Merge pull request #123 from PetrDlouhy/python3-easy

Add Python3 support - easiest changes
2026-05-09 07:04:44 +00:00 · 2018-01-23 14:18:03 -05:00 · 2018-01-23 14:18:03 -05:00 · b3cc3c1911
commit b3cc3c1911
parent b84a2a8c2a e615480850
14 changed files with 117 additions and 85 deletions
--- a/linkcheck/bookmarks/safari.py
+++ b/linkcheck/bookmarks/safari.py
@ -83,7 +83,10 @@ def get_plist_data_from_string (data):
        return biplist.readPlistFromString(data)
    # fall back to normal plistlist
    try:
-        return plistlib.readPlistFromString(data)
+        if hasattr(plistlib, 'readPlistFromBytes'):  # Python 3
+            return plistlib.readPlistFromBytes(data)
+        else:
+            return plistlib.readPlistFromString(data)
    except Exception:
        # not parseable (eg. not well-formed, or binary)
        return {}
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@ -17,12 +17,14 @@
 """
 Mixin class for URLs that can be fetched over a proxy.
 """
-import urllib
-try:
-    import urlparse
+try:  # Python 3
+    from urllib import parse
+    from urllib import request
+    from urllib.parse import splitport
 except ImportError:
-    # Python 3
-    from urllib import parse as urlparse
+    from urllib import splitport
+    import urllib as request
+    import urlparse as parse
 import os
 from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil

@ -40,7 +42,7 @@ class ProxySupport (object):
        self.proxyauth = None
        if not self.proxy:
            return
-        proxyurl = urlparse.urlparse(self.proxy)
+        proxyurl = parse.urlparse(self.proxy)
        self.proxytype = proxyurl.scheme
        if self.proxytype not in ('http', 'https'):
            # Note that invalid proxies might raise TypeError in urllib2,
@ -67,7 +69,7 @@ class ProxySupport (object):

    def ignore_proxy_host (self):
        """Check if self.host is in the $no_proxy ignore list."""
-        if urllib.proxy_bypass(self.host):
+        if request.proxy_bypass(self.host):
            return True
        no_proxy = os.environ.get("no_proxy")
        if no_proxy:
@ -96,7 +98,7 @@ class ProxySupport (object):

 def parse_host_port (host_port):
    """Parse a host:port string into separate components."""
-    host, port = urllib.splitport(host_port.strip())
+    host, port = splitport(host_port.strip())
    if port is not None:
        if urlutil.is_numeric_port(port):
            port = int(port)
--- a/linkcheck/configuration/init.py
+++ b/linkcheck/configuration/init.py
@ -20,12 +20,12 @@ Store metadata and options.

 import os
 import re
-import urllib
-try:
-    import urlparse
-except ImportError:
-    # Python 3
-    from urllib import parse as urlparse
+try:  # Python 3
+    from urllib import parse
+    from urllib import request
+except ImportError:  # Python 2
+    import urlparse as parse
+    import urllib as request
 import shutil
 import socket
 import _LinkChecker_configdata as configdata
@ -174,7 +174,7 @@ class Configuration (dict):
        self["maxrequestspersecond"] = 10
        self["maxhttpredirects"] = 10
        self["nntpserver"] = os.environ.get("NNTP_SERVER", None)
-        self["proxy"] = urllib.getproxies()
+        self["proxy"] = request.getproxies()
        self["sslverify"] = True
        self["threads"] = 10
        self["timeout"] = 60
@ -319,7 +319,7 @@ class Configuration (dict):
        if not url.lower().startswith(("http:", "https:")):
            log.warn(LOG_CHECK, _("login URL is not a HTTP URL."))
            disable = True
-        urlparts = urlparse.urlsplit(url)
+        urlparts = parse.urlsplit(url)
        if not urlparts[0] or not urlparts[1] or not urlparts[2]:
            log.warn(LOG_CHECK, _("login URL is incomplete."))
            disable = True
--- a/linkcheck/configuration/confparse.py
+++ b/linkcheck/configuration/confparse.py
@ -16,7 +16,10 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """Parse configuration files"""

-import ConfigParser
+try: # Python 3
+    from configparser import RawConfigParser
+except ImportError: # Python 2
+    from ConfigParser import RawConfigParser
 import os
 from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf

@ -30,7 +33,7 @@ def read_multiline (value):
        yield line


-class LCConfigParser (ConfigParser.RawConfigParser, object):
+class LCConfigParser (RawConfigParser, object):
    """
    Parse a LinkChecker configuration file.
    """
--- a/linkcheck/cookies.py
+++ b/linkcheck/cookies.py
@ -18,8 +18,14 @@
 Parsing of cookies.
 """

-import cookielib
-import httplib
+try: # Python 3
+    from http.cookiejar import split_header_words
+except ImportError: # Python 2
+    from cookielib import split_header_words
+try: # Python 3
+    from http.client import HTTPMessage
+except ImportError: # Python 2
+    from httplib import HTTPMessage
 import requests
 from cStringIO import StringIO

@ -53,14 +59,14 @@ def from_headers (strheader):
    """
    res = []
    fp = StringIO(strheader)
-    headers = httplib.HTTPMessage(fp, seekable=True)
+    headers = HTTPMessage(fp, seekable=True)
    if "Host" not in headers:
        raise ValueError("Required header 'Host:' missing")
    host = headers["Host"]
    path= headers.get("Path", "/")
    for header in headers.getallmatchingheaders("Set-Cookie"):
        headervalue = header.split(':', 1)[1]
-        for pairs in cookielib.split_header_words([headervalue]):
+        for pairs in split_header_words([headervalue]):
            for name, value in pairs:
                cookie = requests.cookies.create_cookie(name, value,
                    domain=host, path=path)
--- a/linkcheck/director/init.py
+++ b/linkcheck/director/init.py
@ -18,7 +18,10 @@
 Management of checking a queue of links with several threads.
 """
 import os
-import thread
+try: # Python 3
+    from _thread import error as thread_error
+except ImportError: # Python 2
+    from thread import error as thread_error
 import time
 from .. import log, LOG_CHECK, LinkCheckerInterrupt, plugins
 from ..cache import urlqueue, robots_txt, results
@ -52,7 +55,7 @@ def check_urls (aggregate):
        raise
    except KeyboardInterrupt:
        interrupt(aggregate)
-    except thread.error:
+    except thread_error:
        log.warn(LOG_CHECK,
             _("Could not start a new thread. Check that the current user" \
               " is allowed to start new threads."))
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@ -18,14 +18,17 @@
 Aggregate needed object instances for checker threads.
 """
 import threading
-import thread
+try:  # Python 3
+    import _thread
+except ImportError:
+    import thread as _thread
+
 import requests
 import time
-try:
-    import urlparse
+try:  # Python 3
+    from urllib import parse
 except ImportError:
-    # Python 3
-    from urllib import parse as urlparse
+    import urlparse as parse
 import random
 from .. import log, LOG_CHECK, strformat, LinkCheckerError
 from ..decorators import synchronized
@ -92,7 +95,7 @@ class Aggregate (object):
        form.data[cgipassword] = password
        for key, value in self.config["loginextrafields"].items():
            form.data[key] = value
-        formurl = urlparse.urljoin(url, form.url)
+        formurl = parse.urljoin(url, form.url)
        response = session.post(formurl, data=form.data)
        self.cookies = session.cookies
        if len(self.cookies) == 0:
@ -116,19 +119,19 @@ class Aggregate (object):
                self.threads.append(t)
                t.start()
        else:
-            self.request_sessions[thread.get_ident()] = new_request_session(self.config, self.cookies)
+            self.request_sessions[_thread.get_ident()] = new_request_session(self.config, self.cookies)
            checker.check_urls(self.urlqueue, self.logger)

    @synchronized(_threads_lock)
    def add_request_session(self):
        """Add a request session for current thread."""
        session = new_request_session(self.config, self.cookies)
-        self.request_sessions[thread.get_ident()] = session
+        self.request_sessions[_thread.get_ident()] = session

    @synchronized(_threads_lock)
    def get_request_session(self):
        """Get the request session for current thread."""
-        return self.request_sessions[thread.get_ident()]
+        return self.request_sessions[_thread.get_ident()]

    @synchronized(_hosts_lock)
    def wait_for_host(self, host):
--- a/linkcheck/director/logger.py
+++ b/linkcheck/director/logger.py
@ -16,7 +16,10 @@
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """Logger for aggregator instances"""
 import threading
-import thread
+try: # Python 3
+    import _thread
+except ImportError: # Python 2
+    import thread as _thread
 from ..decorators import synchronized
 _lock = threading.Lock()

@ -75,4 +78,4 @@ class Logger (object):
            if logger.is_active:
                break
        else:
-            thread.interrupt_main()
+            _thread.interrupt_main()
--- a/linkcheck/director/task.py
+++ b/linkcheck/director/task.py
@ -14,7 +14,10 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
-import thread
+try: # Python 3
+    import _thread
+except ImportError: # Python 2
+    import thread as _thread
 from ..decorators import notimplemented
 from .. import threader
 from . import console
@ -28,7 +31,7 @@ class CheckedTask (threader.StoppableThread):
        try:
            self.run_checked()
        except KeyboardInterrupt:
-            thread.interrupt_main()
+            _thread.interrupt_main()
        except Exception:
            self.internal_error()

--- a/linkcheck/gzip2.py
+++ b/linkcheck/gzip2.py
@ -179,7 +179,7 @@ class GzipFile:
            self.fileobj.write(fname + '\000')

    def _init_read(self):
-        self.crc = zlib.crc32("") & 0xffffffffL
+        self.crc = zlib.crc32("") & 0xffffffff
        self.size = 0

    def _read_gzip_header(self):
@ -226,7 +226,7 @@ class GzipFile:
            raise ValueError, "write() on closed GzipFile object"
        if len(data) > 0:
            self.size = self.size + len(data)
-            self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+            self.crc = zlib.crc32(data, self.crc) & 0xffffffff
            self.fileobj.write( self.compress.compress(data) )
            self.offset += len(data)
        return len(data)
@ -325,7 +325,7 @@ class GzipFile:
            self._new_member = True

    def _add_read_data(self, data):
-        self.crc = zlib.crc32(data, self.crc) & 0xffffffffL
+        self.crc = zlib.crc32(data, self.crc) & 0xffffffff
        self.extrabuf = self.extrabuf + data
        self.extrasize = self.extrasize + len(data)
        self.size = self.size + len(data)
@ -342,7 +342,7 @@ class GzipFile:
        if crc32 != self.crc:
            raise IOError("CRC check failed %s != %s" % (hex(crc32),
                                                         hex(self.crc)))
-        elif isize != (self.size & 0xffffffffL):
+        elif isize != (self.size & 0xffffffff):
            raise IOError, "Incorrect length of data produced"

        # Gzip files can be padded with zeroes and still have archives.
@ -365,7 +365,7 @@ class GzipFile:
            self.fileobj.write(self.compress.flush())
            write32u(self.fileobj, self.crc)
            # self.size may exceed 2GB, or even 4GB
-            write32u(self.fileobj, self.size & 0xffffffffL)
+            write32u(self.fileobj, self.size & 0xffffffff)
            self.fileobj = None
        elif self.mode == READ:
            self.fileobj = None
--- a/linkcheck/robotparser2.py
+++ b/linkcheck/robotparser2.py
@ -20,20 +20,25 @@ Robots.txt parser.
 The robots.txt Exclusion Protocol is implemented as specified in
 http://www.robotstxt.org/wc/norobots-rfc.html
 """
-try:
-    import urlparse
-except ImportError:
-    # Python 3
-    from urllib import parse as urlparse
-import urllib
+try:  # Python 3
+    from urllib import parse
+except ImportError:  # Python 2
+    import urllib as parse
+try:  # Python 3
+    from urllib.parse import urlparse
+except ImportError:  # Python 2
+    from urlparse import urlparse
 import time
+
 import requests
+
 from . import log, LOG_CHECK, configuration

 __all__ = ["RobotFileParser"]

 ACCEPT_ENCODING = 'x-gzip,gzip,deflate'

+
 class RobotFileParser (object):
    """This class provides a set of methods to read, parse and answer
    questions about a single robots.txt file."""
@ -79,7 +84,7 @@ class RobotFileParser (object):
    def set_url (self, url):
        """Set the URL referring to a robots.txt file."""
        self.url = url
-        self.host, self.path = urlparse.urlparse(url)[1:3]
+        self.host, self.path = urlparse(url)[1:3]

    def read (self):
        """Read the robots.txt URL and feeds it to the parser."""
@ -162,7 +167,7 @@ class RobotFileParser (object):
            line = line.split(':', 1)
            if len(line) == 2:
                line[0] = line[0].strip().lower()
-                line[1] = urllib.unquote(line[1].strip())
+                line[1] = parse.unquote(line[1].strip())
                if line[0] == "user-agent":
                    if state == 2:
                        log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber)
@ -230,7 +235,7 @@ class RobotFileParser (object):
            return True
        # search for given user agent matches
        # the first match counts
-        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
+        url = parse.quote(urlparse(parse.unquote(url))[2]) or "/"
        for entry in self.entries:
            if entry.applies_to(useragent):
                return entry.allowance(url)
@ -276,7 +281,7 @@ class RuleLine (object):
            # an empty value means allow all
            allowance = True
            path = '/'
-        self.path = urllib.quote(path)
+        self.path = parse.quote(path)
        self.allowance = allowance

    def applies_to (self, path):
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -18,15 +18,17 @@
 Functions for parsing and matching URL strings.
 """

-import re
 import os
-try:
-    import urlparse
-except ImportError:
-    # Python 3
+import re
+try:  # Python 3
+    from urllib import parse
    from urllib import parse as urlparse
-import urllib
+except ImportError:  # Python 2
+    import urllib as parse
+    import urlparse
+
 import requests
+
 from . import log, LOG_CHECK

 for scheme in ('ldap', 'irc'):
@ -162,9 +164,9 @@ def parse_qsl (qs, keep_blank_values=0, strict_parsing=0):
            else:
                continue
        if nv[1] or keep_blank_values:
-            name = urllib.unquote(nv[0].replace('+', ' '))
+            name = parse.unquote(nv[0].replace('+', ' '))
            if nv[1]:
-                value = urllib.unquote(nv[1].replace('+', ' '))
+                value = parse.unquote(nv[1].replace('+', ' '))
            else:
                value = nv[1]
            r.append((name, value, sep))
@ -189,12 +191,12 @@ def idna_encode (host):
 def url_fix_host (urlparts):
    """Unquote and fix hostname. Returns is_idn."""
    if not urlparts[1]:
-        urlparts[2] = urllib.unquote(urlparts[2])
+        urlparts[2] = parse.unquote(urlparts[2])
        return False
-    userpass, netloc = urllib.splituser(urlparts[1])
+    userpass, netloc = parse.splituser(urlparts[1])
    if userpass:
-        userpass = urllib.unquote(userpass)
-    netloc, is_idn = idna_encode(urllib.unquote(netloc).lower())
+        userpass = parse.unquote(userpass)
+    netloc, is_idn = idna_encode(parse.unquote(netloc).lower())
    # a leading backslash in path causes urlsplit() to add the
    # path components up to the first slash to host
    # try to find this case...
@ -205,7 +207,7 @@ def url_fix_host (urlparts):
        if not urlparts[2] or urlparts[2] == '/':
            urlparts[2] = comps
        else:
-            urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2]))
+            urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2]))
        netloc = netloc[:i]
    else:
        # a leading ? in path causes urlsplit() to add the query to the
@ -214,7 +216,7 @@ def url_fix_host (urlparts):
        if i != -1:
            netloc, urlparts[3] = netloc.split('?', 1)
        # path
-        urlparts[2] = urllib.unquote(urlparts[2])
+        urlparts[2] = parse.unquote(urlparts[2])
    if userpass:
        # append AT for easy concatenation
        userpass += "@"
@ -311,7 +313,7 @@ def url_norm (url, encoding=None):
        encode_unicode = False
    urlparts = list(urlparse.urlsplit(url))
    # scheme
-    urlparts[0] = urllib.unquote(urlparts[0]).lower()
+    urlparts[0] = parse.unquote(urlparts[0]).lower()
    # mailto: urlsplit is broken
    if urlparts[0] == 'mailto':
        url_fix_mailto_urlsplit(urlparts)
@ -331,7 +333,7 @@ def url_norm (url, encoding=None):
            # fix redundant path parts
            urlparts[2] = collapse_segments(urlparts[2])
    # anchor
-    urlparts[4] = urllib.unquote(urlparts[4])
+    urlparts[4] = parse.unquote(urlparts[4])
    # quote parts again
    urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme
    urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host
@ -418,11 +420,11 @@ def url_quote_part (s, safechars='/', encoding=None):
        if encoding is None:
            encoding = url_encoding
        s = s.encode(encoding, 'ignore')
-    return urllib.quote(s, safechars)
+    return parse.quote(s, safechars)

 def document_quote (document):
    """Quote given document."""
-    doc, query = urllib.splitquery(document)
+    doc, query = parse.splitquery(document)
    doc = url_quote_part(doc, '/=,')
    if query:
        return "%s?%s" % (doc, query)
@ -473,8 +475,8 @@ def url_split (url):
    hostname is always lowercased.
    Precondition: url is syntactically correct URI (eg has no whitespace)
    """
-    scheme, netloc = urllib.splittype(url)
-    host, document = urllib.splithost(netloc)
+    scheme, netloc = parse.splittype(url)
+    host, document = parse.splithost(netloc)
    port = default_ports.get(scheme, 0)
    if host:
        host = host.lower()
--- a/tests/init.py
+++ b/tests/init.py
@ -14,6 +14,8 @@
 # You should have received a copy of the GNU General Public License along
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+from __future__ import print_function
+
 import signal
 import subprocess
 import os
@ -288,9 +290,9 @@ def get_file (filename=None):


 if __name__ == '__main__':
-    print "has clamav", has_clamav()
-    print "has network", has_network()
-    print "has msgfmt", has_msgfmt()
-    print "has POSIX", has_posix()
-    print "has proxy", has_proxy()
-    print "has X11", has_x11()
+    print("has clamav", has_clamav())
+    print("has network", has_network())
+    print("has msgfmt", has_msgfmt())
+    print("has POSIX", has_posix())
+    print("has proxy", has_proxy())
+    print("has X11", has_x11())
--- a/tests/test_po.py
+++ b/tests/test_po.py
@ -56,16 +56,13 @@ class TestGTranslator (unittest.TestCase):
    def test_gtranslator (self):
        """Test all pofiles for GTranslator brokenness."""
        for f in get_pofiles():
-            fd = file(f)
-            try:
+            with open(f, 'rb') as fd:
                self.check_file(fd, f)
-            finally:
-                fd.close()

    def check_file (self, fd, f):
        """Test for GTranslator broken syntax."""
        for line in fd:
-            if line.strip().startswith("#"):
+            if line.strip().startswith(b"#"):
                continue
-            self.assertFalse("\xc2\xb7" in line,
+            self.assertFalse(b"\xc2\xb7" in line,
                 "Broken GTranslator copy/paste in %r:\n%r" % (f, line))