From a1b300c8926cbd63d667c5a18e22eab958cc52d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Fri, 5 Jan 2018 17:16:35 +0100 Subject: [PATCH 1/6] Python3: fix imports --- linkcheck/configuration/__init__.py | 16 +++++------ linkcheck/configuration/confparse.py | 7 +++-- linkcheck/cookies.py | 14 +++++++--- linkcheck/director/__init__.py | 7 +++-- linkcheck/director/aggregator.py | 21 ++++++++------ linkcheck/director/logger.py | 7 +++-- linkcheck/director/task.py | 7 +++-- linkcheck/robotparser2.py | 25 ++++++++++------- linkcheck/url.py | 42 +++++++++++++++------------- 9 files changed, 87 insertions(+), 59 deletions(-) diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index c841ba27..c22d23df 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -20,12 +20,12 @@ Store metadata and options. import os import re -import urllib -try: - import urlparse -except ImportError: - # Python 3 - from urllib import parse as urlparse +try: # Python 3 + from urllib import parse + from urllib import request +except ImportError: # Python 2 + import urlparse as parse + import urllib as request import shutil import socket import _LinkChecker_configdata as configdata @@ -174,7 +174,7 @@ class Configuration (dict): self["maxrequestspersecond"] = 10 self["maxhttpredirects"] = 10 self["nntpserver"] = os.environ.get("NNTP_SERVER", None) - self["proxy"] = urllib.getproxies() + self["proxy"] = request.getproxies() self["sslverify"] = True self["threads"] = 10 self["timeout"] = 60 @@ -319,7 +319,7 @@ class Configuration (dict): if not url.lower().startswith(("http:", "https:")): log.warn(LOG_CHECK, _("login URL is not a HTTP URL.")) disable = True - urlparts = urlparse.urlsplit(url) + urlparts = parse.urlsplit(url) if not urlparts[0] or not urlparts[1] or not urlparts[2]: log.warn(LOG_CHECK, _("login URL is incomplete.")) disable = True diff --git a/linkcheck/configuration/confparse.py b/linkcheck/configuration/confparse.py index 845fa955..42db70cc 100644 --- a/linkcheck/configuration/confparse.py +++ b/linkcheck/configuration/confparse.py @@ -16,7 +16,10 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """Parse configuration files""" -import ConfigParser +try: # Python 3 + from configparser import RawConfigParser +except ImportError: # Python 2 + from ConfigParser import RawConfigParser import os from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf @@ -30,7 +33,7 @@ def read_multiline (value): yield line -class LCConfigParser (ConfigParser.RawConfigParser, object): +class LCConfigParser (RawConfigParser, object): """ Parse a LinkChecker configuration file. """ diff --git a/linkcheck/cookies.py b/linkcheck/cookies.py index 9fab34a6..ffe8a693 100644 --- a/linkcheck/cookies.py +++ b/linkcheck/cookies.py @@ -18,8 +18,14 @@ Parsing of cookies. """ -import cookielib -import httplib +try: # Python 3 + from http.cookiejar import split_header_words +except ImportError: # Python 2 + from cookielib import split_header_words +try: # Python 3 + from http.client import HTTPMessage +except ImportError: # Python 2 + from httplib import HTTPMessage import requests from cStringIO import StringIO @@ -53,14 +59,14 @@ def from_headers (strheader): """ res = [] fp = StringIO(strheader) - headers = httplib.HTTPMessage(fp, seekable=True) + headers = HTTPMessage(fp, seekable=True) if "Host" not in headers: raise ValueError("Required header 'Host:' missing") host = headers["Host"] path= headers.get("Path", "/") for header in headers.getallmatchingheaders("Set-Cookie"): headervalue = header.split(':', 1)[1] - for pairs in cookielib.split_header_words([headervalue]): + for pairs in split_header_words([headervalue]): for name, value in pairs: cookie = requests.cookies.create_cookie(name, value, domain=host, path=path) diff --git a/linkcheck/director/__init__.py b/linkcheck/director/__init__.py index 99a77dd2..afdbfe12 100644 --- a/linkcheck/director/__init__.py +++ b/linkcheck/director/__init__.py @@ -18,7 +18,10 @@ Management of checking a queue of links with several threads. """ import os -import thread +try: # Python 3 + from _thread import error as thread_error +except ImportError: # Python 2 + from thread import error as thread_error import time from .. import log, LOG_CHECK, LinkCheckerInterrupt, plugins from ..cache import urlqueue, robots_txt, results @@ -52,7 +55,7 @@ def check_urls (aggregate): raise except KeyboardInterrupt: interrupt(aggregate) - except thread.error: + except thread_error: log.warn(LOG_CHECK, _("Could not start a new thread. Check that the current user" \ " is allowed to start new threads.")) diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index 64abf983..eb54742c 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -18,14 +18,17 @@ Aggregate needed object instances for checker threads. """ import threading -import thread +try: # Python 3 + import _thread +except ImportError: + import thread as _thread + import requests import time -try: - import urlparse +try: # Python 3 + from urllib import parse except ImportError: - # Python 3 - from urllib import parse as urlparse + import urlparse as parse import random from .. import log, LOG_CHECK, strformat, LinkCheckerError from ..decorators import synchronized @@ -92,7 +95,7 @@ class Aggregate (object): form.data[cgipassword] = password for key, value in self.config["loginextrafields"].items(): form.data[key] = value - formurl = urlparse.urljoin(url, form.url) + formurl = parse.urljoin(url, form.url) response = session.post(formurl, data=form.data) self.cookies = session.cookies if len(self.cookies) == 0: @@ -116,19 +119,19 @@ class Aggregate (object): self.threads.append(t) t.start() else: - self.request_sessions[thread.get_ident()] = new_request_session(self.config, self.cookies) + self.request_sessions[_thread.get_ident()] = new_request_session(self.config, self.cookies) checker.check_urls(self.urlqueue, self.logger) @synchronized(_threads_lock) def add_request_session(self): """Add a request session for current thread.""" session = new_request_session(self.config, self.cookies) - self.request_sessions[thread.get_ident()] = session + self.request_sessions[_thread.get_ident()] = session @synchronized(_threads_lock) def get_request_session(self): """Get the request session for current thread.""" - return self.request_sessions[thread.get_ident()] + return self.request_sessions[_thread.get_ident()] @synchronized(_hosts_lock) def wait_for_host(self, host): diff --git a/linkcheck/director/logger.py b/linkcheck/director/logger.py index e9dd684f..19861f0d 100644 --- a/linkcheck/director/logger.py +++ b/linkcheck/director/logger.py @@ -16,7 +16,10 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """Logger for aggregator instances""" import threading -import thread +try: # Python 3 + import _thread +except ImportError: # Python 2 + import thread as _thread from ..decorators import synchronized _lock = threading.Lock() @@ -75,4 +78,4 @@ class Logger (object): if logger.is_active: break else: - thread.interrupt_main() + _thread.interrupt_main() diff --git a/linkcheck/director/task.py b/linkcheck/director/task.py index 9abeab93..52120cd4 100644 --- a/linkcheck/director/task.py +++ b/linkcheck/director/task.py @@ -14,7 +14,10 @@ # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -import thread +try: # Python 3 + import _thread +except ImportError: # Python 2 + import thread as _thread from ..decorators import notimplemented from .. import threader from . import console @@ -28,7 +31,7 @@ class CheckedTask (threader.StoppableThread): try: self.run_checked() except KeyboardInterrupt: - thread.interrupt_main() + _thread.interrupt_main() except Exception: self.internal_error() diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 91b9548c..38487e8a 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -20,20 +20,25 @@ Robots.txt parser. The robots.txt Exclusion Protocol is implemented as specified in http://www.robotstxt.org/wc/norobots-rfc.html """ -try: - import urlparse -except ImportError: - # Python 3 - from urllib import parse as urlparse -import urllib +try: # Python 3 + from urllib import parse +except ImportError: # Python 2 + import urllib as parse +try: # Python 3 + from urllib.parse import urlparse +except ImportError: # Python 2 + from urlparse import urlparse import time + import requests + from . import log, LOG_CHECK, configuration __all__ = ["RobotFileParser"] ACCEPT_ENCODING = 'x-gzip,gzip,deflate' + class RobotFileParser (object): """This class provides a set of methods to read, parse and answer questions about a single robots.txt file.""" @@ -79,7 +84,7 @@ class RobotFileParser (object): def set_url (self, url): """Set the URL referring to a robots.txt file.""" self.url = url - self.host, self.path = urlparse.urlparse(url)[1:3] + self.host, self.path = urlparse(url)[1:3] def read (self): """Read the robots.txt URL and feeds it to the parser.""" @@ -162,7 +167,7 @@ class RobotFileParser (object): line = line.split(':', 1) if len(line) == 2: line[0] = line[0].strip().lower() - line[1] = urllib.unquote(line[1].strip()) + line[1] = parse.unquote(line[1].strip()) if line[0] == "user-agent": if state == 2: log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber) @@ -230,7 +235,7 @@ class RobotFileParser (object): return True # search for given user agent matches # the first match counts - url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/" + url = parse.quote(urlparse(parse.unquote(url))[2]) or "/" for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) @@ -276,7 +281,7 @@ class RuleLine (object): # an empty value means allow all allowance = True path = '/' - self.path = urllib.quote(path) + self.path = parse.quote(path) self.allowance = allowance def applies_to (self, path): diff --git a/linkcheck/url.py b/linkcheck/url.py index 60107d00..71433fbc 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -18,15 +18,17 @@ Functions for parsing and matching URL strings. """ -import re import os -try: - import urlparse -except ImportError: - # Python 3 +import re +try: # Python 3 + from urllib import parse from urllib import parse as urlparse -import urllib +except ImportError: # Python 2 + import urllib as parse + import urlparse + import requests + from . import log, LOG_CHECK for scheme in ('ldap', 'irc'): @@ -162,9 +164,9 @@ def parse_qsl (qs, keep_blank_values=0, strict_parsing=0): else: continue if nv[1] or keep_blank_values: - name = urllib.unquote(nv[0].replace('+', ' ')) + name = parse.unquote(nv[0].replace('+', ' ')) if nv[1]: - value = urllib.unquote(nv[1].replace('+', ' ')) + value = parse.unquote(nv[1].replace('+', ' ')) else: value = nv[1] r.append((name, value, sep)) @@ -189,12 +191,12 @@ def idna_encode (host): def url_fix_host (urlparts): """Unquote and fix hostname. Returns is_idn.""" if not urlparts[1]: - urlparts[2] = urllib.unquote(urlparts[2]) + urlparts[2] = parse.unquote(urlparts[2]) return False - userpass, netloc = urllib.splituser(urlparts[1]) + userpass, netloc = parse.splituser(urlparts[1]) if userpass: - userpass = urllib.unquote(userpass) - netloc, is_idn = idna_encode(urllib.unquote(netloc).lower()) + userpass = parse.unquote(userpass) + netloc, is_idn = idna_encode(parse.unquote(netloc).lower()) # a leading backslash in path causes urlsplit() to add the # path components up to the first slash to host # try to find this case... @@ -205,7 +207,7 @@ def url_fix_host (urlparts): if not urlparts[2] or urlparts[2] == '/': urlparts[2] = comps else: - urlparts[2] = "%s%s" % (comps, urllib.unquote(urlparts[2])) + urlparts[2] = "%s%s" % (comps, parse.unquote(urlparts[2])) netloc = netloc[:i] else: # a leading ? in path causes urlsplit() to add the query to the @@ -214,7 +216,7 @@ def url_fix_host (urlparts): if i != -1: netloc, urlparts[3] = netloc.split('?', 1) # path - urlparts[2] = urllib.unquote(urlparts[2]) + urlparts[2] = parse.unquote(urlparts[2]) if userpass: # append AT for easy concatenation userpass += "@" @@ -311,7 +313,7 @@ def url_norm (url, encoding=None): encode_unicode = False urlparts = list(urlparse.urlsplit(url)) # scheme - urlparts[0] = urllib.unquote(urlparts[0]).lower() + urlparts[0] = parse.unquote(urlparts[0]).lower() # mailto: urlsplit is broken if urlparts[0] == 'mailto': url_fix_mailto_urlsplit(urlparts) @@ -331,7 +333,7 @@ def url_norm (url, encoding=None): # fix redundant path parts urlparts[2] = collapse_segments(urlparts[2]) # anchor - urlparts[4] = urllib.unquote(urlparts[4]) + urlparts[4] = parse.unquote(urlparts[4]) # quote parts again urlparts[0] = url_quote_part(urlparts[0], encoding=encoding) # scheme urlparts[1] = url_quote_part(urlparts[1], safechars='@:', encoding=encoding) # host @@ -418,11 +420,11 @@ def url_quote_part (s, safechars='/', encoding=None): if encoding is None: encoding = url_encoding s = s.encode(encoding, 'ignore') - return urllib.quote(s, safechars) + return parse.quote(s, safechars) def document_quote (document): """Quote given document.""" - doc, query = urllib.splitquery(document) + doc, query = parse.splitquery(document) doc = url_quote_part(doc, '/=,') if query: return "%s?%s" % (doc, query) @@ -473,8 +475,8 @@ def url_split (url): hostname is always lowercased. Precondition: url is syntactically correct URI (eg has no whitespace) """ - scheme, netloc = urllib.splittype(url) - host, document = urllib.splithost(netloc) + scheme, netloc = parse.splittype(url) + host, document = parse.splithost(netloc) port = default_ports.get(scheme, 0) if host: host = host.lower() From 1cdc974e6d7da316e356c5750c239b9ac52572c8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Fri, 5 Jan 2018 17:19:20 +0100 Subject: [PATCH 2/6] Python3: fix prints --- tests/__init__.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/tests/__init__.py b/tests/__init__.py index 2e3aaea4..995c8b1a 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -14,6 +14,8 @@ # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +from __future__ import print_function + import signal import subprocess import os @@ -288,9 +290,9 @@ def get_file (filename=None): if __name__ == '__main__': - print "has clamav", has_clamav() - print "has network", has_network() - print "has msgfmt", has_msgfmt() - print "has POSIX", has_posix() - print "has proxy", has_proxy() - print "has X11", has_x11() + print("has clamav", has_clamav()) + print("has network", has_network()) + print("has msgfmt", has_msgfmt()) + print("has POSIX", has_posix()) + print("has proxy", has_proxy()) + print("has X11", has_x11()) From f128c9c168d8510b497a393ee3752f88e34e161c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Fri, 5 Jan 2018 17:21:00 +0100 Subject: [PATCH 3/6] Python3: fix gzip2 format --- linkcheck/gzip2.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/linkcheck/gzip2.py b/linkcheck/gzip2.py index 69d913e5..31447252 100644 --- a/linkcheck/gzip2.py +++ b/linkcheck/gzip2.py @@ -179,7 +179,7 @@ class GzipFile: self.fileobj.write(fname + '\000') def _init_read(self): - self.crc = zlib.crc32("") & 0xffffffffL + self.crc = zlib.crc32("") & 0xffffffff self.size = 0 def _read_gzip_header(self): @@ -226,7 +226,7 @@ class GzipFile: raise ValueError, "write() on closed GzipFile object" if len(data) > 0: self.size = self.size + len(data) - self.crc = zlib.crc32(data, self.crc) & 0xffffffffL + self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.fileobj.write( self.compress.compress(data) ) self.offset += len(data) return len(data) @@ -325,7 +325,7 @@ class GzipFile: self._new_member = True def _add_read_data(self, data): - self.crc = zlib.crc32(data, self.crc) & 0xffffffffL + self.crc = zlib.crc32(data, self.crc) & 0xffffffff self.extrabuf = self.extrabuf + data self.extrasize = self.extrasize + len(data) self.size = self.size + len(data) @@ -342,7 +342,7 @@ class GzipFile: if crc32 != self.crc: raise IOError("CRC check failed %s != %s" % (hex(crc32), hex(self.crc))) - elif isize != (self.size & 0xffffffffL): + elif isize != (self.size & 0xffffffff): raise IOError, "Incorrect length of data produced" # Gzip files can be padded with zeroes and still have archives. @@ -365,7 +365,7 @@ class GzipFile: self.fileobj.write(self.compress.flush()) write32u(self.fileobj, self.crc) # self.size may exceed 2GB, or even 4GB - write32u(self.fileobj, self.size & 0xffffffffL) + write32u(self.fileobj, self.size & 0xffffffff) self.fileobj = None elif self.mode == READ: self.fileobj = None From d6f39b4e1ae02e82d385077d753077760c11c87f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Fri, 5 Jan 2018 17:56:46 +0100 Subject: [PATCH 4/6] Python3: use file descriptors --- tests/test_po.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_po.py b/tests/test_po.py index ef19f945..05b89fb2 100644 --- a/tests/test_po.py +++ b/tests/test_po.py @@ -56,16 +56,13 @@ class TestGTranslator (unittest.TestCase): def test_gtranslator (self): """Test all pofiles for GTranslator brokenness.""" for f in get_pofiles(): - fd = file(f) - try: + with open(f, 'rb') as fd: self.check_file(fd, f) - finally: - fd.close() def check_file (self, fd, f): """Test for GTranslator broken syntax.""" for line in fd: - if line.strip().startswith("#"): + if line.strip().startswith(b"#"): continue - self.assertFalse("\xc2\xb7" in line, + self.assertFalse(b"\xc2\xb7" in line, "Broken GTranslator copy/paste in %r:\n%r" % (f, line)) From 256202a20bfac0153912fde827828d52449b30a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Fri, 27 Jan 2017 13:33:37 +0100 Subject: [PATCH 5/6] fixes for Python 3: fix proxysuport --- linkcheck/checker/proxysupport.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/linkcheck/checker/proxysupport.py b/linkcheck/checker/proxysupport.py index d06c80d6..19f81456 100644 --- a/linkcheck/checker/proxysupport.py +++ b/linkcheck/checker/proxysupport.py @@ -17,12 +17,14 @@ """ Mixin class for URLs that can be fetched over a proxy. """ -import urllib -try: - import urlparse +try: # Python 3 + from urllib import parse + from urllib import request + from urllib.parse import splitport except ImportError: - # Python 3 - from urllib import parse as urlparse + from urllib import splitport + import urllib as request + import urlparse as parse import os from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil @@ -40,7 +42,7 @@ class ProxySupport (object): self.proxyauth = None if not self.proxy: return - proxyurl = urlparse.urlparse(self.proxy) + proxyurl = parse.urlparse(self.proxy) self.proxytype = proxyurl.scheme if self.proxytype not in ('http', 'https'): # Note that invalid proxies might raise TypeError in urllib2, @@ -67,7 +69,7 @@ class ProxySupport (object): def ignore_proxy_host (self): """Check if self.host is in the $no_proxy ignore list.""" - if urllib.proxy_bypass(self.host): + if request.proxy_bypass(self.host): return True no_proxy = os.environ.get("no_proxy") if no_proxy: @@ -96,7 +98,7 @@ class ProxySupport (object): def parse_host_port (host_port): """Parse a host:port string into separate components.""" - host, port = urllib.splitport(host_port.strip()) + host, port = splitport(host_port.strip()) if port is not None: if urlutil.is_numeric_port(port): port = int(port) From e615480850eb1b5ff4e84816347c7df8c35d9d30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Sat, 6 Jan 2018 18:29:55 +0100 Subject: [PATCH 6/6] Python3: fix reading Safari bookmarks --- linkcheck/bookmarks/safari.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/linkcheck/bookmarks/safari.py b/linkcheck/bookmarks/safari.py index 37eafd7f..6baab726 100644 --- a/linkcheck/bookmarks/safari.py +++ b/linkcheck/bookmarks/safari.py @@ -83,7 +83,10 @@ def get_plist_data_from_string (data): return biplist.readPlistFromString(data) # fall back to normal plistlist try: - return plistlib.readPlistFromString(data) + if hasattr(plistlib, 'readPlistFromBytes'): # Python 3 + return plistlib.readPlistFromBytes(data) + else: + return plistlib.readPlistFromString(data) except Exception: # not parseable (eg. not well-formed, or binary) return {}