From a92a684ac48dcac017b74688b7bee3d75bb72e9f Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Sat, 30 May 2020 17:01:36 +0100 Subject: [PATCH] Run black on linkcheck/ --- linkcheck/__init__.py | 27 +- linkcheck/ansicolor.py | 100 +++--- linkcheck/better_exchook2.py | 456 +++++++++++++++----------- linkcheck/bookmarks/firefox.py | 2 + linkcheck/bookmarks/safari.py | 3 + linkcheck/cache/robots_txt.py | 3 +- linkcheck/cache/urlqueue.py | 12 +- linkcheck/checker/__init__.py | 59 +++- linkcheck/checker/const.py | 22 +- linkcheck/checker/fileurl.py | 84 +++-- linkcheck/checker/ftpurl.py | 35 +- linkcheck/checker/httpurl.py | 89 +++-- linkcheck/checker/ignoreurl.py | 1 + linkcheck/checker/itmsservicesurl.py | 1 + linkcheck/checker/mailtourl.py | 174 +++++++--- linkcheck/checker/nntpurl.py | 16 +- linkcheck/checker/proxysupport.py | 10 +- linkcheck/checker/telneturl.py | 4 +- linkcheck/checker/unknownurl.py | 9 +- linkcheck/checker/urlbase.py | 296 +++++++++++------ linkcheck/cmdline.py | 9 +- linkcheck/colorama.py | 60 ++-- linkcheck/configuration/__init__.py | 82 +++-- linkcheck/configuration/confparse.py | 87 +++-- linkcheck/containers.py | 3 +- linkcheck/cookies.py | 5 +- linkcheck/decorators.py | 21 +- linkcheck/director/__init__.py | 28 +- linkcheck/director/aggregator.py | 47 ++- linkcheck/director/console.py | 33 +- linkcheck/director/interrupt.py | 6 +- linkcheck/director/logger.py | 1 + linkcheck/dummy.py | 1 + linkcheck/fileutil.py | 3 +- linkcheck/ftpparse.py | 34 +- linkcheck/htmlutil/htmlsoup.py | 12 +- linkcheck/htmlutil/linkparse.py | 79 ++--- linkcheck/htmlutil/loginformsearch.py | 7 +- linkcheck/httputil.py | 10 +- linkcheck/i18n.py | 29 +- linkcheck/lc_cgi.py | 35 +- linkcheck/loader.py | 17 +- linkcheck/lock.py | 1 + linkcheck/log.py | 8 +- linkcheck/logconf.py | 14 +- linkcheck/logger/__init__.py | 82 +++-- linkcheck/logger/blacklist.py | 5 +- linkcheck/logger/csvlog.py | 30 +- linkcheck/logger/customxml.py | 3 +- linkcheck/logger/graph.py | 1 + linkcheck/logger/gxml.py | 2 +- linkcheck/logger/html.py | 248 +++++++++----- linkcheck/logger/sitemapxml.py | 15 +- linkcheck/logger/sql.py | 85 ++--- linkcheck/logger/text.py | 122 ++++--- linkcheck/logger/xmllog.py | 12 +- linkcheck/memoryutil.py | 8 +- linkcheck/mimeutil.py | 2 + linkcheck/network/iputil.py | 1 + linkcheck/parser/__init__.py | 16 +- linkcheck/parser/sitemap.py | 14 +- linkcheck/plugins/__init__.py | 10 +- linkcheck/plugins/anchorcheck.py | 9 +- linkcheck/plugins/httpheaderinfo.py | 6 +- linkcheck/plugins/locationinfo.py | 15 +- linkcheck/plugins/markdowncheck.py | 37 ++- linkcheck/plugins/parsepdf.py | 2 +- linkcheck/plugins/parseword.py | 23 +- linkcheck/plugins/sslcertcheck.py | 24 +- linkcheck/plugins/syntaxchecks.py | 17 +- linkcheck/plugins/viruscheck.py | 10 +- linkcheck/robotparser2.py | 93 +++++- linkcheck/strformat.py | 58 +++- linkcheck/trace.py | 2 +- linkcheck/updater.py | 4 +- linkcheck/url.py | 98 ++++-- 76 files changed, 2021 insertions(+), 1068 deletions(-) diff --git a/linkcheck/__init__.py b/linkcheck/__init__.py index 1e44f7f3..9d8a3f13 100644 --- a/linkcheck/__init__.py +++ b/linkcheck/__init__.py @@ -19,9 +19,14 @@ Main function module for link checking. # version checks import sys + if sys.version_info < (3, 5, 0, 'final', 0): import platform - raise SystemExit("This program requires Python 3.5.0 or later instead of %s." % platform.python_version()) + + raise SystemExit( + "This program requires Python 3.5.0 or later instead of %s." + % platform.python_version() + ) import os import re @@ -48,6 +53,7 @@ def module_path(): def get_install_data(): """Return absolute path of LinkChecker data installation directory.""" from .loader import is_frozen + if is_frozen(): return module_path() return configdata.install_data @@ -55,10 +61,13 @@ def get_install_data(): class LinkCheckerError(Exception): """Exception to be raised on linkchecker-specific check errors.""" + pass + class LinkCheckerInterrupt(Exception): """Used for testing.""" + pass @@ -106,6 +115,7 @@ def init_i18n(loc=None): i18n.init(configdata.name.lower(), locdir, loc=loc) # install translated log level names import logging + logging.addLevelName(logging.CRITICAL, _('CRITICAL')) logging.addLevelName(logging.ERROR, _('ERROR')) logging.addLevelName(logging.WARN, _('WARN')) @@ -124,15 +134,22 @@ def drop_privileges(): if os.name != 'posix': return if os.geteuid() == 0: - log.warn(LOG_CHECK, _("Running as root user; " - "dropping privileges by changing user to nobody.")) + log.warn( + LOG_CHECK, + _( + "Running as root user; " + "dropping privileges by changing user to nobody." + ), + ) import pwd + os.seteuid(pwd.getpwnam('nobody')[3]) if hasattr(signal, "SIGUSR1"): # install SIGUSR1 handler from .decorators import signal_handler + @signal_handler(signal.SIGUSR1) def print_threadstacks(sig, frame): """Print stack traces of all running threads.""" @@ -140,7 +157,9 @@ if hasattr(signal, "SIGUSR1"): for threadId, stack in sys._current_frames().items(): log.warn(LOG_THREAD, "# ThreadID: %s" % threadId) for filename, lineno, name, line in traceback.extract_stack(stack): - log.warn(LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name)) + log.warn( + LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name) + ) line = line.strip() if line: log.warn(LOG_THREAD, " %s" % line) diff --git a/linkcheck/ansicolor.py b/linkcheck/ansicolor.py index 064aa7f7..01a04507 100644 --- a/linkcheck/ansicolor.py +++ b/linkcheck/ansicolor.py @@ -59,6 +59,7 @@ import os import logging import types from .fileutil import has_module, is_tty + if os.name == 'nt': from . import colorama @@ -79,16 +80,16 @@ concealed = 'concealed' # Control numbers AnsiControl = { - None: '', - bold: '1', - light: '2', - #italic: '3', # unsupported + None: '', + bold: '1', + light: '2', + # italic: '3', # unsupported underline: '4', - blink: '5', - #rapidblink: '6', # unsupported - invert: '7', + blink: '5', + # rapidblink: '6', # unsupported + invert: '7', concealed: '8', - #strikethrough: '9', # unsupported + # strikethrough: '9', # unsupported } # Color constants @@ -116,47 +117,47 @@ InverseColors = (Black, Red, Green, Yellow, Blue, Purple, Cyan, White) # Ansi color numbers; capitalized colors are inverse AnsiColor = { - None: '0', + None: '0', default: '0', - black: '30', - red: '31', - green: '32', - yellow: '33', - blue: '34', - purple: '35', - cyan: '36', - white: '37', - Black: '40', - Red: '41', - Green: '42', - Yellow: '43', - Blue: '44', - Purple: '45', - Cyan: '46', - White: '47', + black: '30', + red: '31', + green: '32', + yellow: '33', + blue: '34', + purple: '35', + cyan: '36', + white: '37', + Black: '40', + Red: '41', + Green: '42', + Yellow: '43', + Blue: '44', + Purple: '45', + Cyan: '46', + White: '47', } if os.name == 'nt': # Windows color numbers; capitalized colors are used as background WinColor = { - None: None, + None: None, default: colorama.GREY, - black: colorama.BLACK, - red: colorama.RED, - green: colorama.GREEN, - yellow: colorama.YELLOW, - blue: colorama.BLUE, - purple: colorama.MAGENTA, - cyan: colorama.CYAN, - white: colorama.GREY, - Black: colorama.BLACK, - Red: colorama.RED, - Green: colorama.GREEN, - Yellow: colorama.YELLOW, - Blue: colorama.BLUE, - Purple: colorama.MAGENTA, - Cyan: colorama.CYAN, - White: colorama.GREY, + black: colorama.BLACK, + red: colorama.RED, + green: colorama.GREEN, + yellow: colorama.YELLOW, + blue: colorama.BLUE, + purple: colorama.MAGENTA, + cyan: colorama.CYAN, + white: colorama.GREY, + Black: colorama.BLACK, + Red: colorama.RED, + Green: colorama.GREEN, + Yellow: colorama.YELLOW, + Blue: colorama.BLUE, + Purple: colorama.MAGENTA, + Cyan: colorama.CYAN, + White: colorama.GREY, } # pc speaker beep escape code @@ -168,9 +169,10 @@ def esc_ansicolor(color): control = '' if ";" in color: control, color = color.split(";", 1) - control = AnsiControl.get(control, '')+";" + control = AnsiControl.get(control, '') + ";" cnum = AnsiColor.get(color, '0') - return AnsiEsc % (control+cnum) + return AnsiEsc % (control + cnum) + AnsiReset = esc_ansicolor(default) @@ -201,6 +203,7 @@ def has_colors(fp): return True elif has_curses: import curses + try: curses.setupterm(os.environ.get("TERM"), fp.fileno()) # More than 8 colors are good enough. @@ -218,19 +221,19 @@ def get_columns(fp): return colorama.get_console_size().X if has_curses: import curses + try: curses.setupterm(os.environ.get("TERM"), fp.fileno()) return curses.tigetnum("cols") except curses.error: - pass + pass return 80 def _write_color_colorama(fp, text, color): """Colorize text with given color.""" foreground, background, style = get_win_color(color) - colorama.set_console(foreground=foreground, background=background, - style=style) + colorama.set_console(foreground=foreground, background=background, style=style) fp.write(text) colorama.reset_console() @@ -314,7 +317,6 @@ class ColoredStreamHandler(logging.StreamHandler): try: self.stream.write("%s" % msg, color=color) except UnicodeError: - self.stream.write("%s" % msg.encode("UTF-8"), - color=color) + self.stream.write("%s" % msg.encode("UTF-8"), color=color) self.stream.write(os.linesep) self.flush() diff --git a/linkcheck/better_exchook2.py b/linkcheck/better_exchook2.py index acafc688..47349df8 100644 --- a/linkcheck/better_exchook2.py +++ b/linkcheck/better_exchook2.py @@ -5,14 +5,14 @@ # Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# +# modification, are permitted provided that the following conditions are met: +# # 1. Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. +# list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright notice, # this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# +# and/or other materials provided with the distribution. +# # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE @@ -40,201 +40,283 @@ pykeywords = set(keyword.kwlist) def parse_py_statement(line): - state = 0 - curtoken = "" - spaces = " \t\n" - ops = ".,;:+-*/%&=|(){}[]^<>" - i = 0 - def _escape_char(c): - if c == "n": return "\n" - elif c == "t": return "\t" - else: return c - while i < len(line): - c = line[i] - i += 1 - if state == 0: - if c in spaces: pass - elif c in ops: yield ("op", c) - elif c == "#": state = 6 - elif c == "\"": state = 1 - elif c == "'": state = 2 - else: - curtoken = c - state = 3 - elif state == 1: # string via " - if c == "\\": state = 4 - elif c == "\"": - yield ("str", curtoken) - curtoken = "" - state = 0 - else: curtoken += c - elif state == 2: # string via ' - if c == "\\": state = 5 - elif c == "'": - yield ("str", curtoken) - curtoken = "" - state = 0 - else: curtoken += c - elif state == 3: # identifier - if c in spaces + ops + "#\"'": - yield ("id", curtoken) - curtoken = "" - state = 0 - i -= 1 - else: curtoken += c - elif state == 4: # escape in " - curtoken += _escape_char(c) - state = 1 - elif state == 5: # escape in ' - curtoken += _escape_char(c) - state = 2 - elif state == 6: # comment - curtoken += c - if state == 3: yield ("id", curtoken) - elif state == 6: yield ("comment", curtoken) + state = 0 + curtoken = "" + spaces = " \t\n" + ops = ".,;:+-*/%&=|(){}[]^<>" + i = 0 + + def _escape_char(c): + if c == "n": + return "\n" + elif c == "t": + return "\t" + else: + return c + + while i < len(line): + c = line[i] + i += 1 + if state == 0: + if c in spaces: + pass + elif c in ops: + yield ("op", c) + elif c == "#": + state = 6 + elif c == "\"": + state = 1 + elif c == "'": + state = 2 + else: + curtoken = c + state = 3 + elif state == 1: # string via " + if c == "\\": + state = 4 + elif c == "\"": + yield ("str", curtoken) + curtoken = "" + state = 0 + else: + curtoken += c + elif state == 2: # string via ' + if c == "\\": + state = 5 + elif c == "'": + yield ("str", curtoken) + curtoken = "" + state = 0 + else: + curtoken += c + elif state == 3: # identifier + if c in spaces + ops + "#\"'": + yield ("id", curtoken) + curtoken = "" + state = 0 + i -= 1 + else: + curtoken += c + elif state == 4: # escape in " + curtoken += _escape_char(c) + state = 1 + elif state == 5: # escape in ' + curtoken += _escape_char(c) + state = 2 + elif state == 6: # comment + curtoken += c + if state == 3: + yield ("id", curtoken) + elif state == 6: + yield ("comment", curtoken) def grep_full_py_identifiers(tokens): - global pykeywords - tokens = list(tokens) - i = 0 - while i < len(tokens): - tokentype, token = tokens[i] - i += 1 - if tokentype != "id": continue - while i+1 < len(tokens) and tokens[i] == ("op", ".") and tokens[i+1][0] == "id": - token += "." + tokens[i+1][1] - i += 2 - if token == "": continue - if token in pykeywords: continue - if token[0] in ".0123456789": continue - yield token + global pykeywords + tokens = list(tokens) + i = 0 + while i < len(tokens): + tokentype, token = tokens[i] + i += 1 + if tokentype != "id": + continue + while ( + i + 1 < len(tokens) + and tokens[i] == ("op", ".") + and tokens[i + 1][0] == "id" + ): + token += "." + tokens[i + 1][1] + i += 2 + if token == "": + continue + if token in pykeywords: + continue + if token[0] in ".0123456789": + continue + yield token + + +def output(s, out=sys.stdout): + print(s, file=out) -def output(s, out=sys.stdout): print(s, file=out) def output_limit(): - return 300 + return 300 + + +def pp_extra_info(obj, depthlimit=3): + s = [] + if hasattr(obj, "__len__"): + try: + if type(obj) in (bytes, str, list, tuple, dict) and len(obj) <= 5: + pass # don't print len in this case + else: + s += ["len = " + str(obj.__len__())] + except: + pass + if depthlimit > 0 and hasattr(obj, "__getitem__"): + try: + if type(obj) in (bytes, str): + pass # doesn't make sense to get subitems here + else: + subobj = obj.__getitem__(0) + extra_info = pp_extra_info(subobj, depthlimit - 1) + if extra_info != "": + s += ["_[0]: {" + extra_info + "}"] + except: + pass + return ", ".join(s) + -def pp_extra_info(obj, depthlimit = 3): - s = [] - if hasattr(obj, "__len__"): - try: - if type(obj) in (bytes,str,list,tuple,dict) and len(obj) <= 5: - pass # don't print len in this case - else: - s += ["len = " + str(obj.__len__())] - except: pass - if depthlimit > 0 and hasattr(obj, "__getitem__"): - try: - if type(obj) in (bytes,str): - pass # doesn't make sense to get subitems here - else: - subobj = obj.__getitem__(0) - extra_info = pp_extra_info(subobj, depthlimit - 1) - if extra_info != "": - s += ["_[0]: {" + extra_info + "}"] - except: pass - return ", ".join(s) - def pretty_print(obj): - s = repr(obj) - limit = output_limit() - if len(s) > limit: - s = s[:limit - 3] + "..." - extra_info = pp_extra_info(obj) - if extra_info != "": s += ", " + extra_info - return s + s = repr(obj) + limit = output_limit() + if len(s) > limit: + s = s[: limit - 3] + "..." + extra_info = pp_extra_info(obj) + if extra_info != "": + s += ", " + extra_info + return s + def fallback_findfile(filename): - mods = [ m for m in sys.modules.values() if m and hasattr(m, "__file__") and filename in m.__file__ ] - if len(mods) == 0: return None - altfn = mods[0].__file__ - if altfn[-4:-1] == ".py": altfn = altfn[:-1] # *.pyc or whatever - return altfn + mods = [ + m + for m in sys.modules.values() + if m and hasattr(m, "__file__") and filename in m.__file__ + ] + if len(mods) == 0: + return None + altfn = mods[0].__file__ + if altfn[-4:-1] == ".py": + altfn = altfn[:-1] # *.pyc or whatever + return altfn + def better_exchook(etype, value, tb, out=sys.stdout): - output('Traceback (most recent call last):', out=out) - allLocals,allGlobals = {},{} - try: - import linecache - limit = None - if hasattr(sys, 'tracebacklimit'): - limit = sys.tracebacklimit - n = 0 - _tb = tb - def _resolveIdentifier(namespace, id): - obj = namespace[id[0]] - for part in id[1:]: - obj = getattr(obj, part) - return obj - def _trySet(old, prefix, func): - if old is not None: return old - try: return prefix + func() - except KeyError: return old - except Exception as e: - return prefix + "!" + e.__class__.__name__ + ": " + str(e) - while _tb is not None and (limit is None or n < limit): - f = _tb.tb_frame - allLocals.update(f.f_locals) - allGlobals.update(f.f_globals) - lineno = _tb.tb_lineno - co = f.f_code - filename = co.co_filename - name = co.co_name - output(' File "%s", line %d, in %s' % (filename,lineno,name), out=out) - if not os.path.isfile(filename): - altfn = fallback_findfile(filename) - if altfn: - output(" -- couldn't find file, trying this instead: " + altfn, out=out) - filename = altfn - linecache.checkcache(filename) - line = linecache.getline(filename, lineno, f.f_globals) - if line: - line = line.strip() - output(' line: ' + line, out=out) - output(' locals:', out=out) - alreadyPrintedLocals = set() - for tokenstr in grep_full_py_identifiers(parse_py_statement(line)): - splittedtoken = tuple(tokenstr.split(".")) - for token in map(lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1)): - if token in alreadyPrintedLocals: continue - tokenvalue = None - tokenvalue = _trySet(tokenvalue, " ", lambda: pretty_print(_resolveIdentifier(f.f_locals, token))) - tokenvalue = _trySet(tokenvalue, " ", lambda: pretty_print(_resolveIdentifier(f.f_globals, token))) - tokenvalue = _trySet(tokenvalue, " ", lambda: pretty_print(_resolveIdentifier(f.f_builtins, token))) - tokenvalue = tokenvalue or "" - output(' ' + ".".join(token) + " = " + tokenvalue, out=out) - alreadyPrintedLocals.add(token) - if len(alreadyPrintedLocals) == 0: output(" no locals", out=out) - else: - output(' -- code not available --', out=out) - _tb = _tb.tb_next - n += 1 + output('Traceback (most recent call last):', out=out) + allLocals, allGlobals = {}, {} + try: + import linecache - except Exception: - output("ERROR: cannot get more detailed exception info because:", out=out) - import traceback - for l in traceback.format_exc().split("\n"): output(" " + l, out=out) - output("simple traceback:", out=out) - traceback.print_tb(tb, None, out) + limit = None + if hasattr(sys, 'tracebacklimit'): + limit = sys.tracebacklimit + n = 0 + _tb = tb + + def _resolveIdentifier(namespace, id): + obj = namespace[id[0]] + for part in id[1:]: + obj = getattr(obj, part) + return obj + + def _trySet(old, prefix, func): + if old is not None: + return old + try: + return prefix + func() + except KeyError: + return old + except Exception as e: + return prefix + "!" + e.__class__.__name__ + ": " + str(e) + + while _tb is not None and (limit is None or n < limit): + f = _tb.tb_frame + allLocals.update(f.f_locals) + allGlobals.update(f.f_globals) + lineno = _tb.tb_lineno + co = f.f_code + filename = co.co_filename + name = co.co_name + output(' File "%s", line %d, in %s' % (filename, lineno, name), out=out) + if not os.path.isfile(filename): + altfn = fallback_findfile(filename) + if altfn: + output( + " -- couldn't find file, trying this instead: " + altfn, + out=out, + ) + filename = altfn + linecache.checkcache(filename) + line = linecache.getline(filename, lineno, f.f_globals) + if line: + line = line.strip() + output(' line: ' + line, out=out) + output(' locals:', out=out) + alreadyPrintedLocals = set() + for tokenstr in grep_full_py_identifiers(parse_py_statement(line)): + splittedtoken = tuple(tokenstr.split(".")) + for token in map( + lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1) + ): + if token in alreadyPrintedLocals: + continue + tokenvalue = None + tokenvalue = _trySet( + tokenvalue, + " ", + lambda: pretty_print(_resolveIdentifier(f.f_locals, token)), + ) + tokenvalue = _trySet( + tokenvalue, + " ", + lambda: pretty_print( + _resolveIdentifier(f.f_globals, token) + ), + ) + tokenvalue = _trySet( + tokenvalue, + " ", + lambda: pretty_print( + _resolveIdentifier(f.f_builtins, token) + ), + ) + tokenvalue = tokenvalue or "" + output(' ' + ".".join(token) + " = " + tokenvalue, out=out) + alreadyPrintedLocals.add(token) + if len(alreadyPrintedLocals) == 0: + output(" no locals", out=out) + else: + output(' -- code not available --', out=out) + _tb = _tb.tb_next + n += 1 + + except Exception: + output("ERROR: cannot get more detailed exception info because:", out=out) + import traceback + + for l in traceback.format_exc().split("\n"): + output(" " + l, out=out) + output("simple traceback:", out=out) + traceback.print_tb(tb, None, out) + + import types + + def _some_str(value): + try: + return str(value) + except: + return '' % type(value).__name__ + + def _format_final_exc_line(etype, value): + valuestr = _some_str(value) + if value is None or not valuestr: + line = "%s" % etype + else: + line = "%s: %s" % (etype, valuestr) + return line + + if ( + isinstance(etype, BaseException) + or (hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType)) + or etype is None + or type(etype) is str + ): + output(_format_final_exc_line(etype, value), out=out) + else: + output(_format_final_exc_line(etype.__name__, value), out=out) - import types - def _some_str(value): - try: return str(value) - except: return '' % type(value).__name__ - def _format_final_exc_line(etype, value): - valuestr = _some_str(value) - if value is None or not valuestr: - line = "%s" % etype - else: - line = "%s: %s" % (etype, valuestr) - return line - if (isinstance(etype, BaseException) or - (hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType)) or - etype is None or type(etype) is str): - output(_format_final_exc_line(etype, value), out=out) - else: - output(_format_final_exc_line(etype.__name__, value), out=out) def install(): - sys.excepthook = better_exchook + sys.excepthook = better_exchook diff --git a/linkcheck/bookmarks/firefox.py b/linkcheck/bookmarks/firefox.py index a6eb5e02..5d18ffff 100644 --- a/linkcheck/bookmarks/firefox.py +++ b/linkcheck/bookmarks/firefox.py @@ -16,8 +16,10 @@ """Parser for FireFox bookmark file.""" import re + try: import sqlite3 + has_sqlite = True except ImportError: has_sqlite = False diff --git a/linkcheck/bookmarks/safari.py b/linkcheck/bookmarks/safari.py index bbfe6431..623041c7 100644 --- a/linkcheck/bookmarks/safari.py +++ b/linkcheck/bookmarks/safari.py @@ -15,8 +15,10 @@ # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. import plistlib + try: import biplist + has_biplist = True except ImportError: has_biplist = False @@ -48,6 +50,7 @@ KEY_URIDICTIONARY = 'URIDictionary' KEY_CHILDREN = 'Children' KEY_WEBBOOKMARKTYPE = 'WebBookmarkType' + def parse_plist(entry): """Parse a XML dictionary entry.""" if is_leaf(entry): diff --git a/linkcheck/cache/robots_txt.py b/linkcheck/cache/robots_txt.py index 297428ef..5d2083f1 100644 --- a/linkcheck/cache/robots_txt.py +++ b/linkcheck/cache/robots_txt.py @@ -56,8 +56,7 @@ class RobotsTxt: rp = self.cache[roboturl] return rp.can_fetch(self.useragent, url_data.url) self.misses += 1 - kwargs = dict(auth=url_data.auth, session=url_data.session, - timeout=timeout) + kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout) if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"): kwargs["proxies"] = {url_data.proxytype: url_data.proxy} rp = robotparser2.RobotFileParser(**kwargs) diff --git a/linkcheck/cache/urlqueue.py b/linkcheck/cache/urlqueue.py index fae5e0df..6f0ccaa3 100644 --- a/linkcheck/cache/urlqueue.py +++ b/linkcheck/cache/urlqueue.py @@ -24,15 +24,19 @@ from .. import log, LOG_CACHE class Timeout(Exception): """Raised by join()""" + pass + class Empty(Exception): """Exception raised by get().""" + pass NUM_PUTS_CLEANUP = 10000 + class UrlQueue: """A queue supporting several consumer tasks. The task_done() idea is from the Python 2.5 implementation of Queue.Queue().""" @@ -58,7 +62,9 @@ class UrlQueue: # Each put() decreases the number of allowed puts. # This way we can restrict the number of URLs that are checked. if max_allowed_urls is not None and max_allowed_urls <= 0: - raise ValueError("Non-positive number of allowed URLs: %d" % max_allowed_urls) + raise ValueError( + "Non-positive number of allowed URLs: %d" % max_allowed_urls + ) self.max_allowed_urls = max_allowed_urls self.num_puts = 0 @@ -132,7 +138,9 @@ class UrlQueue: self.cleanup() self.queue.append(url_data) self.unfinished_tasks += 1 - cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times + cache.add_result( + key, None + ) # add none value to cache to prevent checking this url multiple times def cleanup(self): """Move cached elements to top.""" diff --git a/linkcheck/checker/__init__.py b/linkcheck/checker/__init__.py index 022e70b3..bfe46f3b 100644 --- a/linkcheck/checker/__init__.py +++ b/linkcheck/checker/__init__.py @@ -23,7 +23,7 @@ import urllib.parse from .. import strformat, url as urlutil, log, LOG_CHECK -MAX_FILESIZE = 1024*1024*10 # 10MB +MAX_FILESIZE = 1024 * 1024 * 10 # 10MB def guess_url(url): @@ -64,9 +64,20 @@ def absolute_url(base_url, base_ref, parent_url): return "" -def get_url_from(base_url, recursion_level, aggregate, - parent_url=None, base_ref=None, line=None, column=None, - page=0, name="", parent_content_type=None, extern=None, url_encoding=None): +def get_url_from( + base_url, + recursion_level, + aggregate, + parent_url=None, + base_ref=None, + line=None, + column=None, + page=0, + name="", + parent_content_type=None, + extern=None, + url_encoding=None, +): """ Get url data from given base data. @@ -112,17 +123,31 @@ def get_url_from(base_url, recursion_level, aggregate, name = base_url.replace("\\", "/") allowed_schemes = aggregate.config["allowedschemes"] # ignore local PHP files with execution directives - local_php = (parent_content_type == 'application/x-httpd-php' and - '' in base_url and scheme == 'file') + local_php = ( + parent_content_type == 'application/x-httpd-php' + and '' in base_url + and scheme == 'file' + ) if local_php or (allowed_schemes and scheme not in allowed_schemes): klass = ignoreurl.IgnoreUrl else: - assume_local_file = (recursion_level == 0) + assume_local_file = recursion_level == 0 klass = get_urlclass_from(scheme, assume_local_file=assume_local_file) log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url) - return klass(base_url, recursion_level, aggregate, - parent_url=parent_url, base_ref=base_ref, - line=line, column=column, page=page, name=name, extern=extern, url_encoding=url_encoding) + return klass( + base_url, + recursion_level, + aggregate, + parent_url=parent_url, + base_ref=base_ref, + line=line, + column=column, + page=page, + name=name, + extern=extern, + url_encoding=url_encoding, + ) def get_urlclass_from(scheme, assume_local_file=False): @@ -175,5 +200,15 @@ def get_index_html(urls): # all the URL classes -from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl, - mailtourl, telneturl, nntpurl, ignoreurl, itmsservicesurl) # noqa: E402 +from . import ( + fileurl, + unknownurl, + ftpurl, + httpurl, + dnsurl, + mailtourl, + telneturl, + nntpurl, + ignoreurl, + itmsservicesurl, +) # noqa: E402 diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 74e2600d..0289cbbd 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -33,7 +33,7 @@ ExcSyntaxList = [ # exceptions are internal or system errors ExcCacheList = [ IOError, - OSError, # OSError is thrown on Windows when a file is not found + OSError, # OSError is thrown on Windows when a file is not found LinkCheckerError, DNSException, socket.error, @@ -59,6 +59,7 @@ ExcNoCacheList = [ # firefox bookmark file needs sqlite3 for parsing try: import sqlite3 + ExcCacheList.append(sqlite3.Error) except ImportError: pass @@ -66,6 +67,7 @@ except ImportError: # pyOpenSSL errors try: import OpenSSL + ExcCacheList.append(OpenSSL.SSL.Error) except ImportError: pass @@ -99,22 +101,22 @@ WARN_XML_PARSE_ERROR = "xml-parse-error" # registered warnings Warnings = { - WARN_URL_EFFECTIVE_URL: - _("The effective URL is different from the original."), - WARN_URL_ERROR_GETTING_CONTENT: - _("Could not get the content of the URL."), + WARN_URL_EFFECTIVE_URL: _("The effective URL is different from the original."), + WARN_URL_ERROR_GETTING_CONTENT: _("Could not get the content of the URL."), WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."), WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."), - WARN_URL_RATE_LIMITED: _("The URL request was rate limited so need reduce number of requests."), + WARN_URL_RATE_LIMITED: _( + "The URL request was rate limited so need reduce number of requests." + ), WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."), WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."), WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."), - WARN_FILE_SYSTEM_PATH: - _("The file: path is not the same as the system specific path."), + WARN_FILE_SYSTEM_PATH: _( + "The file: path is not the same as the system specific path." + ), WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."), WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."), - WARN_HTTP_COOKIE_STORE_ERROR: - _("An error occurred while storing a cookie."), + WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."), WARN_IGNORE_URL: _("The URL has been ignored."), WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."), WARN_NNTP_NO_SERVER: _("No NNTP server was found."), diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index a0770583..d78e917a 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -39,7 +39,7 @@ def get_files(dirname): if os.path.isfile(fullentry): yield entry elif os.path.isdir(fullentry): - yield entry+"/" + yield entry + "/" def prepare_urlpath_for_nt(path): @@ -48,7 +48,7 @@ def prepare_urlpath_for_nt(path): However urllib.url2pathname expects '////server/path'. """ if '|' not in path: - return "////"+path.lstrip("/") + return "////" + path.lstrip("/") return path @@ -58,9 +58,9 @@ def get_nt_filename(path): head, tail = os.path.split(rest) if not tail: return path - for fname in os.listdir(unc+head): + for fname in os.listdir(unc + head): if fname.lower() == tail.lower(): - return os.path.join(get_nt_filename(unc+head), fname) + return os.path.join(get_nt_filename(unc + head), fname) log.error(LOG_CHECK, "could not find %r in %r", tail, head) return path @@ -92,11 +92,34 @@ class FileUrl(urlbase.UrlBase): Url link with file scheme. """ - def init(self, base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, page, name, url_encoding, extern): + def init( + self, + base_ref, + base_url, + parent_url, + recursion_level, + aggregate, + line, + column, + page, + name, + url_encoding, + extern, + ): """Initialize the scheme.""" - super(FileUrl, self).init(base_ref, base_url, parent_url, - recursion_level, aggregate, line, column, page, name, url_encoding, extern) + super(FileUrl, self).init( + base_ref, + base_url, + parent_url, + recursion_level, + aggregate, + line, + column, + page, + name, + url_encoding, + extern, + ) self.scheme = 'file' def build_base_url(self): @@ -111,14 +134,16 @@ class FileUrl(urlbase.UrlBase): base_url = os.path.expanduser(base_url) if not is_absolute_path(base_url): try: - base_url = os.getcwd()+"/"+base_url + base_url = os.getcwd() + "/" + base_url except OSError as msg: # occurs on stale remote filesystems (eg. NFS) - errmsg = _("Could not get current working directory: %(msg)s") % dict(msg=msg) + errmsg = _( + "Could not get current working directory: %(msg)s" + ) % dict(msg=msg) raise LinkCheckerError(errmsg) if os.path.isdir(base_url): base_url += "/" - base_url = "file://"+base_url + base_url = "file://" + base_url if os.name == "nt": base_url = base_url.replace("\\", "/") # transform c:/windows into /c|/windows @@ -138,6 +163,7 @@ class FileUrl(urlbase.UrlBase): # Otherwise the join function thinks the query is part of # the file name. from .urlbase import url_norm + # norm base url - can raise UnicodeError from url.idna_encode() base_url, is_idn = url_norm(self.base_url, self.encoding) urlparts = list(urllib.parse.urlsplit(base_url)) @@ -148,8 +174,9 @@ class FileUrl(urlbase.UrlBase): # ignore query and fragment url parts for filesystem urls self.urlparts[3] = self.urlparts[4] = '' if self.is_directory() and not self.urlparts[2].endswith('/'): - self.add_warning(_("Added trailing slash to directory."), - tag=WARN_FILE_MISSING_SLASH) + self.add_warning( + _("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH + ) self.urlparts[2] += '/' self.url = urlutil.urlunsplit(self.urlparts) @@ -168,9 +195,10 @@ class FileUrl(urlbase.UrlBase): Try to open the local file. Under NT systems the case sensitivity is checked. """ - if (self.parent_url is not None and - not self.parent_url.startswith("file:")): - msg = _("local files are only checked without parent URL or when the parent URL is also a file") + if self.parent_url is not None and not self.parent_url.startswith("file:"): + msg = _( + "local files are only checked without parent URL or when the parent URL is also a file" + ) raise LinkCheckerError(msg) if self.is_directory(): self.set_result(_("directory")) @@ -190,11 +218,15 @@ class FileUrl(urlbase.UrlBase): path = self.get_os_filename() realpath = get_nt_filename(path) if path != realpath: - self.add_warning(_("The URL path %(path)r is not the same as the " - "system path %(realpath)r. You should always use " - "the system path in URLs.") % \ - {"path": path, "realpath": realpath}, - tag=WARN_FILE_SYSTEM_PATH) + self.add_warning( + _( + "The URL path %(path)r is not the same as the " + "system path %(realpath)r. You should always use " + "the system path in URLs." + ) + % {"path": path, "realpath": realpath}, + tag=WARN_FILE_SYSTEM_PATH, + ) def read_content(self): """Return file content, or in case of directories a dummy HTML file @@ -242,7 +274,9 @@ class FileUrl(urlbase.UrlBase): return True if self.content_type in self.ContentMimetypes: return True - log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type) + log.debug( + LOG_CHECK, "File with content type %r is not parseable.", self.content_type + ) return False def set_content_type(self): @@ -267,7 +301,7 @@ class FileUrl(urlbase.UrlBase): i = url.rindex('/') if i > 6: # remove last filename to make directory internal - url = url[:i+1] + url = url[: i + 1] return re.escape(url) def add_url(self, url, line=0, column=0, page=0, name="", base=None): @@ -277,4 +311,6 @@ class FileUrl(urlbase.UrlBase): if webroot and url and url.startswith("/"): url = webroot + url[1:] log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url) - super(FileUrl, self).add_url(url, line=line, column=column, page=page, name=name, base=base) + super(FileUrl, self).add_url( + url, line=line, column=column, page=page, name=name, base=base + ) diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index bb37cf68..c8b4be31 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -50,14 +50,16 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.set_proxy(self.aggregate.config["proxy"].get(self.scheme)) if self.proxy: # using a (HTTP) proxy - http = httpurl.HttpUrl(self.base_url, - self.recursion_level, - self.aggregate, - parent_url=self.parent_url, - base_ref=self.base_ref, - line=self.line, - column=self.column, - name=self.name) + http = httpurl.HttpUrl( + self.base_url, + self.recursion_level, + self.aggregate, + parent_url=self.parent_url, + base_ref=self.base_ref, + line=self.line, + column=self.column, + name=self.name, + ) http.build_url() return http.check() self.login() @@ -91,7 +93,8 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): raise LinkCheckerError(_("Got no answer from FTP server")) except EOFError as msg: raise LinkCheckerError( - _("Remote host has closed connection: %(msg)s") % str(msg)) + _("Remote host has closed connection: %(msg)s") % str(msg) + ) def negotiate_encoding(self): """Check if server can handle UTF-8 encoded filenames. @@ -137,8 +140,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): if "%s/" % self.filename in files: if not self.url.endswith('/'): self.add_warning( - _("Missing trailing directory slash in ftp url."), - tag=WARN_FTP_MISSING_SLASH) + _("Missing trailing directory slash in ftp url."), + tag=WARN_FTP_MISSING_SLASH, + ) self.url += '/' return raise ftplib.error_perm("550 File not found") @@ -147,11 +151,13 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): """Get list of filenames in directory. Subdirectories have an ending slash.""" files = [] + def add_entry(line): """Parse list line and add the entry it points to to the file list.""" log.debug(LOG_CHECK, "Directory entry %r", line) from ..ftpparse import ftpparse + fpo = ftpparse(line) if fpo is not None and fpo["name"]: name = fpo["name"] @@ -159,6 +165,7 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): name += "/" if fpo["trycwd"] or fpo["tryretr"]: files.append(name) + self.url_connection.dir(add_entry) return files @@ -168,7 +175,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): return True if self.content_type in self.ContentMimetypes: return True - log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type) + log.debug( + LOG_CHECK, "URL with content type %r is not parseable.", self.content_type + ) return False def is_directory(self): @@ -194,12 +203,14 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): # download file in BINARY mode ftpcmd = "RETR %s" % self.filename buf = StringIO() + def stor_data(s): """Helper method storing given data""" # limit the download size if (buf.tell() + len(s)) > self.max_size: raise LinkCheckerError(_("FTP file size too large")) buf.write(s) + self.url_connection.retrbinary(ftpcmd, stor_data) data = buf.getvalue() buf.close() diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index 9fcdeaa4..e9ed7597 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -17,21 +17,33 @@ Handle http links. """ import requests + # The validity of SSL certs is ignored to be able # the check the URL and recurse into it. # The warning about invalid SSL certs is given to the # user instead. import warnings -warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning) + +warnings.simplefilter( + 'ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning +) from io import BytesIO import re -from .. import (log, LOG_CHECK, strformat, mimeutil, - url as urlutil, LinkCheckerError, httputil) -from . import (internpaturl, proxysupport) +from .. import ( + log, + LOG_CHECK, + strformat, + mimeutil, + url as urlutil, + LinkCheckerError, + httputil, +) +from . import internpaturl, proxysupport + # import warnings -from .const import (WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED) +from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED from requests.sessions import REDIRECT_STATI # assumed HTTP header encoding @@ -72,9 +84,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): @return: True if access is granted, otherwise False @rtype: bool """ - return (not self.aggregate.config['robotstxt'] - or self.aggregate.robots_txt.allows_url( - self, timeout=self.aggregate.config["timeout"])) + return not self.aggregate.config[ + 'robotstxt' + ] or self.aggregate.robots_txt.allows_url( + self, timeout=self.aggregate.config["timeout"] + ) def content_allows_robots(self): """ @@ -89,8 +103,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): def add_size_info(self): """Get size of URL content from HTTP header.""" - if self.headers and "Content-Length" in self.headers and \ - "Transfer-Encoding" not in self.headers: + if ( + self.headers + and "Content-Length" in self.headers + and "Transfer-Encoding" not in self.headers + ): # Note that content-encoding causes size differences since # the content data is always decoded. try: @@ -139,14 +156,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): def build_request(self): """Build a prepared request object.""" clientheaders = {} - if (self.parent_url and - self.parent_url.lower().startswith(HTTP_SCHEMAS)): + if self.parent_url and self.parent_url.lower().startswith(HTTP_SCHEMAS): clientheaders["Referer"] = self.parent_url - kwargs = dict( - method='GET', - url=self.url, - headers=clientheaders, - ) + kwargs = dict(method='GET', url=self.url, headers=clientheaders,) if self.auth: kwargs['auth'] = self.auth log.debug(LOG_CHECK, "Prepare request with %s", kwargs) @@ -223,8 +235,10 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): def is_redirect(self): """Check if current response is a redirect.""" - return ('location' in self.headers and - self.url_connection.status_code in REDIRECT_STATI) + return ( + 'location' in self.headers + and self.url_connection.status_code in REDIRECT_STATI + ) def get_request_kwargs(self): """Construct keyword parameters for Session.request() and @@ -241,8 +255,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): def get_redirects(self, request): """Return iterator of redirects for given request.""" kwargs = self.get_request_kwargs() - return self.session.resolve_redirects(self.url_connection, - request, **kwargs) + return self.session.resolve_redirects(self.url_connection, request, **kwargs) def follow_redirections(self, request): """Follow all redirections of http response.""" @@ -285,21 +298,32 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): def check_response(self): """Check final result and log it.""" - if self.url_connection.status_code >= 400 and self.url_connection.status_code != 429: - self.set_result("%d %s" % (self.url_connection.status_code, self.url_connection.reason), - valid=False) + if ( + self.url_connection.status_code >= 400 + and self.url_connection.status_code != 429 + ): + self.set_result( + "%d %s" % (self.url_connection.status_code, self.url_connection.reason), + valid=False, + ) else: if self.url_connection.status_code == 204: # no content - self.add_warning(self.url_connection.reason, - tag=WARN_HTTP_EMPTY_CONTENT) + self.add_warning( + self.url_connection.reason, tag=WARN_HTTP_EMPTY_CONTENT + ) if self.url_connection.status_code == 429: - self.add_warning("Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")), - tag=WARN_URL_RATE_LIMITED) + self.add_warning( + "Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")), + tag=WARN_URL_RATE_LIMITED, + ) if self.url_connection.status_code >= 200: - self.set_result("%r %s" % (self.url_connection.status_code, self.url_connection.reason)) + self.set_result( + "%r %s" + % (self.url_connection.status_code, self.url_connection.reason) + ) else: self.set_result(_("OK")) @@ -325,6 +349,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.add_url(url, name=name) if 'Refresh' in self.headers: from ..htmlutil.linkparse import refresh_re + value = self.headers['Refresh'].strip() mo = refresh_re.match(value) if mo: @@ -352,7 +377,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport): # XXX side effect self.content_type = rtype if self.content_type not in self.ContentMimetypes: - log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type) + log.debug( + LOG_CHECK, + "URL with content type %r is not parseable", + self.content_type, + ) return False return True diff --git a/linkcheck/checker/ignoreurl.py b/linkcheck/checker/ignoreurl.py index 01f7924a..19c129aa 100644 --- a/linkcheck/checker/ignoreurl.py +++ b/linkcheck/checker/ignoreurl.py @@ -19,6 +19,7 @@ Handle ignored URLs. from . import unknownurl + class IgnoreUrl(unknownurl.UnknownUrl): """Always ignored URL.""" diff --git a/linkcheck/checker/itmsservicesurl.py b/linkcheck/checker/itmsservicesurl.py index df201081..919c4df0 100644 --- a/linkcheck/checker/itmsservicesurl.py +++ b/linkcheck/checker/itmsservicesurl.py @@ -20,6 +20,7 @@ Handle itms-services URLs. from . import urlbase from .. import log, LOG_CHECK + class ItmsServicesUrl(urlbase.UrlBase): """Apple iOS application download URLs.""" diff --git a/linkcheck/checker/mailtourl.py b/linkcheck/checker/mailtourl.py index d531798f..d2d7f0b0 100644 --- a/linkcheck/checker/mailtourl.py +++ b/linkcheck/checker/mailtourl.py @@ -53,6 +53,8 @@ def is_literal(domain): _remove_quoted = re.compile(r'\\.').sub _quotes = re.compile(r'["\\]') + + def is_missing_quote(addr): """Return True iff mail address is not correctly quoted.""" return _quotes.match(_remove_quoted("", addr[1:-1])) @@ -62,6 +64,7 @@ def is_missing_quote(addr): EMAIL_CGI_ADDRESS = ("to", "cc", "bcc") EMAIL_CGI_SUBJECT = "subject" + class MailtoUrl(urlbase.UrlBase): """ Url link with mailto scheme. @@ -81,8 +84,10 @@ class MailtoUrl(urlbase.UrlBase): if not self.valid: break elif not self.subject: - self.add_warning(_("No mail addresses or email subject found in `%(url)s'.") % \ - {"url": self.url}) + self.add_warning( + _("No mail addresses or email subject found in `%(url)s'.") + % {"url": self.url} + ) def parse_addresses(self): """Parse all mail addresses out of the URL target. Also parses @@ -92,7 +97,7 @@ class MailtoUrl(urlbase.UrlBase): # cut off leading mailto: and unquote url = urllib.parse.unquote(self.base_url[7:], self.encoding) # search for cc, bcc, to and store in headers - mode = 0 # 0=default, 1=quote, 2=esc + mode = 0 # 0=default, 1=quote, 2=esc quote = None i = 0 for i, c in enumerate(url): @@ -104,7 +109,7 @@ class MailtoUrl(urlbase.UrlBase): mode = 1 elif c == '\\': mode = 2 - elif mode==1: + elif mode == 1: if c == '"' and quote == '"': mode = 0 elif c == '>' and quote == '<': @@ -114,11 +119,13 @@ class MailtoUrl(urlbase.UrlBase): if i < (len(url) - 1): self.addresses.update(getaddresses(url[:i])) try: - headers = urllib.parse.parse_qs(url[(i+1):], strict_parsing=True) + headers = urllib.parse.parse_qs(url[(i + 1) :], strict_parsing=True) for key, vals in headers.items(): if key.lower() in EMAIL_CGI_ADDRESS: # Only the first header value is added - self.addresses.update(getaddresses(urllib.parse.unquote(vals[0], self.encoding))) + self.addresses.update( + getaddresses(urllib.parse.unquote(vals[0], self.encoding)) + ) if key.lower() == EMAIL_CGI_SUBJECT: self.subject = vals[0] except ValueError as err: @@ -145,30 +152,57 @@ class MailtoUrl(urlbase.UrlBase): # restrict email length to 256 characters # http://www.rfc-editor.org/errata_search.php?eid=1003 if len(mail) > 256: - self.set_result(_("Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars.") % \ - {"addr": mail, "length": len(mail)}, valid=False, overwrite=False) + self.set_result( + _( + "Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars." + ) + % {"addr": mail, "length": len(mail)}, + valid=False, + overwrite=False, + ) return if "@" not in mail: - self.set_result(_("Missing `@' in mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Missing `@' in mail address `%(addr)s'.") % {"addr": mail}, + valid=False, + overwrite=False, + ) return # note: be sure to use rsplit since "@" can occur in local part local, domain = mail.rsplit("@", 1) if not local: - self.set_result(_("Missing local part of mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Missing local part of mail address `%(addr)s'.") % {"addr": mail}, + valid=False, + overwrite=False, + ) return if not domain: - self.set_result(_("Missing domain part of mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Missing domain part of mail address `%(addr)s'.") % {"addr": mail}, + valid=False, + overwrite=False, + ) return if len(local) > 64: - self.set_result(_("Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars.") % \ - {"addr": mail, "length": len(local)}, valid=False, overwrite=False) + self.set_result( + _( + "Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars." + ) + % {"addr": mail, "length": len(local)}, + valid=False, + overwrite=False, + ) return if len(domain) > 255: - self.set_result(_("Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars.") % \ - {"addr": mail, "length": len(local)}, valid=False, overwrite=False) + self.set_result( + _( + "Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars." + ) + % {"addr": mail, "length": len(local)}, + valid=False, + overwrite=False, + ) return # local part syntax check @@ -176,26 +210,48 @@ class MailtoUrl(urlbase.UrlBase): # Rules taken from http://tools.ietf.org/html/rfc3696#section-3 if is_quoted(local): if is_missing_quote(local): - self.set_result(_("Unquoted double quote or backslash in mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Unquoted double quote or backslash in mail address `%(addr)s'.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return else: if local.startswith("."): - self.set_result(_("Local part of mail address `%(addr)s' may not start with a dot.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Local part of mail address `%(addr)s' may not start with a dot.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return if local.endswith("."): - self.set_result(_("Local part of mail address `%(addr)s' may not end with a dot.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Local part of mail address `%(addr)s' may not end with a dot.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return if ".." in local: - self.set_result(_("Local part of mail address `%(addr)s' may not contain two dots.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Local part of mail address `%(addr)s' may not contain two dots.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return for char in '@ \\",[]': - if char in local.replace("\\%s"%char, ""): - self.set_result(_("Local part of mail address `%(addr)s' contains unquoted character `%(char)s.") % \ - {"addr": mail, "char": char}, valid=False, overwrite=False) + if char in local.replace("\\%s" % char, ""): + self.set_result( + _( + "Local part of mail address `%(addr)s' contains unquoted character `%(char)s." + ) + % {"addr": mail, "char": char}, + valid=False, + overwrite=False, + ) return # domain part syntax check @@ -206,18 +262,30 @@ class MailtoUrl(urlbase.UrlBase): if ip.startswith("IPv6:"): ip = ip[5:] if not iputil.is_valid_ip(ip): - self.set_result(_("Domain part of mail address `%(addr)s' has invalid IP.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Domain part of mail address `%(addr)s' has invalid IP.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return else: # it's a domain name if not urlutil.is_safe_domain(domain): - self.set_result(_("Invalid domain part of mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Invalid domain part of mail address `%(addr)s'.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return if domain.endswith(".") or domain.split(".")[-1].isdigit(): - self.set_result(_("Invalid top level domain part of mail address `%(addr)s'.") % \ - {"addr": mail}, valid=False, overwrite=False) + self.set_result( + _("Invalid top level domain part of mail address `%(addr)s'.") + % {"addr": mail}, + valid=False, + overwrite=False, + ) return def check_connection(self): @@ -240,6 +308,7 @@ class MailtoUrl(urlbase.UrlBase): Check a single mail address. """ from dns.exception import DNSException + log.debug(LOG_CHECK, "checking mail address %r", mail) mail = strformat.ascii_safe(mail) username, domain = mail.rsplit('@', 1) @@ -249,31 +318,38 @@ class MailtoUrl(urlbase.UrlBase): except DNSException: answers = [] if len(answers) == 0: - self.add_warning(_("No MX mail host for %(domain)s found.") % - {'domain': domain}, - tag=WARN_MAIL_NO_MX_HOST) + self.add_warning( + _("No MX mail host for %(domain)s found.") % {'domain': domain}, + tag=WARN_MAIL_NO_MX_HOST, + ) try: answers = resolver.query(domain, 'A') except DNSException: answers = [] if len(answers) == 0: - self.set_result(_("No host for %(domain)s found.") % - {'domain': domain}, valid=False, - overwrite=True) + self.set_result( + _("No host for %(domain)s found.") % {'domain': domain}, + valid=False, + overwrite=True, + ) return # set preference to zero - mxdata = [(0, rdata.to_text(omit_final_dot=True)) - for rdata in answers] + mxdata = [(0, rdata.to_text(omit_final_dot=True)) for rdata in answers] else: from dns.rdtypes.mxbase import MXBase - mxdata = [(rdata.preference, - rdata.exchange.to_text(omit_final_dot=True)) - for rdata in answers if isinstance(rdata, MXBase)] + + mxdata = [ + (rdata.preference, rdata.exchange.to_text(omit_final_dot=True)) + for rdata in answers + if isinstance(rdata, MXBase) + ] if not mxdata: self.set_result( - _("Got invalid DNS answer %(answer)s for %(domain)s.") % - {'answer': answers, 'domain': domain}, valid=False, - overwrite=True) + _("Got invalid DNS answer %(answer)s for %(domain)s.") + % {'answer': answers, 'domain': domain}, + valid=False, + overwrite=True, + ) return # sort according to preference (lower preference means this # host should be preferred) diff --git a/linkcheck/checker/nntpurl.py b/linkcheck/checker/nntpurl.py index d4f751ec..1b504867 100644 --- a/linkcheck/checker/nntpurl.py +++ b/linkcheck/checker/nntpurl.py @@ -28,6 +28,7 @@ from .const import WARN_NNTP_NO_SERVER, WARN_NNTP_NO_NEWSGROUP random.seed() + class NntpUrl(urlbase.UrlBase): """ Url link with NNTP scheme. @@ -41,8 +42,9 @@ class NntpUrl(urlbase.UrlBase): nntpserver = self.host or self.aggregate.config["nntpserver"] if not nntpserver: self.add_warning( - _("No NNTP server was specified, skipping this URL."), - tag=WARN_NNTP_NO_SERVER) + _("No NNTP server was specified, skipping this URL."), + tag=WARN_NNTP_NO_SERVER, + ) return nntp = self._connect_nntp(nntpserver) group = self.urlparts[2] @@ -50,7 +52,7 @@ class NntpUrl(urlbase.UrlBase): group = group[1:] if '@' in group: # request article info (resp, number mid) - number = nntp.stat("<"+group+">")[1] + number = nntp.stat("<" + group + ">")[1] self.add_info(_('Article number %(num)s found.') % {"num": number}) else: # split off trailing articel span @@ -61,8 +63,9 @@ class NntpUrl(urlbase.UrlBase): self.add_info(_("News group %(name)s found.") % {"name": name}) else: # group name is the empty string - self.add_warning(_("No newsgroup specified in NNTP URL."), - tag=WARN_NNTP_NO_NEWSGROUP) + self.add_warning( + _("No newsgroup specified in NNTP URL."), tag=WARN_NNTP_NO_NEWSGROUP + ) def _connect_nntp(self, nntpserver): """ @@ -85,7 +88,8 @@ class NntpUrl(urlbase.UrlBase): raise if nntp is None: raise LinkCheckerError( - _("NNTP server too busy; tried more than %d times.") % tries) + _("NNTP server too busy; tried more than %d times.") % tries + ) if log.is_debug(LOG_CHECK): nntp.set_debuglevel(1) self.add_info(nntp.getwelcome()) diff --git a/linkcheck/checker/proxysupport.py b/linkcheck/checker/proxysupport.py index b370d15e..8d2c49e7 100644 --- a/linkcheck/checker/proxysupport.py +++ b/linkcheck/checker/proxysupport.py @@ -40,14 +40,14 @@ class ProxySupport: if self.proxytype not in ('http', 'https'): # Note that invalid proxies might raise TypeError in urllib2, # so make sure to stop checking at this point, not later. - msg = _("Proxy value `%(proxy)s' must start with 'http:' or 'https:'.") \ - % dict(proxy=proxy) + msg = _( + "Proxy value `%(proxy)s' must start with 'http:' or 'https:'." + ) % dict(proxy=proxy) raise LinkCheckerError(msg) if self.ignore_proxy_host(): # log proxy without auth info log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy) - self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % - dict(proxy=proxy)) + self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy)) self.proxy = None return log.debug(LOG_CHECK, "using proxy %r", self.proxy) @@ -58,7 +58,7 @@ class ProxySupport: username = proxyurl.username password = proxyurl.password if proxy.password is not None else "" auth = "%s:%s" % (username, password) - self.proxyauth = "Basic "+httputil.encode_base64(auth) + self.proxyauth = "Basic " + httputil.encode_base64(auth) def ignore_proxy_host(self): """Check if self.host is in the $no_proxy ignore list.""" diff --git a/linkcheck/checker/telneturl.py b/linkcheck/checker/telneturl.py index 613758a6..cfab5893 100644 --- a/linkcheck/checker/telneturl.py +++ b/linkcheck/checker/telneturl.py @@ -64,10 +64,10 @@ class TelnetUrl(urlbase.UrlBase): self.url_connection.open(self.host, self.port) if self.user: self.url_connection.read_until(b"login: ", 10) - self.url_connection.write(encode(self.user)+b"\n") + self.url_connection.write(encode(self.user) + b"\n") if self.password: self.url_connection.read_until(b"Password: ", 10) - self.url_connection.write(encode(self.password)+b"\n") + self.url_connection.write(encode(self.password) + b"\n") # XXX how to tell if we are logged in?? self.url_connection.write(b"exit\n") diff --git a/linkcheck/checker/unknownurl.py b/linkcheck/checker/unknownurl.py index 1701d6c3..6762f2cd 100644 --- a/linkcheck/checker/unknownurl.py +++ b/linkcheck/checker/unknownurl.py @@ -28,12 +28,12 @@ class UnknownUrl(urlbase.UrlBase): """Only logs that this URL is unknown.""" super(UnknownUrl, self).build_url() if self.is_ignored(): - self.add_info(_("%(scheme)s URL ignored.") % - {"scheme": self.scheme.capitalize()}) + self.add_info( + _("%(scheme)s URL ignored.") % {"scheme": self.scheme.capitalize()} + ) self.set_result(_("ignored")) else: - self.set_result(_("URL is unrecognized or has invalid syntax"), - valid=False) + self.set_result(_("URL is unrecognized or has invalid syntax"), valid=False) def is_ignored(self): """Return True if this URL scheme is ignored.""" @@ -260,4 +260,3 @@ ignored_schemes = "^(%s%s%s%s)$" % ( ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) is_unknown_scheme = ignored_schemes_re.match - diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 1ffcd747..b920bd0c 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -27,15 +27,30 @@ import select from io import BytesIO from . import absolute_url, get_url_from -from .. import (log, LOG_CHECK, - strformat, LinkCheckerError, url as urlutil, trace, get_link_pat) +from .. import ( + log, + LOG_CHECK, + strformat, + LinkCheckerError, + url as urlutil, + trace, + get_link_pat, +) from ..htmlutil import htmlsoup from ..network import iputil -from .const import (WARN_URL_EFFECTIVE_URL, - WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP, - WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE, - WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG, - ExcList, ExcSyntaxList, ExcNoCacheList) +from .const import ( + WARN_URL_EFFECTIVE_URL, + WARN_URL_ERROR_GETTING_CONTENT, + WARN_URL_OBFUSCATED_IP, + WARN_URL_CONTENT_SIZE_ZERO, + WARN_URL_CONTENT_SIZE_TOO_LARGE, + WARN_URL_WHITESPACE, + URL_MAX_LENGTH, + WARN_URL_TOO_LONG, + ExcList, + ExcSyntaxList, + ExcNoCacheList, +) from ..url import url_fix_wayback_query # helper alias @@ -44,6 +59,7 @@ unicode_safe = strformat.unicode_safe # schemes that are invalid with an empty hostname scheme_requires_host = ("ftp", "http", "telnet") + def urljoin(parent, url): """ If url is relative, join parent and url. Else leave url as-is. @@ -61,8 +77,9 @@ def url_norm(url, encoding): try: return urlutil.url_norm(url, encoding=encoding) except UnicodeError: - msg = _("URL has unparsable domain name: %(name)s") % \ - {"name": sys.exc_info()[1]} + msg = _("URL has unparsable domain name: %(name)s") % { + "name": sys.exc_info()[1] + } raise LinkCheckerError(msg) @@ -92,11 +109,22 @@ class UrlBase: } # Read in 16kb chunks - ReadChunkBytes = 1024*16 + ReadChunkBytes = 1024 * 16 - def __init__(self, base_url, recursion_level, aggregate, - parent_url=None, base_ref=None, line=-1, column=-1, page=-1, - name="", url_encoding=None, extern=None): + def __init__( + self, + base_url, + recursion_level, + aggregate, + parent_url=None, + base_ref=None, + line=-1, + column=-1, + page=-1, + name="", + url_encoding=None, + extern=None, + ): """ Initialize check data, and store given variables. @@ -113,20 +141,44 @@ class UrlBase: @param extern: None or (is_extern, is_strict) """ self.reset() - self.init(base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, page, name, url_encoding, extern) + self.init( + base_ref, + base_url, + parent_url, + recursion_level, + aggregate, + line, + column, + page, + name, + url_encoding, + extern, + ) self.check_syntax() if recursion_level == 0: self.add_intern_pattern() self.set_extern(self.url) if self.extern[0] and self.extern[1]: - self.add_info(_("The URL is outside of the domain " - "filter, checked only syntax.")) + self.add_info( + _("The URL is outside of the domain " "filter, checked only syntax.") + ) if not self.has_result: self.set_result(_("filtered")) - def init(self, base_ref, base_url, parent_url, recursion_level, - aggregate, line, column, page, name, url_encoding, extern): + def init( + self, + base_ref, + base_url, + parent_url, + recursion_level, + aggregate, + line, + column, + page, + name, + url_encoding, + extern, + ): """ Initialize internal data. """ @@ -149,17 +201,22 @@ class UrlBase: self.encoding = url_encoding self.extern = extern if self.base_ref: - assert not urlutil.url_needs_quoting(self.base_ref), \ - "unquoted base reference URL %r" % self.base_ref + assert not urlutil.url_needs_quoting(self.base_ref), ( + "unquoted base reference URL %r" % self.base_ref + ) if self.parent_url: - assert not urlutil.url_needs_quoting(self.parent_url), \ - "unquoted parent URL %r" % self.parent_url + assert not urlutil.url_needs_quoting(self.parent_url), ( + "unquoted parent URL %r" % self.parent_url + ) url = absolute_url(self.base_url, base_ref, parent_url) # assume file link if no scheme is found self.scheme = url.split(":", 1)[0].lower() or "file" if self.base_url != base_url: - self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") % - {"url": base_url}, tag=WARN_URL_WHITESPACE) + self.add_warning( + _("Leading or trailing whitespace in URL `%(url)s'.") + % {"url": base_url}, + tag=WARN_URL_WHITESPACE, + ) def reset(self): """ @@ -219,8 +276,13 @@ class UrlBase: Set result string and validity. """ if self.has_result and not overwrite: - log.warn(LOG_CHECK, - "Double result %r (previous %r) for %s", msg, self.result, self) + log.warn( + LOG_CHECK, + "Double result %r (previous %r) for %s", + msg, + self.result, + self, + ) else: self.has_result = True if not msg: @@ -288,8 +350,10 @@ class UrlBase: Add a warning string. """ item = (tag, s) - if item not in self.warnings and \ - tag not in self.aggregate.config["ignorewarnings"]: + if ( + item not in self.warnings + and tag not in self.aggregate.config["ignorewarnings"] + ): self.warnings.append(item) def add_info(self, s): @@ -303,7 +367,7 @@ class UrlBase: """Set the URL to be used for caching.""" # remove anchor from cached target url since we assume # URLs with different anchors to have the same content - self.cache_url = urlutil.urlunsplit(self.urlparts[:4]+['']) + self.cache_url = urlutil.urlunsplit(self.urlparts[:4] + ['']) if self.cache_url is not None: assert isinstance(self.cache_url, str), repr(self.cache_url) @@ -332,13 +396,17 @@ class UrlBase: """Check URL name and length.""" effectiveurl = urlutil.urlunsplit(self.urlparts) if self.url != effectiveurl: - self.add_warning(_("Effective URL %(url)r.") % - {"url": effectiveurl}, - tag=WARN_URL_EFFECTIVE_URL) + self.add_warning( + _("Effective URL %(url)r.") % {"url": effectiveurl}, + tag=WARN_URL_EFFECTIVE_URL, + ) self.url = effectiveurl if len(self.url) > URL_MAX_LENGTH and self.scheme != "data": args = dict(len=len(self.url), max=URL_MAX_LENGTH) - self.add_warning(_("URL length %(len)d is longer than %(max)d.") % args, tag=WARN_URL_TOO_LONG) + self.add_warning( + _("URL length %(len)d is longer than %(max)d.") % args, + tag=WARN_URL_TOO_LONG, + ) def build_url(self): """ @@ -367,7 +435,9 @@ class UrlBase: if urlparts[2]: urlparts[2] = urlutil.collapse_segments(urlparts[2]) if not urlparts[0].startswith("feed"): - urlparts[2] = url_fix_wayback_query(urlparts[2]) # restore second / in http[s]:// in wayback path + urlparts[2] = url_fix_wayback_query( + urlparts[2] + ) # restore second / in http[s]:// in wayback path self.url = urlutil.urlunsplit(urlparts) # split into (modifiable) list self.urlparts = strformat.url_unicode_split(self.url) @@ -384,8 +454,9 @@ class UrlBase: port = urlutil.default_ports.get(self.scheme, 0) host, port = urlutil.splitport(host, port=port) if port is None: - raise LinkCheckerError(_("URL host %(host)r has invalid port") % - {"host": host}) + raise LinkCheckerError( + _("URL host %(host)r has invalid port") % {"host": host} + ) self.port = port # set host lowercase self.host = host.lower() @@ -415,9 +486,10 @@ class UrlBase: if ips: self.host = ips[0] self.add_warning( - _("URL %(url)s has obfuscated IP address %(ip)s") % \ - {"url": self.base_url, "ip": ips[0]}, - tag=WARN_URL_OBFUSCATED_IP) + _("URL %(url)s has obfuscated IP address %(ip)s") + % {"url": self.base_url, "ip": ips[0]}, + tag=WARN_URL_OBFUSCATED_IP, + ) def check(self): """Main check function for checking this URL.""" @@ -453,7 +525,10 @@ class UrlBase: value = _('Hostname not found') elif isinstance(exc, UnicodeError): # idna.encode(host) failed - value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': value} + value = _('Bad hostname %(host)r: %(msg)s') % { + 'host': self.host, + 'msg': value, + } self.set_result(unicode_safe(value), valid=False) def check_content(self): @@ -469,8 +544,10 @@ class UrlBase: return True except tuple(ExcList): value = self.handle_exception() - self.add_warning(_("could not get content: %(msg)s") % - {"msg": value}, tag=WARN_URL_ERROR_GETTING_CONTENT) + self.add_warning( + _("could not get content: %(msg)s") % {"msg": value}, + tag=WARN_URL_ERROR_GETTING_CONTENT, + ) return False def close_connection(self): @@ -492,11 +569,15 @@ class UrlBase: An exception occurred. Log it and set the cache flag. """ etype, evalue = sys.exc_info()[:2] - log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True) + log.debug( + LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True + ) # note: etype must be the exact class, not a subclass - if (etype in ExcNoCacheList) or \ - (etype == socket.error and evalue.args[0]==errno.EBADF) or \ - not evalue: + if ( + (etype in ExcNoCacheList) + or (etype == socket.error and evalue.args[0] == errno.EBADF) + or not evalue + ): # EBADF occurs when operating on an already socket self.caching = False # format message ": " @@ -519,10 +600,13 @@ class UrlBase: maxbytes = self.aggregate.config["maxfilesizedownload"] if self.size > maxbytes: self.add_warning( - _("Content size %(size)s is larger than %(maxbytes)s.") % - dict(size=strformat.strsize(self.size), - maxbytes=strformat.strsize(maxbytes)), - tag=WARN_URL_CONTENT_SIZE_TOO_LARGE) + _("Content size %(size)s is larger than %(maxbytes)s.") + % dict( + size=strformat.strsize(self.size), + maxbytes=strformat.strsize(maxbytes), + ), + tag=WARN_URL_CONTENT_SIZE_TOO_LARGE, + ) def allows_simple_recursion(self): """Check recursion level and extern status.""" @@ -579,15 +663,13 @@ class UrlBase: return for entry in self.aggregate.config["externlinks"]: match = entry['pattern'].search(url) - if (entry['negate'] and not match) or \ - (match and not entry['negate']): + if (entry['negate'] and not match) or (match and not entry['negate']): log.debug(LOG_CHECK, "Extern URL %r", url) self.extern = (1, entry['strict']) return for entry in self.aggregate.config["internlinks"]: match = entry['pattern'].search(url) - if (entry['negate'] and not match) or \ - (match and not entry['negate']): + if (entry['negate'] and not match) or (match and not entry['negate']): log.debug(LOG_CHECK, "Intern URL %r", url) self.extern = (0, 0) return @@ -612,8 +694,7 @@ class UrlBase: self.size = len(content) self.dltime = time.time() - t if self.size == 0: - self.add_warning(_("Content size is zero."), - tag=WARN_URL_CONTENT_SIZE_ZERO) + self.add_warning(_("Content size is zero."), tag=WARN_URL_CONTENT_SIZE_ZERO) else: self.aggregate.add_downloaded_bytes(self.size) return content @@ -636,8 +717,9 @@ class UrlBase: # than an internal crash, eh? ISO-8859-1 is a safe fallback in the # sense that any binary blob can be decoded, it'll never cause a # UnicodeDecodeError. - log.debug(LOG_CHECK, "Beautiful Soup detected %s", - self.soup.original_encoding) + log.debug( + LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding + ) self.encoding = self.soup.original_encoding or 'ISO-8859-1' log.debug(LOG_CHECK, "Content encoding %s", self.encoding) self.text = self.data.decode(self.encoding) @@ -675,29 +757,41 @@ class UrlBase: base_ref = urlutil.url_norm(base, encoding=self.encoding)[0] else: base_ref = None - url_data = get_url_from(url, self.recursion_level+1, self.aggregate, - parent_url=self.url, base_ref=base_ref, line=line, column=column, - page=page, name=name, parent_content_type=self.content_type, url_encoding=self.encoding) + url_data = get_url_from( + url, + self.recursion_level + 1, + self.aggregate, + parent_url=self.url, + base_ref=base_ref, + line=line, + column=column, + page=page, + name=name, + parent_content_type=self.content_type, + url_encoding=self.encoding, + ) self.aggregate.urlqueue.put(url_data) def serialized(self, sep=os.linesep): """ Return serialized url check data as unicode string. """ - return unicode_safe(sep).join([ - "%s link" % self.scheme, - "base_url=%r" % self.base_url, - "parent_url=%r" % self.parent_url, - "base_ref=%r" % self.base_ref, - "recursion_level=%d" % self.recursion_level, - "url_connection=%s" % self.url_connection, - "line=%s" % self.line, - "column=%s" % self.column, - "page=%d" % self.page, - "name=%r" % self.name, - "anchor=%r" % self.anchor, - "cache_url=%s" % self.cache_url, - ]) + return unicode_safe(sep).join( + [ + "%s link" % self.scheme, + "base_url=%r" % self.base_url, + "parent_url=%r" % self.parent_url, + "base_ref=%r" % self.base_ref, + "recursion_level=%d" % self.recursion_level, + "url_connection=%s" % self.url_connection, + "line=%s" % self.line, + "column=%s" % self.column, + "page=%d" % self.page, + "name=%r" % self.name, + "anchor=%r" % self.anchor, + "cache_url=%s" % self.cache_url, + ] + ) def get_intern_pattern(self, url=None): """Get pattern for intern URL matching. @@ -717,8 +811,7 @@ class UrlBase: log.debug(LOG_CHECK, "Add intern pattern %r", pat) self.aggregate.config['internlinks'].append(get_link_pat(pat)) except UnicodeError as msg: - res = _("URL has unparsable domain name: %(domain)s") % \ - {"domain": msg} + res = _("URL has unparsable domain name: %(domain)s") % {"domain": msg} self.set_result(res, valid=False) def __str__(self): @@ -792,28 +885,29 @@ class UrlBase: - url_data.last_modified: datetime Last modification date of retrieved page (or None). """ - return dict(valid=self.valid, - extern=self.extern[0], - result=self.result, - warnings=self.warnings[:], - name=self.name or "", - title=self.get_title(), - parent_url=self.parent_url or "", - base_ref=self.base_ref or "", - base_url=self.base_url or "", - url=self.url or "", - domain=(self.urlparts[1] if self.urlparts else ""), - checktime=self.checktime, - dltime=self.dltime, - size=self.size, - info=self.info, - line=self.line, - column=self.column, - page=self.page, - cache_url=self.cache_url, - content_type=self.content_type, - level=self.recursion_level, - modified=self.modified, + return dict( + valid=self.valid, + extern=self.extern[0], + result=self.result, + warnings=self.warnings[:], + name=self.name or "", + title=self.get_title(), + parent_url=self.parent_url or "", + base_ref=self.base_ref or "", + base_url=self.base_url or "", + url=self.url or "", + domain=(self.urlparts[1] if self.urlparts else ""), + checktime=self.checktime, + dltime=self.dltime, + size=self.size, + info=self.info, + line=self.line, + column=self.column, + page=self.page, + cache_url=self.cache_url, + content_type=self.content_type, + level=self.recursion_level, + modified=self.modified, ) def to_wire(self): @@ -847,8 +941,10 @@ urlDataAttr = [ 'level', ] + class CompactUrlData: """Store selected UrlData attributes in slots to minimize memory usage.""" + __slots__ = urlDataAttr def __init__(self, wired_url_data): diff --git a/linkcheck/cmdline.py b/linkcheck/cmdline.py index 9f29e18d..faa4a023 100644 --- a/linkcheck/cmdline.py +++ b/linkcheck/cmdline.py @@ -43,7 +43,9 @@ def print_version(exit_code=0): def print_plugins(folders, exit_code=0): """Print available plugins and exit.""" modules = plugins.get_plugin_modules(folders) - pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__) + pluginclasses = sorted( + plugins.get_plugin_classes(modules), key=lambda x: x.__name__ + ) for pluginclass in pluginclasses: print(pluginclass.__name__) @@ -57,7 +59,10 @@ def print_usage(msg, exit_code=2): """Print a program msg text to stderr and exit.""" program = sys.argv[0] print(_("Error: %(msg)s") % {"msg": msg}, file=console.stderr) - print(_("Execute '%(program)s -h' for help") % {"program": program}, file=console.stderr) + print( + _("Execute '%(program)s -h' for help") % {"program": program}, + file=console.stderr, + ) sys.exit(exit_code) diff --git a/linkcheck/colorama.py b/linkcheck/colorama.py index d3bd1273..49b58c8c 100644 --- a/linkcheck/colorama.py +++ b/linkcheck/colorama.py @@ -26,8 +26,17 @@ # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from ctypes import (windll, byref, Structure, c_char, c_short, c_uint32, - c_ushort, ArgumentError, WinError) +from ctypes import ( + windll, + byref, + Structure, + c_char, + c_short, + c_uint32, + c_ushort, + ArgumentError, + WinError, +) # from winbase.h STDOUT = -11 @@ -43,15 +52,19 @@ WORD = c_ushort DWORD = c_uint32 TCHAR = c_char + class COORD(Structure): """struct in wincon.h""" + _fields_ = [ ('X', SHORT), ('Y', SHORT), ] -class SMALL_RECT(Structure): + +class SMALL_RECT(Structure): """struct in wincon.h.""" + _fields_ = [ ("Left", SHORT), ("Top", SHORT), @@ -59,8 +72,10 @@ class SMALL_RECT(Structure): ("Bottom", SHORT), ] + class CONSOLE_SCREEN_BUFFER_INFO(Structure): """struct in wincon.h.""" + _fields_ = [ ("dwSize", COORD), ("dwCursorPosition", COORD), @@ -68,22 +83,29 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure): ("srWindow", SMALL_RECT), ("dwMaximumWindowSize", COORD), ] + def __str__(self): """Get string representation of console screen buffer info.""" return '(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)' % ( - self.dwSize.Y, self.dwSize.X - , self.dwCursorPosition.Y, self.dwCursorPosition.X - , self.wAttributes - , self.srWindow.Top, self.srWindow.Left, self.srWindow.Bottom, self.srWindow.Right - , self.dwMaximumWindowSize.Y, self.dwMaximumWindowSize.X + self.dwSize.Y, + self.dwSize.X, + self.dwCursorPosition.Y, + self.dwCursorPosition.X, + self.wAttributes, + self.srWindow.Top, + self.srWindow.Left, + self.srWindow.Bottom, + self.srWindow.Right, + self.dwMaximumWindowSize.Y, + self.dwMaximumWindowSize.X, ) + def GetConsoleScreenBufferInfo(stream_id=STDOUT): """Get console screen buffer info object.""" handle = handles[stream_id] csbi = CONSOLE_SCREEN_BUFFER_INFO() - success = windll.kernel32.GetConsoleScreenBufferInfo( - handle, byref(csbi)) + success = windll.kernel32.GetConsoleScreenBufferInfo(handle, byref(csbi)) if not success: raise WinError() return csbi @@ -96,18 +118,18 @@ def SetConsoleTextAttribute(stream_id, attrs): # from wincon.h -BLACK = 0 -BLUE = 1 -GREEN = 2 -CYAN = 3 -RED = 4 +BLACK = 0 +BLUE = 1 +GREEN = 2 +CYAN = 3 +RED = 4 MAGENTA = 5 -YELLOW = 6 -GREY = 7 +YELLOW = 6 +GREY = 7 # from wincon.h -NORMAL = 0x00 # dim text, dim background -BRIGHT = 0x08 # bright text, dim background +NORMAL = 0x00 # dim text, dim background +BRIGHT = 0x08 # bright text, dim background _default_foreground = None _default_background = None diff --git a/linkcheck/configuration/__init__.py b/linkcheck/configuration/__init__.py index 622bda56..aae69b29 100644 --- a/linkcheck/configuration/__init__.py +++ b/linkcheck/configuration/__init__.py @@ -25,30 +25,34 @@ import urllib.request import shutil import socket import _LinkChecker_configdata as configdata -from .. import (log, LOG_CHECK, get_install_data, fileutil) +from .. import log, LOG_CHECK, get_install_data, fileutil from . import confparse from xdg.BaseDirectory import xdg_config_home, xdg_data_home Version = configdata.version ReleaseDate = configdata.release_date AppName = configdata.name -App = AppName+" "+Version +App = AppName + " " + Version Author = configdata.author HtmlAuthor = Author.replace(' ', ' ') -Copyright = "Copyright (C) 2000-2014 "+Author -HtmlCopyright = "Copyright © 2000-2014 "+HtmlAuthor -AppInfo = App+" "+Copyright -HtmlAppInfo = App+", "+HtmlCopyright +Copyright = "Copyright (C) 2000-2014 " + Author +HtmlCopyright = "Copyright © 2000-2014 " + HtmlAuthor +AppInfo = App + " " + Copyright +HtmlAppInfo = App + ", " + HtmlCopyright Url = configdata.url SupportUrl = "https://github.com/linkchecker/linkchecker/issues" Email = configdata.author_email UserAgent = "Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url) -Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY! +Freeware = ( + AppName + + """ comes with ABSOLUTELY NO WARRANTY! This is free software, and you are welcome to redistribute it under certain conditions. Look at the file `LICENSE' within this distribution.""" +) Portable = configdata.portable + def normpath(path): """Norm given system path with all available norm or expand functions in os.path.""" @@ -58,18 +62,19 @@ def normpath(path): # List Python modules in the form (module, name, version attribute) Modules = ( -# required modules + # required modules ("requests", "Requests", "__version__"), -# optional modules + # optional modules ("argcomplete", "Argcomplete", None), - ("GeoIP", "GeoIP", 'lib_version'), # on Unix systems - ("pygeoip", "GeoIP", 'lib_version'), # on Windows systems + ("GeoIP", "GeoIP", 'lib_version'), # on Unix systems + ("pygeoip", "GeoIP", 'lib_version'), # on Windows systems ("sqlite3", "Pysqlite", 'version'), ("sqlite3", "Sqlite", 'sqlite_version'), ("gconf", "Gconf", '__version__'), ("meliae", "Meliae", '__version__'), ) + def get_modules_info(): """Return unicode string with detected module info.""" module_infos = [] @@ -136,6 +141,7 @@ def get_certifi_file(): the file is not found """ import certifi + filename = certifi.where() if os.path.isfile(filename): return filename @@ -161,8 +167,8 @@ class Configuration(dict): self['robotstxt'] = True self["debugmemory"] = False self["localwebroot"] = None - self["maxfilesizeparse"] = 1*1024*1024 - self["maxfilesizedownload"] = 5*1024*1024 + self["maxfilesizeparse"] = 1 * 1024 * 1024 + self["maxfilesizedownload"] = 5 * 1024 * 1024 self["maxnumurls"] = None self["maxrunseconds"] = None self["maxrequestspersecond"] = 10 @@ -201,6 +207,7 @@ class Configuration(dict): self['logger'] = None self.loggers = {} from ..logger import LoggerClasses + for c in LoggerClasses: key = c.LoggerName self[key] = {} @@ -250,14 +257,11 @@ class Configuration(dict): def add_auth(self, user=None, password=None, pattern=None): """Add given authentication data.""" if not user or not pattern: - log.warn(LOG_CHECK, - _("missing user or URL pattern in authentication data.")) + log.warn( + LOG_CHECK, _("missing user or URL pattern in authentication data.") + ) return - entry = dict( - user=user, - password=password, - pattern=re.compile(pattern), - ) + entry = dict(user=user, password=password, pattern=re.compile(pattern),) self["authentication"].append(entry) def get_user_password(self, url): @@ -299,16 +303,16 @@ class Configuration(dict): url = self["loginurl"] disable = False if not self["loginpasswordfield"]: - log.warn(LOG_CHECK, - _("no CGI password fieldname given for login URL.")) + log.warn(LOG_CHECK, _("no CGI password fieldname given for login URL.")) disable = True if not self["loginuserfield"]: - log.warn(LOG_CHECK, - _("no CGI user fieldname given for login URL.")) + log.warn(LOG_CHECK, _("no CGI user fieldname given for login URL.")) disable = True if self.get_user_password(url) == (None, None): - log.warn(LOG_CHECK, - _("no user/password authentication data found for login URL.")) + log.warn( + LOG_CHECK, + _("no user/password authentication data found for login URL."), + ) disable = True if not url.lower().startswith(("http:", "https:")): log.warn(LOG_CHECK, _("login URL is not a HTTP URL.")) @@ -318,8 +322,7 @@ class Configuration(dict): log.warn(LOG_CHECK, _("login URL is incomplete.")) disable = True if disable: - log.warn(LOG_CHECK, - _("disabling login URL %(url)s.") % {"url": url}) + log.warn(LOG_CHECK, _("disabling login URL %(url)s.") % {"url": url}) self["loginurl"] = None def sanitize_proxies(self): @@ -366,10 +369,14 @@ def get_user_data(): @rtype string """ homedotdir = normpath("~/.linkchecker/") - userdata = homedotdir if os.path.isdir(homedotdir) \ + userdata = ( + homedotdir + if os.path.isdir(homedotdir) else os.path.join(xdg_data_home, "linkchecker") + ) return userdata + def get_plugin_folders(): """Get linkchecker plugin folders. Default is "$XDG_DATA_HOME/linkchecker/plugins/". "~/.linkchecker/plugins/" is also @@ -413,16 +420,20 @@ def get_user_config(): initialconf = normpath(os.path.join(get_share_dir(), "linkcheckerrc")) # per user config settings homedotfile = normpath("~/.linkchecker/linkcheckerrc") - userconf = homedotfile if os.path.isfile(homedotfile) \ + userconf = ( + homedotfile + if os.path.isfile(homedotfile) else os.path.join(xdg_config_home, "linkchecker", "linkcheckerrc") - if os.path.isfile(initialconf) and not os.path.exists(userconf) and \ - not Portable: + ) + if os.path.isfile(initialconf) and not os.path.exists(userconf) and not Portable: # copy the initial configuration to the user configuration try: make_userdir(userconf) shutil.copy(initialconf, userconf) except Exception as errmsg: - msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r") + msg = _( + "could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r" + ) args = dict(src=initialconf, dst=userconf, errmsg=errmsg) log.warn(LOG_CHECK, msg % args) return userconf @@ -496,6 +507,7 @@ def get_kde_ftp_proxy(): log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg) pass + # The following KDE functions are largely ported and ajusted from # Google Chromium: # http://src.chromium.org/viewvc/chrome/trunk/src/net/proxy/proxy_config_service_linux.cc?revision=HEAD&view=markup @@ -527,6 +539,7 @@ def get_kde_ftp_proxy(): # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + def get_kde_config_dir(): """Return KDE configuration directory or None if not found.""" kde_home = get_kde_home_dir() @@ -571,6 +584,7 @@ def get_kde_home_dir(): loc_ro = re.compile(r"\[.*\]$") + @lru_cache(1) def read_kioslaverc(kde_config_dir): """Read kioslaverc into data dictionary.""" @@ -579,7 +593,7 @@ def read_kioslaverc(kde_config_dir): with open(filename) as fd: # First read all lines into dictionary since they can occur # in any order. - for line in fd: + for line in fd: line = line.rstrip() if line.startswith('['): in_proxy_settings = line.startswith("[Proxy Settings]") diff --git a/linkcheck/configuration/confparse.py b/linkcheck/configuration/confparse.py index deed0dbd..74985026 100644 --- a/linkcheck/configuration/confparse.py +++ b/linkcheck/configuration/confparse.py @@ -18,7 +18,15 @@ from configparser import RawConfigParser import os -from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf +from .. import ( + LinkCheckerError, + get_link_pat, + LOG_CHECK, + log, + fileutil, + plugins, + logconf, +) def read_multiline(value): @@ -50,7 +58,9 @@ class LCConfigParser(RawConfigParser): self.read_ok = super(LCConfigParser, self).read(files) if len(self.read_ok) < len(files): failed_files = set(files) - set(self.read_ok) - log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files) + log.warn( + LOG_CHECK, "Could not read configuration files %s.", failed_files + ) # Read all the configuration parameters from the given files. self.read_checking_config() self.read_authentication_config() @@ -58,15 +68,16 @@ class LCConfigParser(RawConfigParser): self.read_output_config() self.read_plugin_config() except Exception as msg: - raise LinkCheckerError( - _("Error parsing configuration: %s") % str(msg)) + raise LinkCheckerError(_("Error parsing configuration: %s") % str(msg)) def read_string_option(self, section, option, allowempty=False): """Read a string option.""" if self.has_option(section, option): value = self.get(section, option) if not allowempty and not value: - raise LinkCheckerError(_("invalid empty value for %s: %s\n") % (option, value)) + raise LinkCheckerError( + _("invalid empty value for %s: %s\n") % (option, value) + ) self.config[option] = value def read_boolean_option(self, section, option): @@ -80,10 +91,14 @@ class LCConfigParser(RawConfigParser): num = self.getint(section, option) if min is not None and num < min: raise LinkCheckerError( - _("invalid value for %s: %d must not be less than %d") % (option, num, min)) + _("invalid value for %s: %d must not be less than %d") + % (option, num, min) + ) if max is not None and num < max: raise LinkCheckerError( - _("invalid value for %s: %d must not be greater than %d") % (option, num, max)) + _("invalid value for %s: %d must not be greater than %d") + % (option, num, max) + ) if key is None: key = option self.config[key] = num @@ -92,6 +107,7 @@ class LCConfigParser(RawConfigParser): """Read configuration options in section "output".""" section = "output" from ..logger import LoggerClasses + for c in LoggerClasses: key = c.LoggerName if self.has_section(key): @@ -124,8 +140,12 @@ class LCConfigParser(RawConfigParser): loggers = (x.strip().lower() for x in loggers) # no file output for the blacklist and none Logger from ..logger import LoggerNames - loggers = (x for x in loggers if x in LoggerNames and - x not in ("blacklist", "none")) + + loggers = ( + x + for x in loggers + if x in LoggerNames and x not in ("blacklist", "none") + ) for val in loggers: output = self.config.logger_new(val, fileoutput=1) self.config['fileoutput'].append(output) @@ -145,8 +165,10 @@ class LCConfigParser(RawConfigParser): self.read_int_option(section, "maxfilesizeparse", min=1) self.read_int_option(section, "maxfilesizedownload", min=1) if self.has_option(section, "allowedschemes"): - self.config['allowedschemes'] = [x.strip().lower() for x in \ - self.get(section, 'allowedschemes').split(',')] + self.config['allowedschemes'] = [ + x.strip().lower() + for x in self.get(section, 'allowedschemes').split(',') + ] self.read_boolean_option(section, "debugmemory") self.read_string_option(section, "cookiefile") self.read_boolean_option(section, "robotstxt") @@ -165,21 +187,29 @@ class LCConfigParser(RawConfigParser): for val in read_multiline(self.get(section, "entry")): auth = val.split() if len(auth) == 3: - self.config.add_auth(pattern=auth[0], user=auth[1], - password=auth[2]) + self.config.add_auth( + pattern=auth[0], user=auth[1], password=auth[2] + ) password_fields.append("entry/%s/%s" % (auth[0], auth[1])) elif len(auth) == 2: self.config.add_auth(pattern=auth[0], user=auth[1]) else: raise LinkCheckerError( - _("missing auth part in entry %(val)r") % {"val": val}) + _("missing auth part in entry %(val)r") % {"val": val} + ) # read login URL and field names if self.has_option(section, "loginurl"): val = self.get(section, "loginurl").strip() - if not (val.lower().startswith("http:") or - val.lower().startswith("https:")): - raise LinkCheckerError(_("invalid login URL `%s'. Only " \ - "HTTP and HTTPS URLs are supported.") % val) + if not ( + val.lower().startswith("http:") or val.lower().startswith("https:") + ): + raise LinkCheckerError( + _( + "invalid login URL `%s'. Only " + "HTTP and HTTPS URLs are supported." + ) + % val + ) self.config["loginurl"] = val self.read_string_option(section, "loginuserfield") self.read_string_option(section, "loginpasswordfield") @@ -201,11 +231,22 @@ class LCConfigParser(RawConfigParser): return fn = self.read_ok[0] if fileutil.is_accessable_by_others(fn): - log.warn(LOG_CHECK, "The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.", fn, section, fields) + log.warn( + LOG_CHECK, + "The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.", + fn, + section, + fields, + ) if os.name == 'posix': log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn) elif os.name == 'nt': - log.warn(LOG_CHECK, _("See http://support.microsoft.com/kb/308419 for more info on setting file permissions.")) + log.warn( + LOG_CHECK, + _( + "See http://support.microsoft.com/kb/308419 for more info on setting file permissions." + ), + ) def read_filtering_config(self): """ @@ -213,8 +254,10 @@ class LCConfigParser(RawConfigParser): """ section = "filtering" if self.has_option(section, "ignorewarnings"): - self.config['ignorewarnings'] = [f.strip().lower() for f in \ - self.get(section, 'ignorewarnings').split(',')] + self.config['ignorewarnings'] = [ + f.strip().lower() + for f in self.get(section, 'ignorewarnings').split(',') + ] if self.has_option(section, "ignore"): for line in read_multiline(self.get(section, "ignore")): pat = get_link_pat(line, strict=1) diff --git a/linkcheck/containers.py b/linkcheck/containers.py index 88ec19db..bf541c94 100644 --- a/linkcheck/containers.py +++ b/linkcheck/containers.py @@ -17,6 +17,7 @@ Special container classes. """ + class LFUCache(dict): """Limited cache which purges least frequently used items.""" @@ -40,7 +41,7 @@ class LFUCache(dict): def shrink(self): """Shrink ca. 5% of entries.""" - trim = int(0.05*len(self)) + trim = int(0.05 * len(self)) if trim: items = super(LFUCache, self).items() # sorting function for items diff --git a/linkcheck/cookies.py b/linkcheck/cookies.py index c946ad53..5e17ef29 100644 --- a/linkcheck/cookies.py +++ b/linkcheck/cookies.py @@ -60,7 +60,8 @@ def from_headers(strheader): for headervalue in headers.get_all("Set-Cookie"): for pairs in split_header_words([headervalue]): for name, value in pairs: - cookie = requests.cookies.create_cookie(name, value, - domain=host, path=path) + cookie = requests.cookies.create_cookie( + name, value, domain=host, path=path + ) res.append(cookie) return res diff --git a/linkcheck/decorators.py b/linkcheck/decorators.py index b74c8a04..797c624a 100644 --- a/linkcheck/decorators.py +++ b/linkcheck/decorators.py @@ -56,11 +56,15 @@ def update_func_meta(fake_func, real_func): def deprecated(func): """A decorator which can be used to mark functions as deprecated. It emits a warning when the function is called.""" + def newfunc(*args, **kwargs): """Print deprecated warning and execute original function.""" - warnings.warn("Call to deprecated function %s." % func.__name__, - category=DeprecationWarning) + warnings.warn( + "Call to deprecated function %s." % func.__name__, + category=DeprecationWarning, + ) return func(*args, **kwargs) + return update_func_meta(newfunc, func) @@ -83,19 +87,27 @@ def signal_handler(signal_number): if is_valid_signal and os.name == 'posix': signal.signal(signal_number, function) return function + return newfunc def synchronize(lock, func, log_duration_secs=0): """Return synchronized function acquiring the given lock.""" + def newfunc(*args, **kwargs): """Execute function synchronized.""" t = time.time() with lock: duration = time.time() - t if duration > log_duration_secs > 0: - print("WARN:", func.__name__, "locking took %0.2f seconds" % duration, file=sys.stderr) + print( + "WARN:", + func.__name__, + "locking took %0.2f seconds" % duration, + file=sys.stderr, + ) return func(*args, **kwargs) + return update_func_meta(newfunc, func) @@ -106,11 +118,13 @@ def synchronized(lock): def notimplemented(func): """Raises a NotImplementedError if the function is called.""" + def newfunc(*args, **kwargs): """Raise NotImplementedError""" co = func.func_code attrs = (co.co_name, co.co_filename, co.co_firstlineno) raise NotImplementedError("function %s at %s:%d is not implemented" % attrs) + return update_func_meta(newfunc, func) @@ -127,6 +141,7 @@ def timeit(func, log, limit): print(args, file=log) print(kwargs, file=log) return res + return update_func_meta(newfunc, func) diff --git a/linkcheck/director/__init__.py b/linkcheck/director/__init__.py index 7195a962..e92e3cc1 100644 --- a/linkcheck/director/__init__.py +++ b/linkcheck/director/__init__.py @@ -32,14 +32,12 @@ def check_urls(aggregate): try: aggregate.visit_loginurl() except Exception as msg: - log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \ - dict(msg=msg)) + log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % dict(msg=msg)) raise try: aggregate.logger.start_log_output() except Exception as msg: - log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % \ - dict(msg=msg)) + log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % dict(msg=msg)) raise try: if not aggregate.urlqueue.empty(): @@ -52,9 +50,13 @@ def check_urls(aggregate): except KeyboardInterrupt: interrupt(aggregate) except RuntimeError: - log.warn(LOG_CHECK, - _("Could not start a new thread. Check that the current user" \ - " is allowed to start new threads.")) + log.warn( + LOG_CHECK, + _( + "Could not start a new thread. Check that the current user" + " is allowed to start new threads." + ), + ) abort(aggregate) except Exception: # Catching "Exception" is intentionally done. This saves the program @@ -84,10 +86,8 @@ def interrupt(aggregate): interrupts.""" while True: try: - log.warn(LOG_CHECK, - _("interrupt; waiting for active threads to finish")) - log.warn(LOG_CHECK, - _("another interrupt will exit immediately")) + log.warn(LOG_CHECK, _("interrupt; waiting for active threads to finish")) + log.warn(LOG_CHECK, _("another interrupt will exit immediately")) abort(aggregate) break except KeyboardInterrupt: @@ -113,6 +113,7 @@ def abort_now(): if os.name == 'posix': # Unix systems can use signals import signal + os.kill(os.getpid(), signal.SIGTERM) time.sleep(1) os.kill(os.getpid(), signal.SIGKILL) @@ -130,5 +131,6 @@ def get_aggregate(config): _robots_txt = robots_txt.RobotsTxt(config["useragent"]) plugin_manager = plugins.PluginManager(config) result_cache = results.ResultCache() - return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager, - result_cache) + return aggregator.Aggregate( + config, _urlqueue, _robots_txt, plugin_manager, result_cache + ) diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index 2bd061d1..5f1597b6 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -34,15 +34,16 @@ _threads_lock = threading.RLock() _hosts_lock = threading.RLock() _downloadedbytes_lock = threading.RLock() + def new_request_session(config, cookies): """Create a new request session.""" session = requests.Session() if cookies: session.cookies = cookies session.max_redirects = config["maxhttpredirects"] - session.headers.update({ - "User-Agent": config["useragent"], - }) + session.headers.update( + {"User-Agent": config["useragent"],} + ) if config["cookiefile"]: for cookie in from_file(config["cookiefile"]): session.cookies.set_cookie(cookie) @@ -52,8 +53,7 @@ def new_request_session(config, cookies): class Aggregate: """Store thread-safe data collections for checker threads.""" - def __init__(self, config, urlqueue, robots_txt, plugin_manager, - result_cache): + def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache): """Store given link checking objects.""" self.config = config self.urlqueue = urlqueue @@ -78,7 +78,8 @@ class Aggregate: user, password = self.config.get_user_password(url) if not user and not password: raise LinkCheckerError( - "loginurl is configured but neither user nor password are set") + "loginurl is configured but neither user nor password are set" + ) session = new_request_session(self.config, self.cookies) log.debug(LOG_CHECK, "Getting login form %s", url) kwargs = dict(timeout=self.config["timeout"]) @@ -119,11 +120,15 @@ class Aggregate: num = self.config["threads"] if num > 0: for dummy in range(num): - t = checker.Checker(self.urlqueue, self.logger, self.add_request_session) + t = checker.Checker( + self.urlqueue, self.logger, self.add_request_session + ) self.threads.append(t) t.start() else: - self.request_sessions[threading.get_ident()] = new_request_session(self.config, self.cookies) + self.request_sessions[threading.get_ident()] = new_request_session( + self.config, self.cookies + ) checker.check_urls(self.urlqueue, self.logger) @synchronized(_threads_lock) @@ -162,10 +167,18 @@ class Aggregate: first = False log.info(LOG_CHECK, name[12:]) args = dict( - num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]), + num=len( + [x for x in self.threads if x.getName().startswith("CheckThread-")] + ), timeout=strformat.strduration_long(self.config["aborttimeout"]), ) - log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args) + log.info( + LOG_CHECK, + _( + "%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop." + ) + % args, + ) @synchronized(_threads_lock) def get_check_threads(self): @@ -187,7 +200,10 @@ class Aggregate: try: self.urlqueue.join(timeout=timeout) except urlqueue.Timeout: - log.warn(LOG_CHECK, "Abort timed out after %d seconds, stopping application." % timeout) + log.warn( + LOG_CHECK, + "Abort timed out after %d seconds, stopping application." % timeout, + ) raise KeyboardInterrupt() @synchronized(_threads_lock) @@ -219,8 +235,9 @@ class Aggregate: def end_log_output(self, **kwargs): """Print ending output to log.""" - kwargs.update(dict( - downloaded_bytes=self.downloaded_bytes, - num_urls = len(self.result_cache), - )) + kwargs.update( + dict( + downloaded_bytes=self.downloaded_bytes, num_urls=len(self.result_cache), + ) + ) self.logger.end_log_output(**kwargs) diff --git a/linkcheck/director/console.py b/linkcheck/director/console.py index fd2f1adc..a32de801 100644 --- a/linkcheck/director/console.py +++ b/linkcheck/director/console.py @@ -35,8 +35,7 @@ class StatusLogger: def log_status(self, checked, in_progress, queue, duration, num_urls): """Write status message to file descriptor.""" - msg = _n("%2d thread active", "%2d threads active", in_progress) % \ - in_progress + msg = _n("%2d thread active", "%2d threads active", in_progress) % in_progress self.write("%s, " % msg) msg = _n("%5d link queued", "%5d links queued", queue) % queue self.write("%s, " % msg) @@ -64,7 +63,9 @@ class StatusLogger: def internal_error(out=stderr, etype=None, evalue=None, tb=None): """Print internal error message (output defaults to stderr).""" print(os.linesep, file=out) - print(_("""********** Oops, I did it again. ************* + print( + _( + """********** Oops, I did it again. ************* You have found an internal error in LinkChecker. Please write a bug report at %s @@ -79,7 +80,11 @@ When using the commandline client: Not disclosing some of the information above due to privacy reasons is ok. I will try to help you nonetheless, but you have to give me something I can work with ;) . -""") % configuration.SupportUrl, file=out) +""" + ) + % configuration.SupportUrl, + file=out, + ) if etype is None: etype = sys.exc_info()[0] if evalue is None: @@ -90,8 +95,11 @@ I can work with ;) . print_app_info(out=out) print_proxy_info(out=out) print_locale_info(out=out) - print(os.linesep, - _("******** LinkChecker internal error, over and out ********"), file=out) + print( + os.linesep, + _("******** LinkChecker internal error, over and out ********"), + file=out, + ) def print_env_info(key, out=stderr): @@ -113,6 +121,7 @@ def print_locale_info(out=stderr): print_env_info(key, out=out) print(_("Default locale:"), i18n.get_locale(), file=out) + # Environment variables influencing the interpreter execution # See python(1) man page. PYTHON_ENV_VARS = ( @@ -131,13 +140,18 @@ PYTHON_ENV_VARS = ( 'PYTHONWARNINGS', 'PYTHONHASHSEED', ) + + def print_app_info(out=stderr): """Print system and application info (output defaults to stderr).""" print(_("System info:"), file=out) print(configuration.App, file=out) print(_("Released on:"), configuration.ReleaseDate, file=out) - print(_("Python %(version)s on %(platform)s") % - {"version": sys.version, "platform": sys.platform}, file=out) + print( + _("Python %(version)s on %(platform)s") + % {"version": sys.version, "platform": sys.platform}, + file=out, + ) for key in PYTHON_ENV_VARS: print_env_info(key, out=out) print(configuration.get_modules_info(), file=out) @@ -148,6 +162,5 @@ def print_app_info(out=stderr): def print_version(out=stdout): """Print the program version (output defaults to stdout).""" - print(configuration.App, _("released"), - configuration.ReleaseDate, file=out) + print(configuration.App, _("released"), configuration.ReleaseDate, file=out) print(configuration.Copyright, file=out) diff --git a/linkcheck/director/interrupt.py b/linkcheck/director/interrupt.py index b6c409de..11a0513e 100644 --- a/linkcheck/director/interrupt.py +++ b/linkcheck/director/interrupt.py @@ -24,6 +24,7 @@ class Interrupt(task.CheckedTask): This gives us a portable SIGALRM implementation. The duration is checked every 5 seconds. """ + WaitSeconds = 5 def __init__(self, duration): @@ -41,5 +42,8 @@ class Interrupt(task.CheckedTask): while not self.stopped(self.WaitSeconds): duration = time.time() - self.start_time if duration > self.duration: - log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration)) + log.warn( + LOG_CHECK, + "Interrupt after %s" % strformat.strduration_long(duration), + ) raise KeyboardInterrupt() diff --git a/linkcheck/director/logger.py b/linkcheck/director/logger.py index cd706ed1..5428940e 100644 --- a/linkcheck/director/logger.py +++ b/linkcheck/director/logger.py @@ -18,6 +18,7 @@ import threading import _thread from ..decorators import synchronized + _lock = threading.Lock() diff --git a/linkcheck/dummy.py b/linkcheck/dummy.py index e8230c3a..fc32b1da 100644 --- a/linkcheck/dummy.py +++ b/linkcheck/dummy.py @@ -17,6 +17,7 @@ Dummy objects. """ + class Dummy: """A dummy object ignores all access to it. Useful for testing.""" diff --git a/linkcheck/fileutil.py b/linkcheck/fileutil.py index 410cd79b..5c8c8948 100644 --- a/linkcheck/fileutil.py +++ b/linkcheck/fileutil.py @@ -67,6 +67,7 @@ elif "G_BROKEN_FILENAMES" in os.environ: else: FSCODING = "utf-8" + def path_safe(path): """Ensure path string is compatible with the platform file system encoding.""" if isinstance(path, str) and not os.path.supports_unicode_filenames: @@ -83,7 +84,7 @@ def get_temp_file(mode='r', **kwargs): def is_tty(fp): """Check if is a file object pointing to a TTY.""" - return (hasattr(fp, "isatty") and fp.isatty()) + return hasattr(fp, "isatty") and fp.isatty() @lru_cache(128) diff --git a/linkcheck/ftpparse.py b/linkcheck/ftpparse.py index 15b3ac13..b6eff909 100644 --- a/linkcheck/ftpparse.py +++ b/linkcheck/ftpparse.py @@ -19,8 +19,22 @@ Python implementation of a part of Dan Bernstein's ftpparse library. See also http://cr.yp.to/ftpparse.html """ -months = ("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep", - "oct", "nov", "dec") +months = ( + "jan", + "feb", + "mar", + "apr", + "may", + "jun", + "jul", + "aug", + "sep", + "oct", + "nov", + "dec", +) + + def ismonth(txt): """Check if given text is a month name.""" return txt.lower() in months @@ -78,20 +92,20 @@ def ftpparse(line): parts = line.split() if len(parts) < 7: return None - del parts[0] # skip permissions + del parts[0] # skip permissions if parts[0] != 'folder': - del parts[0] # skip nlink - del parts[0] # skip uid - del parts[0] # skip gid or size + del parts[0] # skip nlink + del parts[0] # skip uid + del parts[0] # skip gid or size if not ismonth(parts[0]): - del parts[0] # skip size + del parts[0] # skip size if not ismonth(parts[0]): return None - del parts[0] # skip month - del parts[0] # skip day + del parts[0] # skip month + del parts[0] # skip day if not parts: return None - del parts[0] # skip year or time + del parts[0] # skip year or time name = " ".join(parts) # resolve links if line[0] == 'l' and ' -> ' in name: diff --git a/linkcheck/htmlutil/htmlsoup.py b/linkcheck/htmlutil/htmlsoup.py index e5e18799..8819bc6c 100644 --- a/linkcheck/htmlutil/htmlsoup.py +++ b/linkcheck/htmlutil/htmlsoup.py @@ -19,13 +19,17 @@ HTML parser implemented using Beautiful Soup and html.parser. from warnings import filterwarnings -filterwarnings("ignore", +filterwarnings( + "ignore", message="The soupsieve package is not installed. CSS selectors cannot be used.", - category=UserWarning, module="bs4") + category=UserWarning, + module="bs4", +) from bs4 import BeautifulSoup def make_soup(markup, from_encoding=None): - return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding, - multi_valued_attributes=None) + return BeautifulSoup( + markup, "html.parser", from_encoding=from_encoding, multi_valued_attributes=None + ) diff --git a/linkcheck/htmlutil/linkparse.py b/linkcheck/htmlutil/linkparse.py index 43b87479..0e306ff3 100644 --- a/linkcheck/htmlutil/linkparse.py +++ b/linkcheck/htmlutil/linkparse.py @@ -25,41 +25,41 @@ unquote = strformat.unquote # HTML4/5 link tags # ripped mainly from HTML::Tagset.pm with HTML5 added LinkTags = { - 'a': ['href'], - 'applet': ['archive', 'src'], - 'area': ['href'], - 'audio': ['src'], # HTML5 - 'bgsound': ['src'], + 'a': ['href'], + 'applet': ['archive', 'src'], + 'area': ['href'], + 'audio': ['src'], # HTML5 + 'bgsound': ['src'], 'blockquote': ['cite'], - 'body': ['background'], - 'button': ['formaction'], # HTML5 - 'del': ['cite'], - 'embed': ['pluginspage', 'src'], - 'form': ['action'], - 'frame': ['src', 'longdesc'], - 'head': ['profile'], - 'html': ['manifest'], # HTML5 - 'iframe': ['src', 'longdesc'], - 'ilayer': ['background'], - 'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'], - 'input': ['src', 'usemap', 'formaction'], - 'ins': ['cite'], - 'isindex': ['action'], - 'layer': ['background', 'src'], - 'link': ['href'], - 'meta': ['content', 'href'], - 'object': ['classid', 'data', 'archive', 'usemap', 'codebase'], - 'q': ['cite'], - 'script': ['src'], - 'source': ['src'], # HTML5 - 'table': ['background'], - 'td': ['background'], - 'th': ['background'], - 'tr': ['background'], - 'track': ['src'], # HTML5 - 'video': ['src'], # HTML5 - 'xmp': ['href'], - None: ['style', 'itemtype'], + 'body': ['background'], + 'button': ['formaction'], # HTML5 + 'del': ['cite'], + 'embed': ['pluginspage', 'src'], + 'form': ['action'], + 'frame': ['src', 'longdesc'], + 'head': ['profile'], + 'html': ['manifest'], # HTML5 + 'iframe': ['src', 'longdesc'], + 'ilayer': ['background'], + 'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'], + 'input': ['src', 'usemap', 'formaction'], + 'ins': ['cite'], + 'isindex': ['action'], + 'layer': ['background', 'src'], + 'link': ['href'], + 'meta': ['content', 'href'], + 'object': ['classid', 'data', 'archive', 'usemap', 'codebase'], + 'q': ['cite'], + 'script': ['src'], + 'source': ['src'], # HTML5 + 'table': ['background'], + 'td': ['background'], + 'th': ['background'], + 'tr': ['background'], + 'track': ['src'], # HTML5 + 'video': ['src'], # HTML5 + 'xmp': ['href'], + None: ['style', 'itemtype'], } # HTML anchor tags @@ -70,8 +70,8 @@ AnchorTags = { # WML tags WmlTags = { - 'a': ['href'], - 'go': ['href'], + 'a': ['href'], + 'go': ['href'], 'img': ['src'], } @@ -219,6 +219,9 @@ def find_links(soup, callback, tags): lf = LinkFinder(callback, tags) for element in soup.find_all(True): lf.html_element( - element.name, element.attrs, element.text.strip(), + element.name, + element.attrs, + element.text.strip(), element.sourceline, - None if element.sourcepos is None else element.sourcepos + 1) + None if element.sourcepos is None else element.sourcepos + 1, + ) diff --git a/linkcheck/htmlutil/loginformsearch.py b/linkcheck/htmlutil/loginformsearch.py index 576bef01..b4db4b67 100644 --- a/linkcheck/htmlutil/loginformsearch.py +++ b/linkcheck/htmlutil/loginformsearch.py @@ -19,6 +19,7 @@ HTML form utils from ..htmlutil import htmlsoup from .. import log, LOG_CHECK + class Form: """Store HTML form URL and form data.""" @@ -44,10 +45,8 @@ def search_form(content, cgiuser, cgipassword): cginames = {cgiuser, cgipassword} - {None} for form_element in soup.find_all("form", action=True): form = Form(form_element["action"]) - for input_element in form_element.find_all("input", - attrs={"name": True}): - form.add_value( - input_element["name"], input_element.attrs.get("value")) + for input_element in form_element.find_all("input", attrs={"name": True}): + form.add_value(input_element["name"], input_element.attrs.get("value")) if cginames <= set(form.data): log.debug(LOG_CHECK, "Found form %s", form) return form diff --git a/linkcheck/httputil.py b/linkcheck/httputil.py index 4ec3e2c6..f7142ce2 100644 --- a/linkcheck/httputil.py +++ b/linkcheck/httputil.py @@ -27,14 +27,10 @@ def x509_to_dict(x509): subject, subjectAltName and optional notAfter. """ from requests.packages.urllib3.contrib.pyopenssl import get_subj_alt_name + res = { - 'subject': ( - (('commonName', x509.get_subject().CN),), - ), - 'subjectAltName': [ - ('DNS', value) - for value in get_subj_alt_name(x509) - ] + 'subject': ((('commonName', x509.get_subject().CN),),), + 'subjectAltName': [('DNS', value) for value in get_subj_alt_name(x509)], } notAfter = x509.get_notAfter() if notAfter is not None: diff --git a/linkcheck/i18n.py b/linkcheck/i18n.py index 132aa024..4fde6bdf 100644 --- a/linkcheck/i18n.py +++ b/linkcheck/i18n.py @@ -30,13 +30,16 @@ default_language = default_encoding = None default_directory = None default_domain = None + def install_builtin(translator, do_unicode): """Install _() and _n() gettext methods into default namespace.""" import builtins + builtins.__dict__['_'] = translator.gettext # also install ngettext builtins.__dict__['_n'] = translator.ngettext + class Translator(gettext.GNUTranslations): """A translation class always installing its gettext methods into the default namespace.""" @@ -84,18 +87,29 @@ def init(domain, directory, loc=None): def install_language(language): """Install translation service routines into default namespace.""" - translator = get_translator(default_domain, default_directory, - languages=[get_lang(language)], fallback=True) + translator = get_translator( + default_domain, default_directory, languages=[get_lang(language)], fallback=True + ) do_unicode = True translator.install(do_unicode) -def get_translator(domain, directory, languages=None, - translatorklass=Translator, fallback=False, - fallbackklass=NullTranslator): +def get_translator( + domain, + directory, + languages=None, + translatorklass=Translator, + fallback=False, + fallbackklass=NullTranslator, +): """Search the appropriate GNUTranslations class.""" - translator = gettext.translation(domain, localedir=directory, - languages=languages, class_=translatorklass, fallback=fallback) + translator = gettext.translation( + domain, + localedir=directory, + languages=languages, + class_=translatorklass, + fallback=fallback, + ) if not isinstance(translator, gettext.GNUTranslations) and fallbackklass: translator = fallbackklass() return translator @@ -175,6 +189,7 @@ lang_transis = { 'en': {'de': 'Englisch'}, } + def lang_name(lang): """Return full name of given language.""" return lang_names[lang] diff --git a/linkcheck/lc_cgi.py b/linkcheck/lc_cgi.py index d5202bef..6b6857d4 100644 --- a/linkcheck/lc_cgi.py +++ b/linkcheck/lc_cgi.py @@ -25,8 +25,15 @@ import re import time import urllib.parse -from . import configuration, strformat, checker, director, get_link_pat, \ - init_i18n, url as urlutil +from . import ( + configuration, + strformat, + checker, + director, + get_link_pat, + init_i18n, + url as urlutil, +) from .decorators import synchronized # 5 minutes timeout for requests @@ -67,17 +74,20 @@ lang_locale = { } _is_level = re.compile(r'^(0|1|2|3|-1)$').match + class LCFormError(Exception): """Form related errors.""" + pass def get_response_headers(): """Get list of response headers in key-value form.""" - return [("Content-type", "text/html"), - ("Cache-Control", "no-cache"), - ("Pragma:", "no-cache") - ] + return [ + ("Content-type", "text/html"), + ("Cache-Control", "no-cache"), + ("Pragma:", "no-cache"), + ] def formvalue(form, key): @@ -89,6 +99,8 @@ def formvalue(form, key): _lock = threading.Lock() + + class ThreadsafeIO: """Thread-safe unicode I/O class.""" @@ -235,7 +247,7 @@ def log(env, msg): def dump(env, form): """Log environment and form.""" for var, value in env.items(): - log(env, var+"="+value) + log(env, var + "=" + value) for key in form: log(env, str(formvalue(form, key))) @@ -247,7 +259,9 @@ def format_error(why): @return: HTML page content @rtype: unicode """ - return _(""" + return ( + _( + """ LinkChecker Online Error @@ -260,4 +274,7 @@ contains only these characters: A-Za-z0-9./_~-

Errors are logged. -""") % html.escape(why) +""" + ) + % html.escape(why) + ) diff --git a/linkcheck/loader.py b/linkcheck/loader.py index b137226a..9beaba8c 100644 --- a/linkcheck/loader.py +++ b/linkcheck/loader.py @@ -42,15 +42,17 @@ def get_package_modules(packagename): parentmodule = os.path.basename(os.path.dirname(__file__)) with zipfile.ZipFile(zipname, 'r') as f: prefix = "%s/%s/" % (parentmodule, packagename) - modnames = [os.path.splitext(n[len(prefix):])[0] - for n in f.namelist() - if n.startswith(prefix) and "__init__" not in n] + modnames = [ + os.path.splitext(n[len(prefix) :])[0] + for n in f.namelist() + if n.startswith(prefix) and "__init__" not in n + ] else: dirname = os.path.join(os.path.dirname(__file__), packagename) modnames = [x[:-3] for x in get_importable_files(dirname)] for modname in modnames: try: - name ="..%s.%s" % (packagename, modname) + name = "..%s.%s" % (packagename, modname) yield importlib.import_module(name, __name__) except ImportError as msg: print("WARN: could not load module %s: %s" % (modname, msg)) @@ -63,7 +65,7 @@ def get_folder_modules(folder, parentpackage): return for filename in get_importable_files(folder): fullname = os.path.join(folder, filename) - modname = parentpackage+"."+filename[:-3] + modname = parentpackage + "." + filename[:-3] try: yield imp.load_source(modname, fullname) except ImportError as msg: @@ -80,7 +82,10 @@ def get_importable_files(folder): if fname.endswith('.py') and not fname.startswith('_'): fullname = os.path.join(folder, fname) if check_writable_by_others(fullname): - print("ERROR: refuse to load module from world writable file %r" % fullname) + print( + "ERROR: refuse to load module from world writable file %r" + % fullname + ) else: yield fname diff --git a/linkcheck/lock.py b/linkcheck/lock.py index a52e2753..80963cf5 100644 --- a/linkcheck/lock.py +++ b/linkcheck/lock.py @@ -19,6 +19,7 @@ Locking utility class. import threading from . import log, LOG_THREAD + def get_lock(name, debug=False): """Get a new lock. @param debug: if True, acquire() and release() will have debug messages diff --git a/linkcheck/log.py b/linkcheck/log.py index cf0551b5..2f3a5977 100644 --- a/linkcheck/log.py +++ b/linkcheck/log.py @@ -24,11 +24,13 @@ import inspect import traceback # memory leak debugging -#import gc -#gc.enable() -#gc.set_debug(gc.DEBUG_LEAK) +# import gc +# gc.enable() +# gc.set_debug(gc.DEBUG_LEAK) PRINT_LOCALVARS = False + + def _stack_format(stack): """Format a stack trace to a message. diff --git a/linkcheck/logconf.py b/linkcheck/logconf.py index 266b6ae5..348615aa 100644 --- a/linkcheck/logconf.py +++ b/linkcheck/logconf.py @@ -41,20 +41,18 @@ lognamelist = ", ".join(repr(name) for name in lognames) # logging configuration configdict = { 'version': 1, - 'loggers': { - }, - 'root': { - 'level': 'WARN', - }, + 'loggers': {}, + 'root': {'level': 'WARN',}, 'incremental': True, } + def init_log_config(handler=None): """Set up the application logging (not to be confused with check loggers). """ for applog in lognames.values(): # propagate except for root app logger 'linkcheck' - propagate = (applog != LOG_ROOT) + propagate = applog != LOG_ROOT configdict['loggers'][applog] = dict(level='INFO', propagate=propagate) logging.config.dictConfig(configdict) @@ -86,8 +84,8 @@ def set_debug(loggers): """Set debugging log level.""" set_loglevel(loggers, logging.DEBUG) # enable for httplib debugging (used by requests.packages.urllib3) - #import httplib - #httplib.HTTPConnection.debuglevel = 1 + # import httplib + # httplib.HTTPConnection.debuglevel = 1 def set_loglevel(loggers, level): diff --git a/linkcheck/logger/__init__.py b/linkcheck/logger/__init__.py index ac1feab5..66e169fe 100644 --- a/linkcheck/logger/__init__.py +++ b/linkcheck/logger/__init__.py @@ -46,15 +46,7 @@ Fields = dict( ) del _ -ContentTypes = dict( - image=0, - text=0, - video=0, - audio=0, - application=0, - mail=0, - other=0, -) +ContentTypes = dict(image=0, text=0, video=0, audio=0, application=0, mail=0, other=0,) class LogStatistics: @@ -236,9 +228,13 @@ class _Logger(abc.ABC): self.close_fd = True except IOError: msg = sys.exc_info()[1] - log.warn(LOG_CHECK, - "Could not open file %r for writing: %s\n" - "Disabling log output of %s", self.filename, msg, self) + log.warn( + LOG_CHECK, + "Could not open file %r for writing: %s\n" "Disabling log output of %s", + self.filename, + msg, + self, + ) self.fd = dummy.Dummy() self.is_active = False self.filename = None @@ -246,10 +242,10 @@ class _Logger(abc.ABC): def create_fd(self): """Create open file descriptor.""" if self.filename is None: - return i18n.get_encoded_writer(encoding=self.output_encoding, - errors=self.codec_errors) - return codecs.open(self.filename, "wb", self.output_encoding, - self.codec_errors) + return i18n.get_encoded_writer( + encoding=self.output_encoding, errors=self.codec_errors + ) + return codecs.open(self.filename, "wb", self.output_encoding, self.codec_errors) def close_fileoutput(self): """ @@ -289,12 +285,14 @@ class _Logger(abc.ABC): """ Return wrapped version of given lines. """ - sep = os.linesep+os.linesep + sep = os.linesep + os.linesep text = sep.join(lines) - kwargs = dict(subsequent_indent=" "*self.max_indent, - initial_indent=" "*self.max_indent, - break_long_words=False, - break_on_hyphens=False) + kwargs = dict( + subsequent_indent=" " * self.max_indent, + initial_indent=" " * self.max_indent, + break_long_words=False, + break_on_hyphens=False, + ) return strformat.wrap(text, width, **kwargs).lstrip() def write(self, s, **args): @@ -311,9 +309,12 @@ class _Logger(abc.ABC): self.fd.write(s, **args) except IOError: msg = sys.exc_info()[1] - log.warn(LOG_CHECK, - "Could not write to output file: %s\n" - "Disabling log output of %s", msg, self) + log.warn( + LOG_CHECK, + "Could not write to output file: %s\n" "Disabling log output of %s", + msg, + self, + ) self.close_fileoutput() self.fd = dummy.Dummy() self.is_active = False @@ -356,9 +357,9 @@ class _Logger(abc.ABC): parts = self.logparts values = (self.part(x) for x in parts) # maximum indent for localized log part names - self.max_indent = max(len(x) for x in values)+1 + self.max_indent = max(len(x) for x in values) + 1 for key in parts: - numspaces = (self.max_indent - len(self.part(key))) + numspaces = self.max_indent - len(self.part(key)) self.logspaces[key] = " " * numspaces self.stats.reset() self.starttime = time.time() @@ -374,22 +375,29 @@ class _Logger(abc.ABC): def write_intro(self): """Write intro comments.""" - self.comment(_("created by %(app)s at %(time)s") % - {"app": configuration.AppName, - "time": strformat.strtime(self.starttime)}) - self.comment(_("Get the newest version at %(url)s") % - {'url': configuration.Url}) - self.comment(_("Write comments and bugs to %(url)s") % - {'url': configuration.SupportUrl}) + self.comment( + _("created by %(app)s at %(time)s") + % {"app": configuration.AppName, "time": strformat.strtime(self.starttime)} + ) + self.comment( + _("Get the newest version at %(url)s") % {'url': configuration.Url} + ) + self.comment( + _("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl} + ) self.check_date() def write_outro(self): """Write outro comments.""" self.stoptime = time.time() duration = self.stoptime - self.starttime - self.comment(_("Stopped checking at %(time)s (%(duration)s)") % - {"time": strformat.strtime(self.stoptime), - "duration": strformat.strduration_long(duration)}) + self.comment( + _("Stopped checking at %(time)s (%(duration)s)") + % { + "time": strformat.strtime(self.stoptime), + "duration": strformat.strduration_long(duration), + } + ) @abc.abstractmethod def log_url(self, url_data): @@ -445,9 +453,11 @@ class _Logger(abc.ABC): return modified.strftime("%Y-%m-%d{0}%H:%M:%S.%fZ".format(sep)) return "" + def _get_loggers(): """Return list of Logger classes.""" from .. import loader + modules = loader.get_package_modules('logger') return list(loader.get_plugins(modules, [_Logger])) diff --git a/linkcheck/logger/blacklist.py b/linkcheck/logger/blacklist.py index 7c4171c4..203ba36b 100644 --- a/linkcheck/logger/blacklist.py +++ b/linkcheck/logger/blacklist.py @@ -76,8 +76,9 @@ class BlacklistLogger(_Logger): """ Read a previously stored blacklist from file fd. """ - with codecs.open(self.filename, 'r', self.output_encoding, - self.codec_errors) as fd: + with codecs.open( + self.filename, 'r', self.output_encoding, self.codec_errors + ) as fd: for line in fd: line = line.rstrip() if line.startswith('#') or not line: diff --git a/linkcheck/logger/csvlog.py b/linkcheck/logger/csvlog.py index 60435226..6c7fcba7 100644 --- a/linkcheck/logger/csvlog.py +++ b/linkcheck/logger/csvlog.py @@ -24,9 +24,23 @@ from . import _Logger from .. import strformat Columns = ( - "urlname", "parentname", "baseref", "result", "warningstring", - "infostring", "valid", "url", "line", "column", "name", - "dltime", "size", "checktime", "cached", "level", "modified", + "urlname", + "parentname", + "baseref", + "result", + "warningstring", + "infostring", + "valid", + "url", + "line", + "column", + "name", + "dltime", + "size", + "checktime", + "cached", + "level", + "modified", ) @@ -70,9 +84,13 @@ class CSVLogger(_Logger): # write empty string to initialize file output self.write("") self.queue = StringIO() - self.writer = csv.writer(self.queue, dialect=self.dialect, - delimiter=self.separator, lineterminator=self.linesep, - quotechar=self.quotechar) + self.writer = csv.writer( + self.queue, + dialect=self.dialect, + delimiter=self.separator, + lineterminator=self.linesep, + quotechar=self.quotechar, + ) for s in Columns: if self.has_part(s): row.append(s) diff --git a/linkcheck/logger/customxml.py b/linkcheck/logger/customxml.py index e5cf9324..eccbbf8f 100644 --- a/linkcheck/logger/customxml.py +++ b/linkcheck/logger/customxml.py @@ -55,8 +55,7 @@ class CustomXMLLogger(xmllog._XMLLogger): 'line': "%s" % url_data.line, 'column': "%s" % url_data.column, } - self.xml_tag("parent", url_data.parent_url, - attrs=attrs) + self.xml_tag("parent", url_data.parent_url, attrs=attrs) if url_data.base_ref and self.has_part('base'): self.xml_tag("baseref", url_data.base_ref) if self.has_part("realurl"): diff --git a/linkcheck/logger/graph.py b/linkcheck/logger/graph.py index 42b41175..eef6c6e0 100644 --- a/linkcheck/logger/graph.py +++ b/linkcheck/logger/graph.py @@ -93,6 +93,7 @@ class _GraphLogger(_Logger): _disallowed = re.compile(r"[^a-zA-Z0-9 '#(){}\-\[\]\.,;:\!\?]+") + def quote(s): """Replace disallowed characters in node or edge labels. Also remove whitespace from beginning or end of label.""" diff --git a/linkcheck/logger/gxml.py b/linkcheck/logger/gxml.py index 793ed734..d66ed612 100644 --- a/linkcheck/logger/gxml.py +++ b/linkcheck/logger/gxml.py @@ -27,7 +27,7 @@ class GraphXMLLogger(_XMLLogger, _GraphLogger): LoggerName = 'gxml' - LoggerArgs = { + LoggerArgs = { "filename": "linkchecker-out.gxml", } diff --git a/linkcheck/logger/html.py b/linkcheck/logger/html.py index 47cc6197..ecba706e 100644 --- a/linkcheck/logger/html.py +++ b/linkcheck/logger/html.py @@ -28,8 +28,10 @@ from .. import strformat, configuration # ss=1 enables show source validate_html = "http://validator.w3.org/check?ss=1&uri=%(uri)s" # options are the default -validate_css = "http://jigsaw.w3.org/css-validator/validator?" \ - "uri=%(uri)s&warning=1&profile=css2&usermedium=all" +validate_css = ( + "http://jigsaw.w3.org/css-validator/validator?" + "uri=%(uri)s&warning=1&profile=css2&usermedium=all" +) HTML_HEADER = """ @@ -64,15 +66,15 @@ class HtmlLogger(_Logger): LoggerName = 'html' - LoggerArgs = { - "filename": "linkchecker-out.html", + LoggerArgs = { + "filename": "linkchecker-out.html", 'colorbackground': '#fff7e5', - 'colorurl': '#dcd5cf', - 'colorborder': '#000000', - 'colorlink': '#191c83', - 'colorwarning': '#e0954e', - 'colorerror': '#db4930', - 'colorok': '#3ba557', + 'colorurl': '#dcd5cf', + 'colorborder': '#000000', + 'colorlink': '#191c83', + 'colorwarning': '#e0954e', + 'colorerror': '#db4930', + 'colorok': '#3ba557', } def __init__(self, **kwargs): @@ -116,12 +118,16 @@ class HtmlLogger(_Logger): self.write(HTML_HEADER % header) self.comment("Generated by %s" % configuration.App) if self.has_part('intro'): - self.write("

"+configuration.App+ - "


"+ - configuration.Freeware+"

"+ - (_("Start checking at %s") % - strformat.strtime(self.starttime))+ - os.linesep+"
") + self.write( + "

" + + configuration.App + + "


" + + configuration.Freeware + + "

" + + (_("Start checking at %s") % strformat.strtime(self.starttime)) + + os.linesep + + "
" + ) self.check_date() self.flush() @@ -184,10 +190,15 @@ class HtmlLogger(_Logger): def write_parent(self, url_data): """Write url_data.parent_url.""" - self.write(""+self.part("parenturl")+ - ''+ - html.escape(url_data.parent_url)+"") + self.write( + "" + + self.part("parenturl") + + '' + + html.escape(url_data.parent_url) + + "" + ) if url_data.line is not None: self.write(_(", line %d") % url_data.line) if url_data.column is not None: @@ -199,58 +210,98 @@ class HtmlLogger(_Logger): vhtml = validate_html % {'uri': url_data.parent_url} vcss = validate_css % {'uri': url_data.parent_url} self.writeln() - self.writeln('(HTML)') - self.write('(CSS)') + self.writeln('(HTML)') + self.write('(CSS)') self.writeln("") def write_base(self, url_data): """Write url_data.base_ref.""" - self.writeln(""+self.part("base")+""+ - html.escape(url_data.base_ref)+"") + self.writeln( + "" + + self.part("base") + + "" + + html.escape(url_data.base_ref) + + "" + ) def write_real(self, url_data): """Write url_data.url.""" - self.writeln(""+self.part("realurl")+""+ - ''+html.escape(url_data.url)+"") + self.writeln( + "" + + self.part("realurl") + + "" + + '' + + html.escape(url_data.url) + + "" + ) def write_dltime(self, url_data): """Write url_data.dltime.""" - self.writeln(""+self.part("dltime")+""+ - (_("%.3f seconds") % url_data.dltime)+ - "") + self.writeln( + "" + + self.part("dltime") + + "" + + (_("%.3f seconds") % url_data.dltime) + + "" + ) def write_size(self, url_data): """Write url_data.size.""" - self.writeln(""+self.part("dlsize")+""+ - strformat.strsize(url_data.size)+ - "") + self.writeln( + "" + + self.part("dlsize") + + "" + + strformat.strsize(url_data.size) + + "" + ) def write_checktime(self, url_data): """Write url_data.checktime.""" - self.writeln(""+self.part("checktime")+""+ - (_("%.3f seconds") % url_data.checktime)+"") + self.writeln( + "" + + self.part("checktime") + + "" + + (_("%.3f seconds") % url_data.checktime) + + "" + ) def write_info(self, url_data): """Write url_data.info.""" - sep = "
"+os.linesep + sep = "
" + os.linesep text = sep.join(html.escape(x) for x in url_data.info) - self.writeln('' + self.part("info")+ - ""+text+"") + self.writeln( + '' + + self.part("info") + + "" + + text + + "" + ) def write_modified(self, url_data): """Write url_data.modified.""" text = html.escape(self.format_modified(url_data.modified)) - self.writeln('' + self.part("modified") + - ""+text+"") + self.writeln( + '' + + self.part("modified") + + "" + + text + + "" + ) def write_warning(self, url_data): """Write url_data.warnings.""" - sep = "
"+os.linesep + sep = "
" + os.linesep text = sep.join(html.escape(x[1]) for x in url_data.warnings) - self.writeln('' + self.part("warning") + - '' + text + "") + self.writeln( + '' + + self.part("warning") + + '' + + text + + "" + ) def write_result(self, url_data): """Write url_data.result.""" @@ -265,22 +316,30 @@ class HtmlLogger(_Logger): self.write('') self.write(html.escape(_("Error"))) if url_data.result: - self.write(": "+html.escape(url_data.result)) + self.write(": " + html.escape(url_data.result)) self.writeln("") def write_stats(self): """Write check statistic infos.""" self.writeln('
%s
' % _("Statistics")) if self.stats.number > 0: - self.writeln(_( - "Content types: %(image)d image, %(text)d text, %(video)d video, " - "%(audio)d audio, %(application)d application, %(mail)d mail" - " and %(other)d other.") % self.stats.link_types) + self.writeln( + _( + "Content types: %(image)d image, %(text)d text, %(video)d video, " + "%(audio)d audio, %(application)d application, %(mail)d mail" + " and %(other)d other." + ) + % self.stats.link_types + ) self.writeln("
") - self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") % - dict(min=self.stats.min_url_length, - max=self.stats.max_url_length, - avg=self.stats.avg_url_length)) + self.writeln( + _("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") + % dict( + min=self.stats.min_url_length, + max=self.stats.max_url_length, + avg=self.stats.avg_url_length, + ) + ) else: self.writeln(_("No statistics available since no URLs were checked.")) self.writeln("
") @@ -288,42 +347,77 @@ class HtmlLogger(_Logger): def write_outro(self): """Write end of check message.""" self.writeln("
") - self.write(_("That's it.")+" ") + self.write(_("That's it.") + " ") if self.stats.number >= 0: - self.write(_n("%d link checked.", "%d links checked.", - self.stats.number) % self.stats.number) + self.write( + _n("%d link checked.", "%d links checked.", self.stats.number) + % self.stats.number + ) self.write(" ") - self.write(_n("%d warning found", "%d warnings found", - self.stats.warnings_printed) % self.stats.warnings_printed) + self.write( + _n("%d warning found", "%d warnings found", self.stats.warnings_printed) + % self.stats.warnings_printed + ) if self.stats.warnings != self.stats.warnings_printed: - self.write(_(" (%d ignored or duplicates not printed)") % - (self.stats.warnings - self.stats.warnings_printed)) + self.write( + _(" (%d ignored or duplicates not printed)") + % (self.stats.warnings - self.stats.warnings_printed) + ) self.write(". ") - self.write(_n("%d error found", "%d errors found", - self.stats.errors_printed) % self.stats.errors_printed) + self.write( + _n("%d error found", "%d errors found", self.stats.errors_printed) + % self.stats.errors_printed + ) if self.stats.errors != self.stats.errors_printed: - self.write(_(" (%d duplicates not printed)") % - (self.stats.errors - self.stats.errors_printed)) + self.write( + _(" (%d duplicates not printed)") + % (self.stats.errors - self.stats.errors_printed) + ) self.writeln(".") self.writeln("
") num = self.stats.internal_errors if num: - self.write(_n("There was %(num)d internal error.", - "There were %(num)d internal errors.", num) % {"num": num}) + self.write( + _n( + "There was %(num)d internal error.", + "There were %(num)d internal errors.", + num, + ) + % {"num": num} + ) self.writeln("
") self.stoptime = time.time() duration = self.stoptime - self.starttime - self.writeln(_("Stopped checking at %(time)s (%(duration)s)") % - {"time": strformat.strtime(self.stoptime), - "duration": strformat.strduration_long(duration)}) - self.writeln('


'+ - configuration.HtmlAppInfo+"
") - self.writeln(_("Get the newest version at %s") % - (''+ - configuration.Url+".
")) - self.writeln(_("Write comments and bugs to %s") % - (''+ - configuration.SupportUrl+".
")) + self.writeln( + _("Stopped checking at %(time)s (%(duration)s)") + % { + "time": strformat.strtime(self.stoptime), + "duration": strformat.strduration_long(duration), + } + ) + self.writeln( + '


' + configuration.HtmlAppInfo + "
" + ) + self.writeln( + _("Get the newest version at %s") + % ( + '' + + configuration.Url + + ".
" + ) + ) + self.writeln( + _("Write comments and bugs to %s") + % ( + '' + + configuration.SupportUrl + + ".
" + ) + ) self.writeln("
") def end_output(self, **kwargs): diff --git a/linkcheck/logger/sitemapxml.py b/linkcheck/logger/sitemapxml.py index d73820c7..2ad67957 100644 --- a/linkcheck/logger/sitemapxml.py +++ b/linkcheck/logger/sitemapxml.py @@ -32,6 +32,7 @@ ChangeFreqs = ( HTTP_SCHEMES = ('http:', 'https:') HTML_TYPES = ('text/html', "application/xhtml+xml") + class SitemapXmlLogger(xmllog._XMLLogger): """Sitemap XML output according to http://www.sitemaps.org/protocol.html """ @@ -81,7 +82,11 @@ class SitemapXmlLogger(xmllog._XMLLogger): # initialize prefix and priority if self.prefix is None: if not url_data.url.startswith(HTTP_SCHEMES): - log.warn(LOG_CHECK, "Sitemap URL %r does not start with http: or https:.", url_data.url) + log.warn( + LOG_CHECK, + "Sitemap URL %r does not start with http: or https:.", + url_data.url, + ) self.disabled = True return self.prefix = url_data.url @@ -94,11 +99,13 @@ class SitemapXmlLogger(xmllog._XMLLogger): priority = 0.5 if self.priority is not None: priority = self.priority - # ignore the do_print flag and determine ourselves if we filter the url - if (url_data.valid + # ignore the do_print flag and determine ourselves if we filter the url + if ( + url_data.valid and url_data.url.startswith(HTTP_SCHEMES) and url_data.url.startswith(self.prefix) - and url_data.content_type in HTML_TYPES): + and url_data.content_type in HTML_TYPES + ): self.log_url(url_data, priority=priority) def log_url(self, url_data, priority=None): diff --git a/linkcheck/logger/sql.py b/linkcheck/logger/sql.py index c4c05c9e..05001937 100644 --- a/linkcheck/logger/sql.py +++ b/linkcheck/logger/sql.py @@ -87,47 +87,50 @@ class SQLLogger(_Logger): """ Store url check info into the database. """ - self.writeln("insert into %(table)s(urlname," - "parentname,baseref,valid,result,warning,info,url,line,col," - "name,checktime,dltime,size,cached,level,modified) values (" - "%(base_url)s," - "%(url_parent)s," - "%(base_ref)s," - "%(valid)d," - "%(result)s," - "%(warning)s," - "%(info)s," - "%(url)s," - "%(line)s," - "%(column)s," - "%(name)s," - "%(checktime)d," - "%(dltime)d," - "%(size)d," - "%(cached)d," - "%(level)d," - "%(modified)s" - ")%(separator)s" % - {'table': self.dbname, - 'base_url': sqlify(url_data.base_url), - 'url_parent': sqlify((url_data.parent_url)), - 'base_ref': sqlify((url_data.base_ref)), - 'valid': intify(url_data.valid), - 'result': sqlify(url_data.result), - 'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)), - 'info': sqlify(os.linesep.join(url_data.info)), - 'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")), - 'line': 'NULL' if url_data.line is None else url_data.line, - 'column': 'NULL' if url_data.column is None else url_data.column, - 'name': sqlify(url_data.name), - 'checktime': url_data.checktime, - 'dltime': url_data.dltime, - 'size': url_data.size, - 'cached': 0, - 'separator': self.separator, - "level": url_data.level, - "modified": sqlify(self.format_modified(url_data.modified)), - }) + self.writeln( + "insert into %(table)s(urlname," + "parentname,baseref,valid,result,warning,info,url,line,col," + "name,checktime,dltime,size,cached,level,modified) values (" + "%(base_url)s," + "%(url_parent)s," + "%(base_ref)s," + "%(valid)d," + "%(result)s," + "%(warning)s," + "%(info)s," + "%(url)s," + "%(line)s," + "%(column)s," + "%(name)s," + "%(checktime)d," + "%(dltime)d," + "%(size)d," + "%(cached)d," + "%(level)d," + "%(modified)s" + ")%(separator)s" + % { + 'table': self.dbname, + 'base_url': sqlify(url_data.base_url), + 'url_parent': sqlify((url_data.parent_url)), + 'base_ref': sqlify((url_data.base_ref)), + 'valid': intify(url_data.valid), + 'result': sqlify(url_data.result), + 'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)), + 'info': sqlify(os.linesep.join(url_data.info)), + 'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")), + 'line': 'NULL' if url_data.line is None else url_data.line, + 'column': 'NULL' if url_data.column is None else url_data.column, + 'name': sqlify(url_data.name), + 'checktime': url_data.checktime, + 'dltime': url_data.dltime, + 'size': url_data.size, + 'cached': 0, + 'separator': self.separator, + "level": url_data.level, + "modified": sqlify(self.format_modified(url_data.modified)), + } + ) self.flush() def end_output(self, **kwargs): diff --git a/linkcheck/logger/text.py b/linkcheck/logger/text.py index 12ba7009..7bb067a0 100644 --- a/linkcheck/logger/text.py +++ b/linkcheck/logger/text.py @@ -38,18 +38,18 @@ class TextLogger(_Logger): LoggerArgs = { "filename": "linkchecker-out.txt", - 'colorparent': "default", - 'colorurl': "default", - 'colorname': "default", - 'colorreal': "cyan", - 'colorbase': "purple", - 'colorvalid': "bold;green", + 'colorparent': "default", + 'colorurl': "default", + 'colorname': "default", + 'colorreal': "cyan", + 'colorbase': "purple", + 'colorvalid': "bold;green", 'colorinvalid': "bold;red", - 'colorinfo': "default", + 'colorinfo': "default", 'colorwarning': "bold;yellow", - 'colordltime': "default", - 'colordlsize': "default", - 'colorreset': "default", + 'colordltime': "default", + 'colordlsize': "default", + 'colorreset': "default", } def __init__(self, **kwargs): @@ -95,14 +95,15 @@ class TextLogger(_Logger): """Log introduction text.""" self.writeln(configuration.AppInfo) self.writeln(configuration.Freeware) - self.writeln(_("Get the newest version at %(url)s") % - {'url': configuration.Url}) - self.writeln(_("Write comments and bugs to %(url)s") % - {'url': configuration.SupportUrl}) + self.writeln( + _("Get the newest version at %(url)s") % {'url': configuration.Url} + ) + self.writeln( + _("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl} + ) self.check_date() self.writeln() - self.writeln(_("Start checking at %s") % - strformat.strtime(self.starttime)) + self.writeln(_("Start checking at %s") % strformat.strtime(self.starttime)) def log_url(self, url_data): """Write url checking info.""" @@ -175,20 +176,17 @@ class TextLogger(_Logger): def write_dltime(self, url_data): """Write url_data.dltime.""" self.write(self.part("dltime") + self.spaces("dltime")) - self.writeln(_("%.3f seconds") % url_data.dltime, - color=self.colordltime) + self.writeln(_("%.3f seconds") % url_data.dltime, color=self.colordltime) def write_size(self, url_data): """Write url_data.size.""" self.write(self.part("dlsize") + self.spaces("dlsize")) - self.writeln(strformat.strsize(url_data.size), - color=self.colordlsize) + self.writeln(strformat.strsize(url_data.size), color=self.colordlsize) def write_checktime(self, url_data): """Write url_data.checktime.""" self.write(self.part("checktime") + self.spaces("checktime")) - self.writeln(_("%.3f seconds") % url_data.checktime, - color=self.colordltime) + self.writeln(_("%.3f seconds") % url_data.checktime, color=self.colordltime) def write_info(self, url_data): """Write url_data.info.""" @@ -225,60 +223,88 @@ class TextLogger(_Logger): if interrupt: self.writeln(_("The check has been interrupted; results are not complete.")) self.write(_("That's it.") + " ") - self.write(_n("%d link", "%d links", - self.stats.number) % self.stats.number) + self.write(_n("%d link", "%d links", self.stats.number) % self.stats.number) self.write(" ") if self.stats.num_urls is not None: - self.write(_n("in %d URL", "in %d URLs", - self.stats.num_urls) % self.stats.num_urls) + self.write( + _n("in %d URL", "in %d URLs", self.stats.num_urls) % self.stats.num_urls + ) self.write(" checked. ") - warning_text = _n("%d warning found", "%d warnings found", - self.stats.warnings_printed) % self.stats.warnings_printed + warning_text = ( + _n("%d warning found", "%d warnings found", self.stats.warnings_printed) + % self.stats.warnings_printed + ) if self.stats.warnings_printed: warning_color = self.colorwarning else: warning_color = self.colorinfo self.write(warning_text, color=warning_color) if self.stats.warnings != self.stats.warnings_printed: - self.write(_(" (%d ignored or duplicates not printed)") % - (self.stats.warnings - self.stats.warnings_printed)) + self.write( + _(" (%d ignored or duplicates not printed)") + % (self.stats.warnings - self.stats.warnings_printed) + ) self.write(". ") - error_text = _n("%d error found", "%d errors found", - self.stats.errors_printed) % self.stats.errors_printed + error_text = ( + _n("%d error found", "%d errors found", self.stats.errors_printed) + % self.stats.errors_printed + ) if self.stats.errors_printed: error_color = self.colorinvalid else: error_color = self.colorvalid self.write(error_text, color=error_color) if self.stats.errors != self.stats.errors_printed: - self.write(_(" (%d duplicates not printed)") % - (self.stats.errors - self.stats.errors_printed)) + self.write( + _(" (%d duplicates not printed)") + % (self.stats.errors - self.stats.errors_printed) + ) self.writeln(".") num = self.stats.internal_errors if num: - self.writeln(_n("There was %(num)d internal error.", - "There were %(num)d internal errors.", num) % {"num": num}) + self.writeln( + _n( + "There was %(num)d internal error.", + "There were %(num)d internal errors.", + num, + ) + % {"num": num} + ) self.stoptime = time.time() duration = self.stoptime - self.starttime - self.writeln(_("Stopped checking at %(time)s (%(duration)s)") % - {"time": strformat.strtime(self.stoptime), - "duration": strformat.strduration_long(duration)}) + self.writeln( + _("Stopped checking at %(time)s (%(duration)s)") + % { + "time": strformat.strtime(self.stoptime), + "duration": strformat.strduration_long(duration), + } + ) def write_stats(self): """Write check statistic info.""" self.writeln() self.writeln(_("Statistics:")) if self.stats.downloaded_bytes is not None: - self.writeln(_("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes)) + self.writeln( + _("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes) + ) if self.stats.number > 0: - self.writeln(_( - "Content types: %(image)d image, %(text)d text, %(video)d video, " - "%(audio)d audio, %(application)d application, %(mail)d mail" - " and %(other)d other.") % self.stats.link_types) - self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") % - dict(min=self.stats.min_url_length, - max=self.stats.max_url_length, - avg=self.stats.avg_url_length)) + self.writeln( + _( + "Content types: %(image)d image, %(text)d text, %(video)d video, " + "%(audio)d audio, %(application)d application, %(mail)d mail" + " and %(other)d other." + ) + % self.stats.link_types + ) + self.writeln( + _("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") + % dict( + min=self.stats.min_url_length, + max=self.stats.max_url_length, + avg=self.stats.avg_url_length, + ) + ) else: self.writeln(_("No statistics available since no URLs were checked.")) diff --git a/linkcheck/logger/xmllog.py b/linkcheck/logger/xmllog.py index 41401216..16a8292a 100644 --- a/linkcheck/logger/xmllog.py +++ b/linkcheck/logger/xmllog.py @@ -66,8 +66,10 @@ class _XMLLogger(_Logger): """ Write start of checking info as xml comment. """ - self.writeln('' % - xmlquoteattr(self.get_charset_encoding())) + self.writeln( + '' + % xmlquoteattr(self.get_charset_encoding()) + ) if self.has_part("intro"): self.write_intro() self.writeln() @@ -83,7 +85,7 @@ class _XMLLogger(_Logger): """ Write XML start tag. """ - self.write(self.indent*self.level) + self.write(self.indent * self.level) self.write("<%s" % xmlquote(name)) if attrs: for name, value in attrs.items(): @@ -98,14 +100,14 @@ class _XMLLogger(_Logger): """ self.level -= 1 assert self.level >= 0 - self.write(self.indent*self.level) + self.write(self.indent * self.level) self.writeln("" % xmlquote(name)) def xml_tag(self, name, content, attrs=None): """ Write XML tag with content. """ - self.write(self.indent*self.level) + self.write(self.indent * self.level) self.write("<%s" % xmlquote(name)) if attrs: for aname, avalue in attrs.items(): diff --git a/linkcheck/memoryutil.py b/linkcheck/memoryutil.py index 96d6bade..e86bb588 100644 --- a/linkcheck/memoryutil.py +++ b/linkcheck/memoryutil.py @@ -22,9 +22,9 @@ from . import strformat, log, LOG_CHECK from .fileutil import get_temp_file # Message to display when meliae package is not installed -MemoryDebugMsg = strformat.format_feature_warning(module='meliae', - feature='memory debugging', - url='https://launchpad.net/meliae') +MemoryDebugMsg = strformat.format_feature_warning( + module='meliae', feature='memory debugging', url='https://launchpad.net/meliae' +) def write_memory_dump(): @@ -37,10 +37,10 @@ def write_memory_dump(): if gc.garbage: log.warn(LOG_CHECK, "Unreachabe objects: %s", pprint.pformat(gc.garbage)) from meliae import scanner + fo, filename = get_temp_file(mode='wb', suffix='.json', prefix='lcdump_') try: scanner.dump_all_objects(fo) finally: fo.close() return filename - diff --git a/linkcheck/mimeutil.py b/linkcheck/mimeutil.py index e0dcec0b..a94b204c 100644 --- a/linkcheck/mimeutil.py +++ b/linkcheck/mimeutil.py @@ -26,6 +26,7 @@ from .logconf import LOG_CHECK mimedb = None + def init_mimedb(): """Initialize the local MIME database.""" global mimedb @@ -59,6 +60,7 @@ PARSE_CONTENTS = { "application/xml+sitemap": re.compile(r'<\?xml[^<]+\s]+)>', re.I), - re.compile(r""" + _link_res = [ + re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I), + re.compile( + r""" \[.+\]: # id [ \t]*\n? # maybe *one* newline [ \t]* @@ -54,20 +56,26 @@ class MarkdownCheck(_ContentPlugin): [ \t]* )? # title is optional (?:\n+|\Z) - """, re.X | re.M | re.U)] + """, + re.X | re.M | re.U, + ), + ] _whitespace = re.compile(r'\s*') _strip_anglebrackets = re.compile(r'<(.*)>.*') - _inline_link_title = re.compile(r''' + _inline_link_title = re.compile( + r''' ( # \1 [ \t]+ (['"]) # quote char (.*?) )? # title is optional \)$ - ''', re.X | re.S) + ''', + re.X | re.S, + ) def __init__(self, config): super(MarkdownCheck, self).__init__(config) @@ -83,8 +91,11 @@ class MarkdownCheck(_ContentPlugin): def read_config(cls, configparser): """Read configuration file options.""" config = dict() - config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \ - if configparser.has_option(cls.__name__, cls._filename_re_key) else None + config[cls._filename_re_key] = ( + configparser.get(cls.__name__, cls._filename_re_key) + if configparser.has_option(cls.__name__, cls._filename_re_key) + else None + ) return config def applies_to(self, url_data, pagetype=None): @@ -107,7 +118,9 @@ class MarkdownCheck(_ContentPlugin): """ line = content.count('\n', 0, url_pos) + 1 column = url_pos - content.rfind('\n', 0, url_pos) - url_data.add_url(url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column) + url_data.add_url( + url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column + ) def _check_by_re(self, url_data, content): """ Finds urls by re. @@ -144,12 +157,12 @@ class MarkdownCheck(_ContentPlugin): end_idx = idx has_anglebrackets = text[idx] == "<" if has_anglebrackets: - end_idx = self._find_balanced(text, end_idx+1, "<", ">") + end_idx = self._find_balanced(text, end_idx + 1, "<", ">") end_idx = self._find_balanced(text, end_idx, "(", ")") match = self._inline_link_title.search(text, idx, end_idx) if not match: return None, None - url = text[idx:match.start()] + url = text[idx : match.start()] if has_anglebrackets: url = self._strip_anglebrackets.sub(r'\1', url) return url, end_idx @@ -175,7 +188,9 @@ class MarkdownCheck(_ContentPlugin): # Find the matching closing ']'. bracket_depth = 0 - for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)): + for p in range( + start_idx + 1, min(start_idx + MAX_LINK_TEXT_SENTINEL, content_length) + ): if content[p] == ']': bracket_depth -= 1 if bracket_depth < 0: diff --git a/linkcheck/plugins/parsepdf.py b/linkcheck/plugins/parsepdf.py index 651c62c7..cb56ca74 100755 --- a/linkcheck/plugins/parsepdf.py +++ b/linkcheck/plugins/parsepdf.py @@ -19,6 +19,7 @@ Parse links in PDF files with pdfminer. from io import BytesIO from . import _ParserPlugin + try: from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument @@ -32,7 +33,6 @@ else: from .. import log, LOG_PLUGIN, strformat - def search_url(obj, url_data, pageno, seen_objs): """Recurse through a PDF object, searching for URLs.""" if isinstance(obj, PDFObjRef): diff --git a/linkcheck/plugins/parseword.py b/linkcheck/plugins/parseword.py index 6a8f63a0..00aabed3 100755 --- a/linkcheck/plugins/parseword.py +++ b/linkcheck/plugins/parseword.py @@ -17,9 +17,11 @@ Parse hyperlinks in Word files. """ from . import _ParserPlugin + try: import win32com import pythoncom + has_win32com = True Error = pythoncom.com_error except ImportError: @@ -29,14 +31,17 @@ from .. import fileutil, log, LOG_PLUGIN _initialized = False + + def init_win32com(): """Initialize the win32com.client cache.""" global _initialized if _initialized: return import win32com.client + if win32com.client.gencache.is_readonly: - #allow gencache to create the cached wrapper objects + # allow gencache to create the cached wrapper objects win32com.client.gencache.is_readonly = False # under py2exe the call in gencache to __init__() does not happen # so we use Rebuild() to force the creation of the gen_py folder @@ -79,6 +84,7 @@ def get_word_app(): # the COM layer. pythoncom.CoInitialize() import win32com.client + app = win32com.client.gencache.EnsureDispatch("Word.Application") app.Visible = False return app @@ -91,8 +97,13 @@ def close_word_app(app): def open_wordfile(app, filename): """Open given Word file with application object.""" - return app.Documents.Open(filename, ReadOnly=True, - AddToRecentFiles=False, Visible=False, NoEncodingDialog=True) + return app.Documents.Open( + filename, + ReadOnly=True, + AddToRecentFiles=False, + Visible=False, + NoEncodingDialog=True, + ) def close_wordfile(doc): @@ -128,7 +139,7 @@ class WordParser(_ParserPlugin): try: for link in doc.Hyperlinks: line = get_line_number(link.Range) - name=link.TextToDisplay + name = link.TextToDisplay url_data.add_url(link.Address, name=name, line=line) finally: close_wordfile(doc) @@ -158,11 +169,9 @@ def get_line_number(doc, wrange): def get_temp_filename(content): """Get temporary filename for content to parse.""" # store content in temporary file - fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc', - prefix='lc_') + fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc', prefix='lc_') try: fd.write(content) finally: fd.close() return filename - diff --git a/linkcheck/plugins/sslcertcheck.py b/linkcheck/plugins/sslcertcheck.py index 5af82f45..f1c2f39e 100644 --- a/linkcheck/plugins/sslcertcheck.py +++ b/linkcheck/plugins/sslcertcheck.py @@ -27,6 +27,7 @@ _lock = threading.Lock() # configuration option names sslcertwarndays = "sslcertwarndays" + class SslCertificateCheck(_ConnectionPlugin): """Check SSL certificate expiration date. Only internal https: links will be checked. A domain will only be checked once to avoid duplicate @@ -37,14 +38,20 @@ class SslCertificateCheck(_ConnectionPlugin): def __init__(self, config): """Initialize clamav configuration.""" super(SslCertificateCheck, self).__init__(config) - self.warn_ssl_cert_secs_valid = config[sslcertwarndays] * strformat.SECONDS_PER_DAY + self.warn_ssl_cert_secs_valid = ( + config[sslcertwarndays] * strformat.SECONDS_PER_DAY + ) # do not check hosts multiple times self.checked_hosts = set() def applies_to(self, url_data): """Check validity, scheme, extern and url_connection.""" - return url_data.valid and url_data.scheme == 'https' and \ - not url_data.extern[0] and url_data.url_connection is not None + return ( + url_data.valid + and url_data.scheme == 'https' + and not url_data.extern[0] + and url_data.url_connection is not None + ) @synchronized(_lock) def check(self, url_data): @@ -71,6 +78,7 @@ class SslCertificateCheck(_ConnectionPlugin): if it's at least a number of days valid. """ import ssl + try: notAfter = ssl.cert_time_to_seconds(cert['notAfter']) except ValueError as msg: @@ -88,7 +96,9 @@ class SslCertificateCheck(_ConnectionPlugin): else: args['valid'] = strformat.strduration_long(secondsValid) if secondsValid < self.warn_ssl_cert_secs_valid: - msg = _('SSL certificate expires on %(expire)s and is only %(valid)s valid.') + msg = _( + 'SSL certificate expires on %(expire)s and is only %(valid)s valid.' + ) url_data.add_warning(msg % args) else: msg = _('SSL certificate expires on %(expire)s and is %(valid)s valid.') @@ -105,7 +115,11 @@ class SslCertificateCheck(_ConnectionPlugin): if num > 0: config[option] = num else: - msg = _("invalid value for %s: %d must not be less than %d") % (option, num, 0) + msg = _("invalid value for %s: %d must not be less than %d") % ( + option, + num, + 0, + ) raise LinkCheckerError(msg) else: # set the default diff --git a/linkcheck/plugins/syntaxchecks.py b/linkcheck/plugins/syntaxchecks.py index 0af3ecfa..ad41b9b9 100644 --- a/linkcheck/plugins/syntaxchecks.py +++ b/linkcheck/plugins/syntaxchecks.py @@ -47,6 +47,7 @@ class HtmlSyntaxCheck(_ContentPlugin): """Check the syntax of HTML pages with the online W3C HTML validator. See http://validator.w3.org/docs/api.html. """ + def __init__(self, config): """Initialize plugin.""" super(HtmlSyntaxCheck, self).__init__(config) @@ -69,9 +70,11 @@ class HtmlSyntaxCheck(_ContentPlugin): return check_w3_errors(url_data, response.text, "W3C HTML") except requests.exceptions.RequestException: - pass # ignore service failures + pass # ignore service failures except Exception as msg: - log.warn(LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg}) + log.warn( + LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg} + ) class CssSyntaxCheck(_ContentPlugin): @@ -106,9 +109,11 @@ class CssSyntaxCheck(_ContentPlugin): return check_w3_errors(url_data, response.text, "W3C HTML") except requests.exceptions.RequestException: - pass # ignore service failures + pass # ignore service failures except Exception as msg: - log.warn(LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg}) + log.warn( + LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg} + ) def check_w3_errors(url_data, xml, w3type): @@ -116,7 +121,9 @@ def check_w3_errors(url_data, xml, w3type): w3type is either "W3C HTML" or "W3C CSS".""" dom = parseString(xml) for error in dom.getElementsByTagName('m:error'): - warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s") + warnmsg = _( + "%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s" + ) attrs = { "w3type": w3type, "line": getXmlText(error, "m:line"), diff --git a/linkcheck/plugins/viruscheck.py b/linkcheck/plugins/viruscheck.py index d5623eca..e63b358e 100644 --- a/linkcheck/plugins/viruscheck.py +++ b/linkcheck/plugins/viruscheck.py @@ -67,6 +67,7 @@ class VirusCheck(_ContentPlugin): class ClamavError(Exception): """Raised on clamav errors.""" + pass @@ -78,8 +79,7 @@ class ClamdScanner: self.infected = [] self.errors = [] self.sock, self.host = clamav_conf.new_connection() - self.sock_rcvbuf = \ - self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF) + self.sock_rcvbuf = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF) self.wsock = self.new_scansock() def new_scansock(self): @@ -92,7 +92,7 @@ class ClamdScanner: data = self.sock.recv(self.sock_rcvbuf) i = data.find(b"PORT") if i != -1: - port = int(data[i+5:]) + port = int(data[i + 5 :]) break except socket.error: self.sock.close() @@ -159,7 +159,9 @@ class ClamavConfig(dict): if self.get('ScannerDaemonOutputFormat'): raise ClamavError(_("ScannerDaemonOutputFormat must be disabled")) if self.get('TCPSocket') and self.get('LocalSocket'): - raise ClamavError(_("only one of TCPSocket and LocalSocket must be enabled")) + raise ClamavError( + _("only one of TCPSocket and LocalSocket must be enabled") + ) def parseconf(self, filename): """Parse clamav configuration from given file.""" diff --git a/linkcheck/robotparser2.py b/linkcheck/robotparser2.py index 60a3aac2..f54213cb 100644 --- a/linkcheck/robotparser2.py +++ b/linkcheck/robotparser2.py @@ -35,8 +35,7 @@ class RobotFileParser: """This class provides a set of methods to read, parse and answer questions about a single robots.txt file.""" - def __init__(self, url='', session=None, proxies=None, auth=None, - timeout=None): + def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None): """Initialize internal entry lists and store given url and credentials.""" self.set_url(url) @@ -85,7 +84,7 @@ class RobotFileParser: """Read the robots.txt URL and feeds it to the parser.""" self._reset() kwargs = dict( - headers = { + headers={ 'User-Agent': configuration.UserAgent, 'Accept-Encoding': ACCEPT_ENCODING, } @@ -109,7 +108,12 @@ class RobotFileParser: except requests.HTTPError as x: if x.response.status_code in (401, 403): self.disallow_all = True - log.debug(LOG_CHECK, "%r disallow all (code %d)", self.url, x.response.status_code) + log.debug( + LOG_CHECK, + "%r disallow all (code %d)", + self.url, + x.response.status_code, + ) else: self.allow_all = True log.debug(LOG_CHECK, "%r allow all (HTTP error)", self.url) @@ -148,7 +152,12 @@ class RobotFileParser: linenumber += 1 if not line: if state == 1: - log.debug(LOG_CHECK, "%r line %d: allow or disallow directives without any user-agent line", self.url, linenumber) + log.debug( + LOG_CHECK, + "%r line %d: allow or disallow directives without any user-agent line", + self.url, + linenumber, + ) entry = Entry() state = 0 elif state == 2: @@ -168,35 +177,61 @@ class RobotFileParser: line[1] = urllib.parse.unquote(line[1].strip(), self.encoding) if line[0] == "user-agent": if state == 2: - log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber) + log.debug( + LOG_CHECK, + "%r line %d: missing blank line before user-agent directive", + self.url, + linenumber, + ) self._add_entry(entry) entry = Entry() entry.useragents.append(line[1]) state = 1 elif line[0] == "disallow": if state == 0: - log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber) + log.debug( + LOG_CHECK, + "%r line %d: missing user-agent directive before this line", + self.url, + linenumber, + ) pass else: entry.rulelines.append(RuleLine(line[1], False)) state = 2 elif line[0] == "allow": if state == 0: - log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber) + log.debug( + LOG_CHECK, + "%r line %d: missing user-agent directive before this line", + self.url, + linenumber, + ) pass else: entry.rulelines.append(RuleLine(line[1], True)) state = 2 elif line[0] == "crawl-delay": if state == 0: - log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber) + log.debug( + LOG_CHECK, + "%r line %d: missing user-agent directive before this line", + self.url, + linenumber, + ) pass else: try: entry.crawldelay = max(0, int(line[1])) state = 2 except (ValueError, OverflowError): - log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1]) + log.debug( + LOG_CHECK, + "%r line %d: invalid delay number %r", + self.url, + linenumber, + line[1], + ) pass elif line[0] == "sitemap": # Note that sitemap URLs must be absolute according to @@ -204,10 +239,22 @@ class RobotFileParser: # But this should be checked by the calling layer. self.sitemap_urls.append((line[1], linenumber)) else: - log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0]) + log.debug( + LOG_CHECK, + "%r line %d: unknown key %r", + self.url, + linenumber, + line[0], + ) pass else: - log.debug(LOG_CHECK, "%r line %d: malformed line %r", self.url, linenumber, line) + log.debug( + LOG_CHECK, + "%r line %d: malformed line %r", + self.url, + linenumber, + line, + ) pass if state in (1, 2): self.entries.append(entry) @@ -220,7 +267,13 @@ class RobotFileParser: @return: True if agent can fetch url, else False @rtype: bool """ - log.debug(LOG_CHECK, "%r check allowance for:\n user agent: %r\n url: %r ...", self.url, useragent, url) + log.debug( + LOG_CHECK, + "%r check allowance for:\n user agent: %r\n url: %r ...", + self.url, + useragent, + url, + ) if not isinstance(useragent, str): useragent = useragent.encode("ascii", "ignore") if not isinstance(url, str): @@ -233,7 +286,10 @@ class RobotFileParser: return True # search for given user agent matches # the first match counts - url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/" + url = ( + urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) + or "/" + ) for entry in self.entries: if entry.applies_to(useragent): return entry.allowance(url) @@ -296,7 +352,7 @@ class RuleLine: @return: robots.txt format @rtype: string """ - return ("Allow" if self.allowance else "Disallow")+": "+self.path + return ("Allow" if self.allowance else "Disallow") + ": " + self.path class Entry: @@ -352,5 +408,10 @@ class Entry: if line.applies_to(filename): log.debug(LOG_CHECK, " ... rule line %s", line) return line.allowance - log.debug(LOG_CHECK, " ... no rule lines of %s applied to %s; allowed.", self.useragents, filename) + log.debug( + LOG_CHECK, + " ... no rule lines of %s applied to %s; allowed.", + self.useragents, + filename, + ) return True diff --git a/linkcheck/strformat.py b/linkcheck/strformat.py index e4c55f33..02d12582 100644 --- a/linkcheck/strformat.py +++ b/linkcheck/strformat.py @@ -123,6 +123,7 @@ _para_posix = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\n'} _para_win = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\r\n'} _para_ro = re.compile("%s|%s|%s" % (_para_mac, _para_posix, _para_win)) + def get_paragraphs(text): """A new paragraph is considered to start at a line which follows one or more blank lines (lines containing nothing or just spaces). @@ -148,8 +149,7 @@ def wrap(text, width, **kwargs): def indent(text, indent_string=" "): """Indent each line of text with the given indent string.""" - return os.linesep.join("%s%s" % (indent_string, x) - for x in text.splitlines()) + return os.linesep.join("%s%s" % (indent_string, x) for x in text.splitlines()) def get_line_number(s, index): @@ -173,11 +173,12 @@ def paginate(text): _markup_re = re.compile("<.*?>", re.DOTALL) + def remove_markup(s): """Remove all <*> html markup tags from s.""" mo = _markup_re.search(s) while mo: - s = s[0:mo.start()] + s[mo.end():] + s = s[0 : mo.start()] + s[mo.end() :] mo = _markup_re.search(s) return s @@ -194,12 +195,20 @@ def strsize(b, grouping=True): if b < 1024 * 1024: return "%sKB" % locale.format_string("%.2f", (float(b) / 1024), grouping) if b < 1024 * 1024 * 10: - return "%sMB" % locale.format_string("%.2f", (float(b) / (1024*1024)), grouping) + return "%sMB" % locale.format_string( + "%.2f", (float(b) / (1024 * 1024)), grouping + ) if b < 1024 * 1024 * 1024: - return "%sMB" % locale.format_string("%.1f", (float(b) / (1024*1024)), grouping) + return "%sMB" % locale.format_string( + "%.1f", (float(b) / (1024 * 1024)), grouping + ) if b < 1024 * 1024 * 1024 * 10: - return "%sGB" % locale.format_string("%.2f", (float(b) / (1024*1024*1024)), grouping) - return "%sGB" % locale.format_string("%.1f", (float(b) / (1024*1024*1024)), grouping) + return "%sGB" % locale.format_string( + "%.2f", (float(b) / (1024 * 1024 * 1024)), grouping + ) + return "%sGB" % locale.format_string( + "%.1f", (float(b) / (1024 * 1024 * 1024)), grouping + ) def strtime(t, func=time.localtime): @@ -216,15 +225,21 @@ def strduration(duration): else: prefix = "" duration = math.ceil(duration) - if duration >= SECONDS_PER_HOUR: # 1 hour + if duration >= SECONDS_PER_HOUR: # 1 hour # time, in hours:minutes:seconds - return "%s%02d:%02d:%02d" % (prefix, duration // SECONDS_PER_HOUR, - (duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE, - duration % SECONDS_PER_MINUTE) + return "%s%02d:%02d:%02d" % ( + prefix, + duration // SECONDS_PER_HOUR, + (duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE, + duration % SECONDS_PER_MINUTE, + ) else: # time, in minutes:seconds - return "%s%02d:%02d" % (prefix, duration // SECONDS_PER_MINUTE, - duration % SECONDS_PER_MINUTE) + return "%s%02d:%02d" % ( + prefix, + duration // SECONDS_PER_MINUTE, + duration % SECONDS_PER_MINUTE, + ) # from quodlibet @@ -236,15 +251,17 @@ def strduration_long(duration, do_translate=True): else: # do not translate _ = lambda x: x - _n = lambda a, b, n: a if n==1 else b + _n = lambda a, b, n: a if n == 1 else b if duration < 0: duration = abs(duration) prefix = "-" else: prefix = "" if duration < 1: - return _("%(prefix)s%(duration).02f seconds") % \ - {"prefix": prefix, "duration": duration} + return _("%(prefix)s%(duration).02f seconds") % { + "prefix": prefix, + "duration": duration, + } # translation dummies _n("%d second", "%d seconds", 1) _n("%d minute", "%d minutes", 1) @@ -281,7 +298,7 @@ def strtimezone(): zone = time.altzone else: zone = time.timezone - return "%+04d" % (-zone//SECONDS_PER_HOUR) + return "%+04d" % (-zone // SECONDS_PER_HOUR) def stripurl(s): @@ -319,7 +336,12 @@ def format_feature_warning(**kwargs): """Format warning that a module could not be imported and that it should be installed for a certain URL. """ - return _("Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature.") % kwargs + return ( + _( + "Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature." + ) + % kwargs + ) def strip_control_chars(text): diff --git a/linkcheck/trace.py b/linkcheck/trace.py index a5370240..9fc4f2d4 100644 --- a/linkcheck/trace.py +++ b/linkcheck/trace.py @@ -48,7 +48,7 @@ def _trace(frame, event, arg): elif event in ('return', 'c_return'): _trace_line(frame, event, arg) print(" return:", arg) - #elif event in ('exception', 'c_exception'): + # elif event in ('exception', 'c_exception'): # _trace_line(frame, event, arg) return _trace diff --git a/linkcheck/updater.py b/linkcheck/updater.py index d9bbdeb7..d35dbfc5 100644 --- a/linkcheck/updater.py +++ b/linkcheck/updater.py @@ -24,7 +24,9 @@ from distutils.version import LooseVersion # Use the Freecode submit file as source since that file gets updated # only when releasing a new version. -UPDATE_URL = "https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode" +UPDATE_URL = ( + "https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode" +) VERSION_TAG = 'Version:' if os.name == 'nt': URL_TAG = 'Windows-installer-URL:' diff --git a/linkcheck/url.py b/linkcheck/url.py index f5529981..37bd4bba 100644 --- a/linkcheck/url.py +++ b/linkcheck/url.py @@ -60,20 +60,23 @@ _basic = { "_hex_full": r"0-9a-f", "_part": r"([a-z0-9][-a-z0-9]{0,61}|[a-z])", } -_safe_char = r"([a-z0-9%(_path)s\+]|"\ - r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic +_safe_char = r"([a-z0-9%(_path)s\+]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic _safe_scheme_pattern = r"(https?|ftp)" _safe_domain_pattern = r"(%(_part)s(\.%(_part)s)*\.?)" % _basic -_safe_host_pattern = _safe_domain_pattern+r"(:(80|8080|8000|443))?" % _basic -_safe_path_pattern = r"((/([a-z0-9%(_path)s]|"\ - r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic +_safe_host_pattern = _safe_domain_pattern + r"(:(80|8080|8000|443))?" % _basic +_safe_path_pattern = ( + r"((/([a-z0-9%(_path)s]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic +) _safe_fragment_pattern = r"%s*" % _safe_char _safe_cgi = r"%s+(=(%s|/)+)?" % (_safe_char, _safe_char) _safe_query_pattern = r"(%s(&%s)*)?" % (_safe_cgi, _safe_cgi) _safe_param_pattern = r"(%s(;%s)*)?" % (_safe_cgi, _safe_cgi) -safe_url_pattern = r"%s://%s%s(#%s)?" % \ - (_safe_scheme_pattern, _safe_host_pattern, - _safe_path_pattern, _safe_fragment_pattern) +safe_url_pattern = r"%s://%s%s(#%s)?" % ( + _safe_scheme_pattern, + _safe_host_pattern, + _safe_path_pattern, + _safe_fragment_pattern, +) is_safe_char = re.compile("(?i)^%s$" % _safe_char).match is_safe_url = re.compile("(?i)^%s$" % safe_url_pattern).match @@ -96,7 +99,7 @@ def splitparams(path): i = path.find(';') if i < 0: return path, '' - return path[:i], path[i+1:] + return path[:i], path[i + 1 :] def is_numeric_port(portstr): @@ -113,8 +116,12 @@ def is_numeric_port(portstr): def safe_host_pattern(host): """Return regular expression pattern with given host for URL testing.""" - return "(?i)%s://%s%s(#%s)?" % \ - (_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern) + return "(?i)%s://%s%s(#%s)?" % ( + _safe_scheme_pattern, + host, + _safe_path_pattern, + _safe_fragment_pattern, + ) def parse_qsl(qs, encoding, keep_blank_values=0, strict_parsing=0): @@ -190,18 +197,23 @@ def url_fix_host(urlparts, encoding): userpass, netloc = urllib.parse.splituser(urlparts[1]) if userpass: userpass = urllib.parse.unquote(userpass, encoding=encoding) - netloc, is_idn = idna_encode(urllib.parse.unquote(netloc, encoding=encoding).lower()) + netloc, is_idn = idna_encode( + urllib.parse.unquote(netloc, encoding=encoding).lower() + ) # a leading backslash in path causes urlsplit() to add the # path components up to the first slash to host # try to find this case... i = netloc.find("\\") if i != -1: # ...and fix it by prepending the misplaced components to the path - comps = netloc[i:] # note: still has leading backslash + comps = netloc[i:] # note: still has leading backslash if not urlparts[2] or urlparts[2] == '/': urlparts[2] = comps else: - urlparts[2] = "%s%s" % (comps, urllib.parse.unquote(urlparts[2], encoding=encoding)) + urlparts[2] = "%s%s" % ( + comps, + urllib.parse.unquote(urlparts[2], encoding=encoding), + ) netloc = netloc[:i] else: # a leading ? in path causes urlsplit() to add the query to the @@ -224,7 +236,7 @@ def url_fix_host(urlparts, encoding): if port != dport: host = "%s:%d" % (host, port) netloc = host - urlparts[1] = userpass+netloc + urlparts[1] = userpass + netloc return is_idn @@ -243,21 +255,25 @@ def url_fix_mailto_urlsplit(urlparts): if sep in urlparts[2]: urlparts[2], urlparts[3] = urlparts[2].split(sep, 1) + # wayback urls include in the path http[s]://. By default the # tidying mechanism in linkchecker encodes the : and deletes the second slash # This function reverses these corrections. This function expects only the # path section of the URL as input. wayback_regex = re.compile(r'(https?)(\%3A/|:/)') + + def url_fix_wayback_query(path): return wayback_regex.sub(r'\1://', path) + def url_parse_query(query, encoding): """Parse and re-join the given CGI query.""" # if ? is in the query, split it off, seen at msdn.microsoft.com append = "" while '?' in query: query, rest = query.rsplit('?', 1) - append = '?'+url_parse_query(rest, encoding=encoding)+append + append = '?' + url_parse_query(rest, encoding=encoding) + append l = [] for k, v, sep in parse_qsl(query, keep_blank_values=True, encoding=encoding): k = urllib.parse.quote(k, safe='/-:,;') @@ -316,12 +332,14 @@ def url_norm(url, encoding): # anchor urlparts[4] = urllib.parse.unquote(urlparts[4], encoding=encoding) # quote parts again - urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme - urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host - urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path + urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme + urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host + urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path if not urlparts[0].startswith("feed"): - urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path - urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor + urlparts[2] = url_fix_wayback_query( + urlparts[2] + ) # unencode colon in http[s]:// in wayback path + urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor res = urlunsplit(urlparts) if url.endswith('#') and not urlparts[4]: # re-append trailing empty fragment @@ -334,6 +352,8 @@ _thisdir_ro = re.compile(r"^\./") _samedir_ro = re.compile(r"/\./|/\.$") _parentdir_ro = re.compile(r"^/(\.\./)+|/(?!\.\./)[^/]+/\.\.(/|$)") _relparentdir_ro = re.compile(r"^(?!\.\./)[^/]+/\.\.(/|$)") + + def collapse_segments(path): """Remove all redundant segments from the given URL path. Precondition: path is an unquoted url path""" @@ -375,12 +395,14 @@ def url_quote(url, encoding): if not url_is_absolute(url): return document_quote(url) urlparts = list(urllib.parse.urlsplit(url)) - urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme - urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host - urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path - urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query + urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme + urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host + urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path + urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query l = [] - for k, v, sep in parse_qsl(urlparts[3], encoding=encoding, keep_blank_values=True): # query + for k, v, sep in parse_qsl( + urlparts[3], encoding=encoding, keep_blank_values=True + ): # query k = urllib.parse.quote(k, safe='/-:,;') if v: v = urllib.parse.quote(v, safe='/-:,;') @@ -388,7 +410,7 @@ def url_quote(url, encoding): else: l.append("%s%s" % (k, sep)) urlparts[3] = ''.join(l) - urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor + urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor return urlunsplit(urlparts) @@ -425,8 +447,10 @@ def match_host(host, domainlist): _nopathquote_chars = "-;/=,~*+()@!" if os.name == 'nt': _nopathquote_chars += "|" -_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!")+"a-zA-Z0-9" +_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!") + "a-zA-Z0-9" _safe_url_chars_ro = re.compile(r"^[%s]*$" % _safe_url_chars) + + def url_needs_quoting(url): """Check if url needs percent quoting. Note that the method does only check basic character sets, and not any other syntax. @@ -487,8 +511,7 @@ def splitport(host, port=0): return host, port -def get_content(url, user=None, password=None, proxy=None, data=None, - addheaders=None): +def get_content(url, user=None, password=None, proxy=None, data=None, addheaders=None): """Get URL content and info. @return: (decoded text content of URL, headers) or @@ -496,6 +519,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None, @rtype: tuple (String, dict) or (None, String) """ from . import configuration + headers = { 'User-Agent': configuration.UserAgent, } @@ -511,6 +535,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None, if proxy: kwargs['proxy'] = dict(http=proxy) from .configuration import get_share_file + try: kwargs["verify"] = get_share_file('cacert.pem') except ValueError: @@ -518,10 +543,15 @@ def get_content(url, user=None, password=None, proxy=None, data=None, try: response = requests.request(method, url, **kwargs) return response.text, response.headers - except (requests.exceptions.RequestException, - requests.exceptions.BaseHTTPError) as msg: - log.warn(LOG_CHECK, ("Could not get content of URL %(url)s: %(msg)s.") \ - % {"url": url, "msg": str(msg)}) + except ( + requests.exceptions.RequestException, + requests.exceptions.BaseHTTPError, + ) as msg: + log.warn( + LOG_CHECK, + ("Could not get content of URL %(url)s: %(msg)s.") + % {"url": url, "msg": str(msg)}, + ) return None, str(msg)