Run black on linkcheck/

This commit is contained in:
Chris Mayo 2020-05-30 17:01:36 +01:00
parent 152dbeb9b8
commit a92a684ac4
76 changed files with 2021 additions and 1068 deletions

View file

@ -19,9 +19,14 @@ Main function module for link checking.
# version checks
import sys
if sys.version_info < (3, 5, 0, 'final', 0):
import platform
raise SystemExit("This program requires Python 3.5.0 or later instead of %s." % platform.python_version())
raise SystemExit(
"This program requires Python 3.5.0 or later instead of %s."
% platform.python_version()
)
import os
import re
@ -48,6 +53,7 @@ def module_path():
def get_install_data():
"""Return absolute path of LinkChecker data installation directory."""
from .loader import is_frozen
if is_frozen():
return module_path()
return configdata.install_data
@ -55,10 +61,13 @@ def get_install_data():
class LinkCheckerError(Exception):
"""Exception to be raised on linkchecker-specific check errors."""
pass
class LinkCheckerInterrupt(Exception):
"""Used for testing."""
pass
@ -106,6 +115,7 @@ def init_i18n(loc=None):
i18n.init(configdata.name.lower(), locdir, loc=loc)
# install translated log level names
import logging
logging.addLevelName(logging.CRITICAL, _('CRITICAL'))
logging.addLevelName(logging.ERROR, _('ERROR'))
logging.addLevelName(logging.WARN, _('WARN'))
@ -124,15 +134,22 @@ def drop_privileges():
if os.name != 'posix':
return
if os.geteuid() == 0:
log.warn(LOG_CHECK, _("Running as root user; "
"dropping privileges by changing user to nobody."))
log.warn(
LOG_CHECK,
_(
"Running as root user; "
"dropping privileges by changing user to nobody."
),
)
import pwd
os.seteuid(pwd.getpwnam('nobody')[3])
if hasattr(signal, "SIGUSR1"):
# install SIGUSR1 handler
from .decorators import signal_handler
@signal_handler(signal.SIGUSR1)
def print_threadstacks(sig, frame):
"""Print stack traces of all running threads."""
@ -140,7 +157,9 @@ if hasattr(signal, "SIGUSR1"):
for threadId, stack in sys._current_frames().items():
log.warn(LOG_THREAD, "# ThreadID: %s" % threadId)
for filename, lineno, name, line in traceback.extract_stack(stack):
log.warn(LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name))
log.warn(
LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name)
)
line = line.strip()
if line:
log.warn(LOG_THREAD, " %s" % line)

View file

@ -59,6 +59,7 @@ import os
import logging
import types
from .fileutil import has_module, is_tty
if os.name == 'nt':
from . import colorama
@ -79,16 +80,16 @@ concealed = 'concealed'
# Control numbers
AnsiControl = {
None: '',
bold: '1',
light: '2',
#italic: '3', # unsupported
None: '',
bold: '1',
light: '2',
# italic: '3', # unsupported
underline: '4',
blink: '5',
#rapidblink: '6', # unsupported
invert: '7',
blink: '5',
# rapidblink: '6', # unsupported
invert: '7',
concealed: '8',
#strikethrough: '9', # unsupported
# strikethrough: '9', # unsupported
}
# Color constants
@ -116,47 +117,47 @@ InverseColors = (Black, Red, Green, Yellow, Blue, Purple, Cyan, White)
# Ansi color numbers; capitalized colors are inverse
AnsiColor = {
None: '0',
None: '0',
default: '0',
black: '30',
red: '31',
green: '32',
yellow: '33',
blue: '34',
purple: '35',
cyan: '36',
white: '37',
Black: '40',
Red: '41',
Green: '42',
Yellow: '43',
Blue: '44',
Purple: '45',
Cyan: '46',
White: '47',
black: '30',
red: '31',
green: '32',
yellow: '33',
blue: '34',
purple: '35',
cyan: '36',
white: '37',
Black: '40',
Red: '41',
Green: '42',
Yellow: '43',
Blue: '44',
Purple: '45',
Cyan: '46',
White: '47',
}
if os.name == 'nt':
# Windows color numbers; capitalized colors are used as background
WinColor = {
None: None,
None: None,
default: colorama.GREY,
black: colorama.BLACK,
red: colorama.RED,
green: colorama.GREEN,
yellow: colorama.YELLOW,
blue: colorama.BLUE,
purple: colorama.MAGENTA,
cyan: colorama.CYAN,
white: colorama.GREY,
Black: colorama.BLACK,
Red: colorama.RED,
Green: colorama.GREEN,
Yellow: colorama.YELLOW,
Blue: colorama.BLUE,
Purple: colorama.MAGENTA,
Cyan: colorama.CYAN,
White: colorama.GREY,
black: colorama.BLACK,
red: colorama.RED,
green: colorama.GREEN,
yellow: colorama.YELLOW,
blue: colorama.BLUE,
purple: colorama.MAGENTA,
cyan: colorama.CYAN,
white: colorama.GREY,
Black: colorama.BLACK,
Red: colorama.RED,
Green: colorama.GREEN,
Yellow: colorama.YELLOW,
Blue: colorama.BLUE,
Purple: colorama.MAGENTA,
Cyan: colorama.CYAN,
White: colorama.GREY,
}
# pc speaker beep escape code
@ -168,9 +169,10 @@ def esc_ansicolor(color):
control = ''
if ";" in color:
control, color = color.split(";", 1)
control = AnsiControl.get(control, '')+";"
control = AnsiControl.get(control, '') + ";"
cnum = AnsiColor.get(color, '0')
return AnsiEsc % (control+cnum)
return AnsiEsc % (control + cnum)
AnsiReset = esc_ansicolor(default)
@ -201,6 +203,7 @@ def has_colors(fp):
return True
elif has_curses:
import curses
try:
curses.setupterm(os.environ.get("TERM"), fp.fileno())
# More than 8 colors are good enough.
@ -218,19 +221,19 @@ def get_columns(fp):
return colorama.get_console_size().X
if has_curses:
import curses
try:
curses.setupterm(os.environ.get("TERM"), fp.fileno())
return curses.tigetnum("cols")
except curses.error:
pass
pass
return 80
def _write_color_colorama(fp, text, color):
"""Colorize text with given color."""
foreground, background, style = get_win_color(color)
colorama.set_console(foreground=foreground, background=background,
style=style)
colorama.set_console(foreground=foreground, background=background, style=style)
fp.write(text)
colorama.reset_console()
@ -314,7 +317,6 @@ class ColoredStreamHandler(logging.StreamHandler):
try:
self.stream.write("%s" % msg, color=color)
except UnicodeError:
self.stream.write("%s" % msg.encode("UTF-8"),
color=color)
self.stream.write("%s" % msg.encode("UTF-8"), color=color)
self.stream.write(os.linesep)
self.flush()

View file

@ -5,14 +5,14 @@
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# modification, are permitted provided that the following conditions are met:
#
# 1. Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
# list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
@ -40,201 +40,283 @@ pykeywords = set(keyword.kwlist)
def parse_py_statement(line):
state = 0
curtoken = ""
spaces = " \t\n"
ops = ".,;:+-*/%&=|(){}[]^<>"
i = 0
def _escape_char(c):
if c == "n": return "\n"
elif c == "t": return "\t"
else: return c
while i < len(line):
c = line[i]
i += 1
if state == 0:
if c in spaces: pass
elif c in ops: yield ("op", c)
elif c == "#": state = 6
elif c == "\"": state = 1
elif c == "'": state = 2
else:
curtoken = c
state = 3
elif state == 1: # string via "
if c == "\\": state = 4
elif c == "\"":
yield ("str", curtoken)
curtoken = ""
state = 0
else: curtoken += c
elif state == 2: # string via '
if c == "\\": state = 5
elif c == "'":
yield ("str", curtoken)
curtoken = ""
state = 0
else: curtoken += c
elif state == 3: # identifier
if c in spaces + ops + "#\"'":
yield ("id", curtoken)
curtoken = ""
state = 0
i -= 1
else: curtoken += c
elif state == 4: # escape in "
curtoken += _escape_char(c)
state = 1
elif state == 5: # escape in '
curtoken += _escape_char(c)
state = 2
elif state == 6: # comment
curtoken += c
if state == 3: yield ("id", curtoken)
elif state == 6: yield ("comment", curtoken)
state = 0
curtoken = ""
spaces = " \t\n"
ops = ".,;:+-*/%&=|(){}[]^<>"
i = 0
def _escape_char(c):
if c == "n":
return "\n"
elif c == "t":
return "\t"
else:
return c
while i < len(line):
c = line[i]
i += 1
if state == 0:
if c in spaces:
pass
elif c in ops:
yield ("op", c)
elif c == "#":
state = 6
elif c == "\"":
state = 1
elif c == "'":
state = 2
else:
curtoken = c
state = 3
elif state == 1: # string via "
if c == "\\":
state = 4
elif c == "\"":
yield ("str", curtoken)
curtoken = ""
state = 0
else:
curtoken += c
elif state == 2: # string via '
if c == "\\":
state = 5
elif c == "'":
yield ("str", curtoken)
curtoken = ""
state = 0
else:
curtoken += c
elif state == 3: # identifier
if c in spaces + ops + "#\"'":
yield ("id", curtoken)
curtoken = ""
state = 0
i -= 1
else:
curtoken += c
elif state == 4: # escape in "
curtoken += _escape_char(c)
state = 1
elif state == 5: # escape in '
curtoken += _escape_char(c)
state = 2
elif state == 6: # comment
curtoken += c
if state == 3:
yield ("id", curtoken)
elif state == 6:
yield ("comment", curtoken)
def grep_full_py_identifiers(tokens):
global pykeywords
tokens = list(tokens)
i = 0
while i < len(tokens):
tokentype, token = tokens[i]
i += 1
if tokentype != "id": continue
while i+1 < len(tokens) and tokens[i] == ("op", ".") and tokens[i+1][0] == "id":
token += "." + tokens[i+1][1]
i += 2
if token == "": continue
if token in pykeywords: continue
if token[0] in ".0123456789": continue
yield token
global pykeywords
tokens = list(tokens)
i = 0
while i < len(tokens):
tokentype, token = tokens[i]
i += 1
if tokentype != "id":
continue
while (
i + 1 < len(tokens)
and tokens[i] == ("op", ".")
and tokens[i + 1][0] == "id"
):
token += "." + tokens[i + 1][1]
i += 2
if token == "":
continue
if token in pykeywords:
continue
if token[0] in ".0123456789":
continue
yield token
def output(s, out=sys.stdout):
print(s, file=out)
def output(s, out=sys.stdout): print(s, file=out)
def output_limit():
return 300
return 300
def pp_extra_info(obj, depthlimit=3):
s = []
if hasattr(obj, "__len__"):
try:
if type(obj) in (bytes, str, list, tuple, dict) and len(obj) <= 5:
pass # don't print len in this case
else:
s += ["len = " + str(obj.__len__())]
except:
pass
if depthlimit > 0 and hasattr(obj, "__getitem__"):
try:
if type(obj) in (bytes, str):
pass # doesn't make sense to get subitems here
else:
subobj = obj.__getitem__(0)
extra_info = pp_extra_info(subobj, depthlimit - 1)
if extra_info != "":
s += ["_[0]: {" + extra_info + "}"]
except:
pass
return ", ".join(s)
def pp_extra_info(obj, depthlimit = 3):
s = []
if hasattr(obj, "__len__"):
try:
if type(obj) in (bytes,str,list,tuple,dict) and len(obj) <= 5:
pass # don't print len in this case
else:
s += ["len = " + str(obj.__len__())]
except: pass
if depthlimit > 0 and hasattr(obj, "__getitem__"):
try:
if type(obj) in (bytes,str):
pass # doesn't make sense to get subitems here
else:
subobj = obj.__getitem__(0)
extra_info = pp_extra_info(subobj, depthlimit - 1)
if extra_info != "":
s += ["_[0]: {" + extra_info + "}"]
except: pass
return ", ".join(s)
def pretty_print(obj):
s = repr(obj)
limit = output_limit()
if len(s) > limit:
s = s[:limit - 3] + "..."
extra_info = pp_extra_info(obj)
if extra_info != "": s += ", " + extra_info
return s
s = repr(obj)
limit = output_limit()
if len(s) > limit:
s = s[: limit - 3] + "..."
extra_info = pp_extra_info(obj)
if extra_info != "":
s += ", " + extra_info
return s
def fallback_findfile(filename):
mods = [ m for m in sys.modules.values() if m and hasattr(m, "__file__") and filename in m.__file__ ]
if len(mods) == 0: return None
altfn = mods[0].__file__
if altfn[-4:-1] == ".py": altfn = altfn[:-1] # *.pyc or whatever
return altfn
mods = [
m
for m in sys.modules.values()
if m and hasattr(m, "__file__") and filename in m.__file__
]
if len(mods) == 0:
return None
altfn = mods[0].__file__
if altfn[-4:-1] == ".py":
altfn = altfn[:-1] # *.pyc or whatever
return altfn
def better_exchook(etype, value, tb, out=sys.stdout):
output('Traceback (most recent call last):', out=out)
allLocals,allGlobals = {},{}
try:
import linecache
limit = None
if hasattr(sys, 'tracebacklimit'):
limit = sys.tracebacklimit
n = 0
_tb = tb
def _resolveIdentifier(namespace, id):
obj = namespace[id[0]]
for part in id[1:]:
obj = getattr(obj, part)
return obj
def _trySet(old, prefix, func):
if old is not None: return old
try: return prefix + func()
except KeyError: return old
except Exception as e:
return prefix + "!" + e.__class__.__name__ + ": " + str(e)
while _tb is not None and (limit is None or n < limit):
f = _tb.tb_frame
allLocals.update(f.f_locals)
allGlobals.update(f.f_globals)
lineno = _tb.tb_lineno
co = f.f_code
filename = co.co_filename
name = co.co_name
output(' File "%s", line %d, in %s' % (filename,lineno,name), out=out)
if not os.path.isfile(filename):
altfn = fallback_findfile(filename)
if altfn:
output(" -- couldn't find file, trying this instead: " + altfn, out=out)
filename = altfn
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
if line:
line = line.strip()
output(' line: ' + line, out=out)
output(' locals:', out=out)
alreadyPrintedLocals = set()
for tokenstr in grep_full_py_identifiers(parse_py_statement(line)):
splittedtoken = tuple(tokenstr.split("."))
for token in map(lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1)):
if token in alreadyPrintedLocals: continue
tokenvalue = None
tokenvalue = _trySet(tokenvalue, "<local> ", lambda: pretty_print(_resolveIdentifier(f.f_locals, token)))
tokenvalue = _trySet(tokenvalue, "<global> ", lambda: pretty_print(_resolveIdentifier(f.f_globals, token)))
tokenvalue = _trySet(tokenvalue, "<builtin> ", lambda: pretty_print(_resolveIdentifier(f.f_builtins, token)))
tokenvalue = tokenvalue or "<not found>"
output(' ' + ".".join(token) + " = " + tokenvalue, out=out)
alreadyPrintedLocals.add(token)
if len(alreadyPrintedLocals) == 0: output(" no locals", out=out)
else:
output(' -- code not available --', out=out)
_tb = _tb.tb_next
n += 1
output('Traceback (most recent call last):', out=out)
allLocals, allGlobals = {}, {}
try:
import linecache
except Exception:
output("ERROR: cannot get more detailed exception info because:", out=out)
import traceback
for l in traceback.format_exc().split("\n"): output(" " + l, out=out)
output("simple traceback:", out=out)
traceback.print_tb(tb, None, out)
limit = None
if hasattr(sys, 'tracebacklimit'):
limit = sys.tracebacklimit
n = 0
_tb = tb
def _resolveIdentifier(namespace, id):
obj = namespace[id[0]]
for part in id[1:]:
obj = getattr(obj, part)
return obj
def _trySet(old, prefix, func):
if old is not None:
return old
try:
return prefix + func()
except KeyError:
return old
except Exception as e:
return prefix + "!" + e.__class__.__name__ + ": " + str(e)
while _tb is not None and (limit is None or n < limit):
f = _tb.tb_frame
allLocals.update(f.f_locals)
allGlobals.update(f.f_globals)
lineno = _tb.tb_lineno
co = f.f_code
filename = co.co_filename
name = co.co_name
output(' File "%s", line %d, in %s' % (filename, lineno, name), out=out)
if not os.path.isfile(filename):
altfn = fallback_findfile(filename)
if altfn:
output(
" -- couldn't find file, trying this instead: " + altfn,
out=out,
)
filename = altfn
linecache.checkcache(filename)
line = linecache.getline(filename, lineno, f.f_globals)
if line:
line = line.strip()
output(' line: ' + line, out=out)
output(' locals:', out=out)
alreadyPrintedLocals = set()
for tokenstr in grep_full_py_identifiers(parse_py_statement(line)):
splittedtoken = tuple(tokenstr.split("."))
for token in map(
lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1)
):
if token in alreadyPrintedLocals:
continue
tokenvalue = None
tokenvalue = _trySet(
tokenvalue,
"<local> ",
lambda: pretty_print(_resolveIdentifier(f.f_locals, token)),
)
tokenvalue = _trySet(
tokenvalue,
"<global> ",
lambda: pretty_print(
_resolveIdentifier(f.f_globals, token)
),
)
tokenvalue = _trySet(
tokenvalue,
"<builtin> ",
lambda: pretty_print(
_resolveIdentifier(f.f_builtins, token)
),
)
tokenvalue = tokenvalue or "<not found>"
output(' ' + ".".join(token) + " = " + tokenvalue, out=out)
alreadyPrintedLocals.add(token)
if len(alreadyPrintedLocals) == 0:
output(" no locals", out=out)
else:
output(' -- code not available --', out=out)
_tb = _tb.tb_next
n += 1
except Exception:
output("ERROR: cannot get more detailed exception info because:", out=out)
import traceback
for l in traceback.format_exc().split("\n"):
output(" " + l, out=out)
output("simple traceback:", out=out)
traceback.print_tb(tb, None, out)
import types
def _some_str(value):
try:
return str(value)
except:
return '<unprintable %s object>' % type(value).__name__
def _format_final_exc_line(etype, value):
valuestr = _some_str(value)
if value is None or not valuestr:
line = "%s" % etype
else:
line = "%s: %s" % (etype, valuestr)
return line
if (
isinstance(etype, BaseException)
or (hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType))
or etype is None
or type(etype) is str
):
output(_format_final_exc_line(etype, value), out=out)
else:
output(_format_final_exc_line(etype.__name__, value), out=out)
import types
def _some_str(value):
try: return str(value)
except: return '<unprintable %s object>' % type(value).__name__
def _format_final_exc_line(etype, value):
valuestr = _some_str(value)
if value is None or not valuestr:
line = "%s" % etype
else:
line = "%s: %s" % (etype, valuestr)
return line
if (isinstance(etype, BaseException) or
(hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType)) or
etype is None or type(etype) is str):
output(_format_final_exc_line(etype, value), out=out)
else:
output(_format_final_exc_line(etype.__name__, value), out=out)
def install():
sys.excepthook = better_exchook
sys.excepthook = better_exchook

View file

@ -16,8 +16,10 @@
"""Parser for FireFox bookmark file."""
import re
try:
import sqlite3
has_sqlite = True
except ImportError:
has_sqlite = False

View file

@ -15,8 +15,10 @@
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import plistlib
try:
import biplist
has_biplist = True
except ImportError:
has_biplist = False
@ -48,6 +50,7 @@ KEY_URIDICTIONARY = 'URIDictionary'
KEY_CHILDREN = 'Children'
KEY_WEBBOOKMARKTYPE = 'WebBookmarkType'
def parse_plist(entry):
"""Parse a XML dictionary entry."""
if is_leaf(entry):

View file

@ -56,8 +56,7 @@ class RobotsTxt:
rp = self.cache[roboturl]
return rp.can_fetch(self.useragent, url_data.url)
self.misses += 1
kwargs = dict(auth=url_data.auth, session=url_data.session,
timeout=timeout)
kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout)
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
rp = robotparser2.RobotFileParser(**kwargs)

View file

@ -24,15 +24,19 @@ from .. import log, LOG_CACHE
class Timeout(Exception):
"""Raised by join()"""
pass
class Empty(Exception):
"""Exception raised by get()."""
pass
NUM_PUTS_CLEANUP = 10000
class UrlQueue:
"""A queue supporting several consumer tasks. The task_done() idea is
from the Python 2.5 implementation of Queue.Queue()."""
@ -58,7 +62,9 @@ class UrlQueue:
# Each put() decreases the number of allowed puts.
# This way we can restrict the number of URLs that are checked.
if max_allowed_urls is not None and max_allowed_urls <= 0:
raise ValueError("Non-positive number of allowed URLs: %d" % max_allowed_urls)
raise ValueError(
"Non-positive number of allowed URLs: %d" % max_allowed_urls
)
self.max_allowed_urls = max_allowed_urls
self.num_puts = 0
@ -132,7 +138,9 @@ class UrlQueue:
self.cleanup()
self.queue.append(url_data)
self.unfinished_tasks += 1
cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times
cache.add_result(
key, None
) # add none value to cache to prevent checking this url multiple times
def cleanup(self):
"""Move cached elements to top."""

View file

@ -23,7 +23,7 @@ import urllib.parse
from .. import strformat, url as urlutil, log, LOG_CHECK
MAX_FILESIZE = 1024*1024*10 # 10MB
MAX_FILESIZE = 1024 * 1024 * 10 # 10MB
def guess_url(url):
@ -64,9 +64,20 @@ def absolute_url(base_url, base_ref, parent_url):
return ""
def get_url_from(base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=None, column=None,
page=0, name="", parent_content_type=None, extern=None, url_encoding=None):
def get_url_from(
base_url,
recursion_level,
aggregate,
parent_url=None,
base_ref=None,
line=None,
column=None,
page=0,
name="",
parent_content_type=None,
extern=None,
url_encoding=None,
):
"""
Get url data from given base data.
@ -112,17 +123,31 @@ def get_url_from(base_url, recursion_level, aggregate,
name = base_url.replace("\\", "/")
allowed_schemes = aggregate.config["allowedschemes"]
# ignore local PHP files with execution directives
local_php = (parent_content_type == 'application/x-httpd-php' and
'<?' in base_url and '?>' in base_url and scheme == 'file')
local_php = (
parent_content_type == 'application/x-httpd-php'
and '<?' in base_url
and '?>' in base_url
and scheme == 'file'
)
if local_php or (allowed_schemes and scheme not in allowed_schemes):
klass = ignoreurl.IgnoreUrl
else:
assume_local_file = (recursion_level == 0)
assume_local_file = recursion_level == 0
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(base_url, recursion_level, aggregate,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, page=page, name=name, extern=extern, url_encoding=url_encoding)
return klass(
base_url,
recursion_level,
aggregate,
parent_url=parent_url,
base_ref=base_ref,
line=line,
column=column,
page=page,
name=name,
extern=extern,
url_encoding=url_encoding,
)
def get_urlclass_from(scheme, assume_local_file=False):
@ -175,5 +200,15 @@ def get_index_html(urls):
# all the URL classes
from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
mailtourl, telneturl, nntpurl, ignoreurl, itmsservicesurl) # noqa: E402
from . import (
fileurl,
unknownurl,
ftpurl,
httpurl,
dnsurl,
mailtourl,
telneturl,
nntpurl,
ignoreurl,
itmsservicesurl,
) # noqa: E402

View file

@ -33,7 +33,7 @@ ExcSyntaxList = [
# exceptions are internal or system errors
ExcCacheList = [
IOError,
OSError, # OSError is thrown on Windows when a file is not found
OSError, # OSError is thrown on Windows when a file is not found
LinkCheckerError,
DNSException,
socket.error,
@ -59,6 +59,7 @@ ExcNoCacheList = [
# firefox bookmark file needs sqlite3 for parsing
try:
import sqlite3
ExcCacheList.append(sqlite3.Error)
except ImportError:
pass
@ -66,6 +67,7 @@ except ImportError:
# pyOpenSSL errors
try:
import OpenSSL
ExcCacheList.append(OpenSSL.SSL.Error)
except ImportError:
pass
@ -99,22 +101,22 @@ WARN_XML_PARSE_ERROR = "xml-parse-error"
# registered warnings
Warnings = {
WARN_URL_EFFECTIVE_URL:
_("The effective URL is different from the original."),
WARN_URL_ERROR_GETTING_CONTENT:
_("Could not get the content of the URL."),
WARN_URL_EFFECTIVE_URL: _("The effective URL is different from the original."),
WARN_URL_ERROR_GETTING_CONTENT: _("Could not get the content of the URL."),
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
WARN_URL_RATE_LIMITED: _("The URL request was rate limited so need reduce number of requests."),
WARN_URL_RATE_LIMITED: _(
"The URL request was rate limited so need reduce number of requests."
),
WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
WARN_FILE_SYSTEM_PATH:
_("The file: path is not the same as the system specific path."),
WARN_FILE_SYSTEM_PATH: _(
"The file: path is not the same as the system specific path."
),
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
WARN_HTTP_COOKIE_STORE_ERROR:
_("An error occurred while storing a cookie."),
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),
WARN_IGNORE_URL: _("The URL has been ignored."),
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
WARN_NNTP_NO_SERVER: _("No NNTP server was found."),

View file

@ -39,7 +39,7 @@ def get_files(dirname):
if os.path.isfile(fullentry):
yield entry
elif os.path.isdir(fullentry):
yield entry+"/"
yield entry + "/"
def prepare_urlpath_for_nt(path):
@ -48,7 +48,7 @@ def prepare_urlpath_for_nt(path):
However urllib.url2pathname expects '////server/path'.
"""
if '|' not in path:
return "////"+path.lstrip("/")
return "////" + path.lstrip("/")
return path
@ -58,9 +58,9 @@ def get_nt_filename(path):
head, tail = os.path.split(rest)
if not tail:
return path
for fname in os.listdir(unc+head):
for fname in os.listdir(unc + head):
if fname.lower() == tail.lower():
return os.path.join(get_nt_filename(unc+head), fname)
return os.path.join(get_nt_filename(unc + head), fname)
log.error(LOG_CHECK, "could not find %r in %r", tail, head)
return path
@ -92,11 +92,34 @@ class FileUrl(urlbase.UrlBase):
Url link with file scheme.
"""
def init(self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, page, name, url_encoding, extern):
def init(
self,
base_ref,
base_url,
parent_url,
recursion_level,
aggregate,
line,
column,
page,
name,
url_encoding,
extern,
):
"""Initialize the scheme."""
super(FileUrl, self).init(base_ref, base_url, parent_url,
recursion_level, aggregate, line, column, page, name, url_encoding, extern)
super(FileUrl, self).init(
base_ref,
base_url,
parent_url,
recursion_level,
aggregate,
line,
column,
page,
name,
url_encoding,
extern,
)
self.scheme = 'file'
def build_base_url(self):
@ -111,14 +134,16 @@ class FileUrl(urlbase.UrlBase):
base_url = os.path.expanduser(base_url)
if not is_absolute_path(base_url):
try:
base_url = os.getcwd()+"/"+base_url
base_url = os.getcwd() + "/" + base_url
except OSError as msg:
# occurs on stale remote filesystems (eg. NFS)
errmsg = _("Could not get current working directory: %(msg)s") % dict(msg=msg)
errmsg = _(
"Could not get current working directory: %(msg)s"
) % dict(msg=msg)
raise LinkCheckerError(errmsg)
if os.path.isdir(base_url):
base_url += "/"
base_url = "file://"+base_url
base_url = "file://" + base_url
if os.name == "nt":
base_url = base_url.replace("\\", "/")
# transform c:/windows into /c|/windows
@ -138,6 +163,7 @@ class FileUrl(urlbase.UrlBase):
# Otherwise the join function thinks the query is part of
# the file name.
from .urlbase import url_norm
# norm base url - can raise UnicodeError from url.idna_encode()
base_url, is_idn = url_norm(self.base_url, self.encoding)
urlparts = list(urllib.parse.urlsplit(base_url))
@ -148,8 +174,9 @@ class FileUrl(urlbase.UrlBase):
# ignore query and fragment url parts for filesystem urls
self.urlparts[3] = self.urlparts[4] = ''
if self.is_directory() and not self.urlparts[2].endswith('/'):
self.add_warning(_("Added trailing slash to directory."),
tag=WARN_FILE_MISSING_SLASH)
self.add_warning(
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
)
self.urlparts[2] += '/'
self.url = urlutil.urlunsplit(self.urlparts)
@ -168,9 +195,10 @@ class FileUrl(urlbase.UrlBase):
Try to open the local file. Under NT systems the case sensitivity
is checked.
"""
if (self.parent_url is not None and
not self.parent_url.startswith("file:")):
msg = _("local files are only checked without parent URL or when the parent URL is also a file")
if self.parent_url is not None and not self.parent_url.startswith("file:"):
msg = _(
"local files are only checked without parent URL or when the parent URL is also a file"
)
raise LinkCheckerError(msg)
if self.is_directory():
self.set_result(_("directory"))
@ -190,11 +218,15 @@ class FileUrl(urlbase.UrlBase):
path = self.get_os_filename()
realpath = get_nt_filename(path)
if path != realpath:
self.add_warning(_("The URL path %(path)r is not the same as the "
"system path %(realpath)r. You should always use "
"the system path in URLs.") % \
{"path": path, "realpath": realpath},
tag=WARN_FILE_SYSTEM_PATH)
self.add_warning(
_(
"The URL path %(path)r is not the same as the "
"system path %(realpath)r. You should always use "
"the system path in URLs."
)
% {"path": path, "realpath": realpath},
tag=WARN_FILE_SYSTEM_PATH,
)
def read_content(self):
"""Return file content, or in case of directories a dummy HTML file
@ -242,7 +274,9 @@ class FileUrl(urlbase.UrlBase):
return True
if self.content_type in self.ContentMimetypes:
return True
log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
log.debug(
LOG_CHECK, "File with content type %r is not parseable.", self.content_type
)
return False
def set_content_type(self):
@ -267,7 +301,7 @@ class FileUrl(urlbase.UrlBase):
i = url.rindex('/')
if i > 6:
# remove last filename to make directory internal
url = url[:i+1]
url = url[: i + 1]
return re.escape(url)
def add_url(self, url, line=0, column=0, page=0, name="", base=None):
@ -277,4 +311,6 @@ class FileUrl(urlbase.UrlBase):
if webroot and url and url.startswith("/"):
url = webroot + url[1:]
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
super(FileUrl, self).add_url(url, line=line, column=column, page=page, name=name, base=base)
super(FileUrl, self).add_url(
url, line=line, column=column, page=page, name=name, base=base
)

View file

@ -50,14 +50,16 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
if self.proxy:
# using a (HTTP) proxy
http = httpurl.HttpUrl(self.base_url,
self.recursion_level,
self.aggregate,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
column=self.column,
name=self.name)
http = httpurl.HttpUrl(
self.base_url,
self.recursion_level,
self.aggregate,
parent_url=self.parent_url,
base_ref=self.base_ref,
line=self.line,
column=self.column,
name=self.name,
)
http.build_url()
return http.check()
self.login()
@ -91,7 +93,8 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
raise LinkCheckerError(_("Got no answer from FTP server"))
except EOFError as msg:
raise LinkCheckerError(
_("Remote host has closed connection: %(msg)s") % str(msg))
_("Remote host has closed connection: %(msg)s") % str(msg)
)
def negotiate_encoding(self):
"""Check if server can handle UTF-8 encoded filenames.
@ -137,8 +140,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if "%s/" % self.filename in files:
if not self.url.endswith('/'):
self.add_warning(
_("Missing trailing directory slash in ftp url."),
tag=WARN_FTP_MISSING_SLASH)
_("Missing trailing directory slash in ftp url."),
tag=WARN_FTP_MISSING_SLASH,
)
self.url += '/'
return
raise ftplib.error_perm("550 File not found")
@ -147,11 +151,13 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""Get list of filenames in directory. Subdirectories have an
ending slash."""
files = []
def add_entry(line):
"""Parse list line and add the entry it points to to the file
list."""
log.debug(LOG_CHECK, "Directory entry %r", line)
from ..ftpparse import ftpparse
fpo = ftpparse(line)
if fpo is not None and fpo["name"]:
name = fpo["name"]
@ -159,6 +165,7 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
name += "/"
if fpo["trycwd"] or fpo["tryretr"]:
files.append(name)
self.url_connection.dir(add_entry)
return files
@ -168,7 +175,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
return True
if self.content_type in self.ContentMimetypes:
return True
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
log.debug(
LOG_CHECK, "URL with content type %r is not parseable.", self.content_type
)
return False
def is_directory(self):
@ -194,12 +203,14 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# download file in BINARY mode
ftpcmd = "RETR %s" % self.filename
buf = StringIO()
def stor_data(s):
"""Helper method storing given data"""
# limit the download size
if (buf.tell() + len(s)) > self.max_size:
raise LinkCheckerError(_("FTP file size too large"))
buf.write(s)
self.url_connection.retrbinary(ftpcmd, stor_data)
data = buf.getvalue()
buf.close()

View file

@ -17,21 +17,33 @@
Handle http links.
"""
import requests
# The validity of SSL certs is ignored to be able
# the check the URL and recurse into it.
# The warning about invalid SSL certs is given to the
# user instead.
import warnings
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
warnings.simplefilter(
'ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning
)
from io import BytesIO
import re
from .. import (log, LOG_CHECK, strformat, mimeutil,
url as urlutil, LinkCheckerError, httputil)
from . import (internpaturl, proxysupport)
from .. import (
log,
LOG_CHECK,
strformat,
mimeutil,
url as urlutil,
LinkCheckerError,
httputil,
)
from . import internpaturl, proxysupport
# import warnings
from .const import (WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED)
from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED
from requests.sessions import REDIRECT_STATI
# assumed HTTP header encoding
@ -72,9 +84,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
@return: True if access is granted, otherwise False
@rtype: bool
"""
return (not self.aggregate.config['robotstxt']
or self.aggregate.robots_txt.allows_url(
self, timeout=self.aggregate.config["timeout"]))
return not self.aggregate.config[
'robotstxt'
] or self.aggregate.robots_txt.allows_url(
self, timeout=self.aggregate.config["timeout"]
)
def content_allows_robots(self):
"""
@ -89,8 +103,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def add_size_info(self):
"""Get size of URL content from HTTP header."""
if self.headers and "Content-Length" in self.headers and \
"Transfer-Encoding" not in self.headers:
if (
self.headers
and "Content-Length" in self.headers
and "Transfer-Encoding" not in self.headers
):
# Note that content-encoding causes size differences since
# the content data is always decoded.
try:
@ -139,14 +156,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def build_request(self):
"""Build a prepared request object."""
clientheaders = {}
if (self.parent_url and
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
if self.parent_url and self.parent_url.lower().startswith(HTTP_SCHEMAS):
clientheaders["Referer"] = self.parent_url
kwargs = dict(
method='GET',
url=self.url,
headers=clientheaders,
)
kwargs = dict(method='GET', url=self.url, headers=clientheaders,)
if self.auth:
kwargs['auth'] = self.auth
log.debug(LOG_CHECK, "Prepare request with %s", kwargs)
@ -223,8 +235,10 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def is_redirect(self):
"""Check if current response is a redirect."""
return ('location' in self.headers and
self.url_connection.status_code in REDIRECT_STATI)
return (
'location' in self.headers
and self.url_connection.status_code in REDIRECT_STATI
)
def get_request_kwargs(self):
"""Construct keyword parameters for Session.request() and
@ -241,8 +255,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def get_redirects(self, request):
"""Return iterator of redirects for given request."""
kwargs = self.get_request_kwargs()
return self.session.resolve_redirects(self.url_connection,
request, **kwargs)
return self.session.resolve_redirects(self.url_connection, request, **kwargs)
def follow_redirections(self, request):
"""Follow all redirections of http response."""
@ -285,21 +298,32 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
def check_response(self):
"""Check final result and log it."""
if self.url_connection.status_code >= 400 and self.url_connection.status_code != 429:
self.set_result("%d %s" % (self.url_connection.status_code, self.url_connection.reason),
valid=False)
if (
self.url_connection.status_code >= 400
and self.url_connection.status_code != 429
):
self.set_result(
"%d %s" % (self.url_connection.status_code, self.url_connection.reason),
valid=False,
)
else:
if self.url_connection.status_code == 204:
# no content
self.add_warning(self.url_connection.reason,
tag=WARN_HTTP_EMPTY_CONTENT)
self.add_warning(
self.url_connection.reason, tag=WARN_HTTP_EMPTY_CONTENT
)
if self.url_connection.status_code == 429:
self.add_warning("Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")),
tag=WARN_URL_RATE_LIMITED)
self.add_warning(
"Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")),
tag=WARN_URL_RATE_LIMITED,
)
if self.url_connection.status_code >= 200:
self.set_result("%r %s" % (self.url_connection.status_code, self.url_connection.reason))
self.set_result(
"%r %s"
% (self.url_connection.status_code, self.url_connection.reason)
)
else:
self.set_result(_("OK"))
@ -325,6 +349,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.add_url(url, name=name)
if 'Refresh' in self.headers:
from ..htmlutil.linkparse import refresh_re
value = self.headers['Refresh'].strip()
mo = refresh_re.match(value)
if mo:
@ -352,7 +377,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
# XXX side effect
self.content_type = rtype
if self.content_type not in self.ContentMimetypes:
log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
log.debug(
LOG_CHECK,
"URL with content type %r is not parseable",
self.content_type,
)
return False
return True

View file

@ -19,6 +19,7 @@ Handle ignored URLs.
from . import unknownurl
class IgnoreUrl(unknownurl.UnknownUrl):
"""Always ignored URL."""

View file

@ -20,6 +20,7 @@ Handle itms-services URLs.
from . import urlbase
from .. import log, LOG_CHECK
class ItmsServicesUrl(urlbase.UrlBase):
"""Apple iOS application download URLs."""

View file

@ -53,6 +53,8 @@ def is_literal(domain):
_remove_quoted = re.compile(r'\\.').sub
_quotes = re.compile(r'["\\]')
def is_missing_quote(addr):
"""Return True iff mail address is not correctly quoted."""
return _quotes.match(_remove_quoted("", addr[1:-1]))
@ -62,6 +64,7 @@ def is_missing_quote(addr):
EMAIL_CGI_ADDRESS = ("to", "cc", "bcc")
EMAIL_CGI_SUBJECT = "subject"
class MailtoUrl(urlbase.UrlBase):
"""
Url link with mailto scheme.
@ -81,8 +84,10 @@ class MailtoUrl(urlbase.UrlBase):
if not self.valid:
break
elif not self.subject:
self.add_warning(_("No mail addresses or email subject found in `%(url)s'.") % \
{"url": self.url})
self.add_warning(
_("No mail addresses or email subject found in `%(url)s'.")
% {"url": self.url}
)
def parse_addresses(self):
"""Parse all mail addresses out of the URL target. Also parses
@ -92,7 +97,7 @@ class MailtoUrl(urlbase.UrlBase):
# cut off leading mailto: and unquote
url = urllib.parse.unquote(self.base_url[7:], self.encoding)
# search for cc, bcc, to and store in headers
mode = 0 # 0=default, 1=quote, 2=esc
mode = 0 # 0=default, 1=quote, 2=esc
quote = None
i = 0
for i, c in enumerate(url):
@ -104,7 +109,7 @@ class MailtoUrl(urlbase.UrlBase):
mode = 1
elif c == '\\':
mode = 2
elif mode==1:
elif mode == 1:
if c == '"' and quote == '"':
mode = 0
elif c == '>' and quote == '<':
@ -114,11 +119,13 @@ class MailtoUrl(urlbase.UrlBase):
if i < (len(url) - 1):
self.addresses.update(getaddresses(url[:i]))
try:
headers = urllib.parse.parse_qs(url[(i+1):], strict_parsing=True)
headers = urllib.parse.parse_qs(url[(i + 1) :], strict_parsing=True)
for key, vals in headers.items():
if key.lower() in EMAIL_CGI_ADDRESS:
# Only the first header value is added
self.addresses.update(getaddresses(urllib.parse.unquote(vals[0], self.encoding)))
self.addresses.update(
getaddresses(urllib.parse.unquote(vals[0], self.encoding))
)
if key.lower() == EMAIL_CGI_SUBJECT:
self.subject = vals[0]
except ValueError as err:
@ -145,30 +152,57 @@ class MailtoUrl(urlbase.UrlBase):
# restrict email length to 256 characters
# http://www.rfc-editor.org/errata_search.php?eid=1003
if len(mail) > 256:
self.set_result(_("Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(mail)}, valid=False, overwrite=False)
self.set_result(
_(
"Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars."
)
% {"addr": mail, "length": len(mail)},
valid=False,
overwrite=False,
)
return
if "@" not in mail:
self.set_result(_("Missing `@' in mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Missing `@' in mail address `%(addr)s'.") % {"addr": mail},
valid=False,
overwrite=False,
)
return
# note: be sure to use rsplit since "@" can occur in local part
local, domain = mail.rsplit("@", 1)
if not local:
self.set_result(_("Missing local part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Missing local part of mail address `%(addr)s'.") % {"addr": mail},
valid=False,
overwrite=False,
)
return
if not domain:
self.set_result(_("Missing domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Missing domain part of mail address `%(addr)s'.") % {"addr": mail},
valid=False,
overwrite=False,
)
return
if len(local) > 64:
self.set_result(_("Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
self.set_result(
_(
"Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars."
)
% {"addr": mail, "length": len(local)},
valid=False,
overwrite=False,
)
return
if len(domain) > 255:
self.set_result(_("Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars.") % \
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
self.set_result(
_(
"Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars."
)
% {"addr": mail, "length": len(local)},
valid=False,
overwrite=False,
)
return
# local part syntax check
@ -176,26 +210,48 @@ class MailtoUrl(urlbase.UrlBase):
# Rules taken from http://tools.ietf.org/html/rfc3696#section-3
if is_quoted(local):
if is_missing_quote(local):
self.set_result(_("Unquoted double quote or backslash in mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Unquoted double quote or backslash in mail address `%(addr)s'.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
else:
if local.startswith("."):
self.set_result(_("Local part of mail address `%(addr)s' may not start with a dot.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Local part of mail address `%(addr)s' may not start with a dot.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
if local.endswith("."):
self.set_result(_("Local part of mail address `%(addr)s' may not end with a dot.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Local part of mail address `%(addr)s' may not end with a dot.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
if ".." in local:
self.set_result(_("Local part of mail address `%(addr)s' may not contain two dots.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Local part of mail address `%(addr)s' may not contain two dots.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
for char in '@ \\",[]':
if char in local.replace("\\%s"%char, ""):
self.set_result(_("Local part of mail address `%(addr)s' contains unquoted character `%(char)s.") % \
{"addr": mail, "char": char}, valid=False, overwrite=False)
if char in local.replace("\\%s" % char, ""):
self.set_result(
_(
"Local part of mail address `%(addr)s' contains unquoted character `%(char)s."
)
% {"addr": mail, "char": char},
valid=False,
overwrite=False,
)
return
# domain part syntax check
@ -206,18 +262,30 @@ class MailtoUrl(urlbase.UrlBase):
if ip.startswith("IPv6:"):
ip = ip[5:]
if not iputil.is_valid_ip(ip):
self.set_result(_("Domain part of mail address `%(addr)s' has invalid IP.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Domain part of mail address `%(addr)s' has invalid IP.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
else:
# it's a domain name
if not urlutil.is_safe_domain(domain):
self.set_result(_("Invalid domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Invalid domain part of mail address `%(addr)s'.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
if domain.endswith(".") or domain.split(".")[-1].isdigit():
self.set_result(_("Invalid top level domain part of mail address `%(addr)s'.") % \
{"addr": mail}, valid=False, overwrite=False)
self.set_result(
_("Invalid top level domain part of mail address `%(addr)s'.")
% {"addr": mail},
valid=False,
overwrite=False,
)
return
def check_connection(self):
@ -240,6 +308,7 @@ class MailtoUrl(urlbase.UrlBase):
Check a single mail address.
"""
from dns.exception import DNSException
log.debug(LOG_CHECK, "checking mail address %r", mail)
mail = strformat.ascii_safe(mail)
username, domain = mail.rsplit('@', 1)
@ -249,31 +318,38 @@ class MailtoUrl(urlbase.UrlBase):
except DNSException:
answers = []
if len(answers) == 0:
self.add_warning(_("No MX mail host for %(domain)s found.") %
{'domain': domain},
tag=WARN_MAIL_NO_MX_HOST)
self.add_warning(
_("No MX mail host for %(domain)s found.") % {'domain': domain},
tag=WARN_MAIL_NO_MX_HOST,
)
try:
answers = resolver.query(domain, 'A')
except DNSException:
answers = []
if len(answers) == 0:
self.set_result(_("No host for %(domain)s found.") %
{'domain': domain}, valid=False,
overwrite=True)
self.set_result(
_("No host for %(domain)s found.") % {'domain': domain},
valid=False,
overwrite=True,
)
return
# set preference to zero
mxdata = [(0, rdata.to_text(omit_final_dot=True))
for rdata in answers]
mxdata = [(0, rdata.to_text(omit_final_dot=True)) for rdata in answers]
else:
from dns.rdtypes.mxbase import MXBase
mxdata = [(rdata.preference,
rdata.exchange.to_text(omit_final_dot=True))
for rdata in answers if isinstance(rdata, MXBase)]
mxdata = [
(rdata.preference, rdata.exchange.to_text(omit_final_dot=True))
for rdata in answers
if isinstance(rdata, MXBase)
]
if not mxdata:
self.set_result(
_("Got invalid DNS answer %(answer)s for %(domain)s.") %
{'answer': answers, 'domain': domain}, valid=False,
overwrite=True)
_("Got invalid DNS answer %(answer)s for %(domain)s.")
% {'answer': answers, 'domain': domain},
valid=False,
overwrite=True,
)
return
# sort according to preference (lower preference means this
# host should be preferred)

View file

@ -28,6 +28,7 @@ from .const import WARN_NNTP_NO_SERVER, WARN_NNTP_NO_NEWSGROUP
random.seed()
class NntpUrl(urlbase.UrlBase):
"""
Url link with NNTP scheme.
@ -41,8 +42,9 @@ class NntpUrl(urlbase.UrlBase):
nntpserver = self.host or self.aggregate.config["nntpserver"]
if not nntpserver:
self.add_warning(
_("No NNTP server was specified, skipping this URL."),
tag=WARN_NNTP_NO_SERVER)
_("No NNTP server was specified, skipping this URL."),
tag=WARN_NNTP_NO_SERVER,
)
return
nntp = self._connect_nntp(nntpserver)
group = self.urlparts[2]
@ -50,7 +52,7 @@ class NntpUrl(urlbase.UrlBase):
group = group[1:]
if '@' in group:
# request article info (resp, number mid)
number = nntp.stat("<"+group+">")[1]
number = nntp.stat("<" + group + ">")[1]
self.add_info(_('Article number %(num)s found.') % {"num": number})
else:
# split off trailing articel span
@ -61,8 +63,9 @@ class NntpUrl(urlbase.UrlBase):
self.add_info(_("News group %(name)s found.") % {"name": name})
else:
# group name is the empty string
self.add_warning(_("No newsgroup specified in NNTP URL."),
tag=WARN_NNTP_NO_NEWSGROUP)
self.add_warning(
_("No newsgroup specified in NNTP URL."), tag=WARN_NNTP_NO_NEWSGROUP
)
def _connect_nntp(self, nntpserver):
"""
@ -85,7 +88,8 @@ class NntpUrl(urlbase.UrlBase):
raise
if nntp is None:
raise LinkCheckerError(
_("NNTP server too busy; tried more than %d times.") % tries)
_("NNTP server too busy; tried more than %d times.") % tries
)
if log.is_debug(LOG_CHECK):
nntp.set_debuglevel(1)
self.add_info(nntp.getwelcome())

View file

@ -40,14 +40,14 @@ class ProxySupport:
if self.proxytype not in ('http', 'https'):
# Note that invalid proxies might raise TypeError in urllib2,
# so make sure to stop checking at this point, not later.
msg = _("Proxy value `%(proxy)s' must start with 'http:' or 'https:'.") \
% dict(proxy=proxy)
msg = _(
"Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
) % dict(proxy=proxy)
raise LinkCheckerError(msg)
if self.ignore_proxy_host():
# log proxy without auth info
log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") %
dict(proxy=proxy))
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy))
self.proxy = None
return
log.debug(LOG_CHECK, "using proxy %r", self.proxy)
@ -58,7 +58,7 @@ class ProxySupport:
username = proxyurl.username
password = proxyurl.password if proxy.password is not None else ""
auth = "%s:%s" % (username, password)
self.proxyauth = "Basic "+httputil.encode_base64(auth)
self.proxyauth = "Basic " + httputil.encode_base64(auth)
def ignore_proxy_host(self):
"""Check if self.host is in the $no_proxy ignore list."""

View file

@ -64,10 +64,10 @@ class TelnetUrl(urlbase.UrlBase):
self.url_connection.open(self.host, self.port)
if self.user:
self.url_connection.read_until(b"login: ", 10)
self.url_connection.write(encode(self.user)+b"\n")
self.url_connection.write(encode(self.user) + b"\n")
if self.password:
self.url_connection.read_until(b"Password: ", 10)
self.url_connection.write(encode(self.password)+b"\n")
self.url_connection.write(encode(self.password) + b"\n")
# XXX how to tell if we are logged in??
self.url_connection.write(b"exit\n")

View file

@ -28,12 +28,12 @@ class UnknownUrl(urlbase.UrlBase):
"""Only logs that this URL is unknown."""
super(UnknownUrl, self).build_url()
if self.is_ignored():
self.add_info(_("%(scheme)s URL ignored.") %
{"scheme": self.scheme.capitalize()})
self.add_info(
_("%(scheme)s URL ignored.") % {"scheme": self.scheme.capitalize()}
)
self.set_result(_("ignored"))
else:
self.set_result(_("URL is unrecognized or has invalid syntax"),
valid=False)
self.set_result(_("URL is unrecognized or has invalid syntax"), valid=False)
def is_ignored(self):
"""Return True if this URL scheme is ignored."""
@ -260,4 +260,3 @@ ignored_schemes = "^(%s%s%s%s)$" % (
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
is_unknown_scheme = ignored_schemes_re.match

View file

@ -27,15 +27,30 @@ import select
from io import BytesIO
from . import absolute_url, get_url_from
from .. import (log, LOG_CHECK,
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
from .. import (
log,
LOG_CHECK,
strformat,
LinkCheckerError,
url as urlutil,
trace,
get_link_pat,
)
from ..htmlutil import htmlsoup
from ..network import iputil
from .const import (WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG,
ExcList, ExcSyntaxList, ExcNoCacheList)
from .const import (
WARN_URL_EFFECTIVE_URL,
WARN_URL_ERROR_GETTING_CONTENT,
WARN_URL_OBFUSCATED_IP,
WARN_URL_CONTENT_SIZE_ZERO,
WARN_URL_CONTENT_SIZE_TOO_LARGE,
WARN_URL_WHITESPACE,
URL_MAX_LENGTH,
WARN_URL_TOO_LONG,
ExcList,
ExcSyntaxList,
ExcNoCacheList,
)
from ..url import url_fix_wayback_query
# helper alias
@ -44,6 +59,7 @@ unicode_safe = strformat.unicode_safe
# schemes that are invalid with an empty hostname
scheme_requires_host = ("ftp", "http", "telnet")
def urljoin(parent, url):
"""
If url is relative, join parent and url. Else leave url as-is.
@ -61,8 +77,9 @@ def url_norm(url, encoding):
try:
return urlutil.url_norm(url, encoding=encoding)
except UnicodeError:
msg = _("URL has unparsable domain name: %(name)s") % \
{"name": sys.exc_info()[1]}
msg = _("URL has unparsable domain name: %(name)s") % {
"name": sys.exc_info()[1]
}
raise LinkCheckerError(msg)
@ -92,11 +109,22 @@ class UrlBase:
}
# Read in 16kb chunks
ReadChunkBytes = 1024*16
ReadChunkBytes = 1024 * 16
def __init__(self, base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=-1, column=-1, page=-1,
name="", url_encoding=None, extern=None):
def __init__(
self,
base_url,
recursion_level,
aggregate,
parent_url=None,
base_ref=None,
line=-1,
column=-1,
page=-1,
name="",
url_encoding=None,
extern=None,
):
"""
Initialize check data, and store given variables.
@ -113,20 +141,44 @@ class UrlBase:
@param extern: None or (is_extern, is_strict)
"""
self.reset()
self.init(base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, page, name, url_encoding, extern)
self.init(
base_ref,
base_url,
parent_url,
recursion_level,
aggregate,
line,
column,
page,
name,
url_encoding,
extern,
)
self.check_syntax()
if recursion_level == 0:
self.add_intern_pattern()
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
self.add_info(_("The URL is outside of the domain "
"filter, checked only syntax."))
self.add_info(
_("The URL is outside of the domain " "filter, checked only syntax.")
)
if not self.has_result:
self.set_result(_("filtered"))
def init(self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, page, name, url_encoding, extern):
def init(
self,
base_ref,
base_url,
parent_url,
recursion_level,
aggregate,
line,
column,
page,
name,
url_encoding,
extern,
):
"""
Initialize internal data.
"""
@ -149,17 +201,22 @@ class UrlBase:
self.encoding = url_encoding
self.extern = extern
if self.base_ref:
assert not urlutil.url_needs_quoting(self.base_ref), \
"unquoted base reference URL %r" % self.base_ref
assert not urlutil.url_needs_quoting(self.base_ref), (
"unquoted base reference URL %r" % self.base_ref
)
if self.parent_url:
assert not urlutil.url_needs_quoting(self.parent_url), \
"unquoted parent URL %r" % self.parent_url
assert not urlutil.url_needs_quoting(self.parent_url), (
"unquoted parent URL %r" % self.parent_url
)
url = absolute_url(self.base_url, base_ref, parent_url)
# assume file link if no scheme is found
self.scheme = url.split(":", 1)[0].lower() or "file"
if self.base_url != base_url:
self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") %
{"url": base_url}, tag=WARN_URL_WHITESPACE)
self.add_warning(
_("Leading or trailing whitespace in URL `%(url)s'.")
% {"url": base_url},
tag=WARN_URL_WHITESPACE,
)
def reset(self):
"""
@ -219,8 +276,13 @@ class UrlBase:
Set result string and validity.
"""
if self.has_result and not overwrite:
log.warn(LOG_CHECK,
"Double result %r (previous %r) for %s", msg, self.result, self)
log.warn(
LOG_CHECK,
"Double result %r (previous %r) for %s",
msg,
self.result,
self,
)
else:
self.has_result = True
if not msg:
@ -288,8 +350,10 @@ class UrlBase:
Add a warning string.
"""
item = (tag, s)
if item not in self.warnings and \
tag not in self.aggregate.config["ignorewarnings"]:
if (
item not in self.warnings
and tag not in self.aggregate.config["ignorewarnings"]
):
self.warnings.append(item)
def add_info(self, s):
@ -303,7 +367,7 @@ class UrlBase:
"""Set the URL to be used for caching."""
# remove anchor from cached target url since we assume
# URLs with different anchors to have the same content
self.cache_url = urlutil.urlunsplit(self.urlparts[:4]+[''])
self.cache_url = urlutil.urlunsplit(self.urlparts[:4] + [''])
if self.cache_url is not None:
assert isinstance(self.cache_url, str), repr(self.cache_url)
@ -332,13 +396,17 @@ class UrlBase:
"""Check URL name and length."""
effectiveurl = urlutil.urlunsplit(self.urlparts)
if self.url != effectiveurl:
self.add_warning(_("Effective URL %(url)r.") %
{"url": effectiveurl},
tag=WARN_URL_EFFECTIVE_URL)
self.add_warning(
_("Effective URL %(url)r.") % {"url": effectiveurl},
tag=WARN_URL_EFFECTIVE_URL,
)
self.url = effectiveurl
if len(self.url) > URL_MAX_LENGTH and self.scheme != "data":
args = dict(len=len(self.url), max=URL_MAX_LENGTH)
self.add_warning(_("URL length %(len)d is longer than %(max)d.") % args, tag=WARN_URL_TOO_LONG)
self.add_warning(
_("URL length %(len)d is longer than %(max)d.") % args,
tag=WARN_URL_TOO_LONG,
)
def build_url(self):
"""
@ -367,7 +435,9 @@ class UrlBase:
if urlparts[2]:
urlparts[2] = urlutil.collapse_segments(urlparts[2])
if not urlparts[0].startswith("feed"):
urlparts[2] = url_fix_wayback_query(urlparts[2]) # restore second / in http[s]:// in wayback path
urlparts[2] = url_fix_wayback_query(
urlparts[2]
) # restore second / in http[s]:// in wayback path
self.url = urlutil.urlunsplit(urlparts)
# split into (modifiable) list
self.urlparts = strformat.url_unicode_split(self.url)
@ -384,8 +454,9 @@ class UrlBase:
port = urlutil.default_ports.get(self.scheme, 0)
host, port = urlutil.splitport(host, port=port)
if port is None:
raise LinkCheckerError(_("URL host %(host)r has invalid port") %
{"host": host})
raise LinkCheckerError(
_("URL host %(host)r has invalid port") % {"host": host}
)
self.port = port
# set host lowercase
self.host = host.lower()
@ -415,9 +486,10 @@ class UrlBase:
if ips:
self.host = ips[0]
self.add_warning(
_("URL %(url)s has obfuscated IP address %(ip)s") % \
{"url": self.base_url, "ip": ips[0]},
tag=WARN_URL_OBFUSCATED_IP)
_("URL %(url)s has obfuscated IP address %(ip)s")
% {"url": self.base_url, "ip": ips[0]},
tag=WARN_URL_OBFUSCATED_IP,
)
def check(self):
"""Main check function for checking this URL."""
@ -453,7 +525,10 @@ class UrlBase:
value = _('Hostname not found')
elif isinstance(exc, UnicodeError):
# idna.encode(host) failed
value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': value}
value = _('Bad hostname %(host)r: %(msg)s') % {
'host': self.host,
'msg': value,
}
self.set_result(unicode_safe(value), valid=False)
def check_content(self):
@ -469,8 +544,10 @@ class UrlBase:
return True
except tuple(ExcList):
value = self.handle_exception()
self.add_warning(_("could not get content: %(msg)s") %
{"msg": value}, tag=WARN_URL_ERROR_GETTING_CONTENT)
self.add_warning(
_("could not get content: %(msg)s") % {"msg": value},
tag=WARN_URL_ERROR_GETTING_CONTENT,
)
return False
def close_connection(self):
@ -492,11 +569,15 @@ class UrlBase:
An exception occurred. Log it and set the cache flag.
"""
etype, evalue = sys.exc_info()[:2]
log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True)
log.debug(
LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True
)
# note: etype must be the exact class, not a subclass
if (etype in ExcNoCacheList) or \
(etype == socket.error and evalue.args[0]==errno.EBADF) or \
not evalue:
if (
(etype in ExcNoCacheList)
or (etype == socket.error and evalue.args[0] == errno.EBADF)
or not evalue
):
# EBADF occurs when operating on an already socket
self.caching = False
# format message "<exception name>: <error message>"
@ -519,10 +600,13 @@ class UrlBase:
maxbytes = self.aggregate.config["maxfilesizedownload"]
if self.size > maxbytes:
self.add_warning(
_("Content size %(size)s is larger than %(maxbytes)s.") %
dict(size=strformat.strsize(self.size),
maxbytes=strformat.strsize(maxbytes)),
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
_("Content size %(size)s is larger than %(maxbytes)s.")
% dict(
size=strformat.strsize(self.size),
maxbytes=strformat.strsize(maxbytes),
),
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE,
)
def allows_simple_recursion(self):
"""Check recursion level and extern status."""
@ -579,15 +663,13 @@ class UrlBase:
return
for entry in self.aggregate.config["externlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
if (entry['negate'] and not match) or (match and not entry['negate']):
log.debug(LOG_CHECK, "Extern URL %r", url)
self.extern = (1, entry['strict'])
return
for entry in self.aggregate.config["internlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
if (entry['negate'] and not match) or (match and not entry['negate']):
log.debug(LOG_CHECK, "Intern URL %r", url)
self.extern = (0, 0)
return
@ -612,8 +694,7 @@ class UrlBase:
self.size = len(content)
self.dltime = time.time() - t
if self.size == 0:
self.add_warning(_("Content size is zero."),
tag=WARN_URL_CONTENT_SIZE_ZERO)
self.add_warning(_("Content size is zero."), tag=WARN_URL_CONTENT_SIZE_ZERO)
else:
self.aggregate.add_downloaded_bytes(self.size)
return content
@ -636,8 +717,9 @@ class UrlBase:
# than an internal crash, eh? ISO-8859-1 is a safe fallback in the
# sense that any binary blob can be decoded, it'll never cause a
# UnicodeDecodeError.
log.debug(LOG_CHECK, "Beautiful Soup detected %s",
self.soup.original_encoding)
log.debug(
LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding
)
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
log.debug(LOG_CHECK, "Content encoding %s", self.encoding)
self.text = self.data.decode(self.encoding)
@ -675,29 +757,41 @@ class UrlBase:
base_ref = urlutil.url_norm(base, encoding=self.encoding)[0]
else:
base_ref = None
url_data = get_url_from(url, self.recursion_level+1, self.aggregate,
parent_url=self.url, base_ref=base_ref, line=line, column=column,
page=page, name=name, parent_content_type=self.content_type, url_encoding=self.encoding)
url_data = get_url_from(
url,
self.recursion_level + 1,
self.aggregate,
parent_url=self.url,
base_ref=base_ref,
line=line,
column=column,
page=page,
name=name,
parent_content_type=self.content_type,
url_encoding=self.encoding,
)
self.aggregate.urlqueue.put(url_data)
def serialized(self, sep=os.linesep):
"""
Return serialized url check data as unicode string.
"""
return unicode_safe(sep).join([
"%s link" % self.scheme,
"base_url=%r" % self.base_url,
"parent_url=%r" % self.parent_url,
"base_ref=%r" % self.base_ref,
"recursion_level=%d" % self.recursion_level,
"url_connection=%s" % self.url_connection,
"line=%s" % self.line,
"column=%s" % self.column,
"page=%d" % self.page,
"name=%r" % self.name,
"anchor=%r" % self.anchor,
"cache_url=%s" % self.cache_url,
])
return unicode_safe(sep).join(
[
"%s link" % self.scheme,
"base_url=%r" % self.base_url,
"parent_url=%r" % self.parent_url,
"base_ref=%r" % self.base_ref,
"recursion_level=%d" % self.recursion_level,
"url_connection=%s" % self.url_connection,
"line=%s" % self.line,
"column=%s" % self.column,
"page=%d" % self.page,
"name=%r" % self.name,
"anchor=%r" % self.anchor,
"cache_url=%s" % self.cache_url,
]
)
def get_intern_pattern(self, url=None):
"""Get pattern for intern URL matching.
@ -717,8 +811,7 @@ class UrlBase:
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
self.aggregate.config['internlinks'].append(get_link_pat(pat))
except UnicodeError as msg:
res = _("URL has unparsable domain name: %(domain)s") % \
{"domain": msg}
res = _("URL has unparsable domain name: %(domain)s") % {"domain": msg}
self.set_result(res, valid=False)
def __str__(self):
@ -792,28 +885,29 @@ class UrlBase:
- url_data.last_modified: datetime
Last modification date of retrieved page (or None).
"""
return dict(valid=self.valid,
extern=self.extern[0],
result=self.result,
warnings=self.warnings[:],
name=self.name or "",
title=self.get_title(),
parent_url=self.parent_url or "",
base_ref=self.base_ref or "",
base_url=self.base_url or "",
url=self.url or "",
domain=(self.urlparts[1] if self.urlparts else ""),
checktime=self.checktime,
dltime=self.dltime,
size=self.size,
info=self.info,
line=self.line,
column=self.column,
page=self.page,
cache_url=self.cache_url,
content_type=self.content_type,
level=self.recursion_level,
modified=self.modified,
return dict(
valid=self.valid,
extern=self.extern[0],
result=self.result,
warnings=self.warnings[:],
name=self.name or "",
title=self.get_title(),
parent_url=self.parent_url or "",
base_ref=self.base_ref or "",
base_url=self.base_url or "",
url=self.url or "",
domain=(self.urlparts[1] if self.urlparts else ""),
checktime=self.checktime,
dltime=self.dltime,
size=self.size,
info=self.info,
line=self.line,
column=self.column,
page=self.page,
cache_url=self.cache_url,
content_type=self.content_type,
level=self.recursion_level,
modified=self.modified,
)
def to_wire(self):
@ -847,8 +941,10 @@ urlDataAttr = [
'level',
]
class CompactUrlData:
"""Store selected UrlData attributes in slots to minimize memory usage."""
__slots__ = urlDataAttr
def __init__(self, wired_url_data):

View file

@ -43,7 +43,9 @@ def print_version(exit_code=0):
def print_plugins(folders, exit_code=0):
"""Print available plugins and exit."""
modules = plugins.get_plugin_modules(folders)
pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__)
pluginclasses = sorted(
plugins.get_plugin_classes(modules), key=lambda x: x.__name__
)
for pluginclass in pluginclasses:
print(pluginclass.__name__)
@ -57,7 +59,10 @@ def print_usage(msg, exit_code=2):
"""Print a program msg text to stderr and exit."""
program = sys.argv[0]
print(_("Error: %(msg)s") % {"msg": msg}, file=console.stderr)
print(_("Execute '%(program)s -h' for help") % {"program": program}, file=console.stderr)
print(
_("Execute '%(program)s -h' for help") % {"program": program},
file=console.stderr,
)
sys.exit(exit_code)

View file

@ -26,8 +26,17 @@
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from ctypes import (windll, byref, Structure, c_char, c_short, c_uint32,
c_ushort, ArgumentError, WinError)
from ctypes import (
windll,
byref,
Structure,
c_char,
c_short,
c_uint32,
c_ushort,
ArgumentError,
WinError,
)
# from winbase.h
STDOUT = -11
@ -43,15 +52,19 @@ WORD = c_ushort
DWORD = c_uint32
TCHAR = c_char
class COORD(Structure):
"""struct in wincon.h"""
_fields_ = [
('X', SHORT),
('Y', SHORT),
]
class SMALL_RECT(Structure):
class SMALL_RECT(Structure):
"""struct in wincon.h."""
_fields_ = [
("Left", SHORT),
("Top", SHORT),
@ -59,8 +72,10 @@ class SMALL_RECT(Structure):
("Bottom", SHORT),
]
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
"""struct in wincon.h."""
_fields_ = [
("dwSize", COORD),
("dwCursorPosition", COORD),
@ -68,22 +83,29 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure):
("srWindow", SMALL_RECT),
("dwMaximumWindowSize", COORD),
]
def __str__(self):
"""Get string representation of console screen buffer info."""
return '(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)' % (
self.dwSize.Y, self.dwSize.X
, self.dwCursorPosition.Y, self.dwCursorPosition.X
, self.wAttributes
, self.srWindow.Top, self.srWindow.Left, self.srWindow.Bottom, self.srWindow.Right
, self.dwMaximumWindowSize.Y, self.dwMaximumWindowSize.X
self.dwSize.Y,
self.dwSize.X,
self.dwCursorPosition.Y,
self.dwCursorPosition.X,
self.wAttributes,
self.srWindow.Top,
self.srWindow.Left,
self.srWindow.Bottom,
self.srWindow.Right,
self.dwMaximumWindowSize.Y,
self.dwMaximumWindowSize.X,
)
def GetConsoleScreenBufferInfo(stream_id=STDOUT):
"""Get console screen buffer info object."""
handle = handles[stream_id]
csbi = CONSOLE_SCREEN_BUFFER_INFO()
success = windll.kernel32.GetConsoleScreenBufferInfo(
handle, byref(csbi))
success = windll.kernel32.GetConsoleScreenBufferInfo(handle, byref(csbi))
if not success:
raise WinError()
return csbi
@ -96,18 +118,18 @@ def SetConsoleTextAttribute(stream_id, attrs):
# from wincon.h
BLACK = 0
BLUE = 1
GREEN = 2
CYAN = 3
RED = 4
BLACK = 0
BLUE = 1
GREEN = 2
CYAN = 3
RED = 4
MAGENTA = 5
YELLOW = 6
GREY = 7
YELLOW = 6
GREY = 7
# from wincon.h
NORMAL = 0x00 # dim text, dim background
BRIGHT = 0x08 # bright text, dim background
NORMAL = 0x00 # dim text, dim background
BRIGHT = 0x08 # bright text, dim background
_default_foreground = None
_default_background = None

View file

@ -25,30 +25,34 @@ import urllib.request
import shutil
import socket
import _LinkChecker_configdata as configdata
from .. import (log, LOG_CHECK, get_install_data, fileutil)
from .. import log, LOG_CHECK, get_install_data, fileutil
from . import confparse
from xdg.BaseDirectory import xdg_config_home, xdg_data_home
Version = configdata.version
ReleaseDate = configdata.release_date
AppName = configdata.name
App = AppName+" "+Version
App = AppName + " " + Version
Author = configdata.author
HtmlAuthor = Author.replace(' ', '&nbsp;')
Copyright = "Copyright (C) 2000-2014 "+Author
HtmlCopyright = "Copyright &copy; 2000-2014 "+HtmlAuthor
AppInfo = App+" "+Copyright
HtmlAppInfo = App+", "+HtmlCopyright
Copyright = "Copyright (C) 2000-2014 " + Author
HtmlCopyright = "Copyright &copy; 2000-2014 " + HtmlAuthor
AppInfo = App + " " + Copyright
HtmlAppInfo = App + ", " + HtmlCopyright
Url = configdata.url
SupportUrl = "https://github.com/linkchecker/linkchecker/issues"
Email = configdata.author_email
UserAgent = "Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY!
Freeware = (
AppName
+ """ comes with ABSOLUTELY NO WARRANTY!
This is free software, and you are welcome to redistribute it
under certain conditions. Look at the file `LICENSE' within this
distribution."""
)
Portable = configdata.portable
def normpath(path):
"""Norm given system path with all available norm or expand functions
in os.path."""
@ -58,18 +62,19 @@ def normpath(path):
# List Python modules in the form (module, name, version attribute)
Modules = (
# required modules
# required modules
("requests", "Requests", "__version__"),
# optional modules
# optional modules
("argcomplete", "Argcomplete", None),
("GeoIP", "GeoIP", 'lib_version'), # on Unix systems
("pygeoip", "GeoIP", 'lib_version'), # on Windows systems
("GeoIP", "GeoIP", 'lib_version'), # on Unix systems
("pygeoip", "GeoIP", 'lib_version'), # on Windows systems
("sqlite3", "Pysqlite", 'version'),
("sqlite3", "Sqlite", 'sqlite_version'),
("gconf", "Gconf", '__version__'),
("meliae", "Meliae", '__version__'),
)
def get_modules_info():
"""Return unicode string with detected module info."""
module_infos = []
@ -136,6 +141,7 @@ def get_certifi_file():
the file is not found
"""
import certifi
filename = certifi.where()
if os.path.isfile(filename):
return filename
@ -161,8 +167,8 @@ class Configuration(dict):
self['robotstxt'] = True
self["debugmemory"] = False
self["localwebroot"] = None
self["maxfilesizeparse"] = 1*1024*1024
self["maxfilesizedownload"] = 5*1024*1024
self["maxfilesizeparse"] = 1 * 1024 * 1024
self["maxfilesizedownload"] = 5 * 1024 * 1024
self["maxnumurls"] = None
self["maxrunseconds"] = None
self["maxrequestspersecond"] = 10
@ -201,6 +207,7 @@ class Configuration(dict):
self['logger'] = None
self.loggers = {}
from ..logger import LoggerClasses
for c in LoggerClasses:
key = c.LoggerName
self[key] = {}
@ -250,14 +257,11 @@ class Configuration(dict):
def add_auth(self, user=None, password=None, pattern=None):
"""Add given authentication data."""
if not user or not pattern:
log.warn(LOG_CHECK,
_("missing user or URL pattern in authentication data."))
log.warn(
LOG_CHECK, _("missing user or URL pattern in authentication data.")
)
return
entry = dict(
user=user,
password=password,
pattern=re.compile(pattern),
)
entry = dict(user=user, password=password, pattern=re.compile(pattern),)
self["authentication"].append(entry)
def get_user_password(self, url):
@ -299,16 +303,16 @@ class Configuration(dict):
url = self["loginurl"]
disable = False
if not self["loginpasswordfield"]:
log.warn(LOG_CHECK,
_("no CGI password fieldname given for login URL."))
log.warn(LOG_CHECK, _("no CGI password fieldname given for login URL."))
disable = True
if not self["loginuserfield"]:
log.warn(LOG_CHECK,
_("no CGI user fieldname given for login URL."))
log.warn(LOG_CHECK, _("no CGI user fieldname given for login URL."))
disable = True
if self.get_user_password(url) == (None, None):
log.warn(LOG_CHECK,
_("no user/password authentication data found for login URL."))
log.warn(
LOG_CHECK,
_("no user/password authentication data found for login URL."),
)
disable = True
if not url.lower().startswith(("http:", "https:")):
log.warn(LOG_CHECK, _("login URL is not a HTTP URL."))
@ -318,8 +322,7 @@ class Configuration(dict):
log.warn(LOG_CHECK, _("login URL is incomplete."))
disable = True
if disable:
log.warn(LOG_CHECK,
_("disabling login URL %(url)s.") % {"url": url})
log.warn(LOG_CHECK, _("disabling login URL %(url)s.") % {"url": url})
self["loginurl"] = None
def sanitize_proxies(self):
@ -366,10 +369,14 @@ def get_user_data():
@rtype string
"""
homedotdir = normpath("~/.linkchecker/")
userdata = homedotdir if os.path.isdir(homedotdir) \
userdata = (
homedotdir
if os.path.isdir(homedotdir)
else os.path.join(xdg_data_home, "linkchecker")
)
return userdata
def get_plugin_folders():
"""Get linkchecker plugin folders. Default is
"$XDG_DATA_HOME/linkchecker/plugins/". "~/.linkchecker/plugins/" is also
@ -413,16 +420,20 @@ def get_user_config():
initialconf = normpath(os.path.join(get_share_dir(), "linkcheckerrc"))
# per user config settings
homedotfile = normpath("~/.linkchecker/linkcheckerrc")
userconf = homedotfile if os.path.isfile(homedotfile) \
userconf = (
homedotfile
if os.path.isfile(homedotfile)
else os.path.join(xdg_config_home, "linkchecker", "linkcheckerrc")
if os.path.isfile(initialconf) and not os.path.exists(userconf) and \
not Portable:
)
if os.path.isfile(initialconf) and not os.path.exists(userconf) and not Portable:
# copy the initial configuration to the user configuration
try:
make_userdir(userconf)
shutil.copy(initialconf, userconf)
except Exception as errmsg:
msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r")
msg = _(
"could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r"
)
args = dict(src=initialconf, dst=userconf, errmsg=errmsg)
log.warn(LOG_CHECK, msg % args)
return userconf
@ -496,6 +507,7 @@ def get_kde_ftp_proxy():
log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg)
pass
# The following KDE functions are largely ported and ajusted from
# Google Chromium:
# http://src.chromium.org/viewvc/chrome/trunk/src/net/proxy/proxy_config_service_linux.cc?revision=HEAD&view=markup
@ -527,6 +539,7 @@ def get_kde_ftp_proxy():
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
def get_kde_config_dir():
"""Return KDE configuration directory or None if not found."""
kde_home = get_kde_home_dir()
@ -571,6 +584,7 @@ def get_kde_home_dir():
loc_ro = re.compile(r"\[.*\]$")
@lru_cache(1)
def read_kioslaverc(kde_config_dir):
"""Read kioslaverc into data dictionary."""
@ -579,7 +593,7 @@ def read_kioslaverc(kde_config_dir):
with open(filename) as fd:
# First read all lines into dictionary since they can occur
# in any order.
for line in fd:
for line in fd:
line = line.rstrip()
if line.startswith('['):
in_proxy_settings = line.startswith("[Proxy Settings]")

View file

@ -18,7 +18,15 @@
from configparser import RawConfigParser
import os
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf
from .. import (
LinkCheckerError,
get_link_pat,
LOG_CHECK,
log,
fileutil,
plugins,
logconf,
)
def read_multiline(value):
@ -50,7 +58,9 @@ class LCConfigParser(RawConfigParser):
self.read_ok = super(LCConfigParser, self).read(files)
if len(self.read_ok) < len(files):
failed_files = set(files) - set(self.read_ok)
log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
log.warn(
LOG_CHECK, "Could not read configuration files %s.", failed_files
)
# Read all the configuration parameters from the given files.
self.read_checking_config()
self.read_authentication_config()
@ -58,15 +68,16 @@ class LCConfigParser(RawConfigParser):
self.read_output_config()
self.read_plugin_config()
except Exception as msg:
raise LinkCheckerError(
_("Error parsing configuration: %s") % str(msg))
raise LinkCheckerError(_("Error parsing configuration: %s") % str(msg))
def read_string_option(self, section, option, allowempty=False):
"""Read a string option."""
if self.has_option(section, option):
value = self.get(section, option)
if not allowempty and not value:
raise LinkCheckerError(_("invalid empty value for %s: %s\n") % (option, value))
raise LinkCheckerError(
_("invalid empty value for %s: %s\n") % (option, value)
)
self.config[option] = value
def read_boolean_option(self, section, option):
@ -80,10 +91,14 @@ class LCConfigParser(RawConfigParser):
num = self.getint(section, option)
if min is not None and num < min:
raise LinkCheckerError(
_("invalid value for %s: %d must not be less than %d") % (option, num, min))
_("invalid value for %s: %d must not be less than %d")
% (option, num, min)
)
if max is not None and num < max:
raise LinkCheckerError(
_("invalid value for %s: %d must not be greater than %d") % (option, num, max))
_("invalid value for %s: %d must not be greater than %d")
% (option, num, max)
)
if key is None:
key = option
self.config[key] = num
@ -92,6 +107,7 @@ class LCConfigParser(RawConfigParser):
"""Read configuration options in section "output"."""
section = "output"
from ..logger import LoggerClasses
for c in LoggerClasses:
key = c.LoggerName
if self.has_section(key):
@ -124,8 +140,12 @@ class LCConfigParser(RawConfigParser):
loggers = (x.strip().lower() for x in loggers)
# no file output for the blacklist and none Logger
from ..logger import LoggerNames
loggers = (x for x in loggers if x in LoggerNames and
x not in ("blacklist", "none"))
loggers = (
x
for x in loggers
if x in LoggerNames and x not in ("blacklist", "none")
)
for val in loggers:
output = self.config.logger_new(val, fileoutput=1)
self.config['fileoutput'].append(output)
@ -145,8 +165,10 @@ class LCConfigParser(RawConfigParser):
self.read_int_option(section, "maxfilesizeparse", min=1)
self.read_int_option(section, "maxfilesizedownload", min=1)
if self.has_option(section, "allowedschemes"):
self.config['allowedschemes'] = [x.strip().lower() for x in \
self.get(section, 'allowedschemes').split(',')]
self.config['allowedschemes'] = [
x.strip().lower()
for x in self.get(section, 'allowedschemes').split(',')
]
self.read_boolean_option(section, "debugmemory")
self.read_string_option(section, "cookiefile")
self.read_boolean_option(section, "robotstxt")
@ -165,21 +187,29 @@ class LCConfigParser(RawConfigParser):
for val in read_multiline(self.get(section, "entry")):
auth = val.split()
if len(auth) == 3:
self.config.add_auth(pattern=auth[0], user=auth[1],
password=auth[2])
self.config.add_auth(
pattern=auth[0], user=auth[1], password=auth[2]
)
password_fields.append("entry/%s/%s" % (auth[0], auth[1]))
elif len(auth) == 2:
self.config.add_auth(pattern=auth[0], user=auth[1])
else:
raise LinkCheckerError(
_("missing auth part in entry %(val)r") % {"val": val})
_("missing auth part in entry %(val)r") % {"val": val}
)
# read login URL and field names
if self.has_option(section, "loginurl"):
val = self.get(section, "loginurl").strip()
if not (val.lower().startswith("http:") or
val.lower().startswith("https:")):
raise LinkCheckerError(_("invalid login URL `%s'. Only " \
"HTTP and HTTPS URLs are supported.") % val)
if not (
val.lower().startswith("http:") or val.lower().startswith("https:")
):
raise LinkCheckerError(
_(
"invalid login URL `%s'. Only "
"HTTP and HTTPS URLs are supported."
)
% val
)
self.config["loginurl"] = val
self.read_string_option(section, "loginuserfield")
self.read_string_option(section, "loginpasswordfield")
@ -201,11 +231,22 @@ class LCConfigParser(RawConfigParser):
return
fn = self.read_ok[0]
if fileutil.is_accessable_by_others(fn):
log.warn(LOG_CHECK, "The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.", fn, section, fields)
log.warn(
LOG_CHECK,
"The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.",
fn,
section,
fields,
)
if os.name == 'posix':
log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn)
elif os.name == 'nt':
log.warn(LOG_CHECK, _("See http://support.microsoft.com/kb/308419 for more info on setting file permissions."))
log.warn(
LOG_CHECK,
_(
"See http://support.microsoft.com/kb/308419 for more info on setting file permissions."
),
)
def read_filtering_config(self):
"""
@ -213,8 +254,10 @@ class LCConfigParser(RawConfigParser):
"""
section = "filtering"
if self.has_option(section, "ignorewarnings"):
self.config['ignorewarnings'] = [f.strip().lower() for f in \
self.get(section, 'ignorewarnings').split(',')]
self.config['ignorewarnings'] = [
f.strip().lower()
for f in self.get(section, 'ignorewarnings').split(',')
]
if self.has_option(section, "ignore"):
for line in read_multiline(self.get(section, "ignore")):
pat = get_link_pat(line, strict=1)

View file

@ -17,6 +17,7 @@
Special container classes.
"""
class LFUCache(dict):
"""Limited cache which purges least frequently used items."""
@ -40,7 +41,7 @@ class LFUCache(dict):
def shrink(self):
"""Shrink ca. 5% of entries."""
trim = int(0.05*len(self))
trim = int(0.05 * len(self))
if trim:
items = super(LFUCache, self).items()
# sorting function for items

View file

@ -60,7 +60,8 @@ def from_headers(strheader):
for headervalue in headers.get_all("Set-Cookie"):
for pairs in split_header_words([headervalue]):
for name, value in pairs:
cookie = requests.cookies.create_cookie(name, value,
domain=host, path=path)
cookie = requests.cookies.create_cookie(
name, value, domain=host, path=path
)
res.append(cookie)
return res

View file

@ -56,11 +56,15 @@ def update_func_meta(fake_func, real_func):
def deprecated(func):
"""A decorator which can be used to mark functions as deprecated.
It emits a warning when the function is called."""
def newfunc(*args, **kwargs):
"""Print deprecated warning and execute original function."""
warnings.warn("Call to deprecated function %s." % func.__name__,
category=DeprecationWarning)
warnings.warn(
"Call to deprecated function %s." % func.__name__,
category=DeprecationWarning,
)
return func(*args, **kwargs)
return update_func_meta(newfunc, func)
@ -83,19 +87,27 @@ def signal_handler(signal_number):
if is_valid_signal and os.name == 'posix':
signal.signal(signal_number, function)
return function
return newfunc
def synchronize(lock, func, log_duration_secs=0):
"""Return synchronized function acquiring the given lock."""
def newfunc(*args, **kwargs):
"""Execute function synchronized."""
t = time.time()
with lock:
duration = time.time() - t
if duration > log_duration_secs > 0:
print("WARN:", func.__name__, "locking took %0.2f seconds" % duration, file=sys.stderr)
print(
"WARN:",
func.__name__,
"locking took %0.2f seconds" % duration,
file=sys.stderr,
)
return func(*args, **kwargs)
return update_func_meta(newfunc, func)
@ -106,11 +118,13 @@ def synchronized(lock):
def notimplemented(func):
"""Raises a NotImplementedError if the function is called."""
def newfunc(*args, **kwargs):
"""Raise NotImplementedError"""
co = func.func_code
attrs = (co.co_name, co.co_filename, co.co_firstlineno)
raise NotImplementedError("function %s at %s:%d is not implemented" % attrs)
return update_func_meta(newfunc, func)
@ -127,6 +141,7 @@ def timeit(func, log, limit):
print(args, file=log)
print(kwargs, file=log)
return res
return update_func_meta(newfunc, func)

View file

@ -32,14 +32,12 @@ def check_urls(aggregate):
try:
aggregate.visit_loginurl()
except Exception as msg:
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \
dict(msg=msg))
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % dict(msg=msg))
raise
try:
aggregate.logger.start_log_output()
except Exception as msg:
log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % \
dict(msg=msg))
log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % dict(msg=msg))
raise
try:
if not aggregate.urlqueue.empty():
@ -52,9 +50,13 @@ def check_urls(aggregate):
except KeyboardInterrupt:
interrupt(aggregate)
except RuntimeError:
log.warn(LOG_CHECK,
_("Could not start a new thread. Check that the current user" \
" is allowed to start new threads."))
log.warn(
LOG_CHECK,
_(
"Could not start a new thread. Check that the current user"
" is allowed to start new threads."
),
)
abort(aggregate)
except Exception:
# Catching "Exception" is intentionally done. This saves the program
@ -84,10 +86,8 @@ def interrupt(aggregate):
interrupts."""
while True:
try:
log.warn(LOG_CHECK,
_("interrupt; waiting for active threads to finish"))
log.warn(LOG_CHECK,
_("another interrupt will exit immediately"))
log.warn(LOG_CHECK, _("interrupt; waiting for active threads to finish"))
log.warn(LOG_CHECK, _("another interrupt will exit immediately"))
abort(aggregate)
break
except KeyboardInterrupt:
@ -113,6 +113,7 @@ def abort_now():
if os.name == 'posix':
# Unix systems can use signals
import signal
os.kill(os.getpid(), signal.SIGTERM)
time.sleep(1)
os.kill(os.getpid(), signal.SIGKILL)
@ -130,5 +131,6 @@ def get_aggregate(config):
_robots_txt = robots_txt.RobotsTxt(config["useragent"])
plugin_manager = plugins.PluginManager(config)
result_cache = results.ResultCache()
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,
result_cache)
return aggregator.Aggregate(
config, _urlqueue, _robots_txt, plugin_manager, result_cache
)

View file

@ -34,15 +34,16 @@ _threads_lock = threading.RLock()
_hosts_lock = threading.RLock()
_downloadedbytes_lock = threading.RLock()
def new_request_session(config, cookies):
"""Create a new request session."""
session = requests.Session()
if cookies:
session.cookies = cookies
session.max_redirects = config["maxhttpredirects"]
session.headers.update({
"User-Agent": config["useragent"],
})
session.headers.update(
{"User-Agent": config["useragent"],}
)
if config["cookiefile"]:
for cookie in from_file(config["cookiefile"]):
session.cookies.set_cookie(cookie)
@ -52,8 +53,7 @@ def new_request_session(config, cookies):
class Aggregate:
"""Store thread-safe data collections for checker threads."""
def __init__(self, config, urlqueue, robots_txt, plugin_manager,
result_cache):
def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache):
"""Store given link checking objects."""
self.config = config
self.urlqueue = urlqueue
@ -78,7 +78,8 @@ class Aggregate:
user, password = self.config.get_user_password(url)
if not user and not password:
raise LinkCheckerError(
"loginurl is configured but neither user nor password are set")
"loginurl is configured but neither user nor password are set"
)
session = new_request_session(self.config, self.cookies)
log.debug(LOG_CHECK, "Getting login form %s", url)
kwargs = dict(timeout=self.config["timeout"])
@ -119,11 +120,15 @@ class Aggregate:
num = self.config["threads"]
if num > 0:
for dummy in range(num):
t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
t = checker.Checker(
self.urlqueue, self.logger, self.add_request_session
)
self.threads.append(t)
t.start()
else:
self.request_sessions[threading.get_ident()] = new_request_session(self.config, self.cookies)
self.request_sessions[threading.get_ident()] = new_request_session(
self.config, self.cookies
)
checker.check_urls(self.urlqueue, self.logger)
@synchronized(_threads_lock)
@ -162,10 +167,18 @@ class Aggregate:
first = False
log.info(LOG_CHECK, name[12:])
args = dict(
num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
num=len(
[x for x in self.threads if x.getName().startswith("CheckThread-")]
),
timeout=strformat.strduration_long(self.config["aborttimeout"]),
)
log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)
log.info(
LOG_CHECK,
_(
"%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop."
)
% args,
)
@synchronized(_threads_lock)
def get_check_threads(self):
@ -187,7 +200,10 @@ class Aggregate:
try:
self.urlqueue.join(timeout=timeout)
except urlqueue.Timeout:
log.warn(LOG_CHECK, "Abort timed out after %d seconds, stopping application." % timeout)
log.warn(
LOG_CHECK,
"Abort timed out after %d seconds, stopping application." % timeout,
)
raise KeyboardInterrupt()
@synchronized(_threads_lock)
@ -219,8 +235,9 @@ class Aggregate:
def end_log_output(self, **kwargs):
"""Print ending output to log."""
kwargs.update(dict(
downloaded_bytes=self.downloaded_bytes,
num_urls = len(self.result_cache),
))
kwargs.update(
dict(
downloaded_bytes=self.downloaded_bytes, num_urls=len(self.result_cache),
)
)
self.logger.end_log_output(**kwargs)

View file

@ -35,8 +35,7 @@ class StatusLogger:
def log_status(self, checked, in_progress, queue, duration, num_urls):
"""Write status message to file descriptor."""
msg = _n("%2d thread active", "%2d threads active", in_progress) % \
in_progress
msg = _n("%2d thread active", "%2d threads active", in_progress) % in_progress
self.write("%s, " % msg)
msg = _n("%5d link queued", "%5d links queued", queue) % queue
self.write("%s, " % msg)
@ -64,7 +63,9 @@ class StatusLogger:
def internal_error(out=stderr, etype=None, evalue=None, tb=None):
"""Print internal error message (output defaults to stderr)."""
print(os.linesep, file=out)
print(_("""********** Oops, I did it again. *************
print(
_(
"""********** Oops, I did it again. *************
You have found an internal error in LinkChecker. Please write a bug report
at %s
@ -79,7 +80,11 @@ When using the commandline client:
Not disclosing some of the information above due to privacy reasons is ok.
I will try to help you nonetheless, but you have to give me something
I can work with ;) .
""") % configuration.SupportUrl, file=out)
"""
)
% configuration.SupportUrl,
file=out,
)
if etype is None:
etype = sys.exc_info()[0]
if evalue is None:
@ -90,8 +95,11 @@ I can work with ;) .
print_app_info(out=out)
print_proxy_info(out=out)
print_locale_info(out=out)
print(os.linesep,
_("******** LinkChecker internal error, over and out ********"), file=out)
print(
os.linesep,
_("******** LinkChecker internal error, over and out ********"),
file=out,
)
def print_env_info(key, out=stderr):
@ -113,6 +121,7 @@ def print_locale_info(out=stderr):
print_env_info(key, out=out)
print(_("Default locale:"), i18n.get_locale(), file=out)
# Environment variables influencing the interpreter execution
# See python(1) man page.
PYTHON_ENV_VARS = (
@ -131,13 +140,18 @@ PYTHON_ENV_VARS = (
'PYTHONWARNINGS',
'PYTHONHASHSEED',
)
def print_app_info(out=stderr):
"""Print system and application info (output defaults to stderr)."""
print(_("System info:"), file=out)
print(configuration.App, file=out)
print(_("Released on:"), configuration.ReleaseDate, file=out)
print(_("Python %(version)s on %(platform)s") %
{"version": sys.version, "platform": sys.platform}, file=out)
print(
_("Python %(version)s on %(platform)s")
% {"version": sys.version, "platform": sys.platform},
file=out,
)
for key in PYTHON_ENV_VARS:
print_env_info(key, out=out)
print(configuration.get_modules_info(), file=out)
@ -148,6 +162,5 @@ def print_app_info(out=stderr):
def print_version(out=stdout):
"""Print the program version (output defaults to stdout)."""
print(configuration.App, _("released"),
configuration.ReleaseDate, file=out)
print(configuration.App, _("released"), configuration.ReleaseDate, file=out)
print(configuration.Copyright, file=out)

View file

@ -24,6 +24,7 @@ class Interrupt(task.CheckedTask):
This gives us a portable SIGALRM implementation.
The duration is checked every 5 seconds.
"""
WaitSeconds = 5
def __init__(self, duration):
@ -41,5 +42,8 @@ class Interrupt(task.CheckedTask):
while not self.stopped(self.WaitSeconds):
duration = time.time() - self.start_time
if duration > self.duration:
log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration))
log.warn(
LOG_CHECK,
"Interrupt after %s" % strformat.strduration_long(duration),
)
raise KeyboardInterrupt()

View file

@ -18,6 +18,7 @@ import threading
import _thread
from ..decorators import synchronized
_lock = threading.Lock()

View file

@ -17,6 +17,7 @@
Dummy objects.
"""
class Dummy:
"""A dummy object ignores all access to it. Useful for testing."""

View file

@ -67,6 +67,7 @@ elif "G_BROKEN_FILENAMES" in os.environ:
else:
FSCODING = "utf-8"
def path_safe(path):
"""Ensure path string is compatible with the platform file system encoding."""
if isinstance(path, str) and not os.path.supports_unicode_filenames:
@ -83,7 +84,7 @@ def get_temp_file(mode='r', **kwargs):
def is_tty(fp):
"""Check if is a file object pointing to a TTY."""
return (hasattr(fp, "isatty") and fp.isatty())
return hasattr(fp, "isatty") and fp.isatty()
@lru_cache(128)

View file

@ -19,8 +19,22 @@ Python implementation of a part of Dan Bernstein's ftpparse library.
See also http://cr.yp.to/ftpparse.html
"""
months = ("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep",
"oct", "nov", "dec")
months = (
"jan",
"feb",
"mar",
"apr",
"may",
"jun",
"jul",
"aug",
"sep",
"oct",
"nov",
"dec",
)
def ismonth(txt):
"""Check if given text is a month name."""
return txt.lower() in months
@ -78,20 +92,20 @@ def ftpparse(line):
parts = line.split()
if len(parts) < 7:
return None
del parts[0] # skip permissions
del parts[0] # skip permissions
if parts[0] != 'folder':
del parts[0] # skip nlink
del parts[0] # skip uid
del parts[0] # skip gid or size
del parts[0] # skip nlink
del parts[0] # skip uid
del parts[0] # skip gid or size
if not ismonth(parts[0]):
del parts[0] # skip size
del parts[0] # skip size
if not ismonth(parts[0]):
return None
del parts[0] # skip month
del parts[0] # skip day
del parts[0] # skip month
del parts[0] # skip day
if not parts:
return None
del parts[0] # skip year or time
del parts[0] # skip year or time
name = " ".join(parts)
# resolve links
if line[0] == 'l' and ' -> ' in name:

View file

@ -19,13 +19,17 @@ HTML parser implemented using Beautiful Soup and html.parser.
from warnings import filterwarnings
filterwarnings("ignore",
filterwarnings(
"ignore",
message="The soupsieve package is not installed. CSS selectors cannot be used.",
category=UserWarning, module="bs4")
category=UserWarning,
module="bs4",
)
from bs4 import BeautifulSoup
def make_soup(markup, from_encoding=None):
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
multi_valued_attributes=None)
return BeautifulSoup(
markup, "html.parser", from_encoding=from_encoding, multi_valued_attributes=None
)

View file

@ -25,41 +25,41 @@ unquote = strformat.unquote
# HTML4/5 link tags
# ripped mainly from HTML::Tagset.pm with HTML5 added
LinkTags = {
'a': ['href'],
'applet': ['archive', 'src'],
'area': ['href'],
'audio': ['src'], # HTML5
'bgsound': ['src'],
'a': ['href'],
'applet': ['archive', 'src'],
'area': ['href'],
'audio': ['src'], # HTML5
'bgsound': ['src'],
'blockquote': ['cite'],
'body': ['background'],
'button': ['formaction'], # HTML5
'del': ['cite'],
'embed': ['pluginspage', 'src'],
'form': ['action'],
'frame': ['src', 'longdesc'],
'head': ['profile'],
'html': ['manifest'], # HTML5
'iframe': ['src', 'longdesc'],
'ilayer': ['background'],
'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'],
'input': ['src', 'usemap', 'formaction'],
'ins': ['cite'],
'isindex': ['action'],
'layer': ['background', 'src'],
'link': ['href'],
'meta': ['content', 'href'],
'object': ['classid', 'data', 'archive', 'usemap', 'codebase'],
'q': ['cite'],
'script': ['src'],
'source': ['src'], # HTML5
'table': ['background'],
'td': ['background'],
'th': ['background'],
'tr': ['background'],
'track': ['src'], # HTML5
'video': ['src'], # HTML5
'xmp': ['href'],
None: ['style', 'itemtype'],
'body': ['background'],
'button': ['formaction'], # HTML5
'del': ['cite'],
'embed': ['pluginspage', 'src'],
'form': ['action'],
'frame': ['src', 'longdesc'],
'head': ['profile'],
'html': ['manifest'], # HTML5
'iframe': ['src', 'longdesc'],
'ilayer': ['background'],
'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'],
'input': ['src', 'usemap', 'formaction'],
'ins': ['cite'],
'isindex': ['action'],
'layer': ['background', 'src'],
'link': ['href'],
'meta': ['content', 'href'],
'object': ['classid', 'data', 'archive', 'usemap', 'codebase'],
'q': ['cite'],
'script': ['src'],
'source': ['src'], # HTML5
'table': ['background'],
'td': ['background'],
'th': ['background'],
'tr': ['background'],
'track': ['src'], # HTML5
'video': ['src'], # HTML5
'xmp': ['href'],
None: ['style', 'itemtype'],
}
# HTML anchor tags
@ -70,8 +70,8 @@ AnchorTags = {
# WML tags
WmlTags = {
'a': ['href'],
'go': ['href'],
'a': ['href'],
'go': ['href'],
'img': ['src'],
}
@ -219,6 +219,9 @@ def find_links(soup, callback, tags):
lf = LinkFinder(callback, tags)
for element in soup.find_all(True):
lf.html_element(
element.name, element.attrs, element.text.strip(),
element.name,
element.attrs,
element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)
None if element.sourcepos is None else element.sourcepos + 1,
)

View file

@ -19,6 +19,7 @@ HTML form utils
from ..htmlutil import htmlsoup
from .. import log, LOG_CHECK
class Form:
"""Store HTML form URL and form data."""
@ -44,10 +45,8 @@ def search_form(content, cgiuser, cgipassword):
cginames = {cgiuser, cgipassword} - {None}
for form_element in soup.find_all("form", action=True):
form = Form(form_element["action"])
for input_element in form_element.find_all("input",
attrs={"name": True}):
form.add_value(
input_element["name"], input_element.attrs.get("value"))
for input_element in form_element.find_all("input", attrs={"name": True}):
form.add_value(input_element["name"], input_element.attrs.get("value"))
if cginames <= set(form.data):
log.debug(LOG_CHECK, "Found form %s", form)
return form

View file

@ -27,14 +27,10 @@ def x509_to_dict(x509):
subject, subjectAltName and optional notAfter.
"""
from requests.packages.urllib3.contrib.pyopenssl import get_subj_alt_name
res = {
'subject': (
(('commonName', x509.get_subject().CN),),
),
'subjectAltName': [
('DNS', value)
for value in get_subj_alt_name(x509)
]
'subject': ((('commonName', x509.get_subject().CN),),),
'subjectAltName': [('DNS', value) for value in get_subj_alt_name(x509)],
}
notAfter = x509.get_notAfter()
if notAfter is not None:

View file

@ -30,13 +30,16 @@ default_language = default_encoding = None
default_directory = None
default_domain = None
def install_builtin(translator, do_unicode):
"""Install _() and _n() gettext methods into default namespace."""
import builtins
builtins.__dict__['_'] = translator.gettext
# also install ngettext
builtins.__dict__['_n'] = translator.ngettext
class Translator(gettext.GNUTranslations):
"""A translation class always installing its gettext methods into the
default namespace."""
@ -84,18 +87,29 @@ def init(domain, directory, loc=None):
def install_language(language):
"""Install translation service routines into default namespace."""
translator = get_translator(default_domain, default_directory,
languages=[get_lang(language)], fallback=True)
translator = get_translator(
default_domain, default_directory, languages=[get_lang(language)], fallback=True
)
do_unicode = True
translator.install(do_unicode)
def get_translator(domain, directory, languages=None,
translatorklass=Translator, fallback=False,
fallbackklass=NullTranslator):
def get_translator(
domain,
directory,
languages=None,
translatorklass=Translator,
fallback=False,
fallbackklass=NullTranslator,
):
"""Search the appropriate GNUTranslations class."""
translator = gettext.translation(domain, localedir=directory,
languages=languages, class_=translatorklass, fallback=fallback)
translator = gettext.translation(
domain,
localedir=directory,
languages=languages,
class_=translatorklass,
fallback=fallback,
)
if not isinstance(translator, gettext.GNUTranslations) and fallbackklass:
translator = fallbackklass()
return translator
@ -175,6 +189,7 @@ lang_transis = {
'en': {'de': 'Englisch'},
}
def lang_name(lang):
"""Return full name of given language."""
return lang_names[lang]

View file

@ -25,8 +25,15 @@ import re
import time
import urllib.parse
from . import configuration, strformat, checker, director, get_link_pat, \
init_i18n, url as urlutil
from . import (
configuration,
strformat,
checker,
director,
get_link_pat,
init_i18n,
url as urlutil,
)
from .decorators import synchronized
# 5 minutes timeout for requests
@ -67,17 +74,20 @@ lang_locale = {
}
_is_level = re.compile(r'^(0|1|2|3|-1)$').match
class LCFormError(Exception):
"""Form related errors."""
pass
def get_response_headers():
"""Get list of response headers in key-value form."""
return [("Content-type", "text/html"),
("Cache-Control", "no-cache"),
("Pragma:", "no-cache")
]
return [
("Content-type", "text/html"),
("Cache-Control", "no-cache"),
("Pragma:", "no-cache"),
]
def formvalue(form, key):
@ -89,6 +99,8 @@ def formvalue(form, key):
_lock = threading.Lock()
class ThreadsafeIO:
"""Thread-safe unicode I/O class."""
@ -235,7 +247,7 @@ def log(env, msg):
def dump(env, form):
"""Log environment and form."""
for var, value in env.items():
log(env, var+"="+value)
log(env, var + "=" + value)
for key in form:
log(env, str(formvalue(form, key)))
@ -247,7 +259,9 @@ def format_error(why):
@return: HTML page content
@rtype: unicode
"""
return _("""<!DOCTYPE HTML>
return (
_(
"""<!DOCTYPE HTML>
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>LinkChecker Online Error</title></head>
@ -260,4 +274,7 @@ contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
Errors are logged.
</blockquote>
</body>
</html>""") % html.escape(why)
</html>"""
)
% html.escape(why)
)

View file

@ -42,15 +42,17 @@ def get_package_modules(packagename):
parentmodule = os.path.basename(os.path.dirname(__file__))
with zipfile.ZipFile(zipname, 'r') as f:
prefix = "%s/%s/" % (parentmodule, packagename)
modnames = [os.path.splitext(n[len(prefix):])[0]
for n in f.namelist()
if n.startswith(prefix) and "__init__" not in n]
modnames = [
os.path.splitext(n[len(prefix) :])[0]
for n in f.namelist()
if n.startswith(prefix) and "__init__" not in n
]
else:
dirname = os.path.join(os.path.dirname(__file__), packagename)
modnames = [x[:-3] for x in get_importable_files(dirname)]
for modname in modnames:
try:
name ="..%s.%s" % (packagename, modname)
name = "..%s.%s" % (packagename, modname)
yield importlib.import_module(name, __name__)
except ImportError as msg:
print("WARN: could not load module %s: %s" % (modname, msg))
@ -63,7 +65,7 @@ def get_folder_modules(folder, parentpackage):
return
for filename in get_importable_files(folder):
fullname = os.path.join(folder, filename)
modname = parentpackage+"."+filename[:-3]
modname = parentpackage + "." + filename[:-3]
try:
yield imp.load_source(modname, fullname)
except ImportError as msg:
@ -80,7 +82,10 @@ def get_importable_files(folder):
if fname.endswith('.py') and not fname.startswith('_'):
fullname = os.path.join(folder, fname)
if check_writable_by_others(fullname):
print("ERROR: refuse to load module from world writable file %r" % fullname)
print(
"ERROR: refuse to load module from world writable file %r"
% fullname
)
else:
yield fname

View file

@ -19,6 +19,7 @@ Locking utility class.
import threading
from . import log, LOG_THREAD
def get_lock(name, debug=False):
"""Get a new lock.
@param debug: if True, acquire() and release() will have debug messages

View file

@ -24,11 +24,13 @@ import inspect
import traceback
# memory leak debugging
#import gc
#gc.enable()
#gc.set_debug(gc.DEBUG_LEAK)
# import gc
# gc.enable()
# gc.set_debug(gc.DEBUG_LEAK)
PRINT_LOCALVARS = False
def _stack_format(stack):
"""Format a stack trace to a message.

View file

@ -41,20 +41,18 @@ lognamelist = ", ".join(repr(name) for name in lognames)
# logging configuration
configdict = {
'version': 1,
'loggers': {
},
'root': {
'level': 'WARN',
},
'loggers': {},
'root': {'level': 'WARN',},
'incremental': True,
}
def init_log_config(handler=None):
"""Set up the application logging (not to be confused with check loggers).
"""
for applog in lognames.values():
# propagate except for root app logger 'linkcheck'
propagate = (applog != LOG_ROOT)
propagate = applog != LOG_ROOT
configdict['loggers'][applog] = dict(level='INFO', propagate=propagate)
logging.config.dictConfig(configdict)
@ -86,8 +84,8 @@ def set_debug(loggers):
"""Set debugging log level."""
set_loglevel(loggers, logging.DEBUG)
# enable for httplib debugging (used by requests.packages.urllib3)
#import httplib
#httplib.HTTPConnection.debuglevel = 1
# import httplib
# httplib.HTTPConnection.debuglevel = 1
def set_loglevel(loggers, level):

View file

@ -46,15 +46,7 @@ Fields = dict(
)
del _
ContentTypes = dict(
image=0,
text=0,
video=0,
audio=0,
application=0,
mail=0,
other=0,
)
ContentTypes = dict(image=0, text=0, video=0, audio=0, application=0, mail=0, other=0,)
class LogStatistics:
@ -236,9 +228,13 @@ class _Logger(abc.ABC):
self.close_fd = True
except IOError:
msg = sys.exc_info()[1]
log.warn(LOG_CHECK,
"Could not open file %r for writing: %s\n"
"Disabling log output of %s", self.filename, msg, self)
log.warn(
LOG_CHECK,
"Could not open file %r for writing: %s\n" "Disabling log output of %s",
self.filename,
msg,
self,
)
self.fd = dummy.Dummy()
self.is_active = False
self.filename = None
@ -246,10 +242,10 @@ class _Logger(abc.ABC):
def create_fd(self):
"""Create open file descriptor."""
if self.filename is None:
return i18n.get_encoded_writer(encoding=self.output_encoding,
errors=self.codec_errors)
return codecs.open(self.filename, "wb", self.output_encoding,
self.codec_errors)
return i18n.get_encoded_writer(
encoding=self.output_encoding, errors=self.codec_errors
)
return codecs.open(self.filename, "wb", self.output_encoding, self.codec_errors)
def close_fileoutput(self):
"""
@ -289,12 +285,14 @@ class _Logger(abc.ABC):
"""
Return wrapped version of given lines.
"""
sep = os.linesep+os.linesep
sep = os.linesep + os.linesep
text = sep.join(lines)
kwargs = dict(subsequent_indent=" "*self.max_indent,
initial_indent=" "*self.max_indent,
break_long_words=False,
break_on_hyphens=False)
kwargs = dict(
subsequent_indent=" " * self.max_indent,
initial_indent=" " * self.max_indent,
break_long_words=False,
break_on_hyphens=False,
)
return strformat.wrap(text, width, **kwargs).lstrip()
def write(self, s, **args):
@ -311,9 +309,12 @@ class _Logger(abc.ABC):
self.fd.write(s, **args)
except IOError:
msg = sys.exc_info()[1]
log.warn(LOG_CHECK,
"Could not write to output file: %s\n"
"Disabling log output of %s", msg, self)
log.warn(
LOG_CHECK,
"Could not write to output file: %s\n" "Disabling log output of %s",
msg,
self,
)
self.close_fileoutput()
self.fd = dummy.Dummy()
self.is_active = False
@ -356,9 +357,9 @@ class _Logger(abc.ABC):
parts = self.logparts
values = (self.part(x) for x in parts)
# maximum indent for localized log part names
self.max_indent = max(len(x) for x in values)+1
self.max_indent = max(len(x) for x in values) + 1
for key in parts:
numspaces = (self.max_indent - len(self.part(key)))
numspaces = self.max_indent - len(self.part(key))
self.logspaces[key] = " " * numspaces
self.stats.reset()
self.starttime = time.time()
@ -374,22 +375,29 @@ class _Logger(abc.ABC):
def write_intro(self):
"""Write intro comments."""
self.comment(_("created by %(app)s at %(time)s") %
{"app": configuration.AppName,
"time": strformat.strtime(self.starttime)})
self.comment(_("Get the newest version at %(url)s") %
{'url': configuration.Url})
self.comment(_("Write comments and bugs to %(url)s") %
{'url': configuration.SupportUrl})
self.comment(
_("created by %(app)s at %(time)s")
% {"app": configuration.AppName, "time": strformat.strtime(self.starttime)}
)
self.comment(
_("Get the newest version at %(url)s") % {'url': configuration.Url}
)
self.comment(
_("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl}
)
self.check_date()
def write_outro(self):
"""Write outro comments."""
self.stoptime = time.time()
duration = self.stoptime - self.starttime
self.comment(_("Stopped checking at %(time)s (%(duration)s)") %
{"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration)})
self.comment(
_("Stopped checking at %(time)s (%(duration)s)")
% {
"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration),
}
)
@abc.abstractmethod
def log_url(self, url_data):
@ -445,9 +453,11 @@ class _Logger(abc.ABC):
return modified.strftime("%Y-%m-%d{0}%H:%M:%S.%fZ".format(sep))
return ""
def _get_loggers():
"""Return list of Logger classes."""
from .. import loader
modules = loader.get_package_modules('logger')
return list(loader.get_plugins(modules, [_Logger]))

View file

@ -76,8 +76,9 @@ class BlacklistLogger(_Logger):
"""
Read a previously stored blacklist from file fd.
"""
with codecs.open(self.filename, 'r', self.output_encoding,
self.codec_errors) as fd:
with codecs.open(
self.filename, 'r', self.output_encoding, self.codec_errors
) as fd:
for line in fd:
line = line.rstrip()
if line.startswith('#') or not line:

View file

@ -24,9 +24,23 @@ from . import _Logger
from .. import strformat
Columns = (
"urlname", "parentname", "baseref", "result", "warningstring",
"infostring", "valid", "url", "line", "column", "name",
"dltime", "size", "checktime", "cached", "level", "modified",
"urlname",
"parentname",
"baseref",
"result",
"warningstring",
"infostring",
"valid",
"url",
"line",
"column",
"name",
"dltime",
"size",
"checktime",
"cached",
"level",
"modified",
)
@ -70,9 +84,13 @@ class CSVLogger(_Logger):
# write empty string to initialize file output
self.write("")
self.queue = StringIO()
self.writer = csv.writer(self.queue, dialect=self.dialect,
delimiter=self.separator, lineterminator=self.linesep,
quotechar=self.quotechar)
self.writer = csv.writer(
self.queue,
dialect=self.dialect,
delimiter=self.separator,
lineterminator=self.linesep,
quotechar=self.quotechar,
)
for s in Columns:
if self.has_part(s):
row.append(s)

View file

@ -55,8 +55,7 @@ class CustomXMLLogger(xmllog._XMLLogger):
'line': "%s" % url_data.line,
'column': "%s" % url_data.column,
}
self.xml_tag("parent", url_data.parent_url,
attrs=attrs)
self.xml_tag("parent", url_data.parent_url, attrs=attrs)
if url_data.base_ref and self.has_part('base'):
self.xml_tag("baseref", url_data.base_ref)
if self.has_part("realurl"):

View file

@ -93,6 +93,7 @@ class _GraphLogger(_Logger):
_disallowed = re.compile(r"[^a-zA-Z0-9 '#(){}\-\[\]\.,;:\!\?]+")
def quote(s):
"""Replace disallowed characters in node or edge labels.
Also remove whitespace from beginning or end of label."""

View file

@ -27,7 +27,7 @@ class GraphXMLLogger(_XMLLogger, _GraphLogger):
LoggerName = 'gxml'
LoggerArgs = {
LoggerArgs = {
"filename": "linkchecker-out.gxml",
}

View file

@ -28,8 +28,10 @@ from .. import strformat, configuration
# ss=1 enables show source
validate_html = "http://validator.w3.org/check?ss=1&amp;uri=%(uri)s"
# options are the default
validate_css = "http://jigsaw.w3.org/css-validator/validator?" \
"uri=%(uri)s&amp;warning=1&amp;profile=css2&amp;usermedium=all"
validate_css = (
"http://jigsaw.w3.org/css-validator/validator?"
"uri=%(uri)s&amp;warning=1&amp;profile=css2&amp;usermedium=all"
)
HTML_HEADER = """<!DOCTYPE HTML>
<html>
@ -64,15 +66,15 @@ class HtmlLogger(_Logger):
LoggerName = 'html'
LoggerArgs = {
"filename": "linkchecker-out.html",
LoggerArgs = {
"filename": "linkchecker-out.html",
'colorbackground': '#fff7e5',
'colorurl': '#dcd5cf',
'colorborder': '#000000',
'colorlink': '#191c83',
'colorwarning': '#e0954e',
'colorerror': '#db4930',
'colorok': '#3ba557',
'colorurl': '#dcd5cf',
'colorborder': '#000000',
'colorlink': '#191c83',
'colorwarning': '#e0954e',
'colorerror': '#db4930',
'colorok': '#3ba557',
}
def __init__(self, **kwargs):
@ -116,12 +118,16 @@ class HtmlLogger(_Logger):
self.write(HTML_HEADER % header)
self.comment("Generated by %s" % configuration.App)
if self.has_part('intro'):
self.write("<h2>"+configuration.App+
"</h2><br/><blockquote>"+
configuration.Freeware+"<br/><br/>"+
(_("Start checking at %s") %
strformat.strtime(self.starttime))+
os.linesep+"<br/>")
self.write(
"<h2>"
+ configuration.App
+ "</h2><br/><blockquote>"
+ configuration.Freeware
+ "<br/><br/>"
+ (_("Start checking at %s") % strformat.strtime(self.starttime))
+ os.linesep
+ "<br/>"
)
self.check_date()
self.flush()
@ -184,10 +190,15 @@ class HtmlLogger(_Logger):
def write_parent(self, url_data):
"""Write url_data.parent_url."""
self.write("<tr><td>"+self.part("parenturl")+
'</td><td><a target="top" href="'+
url_data.parent_url+'">'+
html.escape(url_data.parent_url)+"</a>")
self.write(
"<tr><td>"
+ self.part("parenturl")
+ '</td><td><a target="top" href="'
+ url_data.parent_url
+ '">'
+ html.escape(url_data.parent_url)
+ "</a>"
)
if url_data.line is not None:
self.write(_(", line %d") % url_data.line)
if url_data.column is not None:
@ -199,58 +210,98 @@ class HtmlLogger(_Logger):
vhtml = validate_html % {'uri': url_data.parent_url}
vcss = validate_css % {'uri': url_data.parent_url}
self.writeln()
self.writeln('(<a href="'+vhtml+'">HTML</a>)')
self.write('(<a href="'+vcss+'">CSS</a>)')
self.writeln('(<a href="' + vhtml + '">HTML</a>)')
self.write('(<a href="' + vcss + '">CSS</a>)')
self.writeln("</td></tr>")
def write_base(self, url_data):
"""Write url_data.base_ref."""
self.writeln("<tr><td>"+self.part("base")+"</td><td>"+
html.escape(url_data.base_ref)+"</td></tr>")
self.writeln(
"<tr><td>"
+ self.part("base")
+ "</td><td>"
+ html.escape(url_data.base_ref)
+ "</td></tr>"
)
def write_real(self, url_data):
"""Write url_data.url."""
self.writeln("<tr><td>"+self.part("realurl")+"</td><td>"+
'<a target="top" href="'+url_data.url+
'">'+html.escape(url_data.url)+"</a></td></tr>")
self.writeln(
"<tr><td>"
+ self.part("realurl")
+ "</td><td>"
+ '<a target="top" href="'
+ url_data.url
+ '">'
+ html.escape(url_data.url)
+ "</a></td></tr>"
)
def write_dltime(self, url_data):
"""Write url_data.dltime."""
self.writeln("<tr><td>"+self.part("dltime")+"</td><td>"+
(_("%.3f seconds") % url_data.dltime)+
"</td></tr>")
self.writeln(
"<tr><td>"
+ self.part("dltime")
+ "</td><td>"
+ (_("%.3f seconds") % url_data.dltime)
+ "</td></tr>"
)
def write_size(self, url_data):
"""Write url_data.size."""
self.writeln("<tr><td>"+self.part("dlsize")+"</td><td>"+
strformat.strsize(url_data.size)+
"</td></tr>")
self.writeln(
"<tr><td>"
+ self.part("dlsize")
+ "</td><td>"
+ strformat.strsize(url_data.size)
+ "</td></tr>"
)
def write_checktime(self, url_data):
"""Write url_data.checktime."""
self.writeln("<tr><td>"+self.part("checktime")+"</td><td>"+
(_("%.3f seconds") % url_data.checktime)+"</td></tr>")
self.writeln(
"<tr><td>"
+ self.part("checktime")
+ "</td><td>"
+ (_("%.3f seconds") % url_data.checktime)
+ "</td></tr>"
)
def write_info(self, url_data):
"""Write url_data.info."""
sep = "<br/>"+os.linesep
sep = "<br/>" + os.linesep
text = sep.join(html.escape(x) for x in url_data.info)
self.writeln('<tr><td valign="top">' + self.part("info")+
"</td><td>"+text+"</td></tr>")
self.writeln(
'<tr><td valign="top">'
+ self.part("info")
+ "</td><td>"
+ text
+ "</td></tr>"
)
def write_modified(self, url_data):
"""Write url_data.modified."""
text = html.escape(self.format_modified(url_data.modified))
self.writeln('<tr><td valign="top">' + self.part("modified") +
"</td><td>"+text+"</td></tr>")
self.writeln(
'<tr><td valign="top">'
+ self.part("modified")
+ "</td><td>"
+ text
+ "</td></tr>"
)
def write_warning(self, url_data):
"""Write url_data.warnings."""
sep = "<br/>"+os.linesep
sep = "<br/>" + os.linesep
text = sep.join(html.escape(x[1]) for x in url_data.warnings)
self.writeln('<tr><td class="warning" '+
'valign="top">' + self.part("warning") +
'</td><td class="warning">' + text + "</td></tr>")
self.writeln(
'<tr><td class="warning" '
+ 'valign="top">'
+ self.part("warning")
+ '</td><td class="warning">'
+ text
+ "</td></tr>"
)
def write_result(self, url_data):
"""Write url_data.result."""
@ -265,22 +316,30 @@ class HtmlLogger(_Logger):
self.write('</td><td class="error">')
self.write(html.escape(_("Error")))
if url_data.result:
self.write(": "+html.escape(url_data.result))
self.write(": " + html.escape(url_data.result))
self.writeln("</td></tr>")
def write_stats(self):
"""Write check statistic infos."""
self.writeln('<br/><i>%s</i><br/>' % _("Statistics"))
if self.stats.number > 0:
self.writeln(_(
"Content types: %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(mail)d mail"
" and %(other)d other.") % self.stats.link_types)
self.writeln(
_(
"Content types: %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(mail)d mail"
" and %(other)d other."
)
% self.stats.link_types
)
self.writeln("<br/>")
self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") %
dict(min=self.stats.min_url_length,
max=self.stats.max_url_length,
avg=self.stats.avg_url_length))
self.writeln(
_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.")
% dict(
min=self.stats.min_url_length,
max=self.stats.max_url_length,
avg=self.stats.avg_url_length,
)
)
else:
self.writeln(_("No statistics available since no URLs were checked."))
self.writeln("<br/>")
@ -288,42 +347,77 @@ class HtmlLogger(_Logger):
def write_outro(self):
"""Write end of check message."""
self.writeln("<br/>")
self.write(_("That's it.")+" ")
self.write(_("That's it.") + " ")
if self.stats.number >= 0:
self.write(_n("%d link checked.", "%d links checked.",
self.stats.number) % self.stats.number)
self.write(
_n("%d link checked.", "%d links checked.", self.stats.number)
% self.stats.number
)
self.write(" ")
self.write(_n("%d warning found", "%d warnings found",
self.stats.warnings_printed) % self.stats.warnings_printed)
self.write(
_n("%d warning found", "%d warnings found", self.stats.warnings_printed)
% self.stats.warnings_printed
)
if self.stats.warnings != self.stats.warnings_printed:
self.write(_(" (%d ignored or duplicates not printed)") %
(self.stats.warnings - self.stats.warnings_printed))
self.write(
_(" (%d ignored or duplicates not printed)")
% (self.stats.warnings - self.stats.warnings_printed)
)
self.write(". ")
self.write(_n("%d error found", "%d errors found",
self.stats.errors_printed) % self.stats.errors_printed)
self.write(
_n("%d error found", "%d errors found", self.stats.errors_printed)
% self.stats.errors_printed
)
if self.stats.errors != self.stats.errors_printed:
self.write(_(" (%d duplicates not printed)") %
(self.stats.errors - self.stats.errors_printed))
self.write(
_(" (%d duplicates not printed)")
% (self.stats.errors - self.stats.errors_printed)
)
self.writeln(".")
self.writeln("<br/>")
num = self.stats.internal_errors
if num:
self.write(_n("There was %(num)d internal error.",
"There were %(num)d internal errors.", num) % {"num": num})
self.write(
_n(
"There was %(num)d internal error.",
"There were %(num)d internal errors.",
num,
)
% {"num": num}
)
self.writeln("<br/>")
self.stoptime = time.time()
duration = self.stoptime - self.starttime
self.writeln(_("Stopped checking at %(time)s (%(duration)s)") %
{"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration)})
self.writeln('</blockquote><br/><hr><small>'+
configuration.HtmlAppInfo+"<br/>")
self.writeln(_("Get the newest version at %s") %
('<a href="'+configuration.Url+'" target="_top">'+
configuration.Url+"</a>.<br/>"))
self.writeln(_("Write comments and bugs to %s") %
('<a href="'+configuration.SupportUrl+'">'+
configuration.SupportUrl+"</a>.<br/>"))
self.writeln(
_("Stopped checking at %(time)s (%(duration)s)")
% {
"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration),
}
)
self.writeln(
'</blockquote><br/><hr><small>' + configuration.HtmlAppInfo + "<br/>"
)
self.writeln(
_("Get the newest version at %s")
% (
'<a href="'
+ configuration.Url
+ '" target="_top">'
+ configuration.Url
+ "</a>.<br/>"
)
)
self.writeln(
_("Write comments and bugs to %s")
% (
'<a href="'
+ configuration.SupportUrl
+ '">'
+ configuration.SupportUrl
+ "</a>.<br/>"
)
)
self.writeln("</small></body></html>")
def end_output(self, **kwargs):

View file

@ -32,6 +32,7 @@ ChangeFreqs = (
HTTP_SCHEMES = ('http:', 'https:')
HTML_TYPES = ('text/html', "application/xhtml+xml")
class SitemapXmlLogger(xmllog._XMLLogger):
"""Sitemap XML output according to http://www.sitemaps.org/protocol.html
"""
@ -81,7 +82,11 @@ class SitemapXmlLogger(xmllog._XMLLogger):
# initialize prefix and priority
if self.prefix is None:
if not url_data.url.startswith(HTTP_SCHEMES):
log.warn(LOG_CHECK, "Sitemap URL %r does not start with http: or https:.", url_data.url)
log.warn(
LOG_CHECK,
"Sitemap URL %r does not start with http: or https:.",
url_data.url,
)
self.disabled = True
return
self.prefix = url_data.url
@ -94,11 +99,13 @@ class SitemapXmlLogger(xmllog._XMLLogger):
priority = 0.5
if self.priority is not None:
priority = self.priority
# ignore the do_print flag and determine ourselves if we filter the url
if (url_data.valid
# ignore the do_print flag and determine ourselves if we filter the url
if (
url_data.valid
and url_data.url.startswith(HTTP_SCHEMES)
and url_data.url.startswith(self.prefix)
and url_data.content_type in HTML_TYPES):
and url_data.content_type in HTML_TYPES
):
self.log_url(url_data, priority=priority)
def log_url(self, url_data, priority=None):

View file

@ -87,47 +87,50 @@ class SQLLogger(_Logger):
"""
Store url check info into the database.
"""
self.writeln("insert into %(table)s(urlname,"
"parentname,baseref,valid,result,warning,info,url,line,col,"
"name,checktime,dltime,size,cached,level,modified) values ("
"%(base_url)s,"
"%(url_parent)s,"
"%(base_ref)s,"
"%(valid)d,"
"%(result)s,"
"%(warning)s,"
"%(info)s,"
"%(url)s,"
"%(line)s,"
"%(column)s,"
"%(name)s,"
"%(checktime)d,"
"%(dltime)d,"
"%(size)d,"
"%(cached)d,"
"%(level)d,"
"%(modified)s"
")%(separator)s" %
{'table': self.dbname,
'base_url': sqlify(url_data.base_url),
'url_parent': sqlify((url_data.parent_url)),
'base_ref': sqlify((url_data.base_ref)),
'valid': intify(url_data.valid),
'result': sqlify(url_data.result),
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
'info': sqlify(os.linesep.join(url_data.info)),
'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")),
'line': 'NULL' if url_data.line is None else url_data.line,
'column': 'NULL' if url_data.column is None else url_data.column,
'name': sqlify(url_data.name),
'checktime': url_data.checktime,
'dltime': url_data.dltime,
'size': url_data.size,
'cached': 0,
'separator': self.separator,
"level": url_data.level,
"modified": sqlify(self.format_modified(url_data.modified)),
})
self.writeln(
"insert into %(table)s(urlname,"
"parentname,baseref,valid,result,warning,info,url,line,col,"
"name,checktime,dltime,size,cached,level,modified) values ("
"%(base_url)s,"
"%(url_parent)s,"
"%(base_ref)s,"
"%(valid)d,"
"%(result)s,"
"%(warning)s,"
"%(info)s,"
"%(url)s,"
"%(line)s,"
"%(column)s,"
"%(name)s,"
"%(checktime)d,"
"%(dltime)d,"
"%(size)d,"
"%(cached)d,"
"%(level)d,"
"%(modified)s"
")%(separator)s"
% {
'table': self.dbname,
'base_url': sqlify(url_data.base_url),
'url_parent': sqlify((url_data.parent_url)),
'base_ref': sqlify((url_data.base_ref)),
'valid': intify(url_data.valid),
'result': sqlify(url_data.result),
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
'info': sqlify(os.linesep.join(url_data.info)),
'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")),
'line': 'NULL' if url_data.line is None else url_data.line,
'column': 'NULL' if url_data.column is None else url_data.column,
'name': sqlify(url_data.name),
'checktime': url_data.checktime,
'dltime': url_data.dltime,
'size': url_data.size,
'cached': 0,
'separator': self.separator,
"level": url_data.level,
"modified": sqlify(self.format_modified(url_data.modified)),
}
)
self.flush()
def end_output(self, **kwargs):

View file

@ -38,18 +38,18 @@ class TextLogger(_Logger):
LoggerArgs = {
"filename": "linkchecker-out.txt",
'colorparent': "default",
'colorurl': "default",
'colorname': "default",
'colorreal': "cyan",
'colorbase': "purple",
'colorvalid': "bold;green",
'colorparent': "default",
'colorurl': "default",
'colorname': "default",
'colorreal': "cyan",
'colorbase': "purple",
'colorvalid': "bold;green",
'colorinvalid': "bold;red",
'colorinfo': "default",
'colorinfo': "default",
'colorwarning': "bold;yellow",
'colordltime': "default",
'colordlsize': "default",
'colorreset': "default",
'colordltime': "default",
'colordlsize': "default",
'colorreset': "default",
}
def __init__(self, **kwargs):
@ -95,14 +95,15 @@ class TextLogger(_Logger):
"""Log introduction text."""
self.writeln(configuration.AppInfo)
self.writeln(configuration.Freeware)
self.writeln(_("Get the newest version at %(url)s") %
{'url': configuration.Url})
self.writeln(_("Write comments and bugs to %(url)s") %
{'url': configuration.SupportUrl})
self.writeln(
_("Get the newest version at %(url)s") % {'url': configuration.Url}
)
self.writeln(
_("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl}
)
self.check_date()
self.writeln()
self.writeln(_("Start checking at %s") %
strformat.strtime(self.starttime))
self.writeln(_("Start checking at %s") % strformat.strtime(self.starttime))
def log_url(self, url_data):
"""Write url checking info."""
@ -175,20 +176,17 @@ class TextLogger(_Logger):
def write_dltime(self, url_data):
"""Write url_data.dltime."""
self.write(self.part("dltime") + self.spaces("dltime"))
self.writeln(_("%.3f seconds") % url_data.dltime,
color=self.colordltime)
self.writeln(_("%.3f seconds") % url_data.dltime, color=self.colordltime)
def write_size(self, url_data):
"""Write url_data.size."""
self.write(self.part("dlsize") + self.spaces("dlsize"))
self.writeln(strformat.strsize(url_data.size),
color=self.colordlsize)
self.writeln(strformat.strsize(url_data.size), color=self.colordlsize)
def write_checktime(self, url_data):
"""Write url_data.checktime."""
self.write(self.part("checktime") + self.spaces("checktime"))
self.writeln(_("%.3f seconds") % url_data.checktime,
color=self.colordltime)
self.writeln(_("%.3f seconds") % url_data.checktime, color=self.colordltime)
def write_info(self, url_data):
"""Write url_data.info."""
@ -225,60 +223,88 @@ class TextLogger(_Logger):
if interrupt:
self.writeln(_("The check has been interrupted; results are not complete."))
self.write(_("That's it.") + " ")
self.write(_n("%d link", "%d links",
self.stats.number) % self.stats.number)
self.write(_n("%d link", "%d links", self.stats.number) % self.stats.number)
self.write(" ")
if self.stats.num_urls is not None:
self.write(_n("in %d URL", "in %d URLs",
self.stats.num_urls) % self.stats.num_urls)
self.write(
_n("in %d URL", "in %d URLs", self.stats.num_urls) % self.stats.num_urls
)
self.write(" checked. ")
warning_text = _n("%d warning found", "%d warnings found",
self.stats.warnings_printed) % self.stats.warnings_printed
warning_text = (
_n("%d warning found", "%d warnings found", self.stats.warnings_printed)
% self.stats.warnings_printed
)
if self.stats.warnings_printed:
warning_color = self.colorwarning
else:
warning_color = self.colorinfo
self.write(warning_text, color=warning_color)
if self.stats.warnings != self.stats.warnings_printed:
self.write(_(" (%d ignored or duplicates not printed)") %
(self.stats.warnings - self.stats.warnings_printed))
self.write(
_(" (%d ignored or duplicates not printed)")
% (self.stats.warnings - self.stats.warnings_printed)
)
self.write(". ")
error_text = _n("%d error found", "%d errors found",
self.stats.errors_printed) % self.stats.errors_printed
error_text = (
_n("%d error found", "%d errors found", self.stats.errors_printed)
% self.stats.errors_printed
)
if self.stats.errors_printed:
error_color = self.colorinvalid
else:
error_color = self.colorvalid
self.write(error_text, color=error_color)
if self.stats.errors != self.stats.errors_printed:
self.write(_(" (%d duplicates not printed)") %
(self.stats.errors - self.stats.errors_printed))
self.write(
_(" (%d duplicates not printed)")
% (self.stats.errors - self.stats.errors_printed)
)
self.writeln(".")
num = self.stats.internal_errors
if num:
self.writeln(_n("There was %(num)d internal error.",
"There were %(num)d internal errors.", num) % {"num": num})
self.writeln(
_n(
"There was %(num)d internal error.",
"There were %(num)d internal errors.",
num,
)
% {"num": num}
)
self.stoptime = time.time()
duration = self.stoptime - self.starttime
self.writeln(_("Stopped checking at %(time)s (%(duration)s)") %
{"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration)})
self.writeln(
_("Stopped checking at %(time)s (%(duration)s)")
% {
"time": strformat.strtime(self.stoptime),
"duration": strformat.strduration_long(duration),
}
)
def write_stats(self):
"""Write check statistic info."""
self.writeln()
self.writeln(_("Statistics:"))
if self.stats.downloaded_bytes is not None:
self.writeln(_("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes))
self.writeln(
_("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes)
)
if self.stats.number > 0:
self.writeln(_(
"Content types: %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(mail)d mail"
" and %(other)d other.") % self.stats.link_types)
self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") %
dict(min=self.stats.min_url_length,
max=self.stats.max_url_length,
avg=self.stats.avg_url_length))
self.writeln(
_(
"Content types: %(image)d image, %(text)d text, %(video)d video, "
"%(audio)d audio, %(application)d application, %(mail)d mail"
" and %(other)d other."
)
% self.stats.link_types
)
self.writeln(
_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.")
% dict(
min=self.stats.min_url_length,
max=self.stats.max_url_length,
avg=self.stats.avg_url_length,
)
)
else:
self.writeln(_("No statistics available since no URLs were checked."))

View file

@ -66,8 +66,10 @@ class _XMLLogger(_Logger):
"""
Write start of checking info as xml comment.
"""
self.writeln('<?xml version="1.0" encoding="%s"?>' %
xmlquoteattr(self.get_charset_encoding()))
self.writeln(
'<?xml version="1.0" encoding="%s"?>'
% xmlquoteattr(self.get_charset_encoding())
)
if self.has_part("intro"):
self.write_intro()
self.writeln()
@ -83,7 +85,7 @@ class _XMLLogger(_Logger):
"""
Write XML start tag.
"""
self.write(self.indent*self.level)
self.write(self.indent * self.level)
self.write("<%s" % xmlquote(name))
if attrs:
for name, value in attrs.items():
@ -98,14 +100,14 @@ class _XMLLogger(_Logger):
"""
self.level -= 1
assert self.level >= 0
self.write(self.indent*self.level)
self.write(self.indent * self.level)
self.writeln("</%s>" % xmlquote(name))
def xml_tag(self, name, content, attrs=None):
"""
Write XML tag with content.
"""
self.write(self.indent*self.level)
self.write(self.indent * self.level)
self.write("<%s" % xmlquote(name))
if attrs:
for aname, avalue in attrs.items():

View file

@ -22,9 +22,9 @@ from . import strformat, log, LOG_CHECK
from .fileutil import get_temp_file
# Message to display when meliae package is not installed
MemoryDebugMsg = strformat.format_feature_warning(module='meliae',
feature='memory debugging',
url='https://launchpad.net/meliae')
MemoryDebugMsg = strformat.format_feature_warning(
module='meliae', feature='memory debugging', url='https://launchpad.net/meliae'
)
def write_memory_dump():
@ -37,10 +37,10 @@ def write_memory_dump():
if gc.garbage:
log.warn(LOG_CHECK, "Unreachabe objects: %s", pprint.pformat(gc.garbage))
from meliae import scanner
fo, filename = get_temp_file(mode='wb', suffix='.json', prefix='lcdump_')
try:
scanner.dump_all_objects(fo)
finally:
fo.close()
return filename

View file

@ -26,6 +26,7 @@ from .logconf import LOG_CHECK
mimedb = None
def init_mimedb():
"""Initialize the local MIME database."""
global mimedb
@ -59,6 +60,7 @@ PARSE_CONTENTS = {
"application/xml+sitemap": re.compile(r'<\?xml[^<]+<urlset\s+', re.IGNORECASE),
}
def guess_mimetype(filename, read=None):
"""Return MIME type of file, or 'application/octet-stream' if it could
not be determined."""

View file

@ -22,6 +22,7 @@ import re
import socket
from .. import log, LOG_CHECK
def is_valid_ip(ip):
"""
Return True if given ip is a valid IPv4 or IPv6 address.

View file

@ -26,7 +26,11 @@ def parse_url(url_data):
if url_data.is_directory():
# both ftp and file links represent directories as HTML data
key = "html"
elif url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
elif (
url_data.is_file()
and firefox.has_sqlite
and firefox.extension.search(url_data.url)
):
key = "firefox"
elif url_data.scheme == "itms-services":
key = "itms_services"
@ -34,7 +38,7 @@ def parse_url(url_data):
# determine parse routine according to content types
mime = url_data.content_type
key = url_data.ContentMimetypes[mime]
funcname = "parse_"+key
funcname = "parse_" + key
if funcname in globals():
globals()[funcname](url_data)
else:
@ -51,6 +55,7 @@ def parse_html(url_data):
def parse_opera(url_data):
"""Parse an opera bookmark file."""
from ..bookmarks.opera import parse_bookmark_data
for url, name, lineno in parse_bookmark_data(url_data.get_content()):
url_data.add_url(url, line=lineno, name=name)
@ -58,6 +63,7 @@ def parse_opera(url_data):
def parse_chromium(url_data):
"""Parse a Chromium or Google Chrome bookmark file."""
from ..bookmarks.chromium import parse_bookmark_data
for url, name in parse_bookmark_data(url_data.get_content()):
url_data.add_url(url, name=name)
@ -65,6 +71,7 @@ def parse_chromium(url_data):
def parse_safari(url_data):
"""Parse a Safari bookmark file."""
from ..bookmarks.safari import parse_bookmark_data
for url, name in parse_bookmark_data(url_data.get_raw_content()):
url_data.add_url(url, name=name)
@ -124,8 +131,9 @@ def parse_firefox(url_data):
def parse_itms_services(url_data):
"""Get "url" CGI parameter value as child URL."""
query = url_data.urlparts[3]
for k, v, sep in urlutil.parse_qsl(query, encoding=url_data.encoding,
keep_blank_values=True):
for k, v, sep in urlutil.parse_qsl(
query, encoding=url_data.encoding, keep_blank_values=True
):
if k == "url":
url_data.add_url(v)
break

View file

@ -18,7 +18,8 @@ Main functions for link parsing
"""
from xml.parsers.expat import ParserCreate
from xml.parsers.expat import ExpatError
from ..checker.const import (WARN_XML_PARSE_ERROR)
from ..checker.const import WARN_XML_PARSE_ERROR
class XmlTagUrlParser:
"""Parse XML files and find URLs in text content of a tag name."""
@ -42,11 +43,11 @@ class XmlTagUrlParser:
try:
self.parser.Parse(data, isfinal)
except ExpatError as expaterr:
self.url_data.add_warning(expaterr.message,tag=WARN_XML_PARSE_ERROR)
self.url_data.add_warning(expaterr.message, tag=WARN_XML_PARSE_ERROR)
def start_element(self, name, attrs):
"""Set tag status for start element."""
self.in_tag = (name == self.tag)
self.in_tag = name == self.tag
self.url = ""
def end_element(self, name):
@ -58,8 +59,11 @@ class XmlTagUrlParser:
def add_url(self):
"""Add non-empty URLs to the queue."""
if self.url:
self.url_data.add_url(self.url, line=self.parser.CurrentLineNumber,
column=self.parser.CurrentColumnNumber)
self.url_data.add_url(
self.url,
line=self.parser.CurrentLineNumber,
column=self.parser.CurrentColumnNumber,
)
self.url = ""
def char_data(self, data):

View file

@ -46,21 +46,23 @@ class _PluginBase:
class _ConnectionPlugin(_PluginBase):
"""Plugins run after connection checks."""
pass
class _ContentPlugin(_PluginBase):
"""Plugins run for valid URLs with content."""
pass
class _ParserPlugin(_PluginBase):
"""Plugins run for valid URLs to parse their contents."""
pass
def get_plugin_modules(folders, package='plugins',
parentpackage='linkcheck.dummy'):
def get_plugin_modules(folders, package='plugins', parentpackage='linkcheck.dummy'):
"""Get plugin modules for given folders."""
for folder in folders:
for module in loader.get_folder_modules(folder, parentpackage):
@ -114,7 +116,9 @@ class PluginManager:
def run_parser_plugins(self, url_data, pagetype):
"""Run parser plugins for given pagetype."""
run_plugins(self.parser_plugins, url_data, stop_after_match=True, pagetype=pagetype)
run_plugins(
self.parser_plugins, url_data, stop_after_match=True, pagetype=pagetype
)
def run_plugins(plugins, url_data, stop_after_match=False, **kwargs):

View file

@ -35,8 +35,7 @@ class AnchorCheck(_ContentPlugin):
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
# list of parsed anchors
self.anchors = []
linkparse.find_links(url_data.get_soup(), self.add_anchor,
linkparse.AnchorTags)
linkparse.find_links(url_data.get_soup(), self.add_anchor, linkparse.AnchorTags)
self.check_anchor(url_data)
def add_anchor(self, url, line, column, name, base):
@ -56,6 +55,8 @@ class AnchorCheck(_ContentPlugin):
else:
anchors = "-"
args = {"name": url_data.anchor, "anchors": anchors}
msg = "%s %s" % (_("Anchor `%(name)s' not found.") % args,
_("Available anchors: %(anchors)s.") % args)
msg = "%s %s" % (
_("Anchor `%(name)s' not found.") % args,
_("Available anchors: %(anchors)s.") % args,
)
url_data.add_warning(msg)

View file

@ -38,7 +38,10 @@ class HttpHeaderInfo(_ConnectionPlugin):
if name.lower().startswith(self.prefixes):
headers.append(name.lower())
if headers:
items = ["%s=%s" % (name.capitalize(), url_data.headers[name]) for name in headers]
items = [
"%s=%s" % (name.capitalize(), url_data.headers[name])
for name in headers
]
info = "HTTP headers %s" % ", ".join(items)
url_data.add_info(info)
@ -55,4 +58,3 @@ class HttpHeaderInfo(_ConnectionPlugin):
names = []
config[option] = names
return config

View file

@ -25,6 +25,7 @@ from ..decorators import synchronized
from ..strformat import unicode_safe
from .. import log, LOG_PLUGIN
class LocationInfo(_ConnectionPlugin):
"""Adds the country and if possible city name of the URL host as info.
Needs GeoIP or pygeoip and a local country or city lookup DB installed."""
@ -43,13 +44,16 @@ class LocationInfo(_ConnectionPlugin):
"""Try to ask GeoIP database for country info."""
location = get_location(url_data.host)
if location:
url_data.add_info(_("URL is located in %(location)s.") %
{"location": _(location)})
url_data.add_info(
_("URL is located in %(location)s.") % {"location": _(location)}
)
# It is unknown if the geoip library is already thread-safe, so
# no risks should be taken here by using a lock.
_lock = get_lock("geoip")
def get_geoip_dat():
"""Find a GeoIP database, preferring city over country lookup."""
datafiles = ("GeoIPCity.dat", "GeoIP.dat")
@ -63,17 +67,20 @@ def get_geoip_dat():
if os.path.isfile(filename):
return filename
# try importing both the C-library GeoIP and the pure-python pygeoip
geoip_dat = get_geoip_dat()
geoip = None
if geoip_dat:
try:
import GeoIP
geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
geoip_error = GeoIP.error
except ImportError:
try:
import pygeoip
geoip = pygeoip.GeoIP(geoip_dat)
geoip_error = pygeoip.GeoIPError
except ImportError:
@ -81,7 +88,9 @@ if geoip_dat:
if geoip_dat.endswith('GeoIPCity.dat'):
get_geoip_record = lambda host: geoip.record_by_name(host)
else:
get_geoip_record = lambda host: {'country_name': geoip.country_name_by_name(host)}
get_geoip_record = lambda host: {
'country_name': geoip.country_name_by_name(host)
}
@synchronized(_lock)

View file

@ -37,8 +37,10 @@ class MarkdownCheck(_ContentPlugin):
_filename_re_key = "filename_re"
_default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
_link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
re.compile(r"""
_link_res = [
re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
re.compile(
r"""
\[.+\]: # id
[ \t]*\n? # maybe *one* newline
[ \t]*
@ -54,20 +56,26 @@ class MarkdownCheck(_ContentPlugin):
[ \t]*
)? # title is optional
(?:\n+|\Z)
""", re.X | re.M | re.U)]
""",
re.X | re.M | re.U,
),
]
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
_inline_link_title = re.compile(r'''
_inline_link_title = re.compile(
r'''
( # \1
[ \t]+
(['"]) # quote char
(.*?)
)? # title is optional
\)$
''', re.X | re.S)
''',
re.X | re.S,
)
def __init__(self, config):
super(MarkdownCheck, self).__init__(config)
@ -83,8 +91,11 @@ class MarkdownCheck(_ContentPlugin):
def read_config(cls, configparser):
"""Read configuration file options."""
config = dict()
config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \
if configparser.has_option(cls.__name__, cls._filename_re_key) else None
config[cls._filename_re_key] = (
configparser.get(cls.__name__, cls._filename_re_key)
if configparser.has_option(cls.__name__, cls._filename_re_key)
else None
)
return config
def applies_to(self, url_data, pagetype=None):
@ -107,7 +118,9 @@ class MarkdownCheck(_ContentPlugin):
"""
line = content.count('\n', 0, url_pos) + 1
column = url_pos - content.rfind('\n', 0, url_pos)
url_data.add_url(url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column)
url_data.add_url(
url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column
)
def _check_by_re(self, url_data, content):
""" Finds urls by re.
@ -144,12 +157,12 @@ class MarkdownCheck(_ContentPlugin):
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
end_idx = self._find_balanced(text, end_idx + 1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None
url = text[idx:match.start()]
url = text[idx : match.start()]
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, end_idx
@ -175,7 +188,9 @@ class MarkdownCheck(_ContentPlugin):
# Find the matching closing ']'.
bracket_depth = 0
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)):
for p in range(
start_idx + 1, min(start_idx + MAX_LINK_TEXT_SENTINEL, content_length)
):
if content[p] == ']':
bracket_depth -= 1
if bracket_depth < 0:

View file

@ -19,6 +19,7 @@ Parse links in PDF files with pdfminer.
from io import BytesIO
from . import _ParserPlugin
try:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
@ -32,7 +33,6 @@ else:
from .. import log, LOG_PLUGIN, strformat
def search_url(obj, url_data, pageno, seen_objs):
"""Recurse through a PDF object, searching for URLs."""
if isinstance(obj, PDFObjRef):

View file

@ -17,9 +17,11 @@
Parse hyperlinks in Word files.
"""
from . import _ParserPlugin
try:
import win32com
import pythoncom
has_win32com = True
Error = pythoncom.com_error
except ImportError:
@ -29,14 +31,17 @@ from .. import fileutil, log, LOG_PLUGIN
_initialized = False
def init_win32com():
"""Initialize the win32com.client cache."""
global _initialized
if _initialized:
return
import win32com.client
if win32com.client.gencache.is_readonly:
#allow gencache to create the cached wrapper objects
# allow gencache to create the cached wrapper objects
win32com.client.gencache.is_readonly = False
# under py2exe the call in gencache to __init__() does not happen
# so we use Rebuild() to force the creation of the gen_py folder
@ -79,6 +84,7 @@ def get_word_app():
# the COM layer.
pythoncom.CoInitialize()
import win32com.client
app = win32com.client.gencache.EnsureDispatch("Word.Application")
app.Visible = False
return app
@ -91,8 +97,13 @@ def close_word_app(app):
def open_wordfile(app, filename):
"""Open given Word file with application object."""
return app.Documents.Open(filename, ReadOnly=True,
AddToRecentFiles=False, Visible=False, NoEncodingDialog=True)
return app.Documents.Open(
filename,
ReadOnly=True,
AddToRecentFiles=False,
Visible=False,
NoEncodingDialog=True,
)
def close_wordfile(doc):
@ -128,7 +139,7 @@ class WordParser(_ParserPlugin):
try:
for link in doc.Hyperlinks:
line = get_line_number(link.Range)
name=link.TextToDisplay
name = link.TextToDisplay
url_data.add_url(link.Address, name=name, line=line)
finally:
close_wordfile(doc)
@ -158,11 +169,9 @@ def get_line_number(doc, wrange):
def get_temp_filename(content):
"""Get temporary filename for content to parse."""
# store content in temporary file
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
prefix='lc_')
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc', prefix='lc_')
try:
fd.write(content)
finally:
fd.close()
return filename

View file

@ -27,6 +27,7 @@ _lock = threading.Lock()
# configuration option names
sslcertwarndays = "sslcertwarndays"
class SslCertificateCheck(_ConnectionPlugin):
"""Check SSL certificate expiration date. Only internal https: links
will be checked. A domain will only be checked once to avoid duplicate
@ -37,14 +38,20 @@ class SslCertificateCheck(_ConnectionPlugin):
def __init__(self, config):
"""Initialize clamav configuration."""
super(SslCertificateCheck, self).__init__(config)
self.warn_ssl_cert_secs_valid = config[sslcertwarndays] * strformat.SECONDS_PER_DAY
self.warn_ssl_cert_secs_valid = (
config[sslcertwarndays] * strformat.SECONDS_PER_DAY
)
# do not check hosts multiple times
self.checked_hosts = set()
def applies_to(self, url_data):
"""Check validity, scheme, extern and url_connection."""
return url_data.valid and url_data.scheme == 'https' and \
not url_data.extern[0] and url_data.url_connection is not None
return (
url_data.valid
and url_data.scheme == 'https'
and not url_data.extern[0]
and url_data.url_connection is not None
)
@synchronized(_lock)
def check(self, url_data):
@ -71,6 +78,7 @@ class SslCertificateCheck(_ConnectionPlugin):
if it's at least a number of days valid.
"""
import ssl
try:
notAfter = ssl.cert_time_to_seconds(cert['notAfter'])
except ValueError as msg:
@ -88,7 +96,9 @@ class SslCertificateCheck(_ConnectionPlugin):
else:
args['valid'] = strformat.strduration_long(secondsValid)
if secondsValid < self.warn_ssl_cert_secs_valid:
msg = _('SSL certificate expires on %(expire)s and is only %(valid)s valid.')
msg = _(
'SSL certificate expires on %(expire)s and is only %(valid)s valid.'
)
url_data.add_warning(msg % args)
else:
msg = _('SSL certificate expires on %(expire)s and is %(valid)s valid.')
@ -105,7 +115,11 @@ class SslCertificateCheck(_ConnectionPlugin):
if num > 0:
config[option] = num
else:
msg = _("invalid value for %s: %d must not be less than %d") % (option, num, 0)
msg = _("invalid value for %s: %d must not be less than %d") % (
option,
num,
0,
)
raise LinkCheckerError(msg)
else:
# set the default

View file

@ -47,6 +47,7 @@ class HtmlSyntaxCheck(_ContentPlugin):
"""Check the syntax of HTML pages with the online W3C HTML validator.
See http://validator.w3.org/docs/api.html.
"""
def __init__(self, config):
"""Initialize plugin."""
super(HtmlSyntaxCheck, self).__init__(config)
@ -69,9 +70,11 @@ class HtmlSyntaxCheck(_ContentPlugin):
return
check_w3_errors(url_data, response.text, "W3C HTML")
except requests.exceptions.RequestException:
pass # ignore service failures
pass # ignore service failures
except Exception as msg:
log.warn(LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg})
log.warn(
LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg}
)
class CssSyntaxCheck(_ContentPlugin):
@ -106,9 +109,11 @@ class CssSyntaxCheck(_ContentPlugin):
return
check_w3_errors(url_data, response.text, "W3C HTML")
except requests.exceptions.RequestException:
pass # ignore service failures
pass # ignore service failures
except Exception as msg:
log.warn(LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg})
log.warn(
LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg}
)
def check_w3_errors(url_data, xml, w3type):
@ -116,7 +121,9 @@ def check_w3_errors(url_data, xml, w3type):
w3type is either "W3C HTML" or "W3C CSS"."""
dom = parseString(xml)
for error in dom.getElementsByTagName('m:error'):
warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
warnmsg = _(
"%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s"
)
attrs = {
"w3type": w3type,
"line": getXmlText(error, "m:line"),

View file

@ -67,6 +67,7 @@ class VirusCheck(_ContentPlugin):
class ClamavError(Exception):
"""Raised on clamav errors."""
pass
@ -78,8 +79,7 @@ class ClamdScanner:
self.infected = []
self.errors = []
self.sock, self.host = clamav_conf.new_connection()
self.sock_rcvbuf = \
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF)
self.sock_rcvbuf = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF)
self.wsock = self.new_scansock()
def new_scansock(self):
@ -92,7 +92,7 @@ class ClamdScanner:
data = self.sock.recv(self.sock_rcvbuf)
i = data.find(b"PORT")
if i != -1:
port = int(data[i+5:])
port = int(data[i + 5 :])
break
except socket.error:
self.sock.close()
@ -159,7 +159,9 @@ class ClamavConfig(dict):
if self.get('ScannerDaemonOutputFormat'):
raise ClamavError(_("ScannerDaemonOutputFormat must be disabled"))
if self.get('TCPSocket') and self.get('LocalSocket'):
raise ClamavError(_("only one of TCPSocket and LocalSocket must be enabled"))
raise ClamavError(
_("only one of TCPSocket and LocalSocket must be enabled")
)
def parseconf(self, filename):
"""Parse clamav configuration from given file."""

View file

@ -35,8 +35,7 @@ class RobotFileParser:
"""This class provides a set of methods to read, parse and answer
questions about a single robots.txt file."""
def __init__(self, url='', session=None, proxies=None, auth=None,
timeout=None):
def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
"""Initialize internal entry lists and store given url and
credentials."""
self.set_url(url)
@ -85,7 +84,7 @@ class RobotFileParser:
"""Read the robots.txt URL and feeds it to the parser."""
self._reset()
kwargs = dict(
headers = {
headers={
'User-Agent': configuration.UserAgent,
'Accept-Encoding': ACCEPT_ENCODING,
}
@ -109,7 +108,12 @@ class RobotFileParser:
except requests.HTTPError as x:
if x.response.status_code in (401, 403):
self.disallow_all = True
log.debug(LOG_CHECK, "%r disallow all (code %d)", self.url, x.response.status_code)
log.debug(
LOG_CHECK,
"%r disallow all (code %d)",
self.url,
x.response.status_code,
)
else:
self.allow_all = True
log.debug(LOG_CHECK, "%r allow all (HTTP error)", self.url)
@ -148,7 +152,12 @@ class RobotFileParser:
linenumber += 1
if not line:
if state == 1:
log.debug(LOG_CHECK, "%r line %d: allow or disallow directives without any user-agent line", self.url, linenumber)
log.debug(
LOG_CHECK,
"%r line %d: allow or disallow directives without any user-agent line",
self.url,
linenumber,
)
entry = Entry()
state = 0
elif state == 2:
@ -168,35 +177,61 @@ class RobotFileParser:
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
if line[0] == "user-agent":
if state == 2:
log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber)
log.debug(
LOG_CHECK,
"%r line %d: missing blank line before user-agent directive",
self.url,
linenumber,
)
self._add_entry(entry)
entry = Entry()
entry.useragents.append(line[1])
state = 1
elif line[0] == "disallow":
if state == 0:
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
entry.rulelines.append(RuleLine(line[1], False))
state = 2
elif line[0] == "allow":
if state == 0:
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
entry.rulelines.append(RuleLine(line[1], True))
state = 2
elif line[0] == "crawl-delay":
if state == 0:
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
log.debug(
LOG_CHECK,
"%r line %d: missing user-agent directive before this line",
self.url,
linenumber,
)
pass
else:
try:
entry.crawldelay = max(0, int(line[1]))
state = 2
except (ValueError, OverflowError):
log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1])
log.debug(
LOG_CHECK,
"%r line %d: invalid delay number %r",
self.url,
linenumber,
line[1],
)
pass
elif line[0] == "sitemap":
# Note that sitemap URLs must be absolute according to
@ -204,10 +239,22 @@ class RobotFileParser:
# But this should be checked by the calling layer.
self.sitemap_urls.append((line[1], linenumber))
else:
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
log.debug(
LOG_CHECK,
"%r line %d: unknown key %r",
self.url,
linenumber,
line[0],
)
pass
else:
log.debug(LOG_CHECK, "%r line %d: malformed line %r", self.url, linenumber, line)
log.debug(
LOG_CHECK,
"%r line %d: malformed line %r",
self.url,
linenumber,
line,
)
pass
if state in (1, 2):
self.entries.append(entry)
@ -220,7 +267,13 @@ class RobotFileParser:
@return: True if agent can fetch url, else False
@rtype: bool
"""
log.debug(LOG_CHECK, "%r check allowance for:\n user agent: %r\n url: %r ...", self.url, useragent, url)
log.debug(
LOG_CHECK,
"%r check allowance for:\n user agent: %r\n url: %r ...",
self.url,
useragent,
url,
)
if not isinstance(useragent, str):
useragent = useragent.encode("ascii", "ignore")
if not isinstance(url, str):
@ -233,7 +286,10 @@ class RobotFileParser:
return True
# search for given user agent matches
# the first match counts
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
url = (
urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2])
or "/"
)
for entry in self.entries:
if entry.applies_to(useragent):
return entry.allowance(url)
@ -296,7 +352,7 @@ class RuleLine:
@return: robots.txt format
@rtype: string
"""
return ("Allow" if self.allowance else "Disallow")+": "+self.path
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
class Entry:
@ -352,5 +408,10 @@ class Entry:
if line.applies_to(filename):
log.debug(LOG_CHECK, " ... rule line %s", line)
return line.allowance
log.debug(LOG_CHECK, " ... no rule lines of %s applied to %s; allowed.", self.useragents, filename)
log.debug(
LOG_CHECK,
" ... no rule lines of %s applied to %s; allowed.",
self.useragents,
filename,
)
return True

View file

@ -123,6 +123,7 @@ _para_posix = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\n'}
_para_win = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\r\n'}
_para_ro = re.compile("%s|%s|%s" % (_para_mac, _para_posix, _para_win))
def get_paragraphs(text):
"""A new paragraph is considered to start at a line which follows
one or more blank lines (lines containing nothing or just spaces).
@ -148,8 +149,7 @@ def wrap(text, width, **kwargs):
def indent(text, indent_string=" "):
"""Indent each line of text with the given indent string."""
return os.linesep.join("%s%s" % (indent_string, x)
for x in text.splitlines())
return os.linesep.join("%s%s" % (indent_string, x) for x in text.splitlines())
def get_line_number(s, index):
@ -173,11 +173,12 @@ def paginate(text):
_markup_re = re.compile("<.*?>", re.DOTALL)
def remove_markup(s):
"""Remove all <*> html markup tags from s."""
mo = _markup_re.search(s)
while mo:
s = s[0:mo.start()] + s[mo.end():]
s = s[0 : mo.start()] + s[mo.end() :]
mo = _markup_re.search(s)
return s
@ -194,12 +195,20 @@ def strsize(b, grouping=True):
if b < 1024 * 1024:
return "%sKB" % locale.format_string("%.2f", (float(b) / 1024), grouping)
if b < 1024 * 1024 * 10:
return "%sMB" % locale.format_string("%.2f", (float(b) / (1024*1024)), grouping)
return "%sMB" % locale.format_string(
"%.2f", (float(b) / (1024 * 1024)), grouping
)
if b < 1024 * 1024 * 1024:
return "%sMB" % locale.format_string("%.1f", (float(b) / (1024*1024)), grouping)
return "%sMB" % locale.format_string(
"%.1f", (float(b) / (1024 * 1024)), grouping
)
if b < 1024 * 1024 * 1024 * 10:
return "%sGB" % locale.format_string("%.2f", (float(b) / (1024*1024*1024)), grouping)
return "%sGB" % locale.format_string("%.1f", (float(b) / (1024*1024*1024)), grouping)
return "%sGB" % locale.format_string(
"%.2f", (float(b) / (1024 * 1024 * 1024)), grouping
)
return "%sGB" % locale.format_string(
"%.1f", (float(b) / (1024 * 1024 * 1024)), grouping
)
def strtime(t, func=time.localtime):
@ -216,15 +225,21 @@ def strduration(duration):
else:
prefix = ""
duration = math.ceil(duration)
if duration >= SECONDS_PER_HOUR: # 1 hour
if duration >= SECONDS_PER_HOUR: # 1 hour
# time, in hours:minutes:seconds
return "%s%02d:%02d:%02d" % (prefix, duration // SECONDS_PER_HOUR,
(duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE,
duration % SECONDS_PER_MINUTE)
return "%s%02d:%02d:%02d" % (
prefix,
duration // SECONDS_PER_HOUR,
(duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE,
duration % SECONDS_PER_MINUTE,
)
else:
# time, in minutes:seconds
return "%s%02d:%02d" % (prefix, duration // SECONDS_PER_MINUTE,
duration % SECONDS_PER_MINUTE)
return "%s%02d:%02d" % (
prefix,
duration // SECONDS_PER_MINUTE,
duration % SECONDS_PER_MINUTE,
)
# from quodlibet
@ -236,15 +251,17 @@ def strduration_long(duration, do_translate=True):
else:
# do not translate
_ = lambda x: x
_n = lambda a, b, n: a if n==1 else b
_n = lambda a, b, n: a if n == 1 else b
if duration < 0:
duration = abs(duration)
prefix = "-"
else:
prefix = ""
if duration < 1:
return _("%(prefix)s%(duration).02f seconds") % \
{"prefix": prefix, "duration": duration}
return _("%(prefix)s%(duration).02f seconds") % {
"prefix": prefix,
"duration": duration,
}
# translation dummies
_n("%d second", "%d seconds", 1)
_n("%d minute", "%d minutes", 1)
@ -281,7 +298,7 @@ def strtimezone():
zone = time.altzone
else:
zone = time.timezone
return "%+04d" % (-zone//SECONDS_PER_HOUR)
return "%+04d" % (-zone // SECONDS_PER_HOUR)
def stripurl(s):
@ -319,7 +336,12 @@ def format_feature_warning(**kwargs):
"""Format warning that a module could not be imported and that it should
be installed for a certain URL.
"""
return _("Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature.") % kwargs
return (
_(
"Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature."
)
% kwargs
)
def strip_control_chars(text):

View file

@ -48,7 +48,7 @@ def _trace(frame, event, arg):
elif event in ('return', 'c_return'):
_trace_line(frame, event, arg)
print(" return:", arg)
#elif event in ('exception', 'c_exception'):
# elif event in ('exception', 'c_exception'):
# _trace_line(frame, event, arg)
return _trace

View file

@ -24,7 +24,9 @@ from distutils.version import LooseVersion
# Use the Freecode submit file as source since that file gets updated
# only when releasing a new version.
UPDATE_URL = "https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode"
UPDATE_URL = (
"https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode"
)
VERSION_TAG = 'Version:'
if os.name == 'nt':
URL_TAG = 'Windows-installer-URL:'

View file

@ -60,20 +60,23 @@ _basic = {
"_hex_full": r"0-9a-f",
"_part": r"([a-z0-9][-a-z0-9]{0,61}|[a-z])",
}
_safe_char = r"([a-z0-9%(_path)s\+]|"\
r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
_safe_char = r"([a-z0-9%(_path)s\+]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
_safe_scheme_pattern = r"(https?|ftp)"
_safe_domain_pattern = r"(%(_part)s(\.%(_part)s)*\.?)" % _basic
_safe_host_pattern = _safe_domain_pattern+r"(:(80|8080|8000|443))?" % _basic
_safe_path_pattern = r"((/([a-z0-9%(_path)s]|"\
r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
_safe_host_pattern = _safe_domain_pattern + r"(:(80|8080|8000|443))?" % _basic
_safe_path_pattern = (
r"((/([a-z0-9%(_path)s]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
)
_safe_fragment_pattern = r"%s*" % _safe_char
_safe_cgi = r"%s+(=(%s|/)+)?" % (_safe_char, _safe_char)
_safe_query_pattern = r"(%s(&%s)*)?" % (_safe_cgi, _safe_cgi)
_safe_param_pattern = r"(%s(;%s)*)?" % (_safe_cgi, _safe_cgi)
safe_url_pattern = r"%s://%s%s(#%s)?" % \
(_safe_scheme_pattern, _safe_host_pattern,
_safe_path_pattern, _safe_fragment_pattern)
safe_url_pattern = r"%s://%s%s(#%s)?" % (
_safe_scheme_pattern,
_safe_host_pattern,
_safe_path_pattern,
_safe_fragment_pattern,
)
is_safe_char = re.compile("(?i)^%s$" % _safe_char).match
is_safe_url = re.compile("(?i)^%s$" % safe_url_pattern).match
@ -96,7 +99,7 @@ def splitparams(path):
i = path.find(';')
if i < 0:
return path, ''
return path[:i], path[i+1:]
return path[:i], path[i + 1 :]
def is_numeric_port(portstr):
@ -113,8 +116,12 @@ def is_numeric_port(portstr):
def safe_host_pattern(host):
"""Return regular expression pattern with given host for URL testing."""
return "(?i)%s://%s%s(#%s)?" % \
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
return "(?i)%s://%s%s(#%s)?" % (
_safe_scheme_pattern,
host,
_safe_path_pattern,
_safe_fragment_pattern,
)
def parse_qsl(qs, encoding, keep_blank_values=0, strict_parsing=0):
@ -190,18 +197,23 @@ def url_fix_host(urlparts, encoding):
userpass, netloc = urllib.parse.splituser(urlparts[1])
if userpass:
userpass = urllib.parse.unquote(userpass, encoding=encoding)
netloc, is_idn = idna_encode(urllib.parse.unquote(netloc, encoding=encoding).lower())
netloc, is_idn = idna_encode(
urllib.parse.unquote(netloc, encoding=encoding).lower()
)
# a leading backslash in path causes urlsplit() to add the
# path components up to the first slash to host
# try to find this case...
i = netloc.find("\\")
if i != -1:
# ...and fix it by prepending the misplaced components to the path
comps = netloc[i:] # note: still has leading backslash
comps = netloc[i:] # note: still has leading backslash
if not urlparts[2] or urlparts[2] == '/':
urlparts[2] = comps
else:
urlparts[2] = "%s%s" % (comps, urllib.parse.unquote(urlparts[2], encoding=encoding))
urlparts[2] = "%s%s" % (
comps,
urllib.parse.unquote(urlparts[2], encoding=encoding),
)
netloc = netloc[:i]
else:
# a leading ? in path causes urlsplit() to add the query to the
@ -224,7 +236,7 @@ def url_fix_host(urlparts, encoding):
if port != dport:
host = "%s:%d" % (host, port)
netloc = host
urlparts[1] = userpass+netloc
urlparts[1] = userpass + netloc
return is_idn
@ -243,21 +255,25 @@ def url_fix_mailto_urlsplit(urlparts):
if sep in urlparts[2]:
urlparts[2], urlparts[3] = urlparts[2].split(sep, 1)
# wayback urls include in the path http[s]://. By default the
# tidying mechanism in linkchecker encodes the : and deletes the second slash
# This function reverses these corrections. This function expects only the
# path section of the URL as input.
wayback_regex = re.compile(r'(https?)(\%3A/|:/)')
def url_fix_wayback_query(path):
return wayback_regex.sub(r'\1://', path)
def url_parse_query(query, encoding):
"""Parse and re-join the given CGI query."""
# if ? is in the query, split it off, seen at msdn.microsoft.com
append = ""
while '?' in query:
query, rest = query.rsplit('?', 1)
append = '?'+url_parse_query(rest, encoding=encoding)+append
append = '?' + url_parse_query(rest, encoding=encoding) + append
l = []
for k, v, sep in parse_qsl(query, keep_blank_values=True, encoding=encoding):
k = urllib.parse.quote(k, safe='/-:,;')
@ -316,12 +332,14 @@ def url_norm(url, encoding):
# anchor
urlparts[4] = urllib.parse.unquote(urlparts[4], encoding=encoding)
# quote parts again
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
if not urlparts[0].startswith("feed"):
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
urlparts[2] = url_fix_wayback_query(
urlparts[2]
) # unencode colon in http[s]:// in wayback path
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
res = urlunsplit(urlparts)
if url.endswith('#') and not urlparts[4]:
# re-append trailing empty fragment
@ -334,6 +352,8 @@ _thisdir_ro = re.compile(r"^\./")
_samedir_ro = re.compile(r"/\./|/\.$")
_parentdir_ro = re.compile(r"^/(\.\./)+|/(?!\.\./)[^/]+/\.\.(/|$)")
_relparentdir_ro = re.compile(r"^(?!\.\./)[^/]+/\.\.(/|$)")
def collapse_segments(path):
"""Remove all redundant segments from the given URL path.
Precondition: path is an unquoted url path"""
@ -375,12 +395,14 @@ def url_quote(url, encoding):
if not url_is_absolute(url):
return document_quote(url)
urlparts = list(urllib.parse.urlsplit(url))
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
l = []
for k, v, sep in parse_qsl(urlparts[3], encoding=encoding, keep_blank_values=True): # query
for k, v, sep in parse_qsl(
urlparts[3], encoding=encoding, keep_blank_values=True
): # query
k = urllib.parse.quote(k, safe='/-:,;')
if v:
v = urllib.parse.quote(v, safe='/-:,;')
@ -388,7 +410,7 @@ def url_quote(url, encoding):
else:
l.append("%s%s" % (k, sep))
urlparts[3] = ''.join(l)
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
return urlunsplit(urlparts)
@ -425,8 +447,10 @@ def match_host(host, domainlist):
_nopathquote_chars = "-;/=,~*+()@!"
if os.name == 'nt':
_nopathquote_chars += "|"
_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!")+"a-zA-Z0-9"
_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!") + "a-zA-Z0-9"
_safe_url_chars_ro = re.compile(r"^[%s]*$" % _safe_url_chars)
def url_needs_quoting(url):
"""Check if url needs percent quoting. Note that the method does
only check basic character sets, and not any other syntax.
@ -487,8 +511,7 @@ def splitport(host, port=0):
return host, port
def get_content(url, user=None, password=None, proxy=None, data=None,
addheaders=None):
def get_content(url, user=None, password=None, proxy=None, data=None, addheaders=None):
"""Get URL content and info.
@return: (decoded text content of URL, headers) or
@ -496,6 +519,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
@rtype: tuple (String, dict) or (None, String)
"""
from . import configuration
headers = {
'User-Agent': configuration.UserAgent,
}
@ -511,6 +535,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
if proxy:
kwargs['proxy'] = dict(http=proxy)
from .configuration import get_share_file
try:
kwargs["verify"] = get_share_file('cacert.pem')
except ValueError:
@ -518,10 +543,15 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
try:
response = requests.request(method, url, **kwargs)
return response.text, response.headers
except (requests.exceptions.RequestException,
requests.exceptions.BaseHTTPError) as msg:
log.warn(LOG_CHECK, ("Could not get content of URL %(url)s: %(msg)s.") \
% {"url": url, "msg": str(msg)})
except (
requests.exceptions.RequestException,
requests.exceptions.BaseHTTPError,
) as msg:
log.warn(
LOG_CHECK,
("Could not get content of URL %(url)s: %(msg)s.")
% {"url": url, "msg": str(msg)},
)
return None, str(msg)