mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Run black on linkcheck/
This commit is contained in:
parent
152dbeb9b8
commit
a92a684ac4
76 changed files with 2021 additions and 1068 deletions
|
|
@ -19,9 +19,14 @@ Main function module for link checking.
|
|||
|
||||
# version checks
|
||||
import sys
|
||||
|
||||
if sys.version_info < (3, 5, 0, 'final', 0):
|
||||
import platform
|
||||
raise SystemExit("This program requires Python 3.5.0 or later instead of %s." % platform.python_version())
|
||||
|
||||
raise SystemExit(
|
||||
"This program requires Python 3.5.0 or later instead of %s."
|
||||
% platform.python_version()
|
||||
)
|
||||
|
||||
import os
|
||||
import re
|
||||
|
|
@ -48,6 +53,7 @@ def module_path():
|
|||
def get_install_data():
|
||||
"""Return absolute path of LinkChecker data installation directory."""
|
||||
from .loader import is_frozen
|
||||
|
||||
if is_frozen():
|
||||
return module_path()
|
||||
return configdata.install_data
|
||||
|
|
@ -55,10 +61,13 @@ def get_install_data():
|
|||
|
||||
class LinkCheckerError(Exception):
|
||||
"""Exception to be raised on linkchecker-specific check errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class LinkCheckerInterrupt(Exception):
|
||||
"""Used for testing."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
|
@ -106,6 +115,7 @@ def init_i18n(loc=None):
|
|||
i18n.init(configdata.name.lower(), locdir, loc=loc)
|
||||
# install translated log level names
|
||||
import logging
|
||||
|
||||
logging.addLevelName(logging.CRITICAL, _('CRITICAL'))
|
||||
logging.addLevelName(logging.ERROR, _('ERROR'))
|
||||
logging.addLevelName(logging.WARN, _('WARN'))
|
||||
|
|
@ -124,15 +134,22 @@ def drop_privileges():
|
|||
if os.name != 'posix':
|
||||
return
|
||||
if os.geteuid() == 0:
|
||||
log.warn(LOG_CHECK, _("Running as root user; "
|
||||
"dropping privileges by changing user to nobody."))
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
_(
|
||||
"Running as root user; "
|
||||
"dropping privileges by changing user to nobody."
|
||||
),
|
||||
)
|
||||
import pwd
|
||||
|
||||
os.seteuid(pwd.getpwnam('nobody')[3])
|
||||
|
||||
|
||||
if hasattr(signal, "SIGUSR1"):
|
||||
# install SIGUSR1 handler
|
||||
from .decorators import signal_handler
|
||||
|
||||
@signal_handler(signal.SIGUSR1)
|
||||
def print_threadstacks(sig, frame):
|
||||
"""Print stack traces of all running threads."""
|
||||
|
|
@ -140,7 +157,9 @@ if hasattr(signal, "SIGUSR1"):
|
|||
for threadId, stack in sys._current_frames().items():
|
||||
log.warn(LOG_THREAD, "# ThreadID: %s" % threadId)
|
||||
for filename, lineno, name, line in traceback.extract_stack(stack):
|
||||
log.warn(LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name))
|
||||
log.warn(
|
||||
LOG_THREAD, 'File: "%s", line %d, in %s' % (filename, lineno, name)
|
||||
)
|
||||
line = line.strip()
|
||||
if line:
|
||||
log.warn(LOG_THREAD, " %s" % line)
|
||||
|
|
|
|||
|
|
@ -59,6 +59,7 @@ import os
|
|||
import logging
|
||||
import types
|
||||
from .fileutil import has_module, is_tty
|
||||
|
||||
if os.name == 'nt':
|
||||
from . import colorama
|
||||
|
||||
|
|
@ -79,16 +80,16 @@ concealed = 'concealed'
|
|||
|
||||
# Control numbers
|
||||
AnsiControl = {
|
||||
None: '',
|
||||
bold: '1',
|
||||
light: '2',
|
||||
#italic: '3', # unsupported
|
||||
None: '',
|
||||
bold: '1',
|
||||
light: '2',
|
||||
# italic: '3', # unsupported
|
||||
underline: '4',
|
||||
blink: '5',
|
||||
#rapidblink: '6', # unsupported
|
||||
invert: '7',
|
||||
blink: '5',
|
||||
# rapidblink: '6', # unsupported
|
||||
invert: '7',
|
||||
concealed: '8',
|
||||
#strikethrough: '9', # unsupported
|
||||
# strikethrough: '9', # unsupported
|
||||
}
|
||||
|
||||
# Color constants
|
||||
|
|
@ -116,47 +117,47 @@ InverseColors = (Black, Red, Green, Yellow, Blue, Purple, Cyan, White)
|
|||
|
||||
# Ansi color numbers; capitalized colors are inverse
|
||||
AnsiColor = {
|
||||
None: '0',
|
||||
None: '0',
|
||||
default: '0',
|
||||
black: '30',
|
||||
red: '31',
|
||||
green: '32',
|
||||
yellow: '33',
|
||||
blue: '34',
|
||||
purple: '35',
|
||||
cyan: '36',
|
||||
white: '37',
|
||||
Black: '40',
|
||||
Red: '41',
|
||||
Green: '42',
|
||||
Yellow: '43',
|
||||
Blue: '44',
|
||||
Purple: '45',
|
||||
Cyan: '46',
|
||||
White: '47',
|
||||
black: '30',
|
||||
red: '31',
|
||||
green: '32',
|
||||
yellow: '33',
|
||||
blue: '34',
|
||||
purple: '35',
|
||||
cyan: '36',
|
||||
white: '37',
|
||||
Black: '40',
|
||||
Red: '41',
|
||||
Green: '42',
|
||||
Yellow: '43',
|
||||
Blue: '44',
|
||||
Purple: '45',
|
||||
Cyan: '46',
|
||||
White: '47',
|
||||
}
|
||||
|
||||
if os.name == 'nt':
|
||||
# Windows color numbers; capitalized colors are used as background
|
||||
WinColor = {
|
||||
None: None,
|
||||
None: None,
|
||||
default: colorama.GREY,
|
||||
black: colorama.BLACK,
|
||||
red: colorama.RED,
|
||||
green: colorama.GREEN,
|
||||
yellow: colorama.YELLOW,
|
||||
blue: colorama.BLUE,
|
||||
purple: colorama.MAGENTA,
|
||||
cyan: colorama.CYAN,
|
||||
white: colorama.GREY,
|
||||
Black: colorama.BLACK,
|
||||
Red: colorama.RED,
|
||||
Green: colorama.GREEN,
|
||||
Yellow: colorama.YELLOW,
|
||||
Blue: colorama.BLUE,
|
||||
Purple: colorama.MAGENTA,
|
||||
Cyan: colorama.CYAN,
|
||||
White: colorama.GREY,
|
||||
black: colorama.BLACK,
|
||||
red: colorama.RED,
|
||||
green: colorama.GREEN,
|
||||
yellow: colorama.YELLOW,
|
||||
blue: colorama.BLUE,
|
||||
purple: colorama.MAGENTA,
|
||||
cyan: colorama.CYAN,
|
||||
white: colorama.GREY,
|
||||
Black: colorama.BLACK,
|
||||
Red: colorama.RED,
|
||||
Green: colorama.GREEN,
|
||||
Yellow: colorama.YELLOW,
|
||||
Blue: colorama.BLUE,
|
||||
Purple: colorama.MAGENTA,
|
||||
Cyan: colorama.CYAN,
|
||||
White: colorama.GREY,
|
||||
}
|
||||
|
||||
# pc speaker beep escape code
|
||||
|
|
@ -168,9 +169,10 @@ def esc_ansicolor(color):
|
|||
control = ''
|
||||
if ";" in color:
|
||||
control, color = color.split(";", 1)
|
||||
control = AnsiControl.get(control, '')+";"
|
||||
control = AnsiControl.get(control, '') + ";"
|
||||
cnum = AnsiColor.get(color, '0')
|
||||
return AnsiEsc % (control+cnum)
|
||||
return AnsiEsc % (control + cnum)
|
||||
|
||||
|
||||
AnsiReset = esc_ansicolor(default)
|
||||
|
||||
|
|
@ -201,6 +203,7 @@ def has_colors(fp):
|
|||
return True
|
||||
elif has_curses:
|
||||
import curses
|
||||
|
||||
try:
|
||||
curses.setupterm(os.environ.get("TERM"), fp.fileno())
|
||||
# More than 8 colors are good enough.
|
||||
|
|
@ -218,19 +221,19 @@ def get_columns(fp):
|
|||
return colorama.get_console_size().X
|
||||
if has_curses:
|
||||
import curses
|
||||
|
||||
try:
|
||||
curses.setupterm(os.environ.get("TERM"), fp.fileno())
|
||||
return curses.tigetnum("cols")
|
||||
except curses.error:
|
||||
pass
|
||||
pass
|
||||
return 80
|
||||
|
||||
|
||||
def _write_color_colorama(fp, text, color):
|
||||
"""Colorize text with given color."""
|
||||
foreground, background, style = get_win_color(color)
|
||||
colorama.set_console(foreground=foreground, background=background,
|
||||
style=style)
|
||||
colorama.set_console(foreground=foreground, background=background, style=style)
|
||||
fp.write(text)
|
||||
colorama.reset_console()
|
||||
|
||||
|
|
@ -314,7 +317,6 @@ class ColoredStreamHandler(logging.StreamHandler):
|
|||
try:
|
||||
self.stream.write("%s" % msg, color=color)
|
||||
except UnicodeError:
|
||||
self.stream.write("%s" % msg.encode("UTF-8"),
|
||||
color=color)
|
||||
self.stream.write("%s" % msg.encode("UTF-8"), color=color)
|
||||
self.stream.write(os.linesep)
|
||||
self.flush()
|
||||
|
|
|
|||
|
|
@ -5,14 +5,14 @@
|
|||
|
||||
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# 1. Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
# list of conditions and the following disclaimer.
|
||||
# 2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
|
|
@ -40,201 +40,283 @@ pykeywords = set(keyword.kwlist)
|
|||
|
||||
|
||||
def parse_py_statement(line):
|
||||
state = 0
|
||||
curtoken = ""
|
||||
spaces = " \t\n"
|
||||
ops = ".,;:+-*/%&=|(){}[]^<>"
|
||||
i = 0
|
||||
def _escape_char(c):
|
||||
if c == "n": return "\n"
|
||||
elif c == "t": return "\t"
|
||||
else: return c
|
||||
while i < len(line):
|
||||
c = line[i]
|
||||
i += 1
|
||||
if state == 0:
|
||||
if c in spaces: pass
|
||||
elif c in ops: yield ("op", c)
|
||||
elif c == "#": state = 6
|
||||
elif c == "\"": state = 1
|
||||
elif c == "'": state = 2
|
||||
else:
|
||||
curtoken = c
|
||||
state = 3
|
||||
elif state == 1: # string via "
|
||||
if c == "\\": state = 4
|
||||
elif c == "\"":
|
||||
yield ("str", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
else: curtoken += c
|
||||
elif state == 2: # string via '
|
||||
if c == "\\": state = 5
|
||||
elif c == "'":
|
||||
yield ("str", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
else: curtoken += c
|
||||
elif state == 3: # identifier
|
||||
if c in spaces + ops + "#\"'":
|
||||
yield ("id", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
i -= 1
|
||||
else: curtoken += c
|
||||
elif state == 4: # escape in "
|
||||
curtoken += _escape_char(c)
|
||||
state = 1
|
||||
elif state == 5: # escape in '
|
||||
curtoken += _escape_char(c)
|
||||
state = 2
|
||||
elif state == 6: # comment
|
||||
curtoken += c
|
||||
if state == 3: yield ("id", curtoken)
|
||||
elif state == 6: yield ("comment", curtoken)
|
||||
state = 0
|
||||
curtoken = ""
|
||||
spaces = " \t\n"
|
||||
ops = ".,;:+-*/%&=|(){}[]^<>"
|
||||
i = 0
|
||||
|
||||
def _escape_char(c):
|
||||
if c == "n":
|
||||
return "\n"
|
||||
elif c == "t":
|
||||
return "\t"
|
||||
else:
|
||||
return c
|
||||
|
||||
while i < len(line):
|
||||
c = line[i]
|
||||
i += 1
|
||||
if state == 0:
|
||||
if c in spaces:
|
||||
pass
|
||||
elif c in ops:
|
||||
yield ("op", c)
|
||||
elif c == "#":
|
||||
state = 6
|
||||
elif c == "\"":
|
||||
state = 1
|
||||
elif c == "'":
|
||||
state = 2
|
||||
else:
|
||||
curtoken = c
|
||||
state = 3
|
||||
elif state == 1: # string via "
|
||||
if c == "\\":
|
||||
state = 4
|
||||
elif c == "\"":
|
||||
yield ("str", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
else:
|
||||
curtoken += c
|
||||
elif state == 2: # string via '
|
||||
if c == "\\":
|
||||
state = 5
|
||||
elif c == "'":
|
||||
yield ("str", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
else:
|
||||
curtoken += c
|
||||
elif state == 3: # identifier
|
||||
if c in spaces + ops + "#\"'":
|
||||
yield ("id", curtoken)
|
||||
curtoken = ""
|
||||
state = 0
|
||||
i -= 1
|
||||
else:
|
||||
curtoken += c
|
||||
elif state == 4: # escape in "
|
||||
curtoken += _escape_char(c)
|
||||
state = 1
|
||||
elif state == 5: # escape in '
|
||||
curtoken += _escape_char(c)
|
||||
state = 2
|
||||
elif state == 6: # comment
|
||||
curtoken += c
|
||||
if state == 3:
|
||||
yield ("id", curtoken)
|
||||
elif state == 6:
|
||||
yield ("comment", curtoken)
|
||||
|
||||
|
||||
def grep_full_py_identifiers(tokens):
|
||||
global pykeywords
|
||||
tokens = list(tokens)
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
tokentype, token = tokens[i]
|
||||
i += 1
|
||||
if tokentype != "id": continue
|
||||
while i+1 < len(tokens) and tokens[i] == ("op", ".") and tokens[i+1][0] == "id":
|
||||
token += "." + tokens[i+1][1]
|
||||
i += 2
|
||||
if token == "": continue
|
||||
if token in pykeywords: continue
|
||||
if token[0] in ".0123456789": continue
|
||||
yield token
|
||||
global pykeywords
|
||||
tokens = list(tokens)
|
||||
i = 0
|
||||
while i < len(tokens):
|
||||
tokentype, token = tokens[i]
|
||||
i += 1
|
||||
if tokentype != "id":
|
||||
continue
|
||||
while (
|
||||
i + 1 < len(tokens)
|
||||
and tokens[i] == ("op", ".")
|
||||
and tokens[i + 1][0] == "id"
|
||||
):
|
||||
token += "." + tokens[i + 1][1]
|
||||
i += 2
|
||||
if token == "":
|
||||
continue
|
||||
if token in pykeywords:
|
||||
continue
|
||||
if token[0] in ".0123456789":
|
||||
continue
|
||||
yield token
|
||||
|
||||
|
||||
def output(s, out=sys.stdout):
|
||||
print(s, file=out)
|
||||
|
||||
def output(s, out=sys.stdout): print(s, file=out)
|
||||
|
||||
def output_limit():
|
||||
return 300
|
||||
return 300
|
||||
|
||||
|
||||
def pp_extra_info(obj, depthlimit=3):
|
||||
s = []
|
||||
if hasattr(obj, "__len__"):
|
||||
try:
|
||||
if type(obj) in (bytes, str, list, tuple, dict) and len(obj) <= 5:
|
||||
pass # don't print len in this case
|
||||
else:
|
||||
s += ["len = " + str(obj.__len__())]
|
||||
except:
|
||||
pass
|
||||
if depthlimit > 0 and hasattr(obj, "__getitem__"):
|
||||
try:
|
||||
if type(obj) in (bytes, str):
|
||||
pass # doesn't make sense to get subitems here
|
||||
else:
|
||||
subobj = obj.__getitem__(0)
|
||||
extra_info = pp_extra_info(subobj, depthlimit - 1)
|
||||
if extra_info != "":
|
||||
s += ["_[0]: {" + extra_info + "}"]
|
||||
except:
|
||||
pass
|
||||
return ", ".join(s)
|
||||
|
||||
|
||||
def pp_extra_info(obj, depthlimit = 3):
|
||||
s = []
|
||||
if hasattr(obj, "__len__"):
|
||||
try:
|
||||
if type(obj) in (bytes,str,list,tuple,dict) and len(obj) <= 5:
|
||||
pass # don't print len in this case
|
||||
else:
|
||||
s += ["len = " + str(obj.__len__())]
|
||||
except: pass
|
||||
if depthlimit > 0 and hasattr(obj, "__getitem__"):
|
||||
try:
|
||||
if type(obj) in (bytes,str):
|
||||
pass # doesn't make sense to get subitems here
|
||||
else:
|
||||
subobj = obj.__getitem__(0)
|
||||
extra_info = pp_extra_info(subobj, depthlimit - 1)
|
||||
if extra_info != "":
|
||||
s += ["_[0]: {" + extra_info + "}"]
|
||||
except: pass
|
||||
return ", ".join(s)
|
||||
|
||||
def pretty_print(obj):
|
||||
s = repr(obj)
|
||||
limit = output_limit()
|
||||
if len(s) > limit:
|
||||
s = s[:limit - 3] + "..."
|
||||
extra_info = pp_extra_info(obj)
|
||||
if extra_info != "": s += ", " + extra_info
|
||||
return s
|
||||
s = repr(obj)
|
||||
limit = output_limit()
|
||||
if len(s) > limit:
|
||||
s = s[: limit - 3] + "..."
|
||||
extra_info = pp_extra_info(obj)
|
||||
if extra_info != "":
|
||||
s += ", " + extra_info
|
||||
return s
|
||||
|
||||
|
||||
def fallback_findfile(filename):
|
||||
mods = [ m for m in sys.modules.values() if m and hasattr(m, "__file__") and filename in m.__file__ ]
|
||||
if len(mods) == 0: return None
|
||||
altfn = mods[0].__file__
|
||||
if altfn[-4:-1] == ".py": altfn = altfn[:-1] # *.pyc or whatever
|
||||
return altfn
|
||||
mods = [
|
||||
m
|
||||
for m in sys.modules.values()
|
||||
if m and hasattr(m, "__file__") and filename in m.__file__
|
||||
]
|
||||
if len(mods) == 0:
|
||||
return None
|
||||
altfn = mods[0].__file__
|
||||
if altfn[-4:-1] == ".py":
|
||||
altfn = altfn[:-1] # *.pyc or whatever
|
||||
return altfn
|
||||
|
||||
|
||||
def better_exchook(etype, value, tb, out=sys.stdout):
|
||||
output('Traceback (most recent call last):', out=out)
|
||||
allLocals,allGlobals = {},{}
|
||||
try:
|
||||
import linecache
|
||||
limit = None
|
||||
if hasattr(sys, 'tracebacklimit'):
|
||||
limit = sys.tracebacklimit
|
||||
n = 0
|
||||
_tb = tb
|
||||
def _resolveIdentifier(namespace, id):
|
||||
obj = namespace[id[0]]
|
||||
for part in id[1:]:
|
||||
obj = getattr(obj, part)
|
||||
return obj
|
||||
def _trySet(old, prefix, func):
|
||||
if old is not None: return old
|
||||
try: return prefix + func()
|
||||
except KeyError: return old
|
||||
except Exception as e:
|
||||
return prefix + "!" + e.__class__.__name__ + ": " + str(e)
|
||||
while _tb is not None and (limit is None or n < limit):
|
||||
f = _tb.tb_frame
|
||||
allLocals.update(f.f_locals)
|
||||
allGlobals.update(f.f_globals)
|
||||
lineno = _tb.tb_lineno
|
||||
co = f.f_code
|
||||
filename = co.co_filename
|
||||
name = co.co_name
|
||||
output(' File "%s", line %d, in %s' % (filename,lineno,name), out=out)
|
||||
if not os.path.isfile(filename):
|
||||
altfn = fallback_findfile(filename)
|
||||
if altfn:
|
||||
output(" -- couldn't find file, trying this instead: " + altfn, out=out)
|
||||
filename = altfn
|
||||
linecache.checkcache(filename)
|
||||
line = linecache.getline(filename, lineno, f.f_globals)
|
||||
if line:
|
||||
line = line.strip()
|
||||
output(' line: ' + line, out=out)
|
||||
output(' locals:', out=out)
|
||||
alreadyPrintedLocals = set()
|
||||
for tokenstr in grep_full_py_identifiers(parse_py_statement(line)):
|
||||
splittedtoken = tuple(tokenstr.split("."))
|
||||
for token in map(lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1)):
|
||||
if token in alreadyPrintedLocals: continue
|
||||
tokenvalue = None
|
||||
tokenvalue = _trySet(tokenvalue, "<local> ", lambda: pretty_print(_resolveIdentifier(f.f_locals, token)))
|
||||
tokenvalue = _trySet(tokenvalue, "<global> ", lambda: pretty_print(_resolveIdentifier(f.f_globals, token)))
|
||||
tokenvalue = _trySet(tokenvalue, "<builtin> ", lambda: pretty_print(_resolveIdentifier(f.f_builtins, token)))
|
||||
tokenvalue = tokenvalue or "<not found>"
|
||||
output(' ' + ".".join(token) + " = " + tokenvalue, out=out)
|
||||
alreadyPrintedLocals.add(token)
|
||||
if len(alreadyPrintedLocals) == 0: output(" no locals", out=out)
|
||||
else:
|
||||
output(' -- code not available --', out=out)
|
||||
_tb = _tb.tb_next
|
||||
n += 1
|
||||
output('Traceback (most recent call last):', out=out)
|
||||
allLocals, allGlobals = {}, {}
|
||||
try:
|
||||
import linecache
|
||||
|
||||
except Exception:
|
||||
output("ERROR: cannot get more detailed exception info because:", out=out)
|
||||
import traceback
|
||||
for l in traceback.format_exc().split("\n"): output(" " + l, out=out)
|
||||
output("simple traceback:", out=out)
|
||||
traceback.print_tb(tb, None, out)
|
||||
limit = None
|
||||
if hasattr(sys, 'tracebacklimit'):
|
||||
limit = sys.tracebacklimit
|
||||
n = 0
|
||||
_tb = tb
|
||||
|
||||
def _resolveIdentifier(namespace, id):
|
||||
obj = namespace[id[0]]
|
||||
for part in id[1:]:
|
||||
obj = getattr(obj, part)
|
||||
return obj
|
||||
|
||||
def _trySet(old, prefix, func):
|
||||
if old is not None:
|
||||
return old
|
||||
try:
|
||||
return prefix + func()
|
||||
except KeyError:
|
||||
return old
|
||||
except Exception as e:
|
||||
return prefix + "!" + e.__class__.__name__ + ": " + str(e)
|
||||
|
||||
while _tb is not None and (limit is None or n < limit):
|
||||
f = _tb.tb_frame
|
||||
allLocals.update(f.f_locals)
|
||||
allGlobals.update(f.f_globals)
|
||||
lineno = _tb.tb_lineno
|
||||
co = f.f_code
|
||||
filename = co.co_filename
|
||||
name = co.co_name
|
||||
output(' File "%s", line %d, in %s' % (filename, lineno, name), out=out)
|
||||
if not os.path.isfile(filename):
|
||||
altfn = fallback_findfile(filename)
|
||||
if altfn:
|
||||
output(
|
||||
" -- couldn't find file, trying this instead: " + altfn,
|
||||
out=out,
|
||||
)
|
||||
filename = altfn
|
||||
linecache.checkcache(filename)
|
||||
line = linecache.getline(filename, lineno, f.f_globals)
|
||||
if line:
|
||||
line = line.strip()
|
||||
output(' line: ' + line, out=out)
|
||||
output(' locals:', out=out)
|
||||
alreadyPrintedLocals = set()
|
||||
for tokenstr in grep_full_py_identifiers(parse_py_statement(line)):
|
||||
splittedtoken = tuple(tokenstr.split("."))
|
||||
for token in map(
|
||||
lambda i: splittedtoken[0:i], range(1, len(splittedtoken) + 1)
|
||||
):
|
||||
if token in alreadyPrintedLocals:
|
||||
continue
|
||||
tokenvalue = None
|
||||
tokenvalue = _trySet(
|
||||
tokenvalue,
|
||||
"<local> ",
|
||||
lambda: pretty_print(_resolveIdentifier(f.f_locals, token)),
|
||||
)
|
||||
tokenvalue = _trySet(
|
||||
tokenvalue,
|
||||
"<global> ",
|
||||
lambda: pretty_print(
|
||||
_resolveIdentifier(f.f_globals, token)
|
||||
),
|
||||
)
|
||||
tokenvalue = _trySet(
|
||||
tokenvalue,
|
||||
"<builtin> ",
|
||||
lambda: pretty_print(
|
||||
_resolveIdentifier(f.f_builtins, token)
|
||||
),
|
||||
)
|
||||
tokenvalue = tokenvalue or "<not found>"
|
||||
output(' ' + ".".join(token) + " = " + tokenvalue, out=out)
|
||||
alreadyPrintedLocals.add(token)
|
||||
if len(alreadyPrintedLocals) == 0:
|
||||
output(" no locals", out=out)
|
||||
else:
|
||||
output(' -- code not available --', out=out)
|
||||
_tb = _tb.tb_next
|
||||
n += 1
|
||||
|
||||
except Exception:
|
||||
output("ERROR: cannot get more detailed exception info because:", out=out)
|
||||
import traceback
|
||||
|
||||
for l in traceback.format_exc().split("\n"):
|
||||
output(" " + l, out=out)
|
||||
output("simple traceback:", out=out)
|
||||
traceback.print_tb(tb, None, out)
|
||||
|
||||
import types
|
||||
|
||||
def _some_str(value):
|
||||
try:
|
||||
return str(value)
|
||||
except:
|
||||
return '<unprintable %s object>' % type(value).__name__
|
||||
|
||||
def _format_final_exc_line(etype, value):
|
||||
valuestr = _some_str(value)
|
||||
if value is None or not valuestr:
|
||||
line = "%s" % etype
|
||||
else:
|
||||
line = "%s: %s" % (etype, valuestr)
|
||||
return line
|
||||
|
||||
if (
|
||||
isinstance(etype, BaseException)
|
||||
or (hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType))
|
||||
or etype is None
|
||||
or type(etype) is str
|
||||
):
|
||||
output(_format_final_exc_line(etype, value), out=out)
|
||||
else:
|
||||
output(_format_final_exc_line(etype.__name__, value), out=out)
|
||||
|
||||
import types
|
||||
def _some_str(value):
|
||||
try: return str(value)
|
||||
except: return '<unprintable %s object>' % type(value).__name__
|
||||
def _format_final_exc_line(etype, value):
|
||||
valuestr = _some_str(value)
|
||||
if value is None or not valuestr:
|
||||
line = "%s" % etype
|
||||
else:
|
||||
line = "%s: %s" % (etype, valuestr)
|
||||
return line
|
||||
if (isinstance(etype, BaseException) or
|
||||
(hasattr(types, "InstanceType") and isinstance(etype, types.InstanceType)) or
|
||||
etype is None or type(etype) is str):
|
||||
output(_format_final_exc_line(etype, value), out=out)
|
||||
else:
|
||||
output(_format_final_exc_line(etype.__name__, value), out=out)
|
||||
|
||||
def install():
|
||||
sys.excepthook = better_exchook
|
||||
sys.excepthook = better_exchook
|
||||
|
|
|
|||
|
|
@ -16,8 +16,10 @@
|
|||
"""Parser for FireFox bookmark file."""
|
||||
|
||||
import re
|
||||
|
||||
try:
|
||||
import sqlite3
|
||||
|
||||
has_sqlite = True
|
||||
except ImportError:
|
||||
has_sqlite = False
|
||||
|
|
|
|||
|
|
@ -15,8 +15,10 @@
|
|||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
|
||||
import plistlib
|
||||
|
||||
try:
|
||||
import biplist
|
||||
|
||||
has_biplist = True
|
||||
except ImportError:
|
||||
has_biplist = False
|
||||
|
|
@ -48,6 +50,7 @@ KEY_URIDICTIONARY = 'URIDictionary'
|
|||
KEY_CHILDREN = 'Children'
|
||||
KEY_WEBBOOKMARKTYPE = 'WebBookmarkType'
|
||||
|
||||
|
||||
def parse_plist(entry):
|
||||
"""Parse a XML dictionary entry."""
|
||||
if is_leaf(entry):
|
||||
|
|
|
|||
3
linkcheck/cache/robots_txt.py
vendored
3
linkcheck/cache/robots_txt.py
vendored
|
|
@ -56,8 +56,7 @@ class RobotsTxt:
|
|||
rp = self.cache[roboturl]
|
||||
return rp.can_fetch(self.useragent, url_data.url)
|
||||
self.misses += 1
|
||||
kwargs = dict(auth=url_data.auth, session=url_data.session,
|
||||
timeout=timeout)
|
||||
kwargs = dict(auth=url_data.auth, session=url_data.session, timeout=timeout)
|
||||
if hasattr(url_data, "proxy") and hasattr(url_data, "proxy_type"):
|
||||
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
|
||||
rp = robotparser2.RobotFileParser(**kwargs)
|
||||
|
|
|
|||
12
linkcheck/cache/urlqueue.py
vendored
12
linkcheck/cache/urlqueue.py
vendored
|
|
@ -24,15 +24,19 @@ from .. import log, LOG_CACHE
|
|||
|
||||
class Timeout(Exception):
|
||||
"""Raised by join()"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class Empty(Exception):
|
||||
"""Exception raised by get()."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
NUM_PUTS_CLEANUP = 10000
|
||||
|
||||
|
||||
class UrlQueue:
|
||||
"""A queue supporting several consumer tasks. The task_done() idea is
|
||||
from the Python 2.5 implementation of Queue.Queue()."""
|
||||
|
|
@ -58,7 +62,9 @@ class UrlQueue:
|
|||
# Each put() decreases the number of allowed puts.
|
||||
# This way we can restrict the number of URLs that are checked.
|
||||
if max_allowed_urls is not None and max_allowed_urls <= 0:
|
||||
raise ValueError("Non-positive number of allowed URLs: %d" % max_allowed_urls)
|
||||
raise ValueError(
|
||||
"Non-positive number of allowed URLs: %d" % max_allowed_urls
|
||||
)
|
||||
self.max_allowed_urls = max_allowed_urls
|
||||
self.num_puts = 0
|
||||
|
||||
|
|
@ -132,7 +138,9 @@ class UrlQueue:
|
|||
self.cleanup()
|
||||
self.queue.append(url_data)
|
||||
self.unfinished_tasks += 1
|
||||
cache.add_result(key, None) # add none value to cache to prevent checking this url multiple times
|
||||
cache.add_result(
|
||||
key, None
|
||||
) # add none value to cache to prevent checking this url multiple times
|
||||
|
||||
def cleanup(self):
|
||||
"""Move cached elements to top."""
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ import urllib.parse
|
|||
|
||||
from .. import strformat, url as urlutil, log, LOG_CHECK
|
||||
|
||||
MAX_FILESIZE = 1024*1024*10 # 10MB
|
||||
MAX_FILESIZE = 1024 * 1024 * 10 # 10MB
|
||||
|
||||
|
||||
def guess_url(url):
|
||||
|
|
@ -64,9 +64,20 @@ def absolute_url(base_url, base_ref, parent_url):
|
|||
return ""
|
||||
|
||||
|
||||
def get_url_from(base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=None, column=None,
|
||||
page=0, name="", parent_content_type=None, extern=None, url_encoding=None):
|
||||
def get_url_from(
|
||||
base_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
parent_url=None,
|
||||
base_ref=None,
|
||||
line=None,
|
||||
column=None,
|
||||
page=0,
|
||||
name="",
|
||||
parent_content_type=None,
|
||||
extern=None,
|
||||
url_encoding=None,
|
||||
):
|
||||
"""
|
||||
Get url data from given base data.
|
||||
|
||||
|
|
@ -112,17 +123,31 @@ def get_url_from(base_url, recursion_level, aggregate,
|
|||
name = base_url.replace("\\", "/")
|
||||
allowed_schemes = aggregate.config["allowedschemes"]
|
||||
# ignore local PHP files with execution directives
|
||||
local_php = (parent_content_type == 'application/x-httpd-php' and
|
||||
'<?' in base_url and '?>' in base_url and scheme == 'file')
|
||||
local_php = (
|
||||
parent_content_type == 'application/x-httpd-php'
|
||||
and '<?' in base_url
|
||||
and '?>' in base_url
|
||||
and scheme == 'file'
|
||||
)
|
||||
if local_php or (allowed_schemes and scheme not in allowed_schemes):
|
||||
klass = ignoreurl.IgnoreUrl
|
||||
else:
|
||||
assume_local_file = (recursion_level == 0)
|
||||
assume_local_file = recursion_level == 0
|
||||
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
|
||||
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
|
||||
return klass(base_url, recursion_level, aggregate,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, page=page, name=name, extern=extern, url_encoding=url_encoding)
|
||||
return klass(
|
||||
base_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
parent_url=parent_url,
|
||||
base_ref=base_ref,
|
||||
line=line,
|
||||
column=column,
|
||||
page=page,
|
||||
name=name,
|
||||
extern=extern,
|
||||
url_encoding=url_encoding,
|
||||
)
|
||||
|
||||
|
||||
def get_urlclass_from(scheme, assume_local_file=False):
|
||||
|
|
@ -175,5 +200,15 @@ def get_index_html(urls):
|
|||
|
||||
|
||||
# all the URL classes
|
||||
from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
|
||||
mailtourl, telneturl, nntpurl, ignoreurl, itmsservicesurl) # noqa: E402
|
||||
from . import (
|
||||
fileurl,
|
||||
unknownurl,
|
||||
ftpurl,
|
||||
httpurl,
|
||||
dnsurl,
|
||||
mailtourl,
|
||||
telneturl,
|
||||
nntpurl,
|
||||
ignoreurl,
|
||||
itmsservicesurl,
|
||||
) # noqa: E402
|
||||
|
|
|
|||
|
|
@ -33,7 +33,7 @@ ExcSyntaxList = [
|
|||
# exceptions are internal or system errors
|
||||
ExcCacheList = [
|
||||
IOError,
|
||||
OSError, # OSError is thrown on Windows when a file is not found
|
||||
OSError, # OSError is thrown on Windows when a file is not found
|
||||
LinkCheckerError,
|
||||
DNSException,
|
||||
socket.error,
|
||||
|
|
@ -59,6 +59,7 @@ ExcNoCacheList = [
|
|||
# firefox bookmark file needs sqlite3 for parsing
|
||||
try:
|
||||
import sqlite3
|
||||
|
||||
ExcCacheList.append(sqlite3.Error)
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
@ -66,6 +67,7 @@ except ImportError:
|
|||
# pyOpenSSL errors
|
||||
try:
|
||||
import OpenSSL
|
||||
|
||||
ExcCacheList.append(OpenSSL.SSL.Error)
|
||||
except ImportError:
|
||||
pass
|
||||
|
|
@ -99,22 +101,22 @@ WARN_XML_PARSE_ERROR = "xml-parse-error"
|
|||
|
||||
# registered warnings
|
||||
Warnings = {
|
||||
WARN_URL_EFFECTIVE_URL:
|
||||
_("The effective URL is different from the original."),
|
||||
WARN_URL_ERROR_GETTING_CONTENT:
|
||||
_("Could not get the content of the URL."),
|
||||
WARN_URL_EFFECTIVE_URL: _("The effective URL is different from the original."),
|
||||
WARN_URL_ERROR_GETTING_CONTENT: _("Could not get the content of the URL."),
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE: _("The URL content size is too large."),
|
||||
WARN_URL_CONTENT_SIZE_ZERO: _("The URL content size is zero."),
|
||||
WARN_URL_RATE_LIMITED: _("The URL request was rate limited so need reduce number of requests."),
|
||||
WARN_URL_RATE_LIMITED: _(
|
||||
"The URL request was rate limited so need reduce number of requests."
|
||||
),
|
||||
WARN_URL_TOO_LONG: _("The URL is longer than the recommended size."),
|
||||
WARN_URL_WHITESPACE: _("The URL contains leading or trailing whitespace."),
|
||||
WARN_FILE_MISSING_SLASH: _("The file: URL is missing a trailing slash."),
|
||||
WARN_FILE_SYSTEM_PATH:
|
||||
_("The file: path is not the same as the system specific path."),
|
||||
WARN_FILE_SYSTEM_PATH: _(
|
||||
"The file: path is not the same as the system specific path."
|
||||
),
|
||||
WARN_FTP_MISSING_SLASH: _("The ftp: URL is missing a trailing slash."),
|
||||
WARN_HTTP_EMPTY_CONTENT: _("The URL had no content."),
|
||||
WARN_HTTP_COOKIE_STORE_ERROR:
|
||||
_("An error occurred while storing a cookie."),
|
||||
WARN_HTTP_COOKIE_STORE_ERROR: _("An error occurred while storing a cookie."),
|
||||
WARN_IGNORE_URL: _("The URL has been ignored."),
|
||||
WARN_MAIL_NO_MX_HOST: _("The mail MX host could not be found."),
|
||||
WARN_NNTP_NO_SERVER: _("No NNTP server was found."),
|
||||
|
|
|
|||
|
|
@ -39,7 +39,7 @@ def get_files(dirname):
|
|||
if os.path.isfile(fullentry):
|
||||
yield entry
|
||||
elif os.path.isdir(fullentry):
|
||||
yield entry+"/"
|
||||
yield entry + "/"
|
||||
|
||||
|
||||
def prepare_urlpath_for_nt(path):
|
||||
|
|
@ -48,7 +48,7 @@ def prepare_urlpath_for_nt(path):
|
|||
However urllib.url2pathname expects '////server/path'.
|
||||
"""
|
||||
if '|' not in path:
|
||||
return "////"+path.lstrip("/")
|
||||
return "////" + path.lstrip("/")
|
||||
return path
|
||||
|
||||
|
||||
|
|
@ -58,9 +58,9 @@ def get_nt_filename(path):
|
|||
head, tail = os.path.split(rest)
|
||||
if not tail:
|
||||
return path
|
||||
for fname in os.listdir(unc+head):
|
||||
for fname in os.listdir(unc + head):
|
||||
if fname.lower() == tail.lower():
|
||||
return os.path.join(get_nt_filename(unc+head), fname)
|
||||
return os.path.join(get_nt_filename(unc + head), fname)
|
||||
log.error(LOG_CHECK, "could not find %r in %r", tail, head)
|
||||
return path
|
||||
|
||||
|
|
@ -92,11 +92,34 @@ class FileUrl(urlbase.UrlBase):
|
|||
Url link with file scheme.
|
||||
"""
|
||||
|
||||
def init(self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, page, name, url_encoding, extern):
|
||||
def init(
|
||||
self,
|
||||
base_ref,
|
||||
base_url,
|
||||
parent_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
line,
|
||||
column,
|
||||
page,
|
||||
name,
|
||||
url_encoding,
|
||||
extern,
|
||||
):
|
||||
"""Initialize the scheme."""
|
||||
super(FileUrl, self).init(base_ref, base_url, parent_url,
|
||||
recursion_level, aggregate, line, column, page, name, url_encoding, extern)
|
||||
super(FileUrl, self).init(
|
||||
base_ref,
|
||||
base_url,
|
||||
parent_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
line,
|
||||
column,
|
||||
page,
|
||||
name,
|
||||
url_encoding,
|
||||
extern,
|
||||
)
|
||||
self.scheme = 'file'
|
||||
|
||||
def build_base_url(self):
|
||||
|
|
@ -111,14 +134,16 @@ class FileUrl(urlbase.UrlBase):
|
|||
base_url = os.path.expanduser(base_url)
|
||||
if not is_absolute_path(base_url):
|
||||
try:
|
||||
base_url = os.getcwd()+"/"+base_url
|
||||
base_url = os.getcwd() + "/" + base_url
|
||||
except OSError as msg:
|
||||
# occurs on stale remote filesystems (eg. NFS)
|
||||
errmsg = _("Could not get current working directory: %(msg)s") % dict(msg=msg)
|
||||
errmsg = _(
|
||||
"Could not get current working directory: %(msg)s"
|
||||
) % dict(msg=msg)
|
||||
raise LinkCheckerError(errmsg)
|
||||
if os.path.isdir(base_url):
|
||||
base_url += "/"
|
||||
base_url = "file://"+base_url
|
||||
base_url = "file://" + base_url
|
||||
if os.name == "nt":
|
||||
base_url = base_url.replace("\\", "/")
|
||||
# transform c:/windows into /c|/windows
|
||||
|
|
@ -138,6 +163,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
# Otherwise the join function thinks the query is part of
|
||||
# the file name.
|
||||
from .urlbase import url_norm
|
||||
|
||||
# norm base url - can raise UnicodeError from url.idna_encode()
|
||||
base_url, is_idn = url_norm(self.base_url, self.encoding)
|
||||
urlparts = list(urllib.parse.urlsplit(base_url))
|
||||
|
|
@ -148,8 +174,9 @@ class FileUrl(urlbase.UrlBase):
|
|||
# ignore query and fragment url parts for filesystem urls
|
||||
self.urlparts[3] = self.urlparts[4] = ''
|
||||
if self.is_directory() and not self.urlparts[2].endswith('/'):
|
||||
self.add_warning(_("Added trailing slash to directory."),
|
||||
tag=WARN_FILE_MISSING_SLASH)
|
||||
self.add_warning(
|
||||
_("Added trailing slash to directory."), tag=WARN_FILE_MISSING_SLASH
|
||||
)
|
||||
self.urlparts[2] += '/'
|
||||
self.url = urlutil.urlunsplit(self.urlparts)
|
||||
|
||||
|
|
@ -168,9 +195,10 @@ class FileUrl(urlbase.UrlBase):
|
|||
Try to open the local file. Under NT systems the case sensitivity
|
||||
is checked.
|
||||
"""
|
||||
if (self.parent_url is not None and
|
||||
not self.parent_url.startswith("file:")):
|
||||
msg = _("local files are only checked without parent URL or when the parent URL is also a file")
|
||||
if self.parent_url is not None and not self.parent_url.startswith("file:"):
|
||||
msg = _(
|
||||
"local files are only checked without parent URL or when the parent URL is also a file"
|
||||
)
|
||||
raise LinkCheckerError(msg)
|
||||
if self.is_directory():
|
||||
self.set_result(_("directory"))
|
||||
|
|
@ -190,11 +218,15 @@ class FileUrl(urlbase.UrlBase):
|
|||
path = self.get_os_filename()
|
||||
realpath = get_nt_filename(path)
|
||||
if path != realpath:
|
||||
self.add_warning(_("The URL path %(path)r is not the same as the "
|
||||
"system path %(realpath)r. You should always use "
|
||||
"the system path in URLs.") % \
|
||||
{"path": path, "realpath": realpath},
|
||||
tag=WARN_FILE_SYSTEM_PATH)
|
||||
self.add_warning(
|
||||
_(
|
||||
"The URL path %(path)r is not the same as the "
|
||||
"system path %(realpath)r. You should always use "
|
||||
"the system path in URLs."
|
||||
)
|
||||
% {"path": path, "realpath": realpath},
|
||||
tag=WARN_FILE_SYSTEM_PATH,
|
||||
)
|
||||
|
||||
def read_content(self):
|
||||
"""Return file content, or in case of directories a dummy HTML file
|
||||
|
|
@ -242,7 +274,9 @@ class FileUrl(urlbase.UrlBase):
|
|||
return True
|
||||
if self.content_type in self.ContentMimetypes:
|
||||
return True
|
||||
log.debug(LOG_CHECK, "File with content type %r is not parseable.", self.content_type)
|
||||
log.debug(
|
||||
LOG_CHECK, "File with content type %r is not parseable.", self.content_type
|
||||
)
|
||||
return False
|
||||
|
||||
def set_content_type(self):
|
||||
|
|
@ -267,7 +301,7 @@ class FileUrl(urlbase.UrlBase):
|
|||
i = url.rindex('/')
|
||||
if i > 6:
|
||||
# remove last filename to make directory internal
|
||||
url = url[:i+1]
|
||||
url = url[: i + 1]
|
||||
return re.escape(url)
|
||||
|
||||
def add_url(self, url, line=0, column=0, page=0, name="", base=None):
|
||||
|
|
@ -277,4 +311,6 @@ class FileUrl(urlbase.UrlBase):
|
|||
if webroot and url and url.startswith("/"):
|
||||
url = webroot + url[1:]
|
||||
log.debug(LOG_CHECK, "Applied local webroot `%s' to `%s'.", webroot, url)
|
||||
super(FileUrl, self).add_url(url, line=line, column=column, page=page, name=name, base=base)
|
||||
super(FileUrl, self).add_url(
|
||||
url, line=line, column=column, page=page, name=name, base=base
|
||||
)
|
||||
|
|
|
|||
|
|
@ -50,14 +50,16 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
self.set_proxy(self.aggregate.config["proxy"].get(self.scheme))
|
||||
if self.proxy:
|
||||
# using a (HTTP) proxy
|
||||
http = httpurl.HttpUrl(self.base_url,
|
||||
self.recursion_level,
|
||||
self.aggregate,
|
||||
parent_url=self.parent_url,
|
||||
base_ref=self.base_ref,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
name=self.name)
|
||||
http = httpurl.HttpUrl(
|
||||
self.base_url,
|
||||
self.recursion_level,
|
||||
self.aggregate,
|
||||
parent_url=self.parent_url,
|
||||
base_ref=self.base_ref,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
name=self.name,
|
||||
)
|
||||
http.build_url()
|
||||
return http.check()
|
||||
self.login()
|
||||
|
|
@ -91,7 +93,8 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
raise LinkCheckerError(_("Got no answer from FTP server"))
|
||||
except EOFError as msg:
|
||||
raise LinkCheckerError(
|
||||
_("Remote host has closed connection: %(msg)s") % str(msg))
|
||||
_("Remote host has closed connection: %(msg)s") % str(msg)
|
||||
)
|
||||
|
||||
def negotiate_encoding(self):
|
||||
"""Check if server can handle UTF-8 encoded filenames.
|
||||
|
|
@ -137,8 +140,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if "%s/" % self.filename in files:
|
||||
if not self.url.endswith('/'):
|
||||
self.add_warning(
|
||||
_("Missing trailing directory slash in ftp url."),
|
||||
tag=WARN_FTP_MISSING_SLASH)
|
||||
_("Missing trailing directory slash in ftp url."),
|
||||
tag=WARN_FTP_MISSING_SLASH,
|
||||
)
|
||||
self.url += '/'
|
||||
return
|
||||
raise ftplib.error_perm("550 File not found")
|
||||
|
|
@ -147,11 +151,13 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""Get list of filenames in directory. Subdirectories have an
|
||||
ending slash."""
|
||||
files = []
|
||||
|
||||
def add_entry(line):
|
||||
"""Parse list line and add the entry it points to to the file
|
||||
list."""
|
||||
log.debug(LOG_CHECK, "Directory entry %r", line)
|
||||
from ..ftpparse import ftpparse
|
||||
|
||||
fpo = ftpparse(line)
|
||||
if fpo is not None and fpo["name"]:
|
||||
name = fpo["name"]
|
||||
|
|
@ -159,6 +165,7 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
name += "/"
|
||||
if fpo["trycwd"] or fpo["tryretr"]:
|
||||
files.append(name)
|
||||
|
||||
self.url_connection.dir(add_entry)
|
||||
return files
|
||||
|
||||
|
|
@ -168,7 +175,9 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
return True
|
||||
if self.content_type in self.ContentMimetypes:
|
||||
return True
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable.", self.content_type)
|
||||
log.debug(
|
||||
LOG_CHECK, "URL with content type %r is not parseable.", self.content_type
|
||||
)
|
||||
return False
|
||||
|
||||
def is_directory(self):
|
||||
|
|
@ -194,12 +203,14 @@ class FtpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# download file in BINARY mode
|
||||
ftpcmd = "RETR %s" % self.filename
|
||||
buf = StringIO()
|
||||
|
||||
def stor_data(s):
|
||||
"""Helper method storing given data"""
|
||||
# limit the download size
|
||||
if (buf.tell() + len(s)) > self.max_size:
|
||||
raise LinkCheckerError(_("FTP file size too large"))
|
||||
buf.write(s)
|
||||
|
||||
self.url_connection.retrbinary(ftpcmd, stor_data)
|
||||
data = buf.getvalue()
|
||||
buf.close()
|
||||
|
|
|
|||
|
|
@ -17,21 +17,33 @@
|
|||
Handle http links.
|
||||
"""
|
||||
import requests
|
||||
|
||||
# The validity of SSL certs is ignored to be able
|
||||
# the check the URL and recurse into it.
|
||||
# The warning about invalid SSL certs is given to the
|
||||
# user instead.
|
||||
import warnings
|
||||
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
warnings.simplefilter(
|
||||
'ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning
|
||||
)
|
||||
|
||||
from io import BytesIO
|
||||
import re
|
||||
|
||||
from .. import (log, LOG_CHECK, strformat, mimeutil,
|
||||
url as urlutil, LinkCheckerError, httputil)
|
||||
from . import (internpaturl, proxysupport)
|
||||
from .. import (
|
||||
log,
|
||||
LOG_CHECK,
|
||||
strformat,
|
||||
mimeutil,
|
||||
url as urlutil,
|
||||
LinkCheckerError,
|
||||
httputil,
|
||||
)
|
||||
from . import internpaturl, proxysupport
|
||||
|
||||
# import warnings
|
||||
from .const import (WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED)
|
||||
from .const import WARN_HTTP_EMPTY_CONTENT, WARN_URL_RATE_LIMITED
|
||||
from requests.sessions import REDIRECT_STATI
|
||||
|
||||
# assumed HTTP header encoding
|
||||
|
|
@ -72,9 +84,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
@return: True if access is granted, otherwise False
|
||||
@rtype: bool
|
||||
"""
|
||||
return (not self.aggregate.config['robotstxt']
|
||||
or self.aggregate.robots_txt.allows_url(
|
||||
self, timeout=self.aggregate.config["timeout"]))
|
||||
return not self.aggregate.config[
|
||||
'robotstxt'
|
||||
] or self.aggregate.robots_txt.allows_url(
|
||||
self, timeout=self.aggregate.config["timeout"]
|
||||
)
|
||||
|
||||
def content_allows_robots(self):
|
||||
"""
|
||||
|
|
@ -89,8 +103,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
|
||||
def add_size_info(self):
|
||||
"""Get size of URL content from HTTP header."""
|
||||
if self.headers and "Content-Length" in self.headers and \
|
||||
"Transfer-Encoding" not in self.headers:
|
||||
if (
|
||||
self.headers
|
||||
and "Content-Length" in self.headers
|
||||
and "Transfer-Encoding" not in self.headers
|
||||
):
|
||||
# Note that content-encoding causes size differences since
|
||||
# the content data is always decoded.
|
||||
try:
|
||||
|
|
@ -139,14 +156,9 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
def build_request(self):
|
||||
"""Build a prepared request object."""
|
||||
clientheaders = {}
|
||||
if (self.parent_url and
|
||||
self.parent_url.lower().startswith(HTTP_SCHEMAS)):
|
||||
if self.parent_url and self.parent_url.lower().startswith(HTTP_SCHEMAS):
|
||||
clientheaders["Referer"] = self.parent_url
|
||||
kwargs = dict(
|
||||
method='GET',
|
||||
url=self.url,
|
||||
headers=clientheaders,
|
||||
)
|
||||
kwargs = dict(method='GET', url=self.url, headers=clientheaders,)
|
||||
if self.auth:
|
||||
kwargs['auth'] = self.auth
|
||||
log.debug(LOG_CHECK, "Prepare request with %s", kwargs)
|
||||
|
|
@ -223,8 +235,10 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
|
||||
def is_redirect(self):
|
||||
"""Check if current response is a redirect."""
|
||||
return ('location' in self.headers and
|
||||
self.url_connection.status_code in REDIRECT_STATI)
|
||||
return (
|
||||
'location' in self.headers
|
||||
and self.url_connection.status_code in REDIRECT_STATI
|
||||
)
|
||||
|
||||
def get_request_kwargs(self):
|
||||
"""Construct keyword parameters for Session.request() and
|
||||
|
|
@ -241,8 +255,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
def get_redirects(self, request):
|
||||
"""Return iterator of redirects for given request."""
|
||||
kwargs = self.get_request_kwargs()
|
||||
return self.session.resolve_redirects(self.url_connection,
|
||||
request, **kwargs)
|
||||
return self.session.resolve_redirects(self.url_connection, request, **kwargs)
|
||||
|
||||
def follow_redirections(self, request):
|
||||
"""Follow all redirections of http response."""
|
||||
|
|
@ -285,21 +298,32 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
|
||||
def check_response(self):
|
||||
"""Check final result and log it."""
|
||||
if self.url_connection.status_code >= 400 and self.url_connection.status_code != 429:
|
||||
self.set_result("%d %s" % (self.url_connection.status_code, self.url_connection.reason),
|
||||
valid=False)
|
||||
if (
|
||||
self.url_connection.status_code >= 400
|
||||
and self.url_connection.status_code != 429
|
||||
):
|
||||
self.set_result(
|
||||
"%d %s" % (self.url_connection.status_code, self.url_connection.reason),
|
||||
valid=False,
|
||||
)
|
||||
else:
|
||||
if self.url_connection.status_code == 204:
|
||||
# no content
|
||||
self.add_warning(self.url_connection.reason,
|
||||
tag=WARN_HTTP_EMPTY_CONTENT)
|
||||
self.add_warning(
|
||||
self.url_connection.reason, tag=WARN_HTTP_EMPTY_CONTENT
|
||||
)
|
||||
|
||||
if self.url_connection.status_code == 429:
|
||||
self.add_warning("Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")),
|
||||
tag=WARN_URL_RATE_LIMITED)
|
||||
self.add_warning(
|
||||
"Rate limited (Retry-After: %s)" % self.getheader(_("Retry-After")),
|
||||
tag=WARN_URL_RATE_LIMITED,
|
||||
)
|
||||
|
||||
if self.url_connection.status_code >= 200:
|
||||
self.set_result("%r %s" % (self.url_connection.status_code, self.url_connection.reason))
|
||||
self.set_result(
|
||||
"%r %s"
|
||||
% (self.url_connection.status_code, self.url_connection.reason)
|
||||
)
|
||||
else:
|
||||
self.set_result(_("OK"))
|
||||
|
||||
|
|
@ -325,6 +349,7 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
self.add_url(url, name=name)
|
||||
if 'Refresh' in self.headers:
|
||||
from ..htmlutil.linkparse import refresh_re
|
||||
|
||||
value = self.headers['Refresh'].strip()
|
||||
mo = refresh_re.match(value)
|
||||
if mo:
|
||||
|
|
@ -352,7 +377,11 @@ class HttpUrl(internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
# XXX side effect
|
||||
self.content_type = rtype
|
||||
if self.content_type not in self.ContentMimetypes:
|
||||
log.debug(LOG_CHECK, "URL with content type %r is not parseable", self.content_type)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"URL with content type %r is not parseable",
|
||||
self.content_type,
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Handle ignored URLs.
|
|||
|
||||
from . import unknownurl
|
||||
|
||||
|
||||
class IgnoreUrl(unknownurl.UnknownUrl):
|
||||
"""Always ignored URL."""
|
||||
|
||||
|
|
|
|||
|
|
@ -20,6 +20,7 @@ Handle itms-services URLs.
|
|||
from . import urlbase
|
||||
from .. import log, LOG_CHECK
|
||||
|
||||
|
||||
class ItmsServicesUrl(urlbase.UrlBase):
|
||||
"""Apple iOS application download URLs."""
|
||||
|
||||
|
|
|
|||
|
|
@ -53,6 +53,8 @@ def is_literal(domain):
|
|||
|
||||
_remove_quoted = re.compile(r'\\.').sub
|
||||
_quotes = re.compile(r'["\\]')
|
||||
|
||||
|
||||
def is_missing_quote(addr):
|
||||
"""Return True iff mail address is not correctly quoted."""
|
||||
return _quotes.match(_remove_quoted("", addr[1:-1]))
|
||||
|
|
@ -62,6 +64,7 @@ def is_missing_quote(addr):
|
|||
EMAIL_CGI_ADDRESS = ("to", "cc", "bcc")
|
||||
EMAIL_CGI_SUBJECT = "subject"
|
||||
|
||||
|
||||
class MailtoUrl(urlbase.UrlBase):
|
||||
"""
|
||||
Url link with mailto scheme.
|
||||
|
|
@ -81,8 +84,10 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
if not self.valid:
|
||||
break
|
||||
elif not self.subject:
|
||||
self.add_warning(_("No mail addresses or email subject found in `%(url)s'.") % \
|
||||
{"url": self.url})
|
||||
self.add_warning(
|
||||
_("No mail addresses or email subject found in `%(url)s'.")
|
||||
% {"url": self.url}
|
||||
)
|
||||
|
||||
def parse_addresses(self):
|
||||
"""Parse all mail addresses out of the URL target. Also parses
|
||||
|
|
@ -92,7 +97,7 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
# cut off leading mailto: and unquote
|
||||
url = urllib.parse.unquote(self.base_url[7:], self.encoding)
|
||||
# search for cc, bcc, to and store in headers
|
||||
mode = 0 # 0=default, 1=quote, 2=esc
|
||||
mode = 0 # 0=default, 1=quote, 2=esc
|
||||
quote = None
|
||||
i = 0
|
||||
for i, c in enumerate(url):
|
||||
|
|
@ -104,7 +109,7 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
mode = 1
|
||||
elif c == '\\':
|
||||
mode = 2
|
||||
elif mode==1:
|
||||
elif mode == 1:
|
||||
if c == '"' and quote == '"':
|
||||
mode = 0
|
||||
elif c == '>' and quote == '<':
|
||||
|
|
@ -114,11 +119,13 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
if i < (len(url) - 1):
|
||||
self.addresses.update(getaddresses(url[:i]))
|
||||
try:
|
||||
headers = urllib.parse.parse_qs(url[(i+1):], strict_parsing=True)
|
||||
headers = urllib.parse.parse_qs(url[(i + 1) :], strict_parsing=True)
|
||||
for key, vals in headers.items():
|
||||
if key.lower() in EMAIL_CGI_ADDRESS:
|
||||
# Only the first header value is added
|
||||
self.addresses.update(getaddresses(urllib.parse.unquote(vals[0], self.encoding)))
|
||||
self.addresses.update(
|
||||
getaddresses(urllib.parse.unquote(vals[0], self.encoding))
|
||||
)
|
||||
if key.lower() == EMAIL_CGI_SUBJECT:
|
||||
self.subject = vals[0]
|
||||
except ValueError as err:
|
||||
|
|
@ -145,30 +152,57 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
# restrict email length to 256 characters
|
||||
# http://www.rfc-editor.org/errata_search.php?eid=1003
|
||||
if len(mail) > 256:
|
||||
self.set_result(_("Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(mail)}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_(
|
||||
"Mail address `%(addr)s' too long. Allowed 256 chars, was %(length)d chars."
|
||||
)
|
||||
% {"addr": mail, "length": len(mail)},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if "@" not in mail:
|
||||
self.set_result(_("Missing `@' in mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Missing `@' in mail address `%(addr)s'.") % {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
# note: be sure to use rsplit since "@" can occur in local part
|
||||
local, domain = mail.rsplit("@", 1)
|
||||
if not local:
|
||||
self.set_result(_("Missing local part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Missing local part of mail address `%(addr)s'.") % {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if not domain:
|
||||
self.set_result(_("Missing domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Missing domain part of mail address `%(addr)s'.") % {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if len(local) > 64:
|
||||
self.set_result(_("Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_(
|
||||
"Local part of mail address `%(addr)s' too long. Allowed 64 chars, was %(length)d chars."
|
||||
)
|
||||
% {"addr": mail, "length": len(local)},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if len(domain) > 255:
|
||||
self.set_result(_("Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars.") % \
|
||||
{"addr": mail, "length": len(local)}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_(
|
||||
"Domain part of mail address `%(addr)s' too long. Allowed 255 chars, was %(length)d chars."
|
||||
)
|
||||
% {"addr": mail, "length": len(local)},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
|
||||
# local part syntax check
|
||||
|
|
@ -176,26 +210,48 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
# Rules taken from http://tools.ietf.org/html/rfc3696#section-3
|
||||
if is_quoted(local):
|
||||
if is_missing_quote(local):
|
||||
self.set_result(_("Unquoted double quote or backslash in mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Unquoted double quote or backslash in mail address `%(addr)s'.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
else:
|
||||
if local.startswith("."):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not start with a dot.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Local part of mail address `%(addr)s' may not start with a dot.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if local.endswith("."):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not end with a dot.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Local part of mail address `%(addr)s' may not end with a dot.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if ".." in local:
|
||||
self.set_result(_("Local part of mail address `%(addr)s' may not contain two dots.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Local part of mail address `%(addr)s' may not contain two dots.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
for char in '@ \\",[]':
|
||||
if char in local.replace("\\%s"%char, ""):
|
||||
self.set_result(_("Local part of mail address `%(addr)s' contains unquoted character `%(char)s.") % \
|
||||
{"addr": mail, "char": char}, valid=False, overwrite=False)
|
||||
if char in local.replace("\\%s" % char, ""):
|
||||
self.set_result(
|
||||
_(
|
||||
"Local part of mail address `%(addr)s' contains unquoted character `%(char)s."
|
||||
)
|
||||
% {"addr": mail, "char": char},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
|
||||
# domain part syntax check
|
||||
|
|
@ -206,18 +262,30 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
if ip.startswith("IPv6:"):
|
||||
ip = ip[5:]
|
||||
if not iputil.is_valid_ip(ip):
|
||||
self.set_result(_("Domain part of mail address `%(addr)s' has invalid IP.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Domain part of mail address `%(addr)s' has invalid IP.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
else:
|
||||
# it's a domain name
|
||||
if not urlutil.is_safe_domain(domain):
|
||||
self.set_result(_("Invalid domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Invalid domain part of mail address `%(addr)s'.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
if domain.endswith(".") or domain.split(".")[-1].isdigit():
|
||||
self.set_result(_("Invalid top level domain part of mail address `%(addr)s'.") % \
|
||||
{"addr": mail}, valid=False, overwrite=False)
|
||||
self.set_result(
|
||||
_("Invalid top level domain part of mail address `%(addr)s'.")
|
||||
% {"addr": mail},
|
||||
valid=False,
|
||||
overwrite=False,
|
||||
)
|
||||
return
|
||||
|
||||
def check_connection(self):
|
||||
|
|
@ -240,6 +308,7 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
Check a single mail address.
|
||||
"""
|
||||
from dns.exception import DNSException
|
||||
|
||||
log.debug(LOG_CHECK, "checking mail address %r", mail)
|
||||
mail = strformat.ascii_safe(mail)
|
||||
username, domain = mail.rsplit('@', 1)
|
||||
|
|
@ -249,31 +318,38 @@ class MailtoUrl(urlbase.UrlBase):
|
|||
except DNSException:
|
||||
answers = []
|
||||
if len(answers) == 0:
|
||||
self.add_warning(_("No MX mail host for %(domain)s found.") %
|
||||
{'domain': domain},
|
||||
tag=WARN_MAIL_NO_MX_HOST)
|
||||
self.add_warning(
|
||||
_("No MX mail host for %(domain)s found.") % {'domain': domain},
|
||||
tag=WARN_MAIL_NO_MX_HOST,
|
||||
)
|
||||
try:
|
||||
answers = resolver.query(domain, 'A')
|
||||
except DNSException:
|
||||
answers = []
|
||||
if len(answers) == 0:
|
||||
self.set_result(_("No host for %(domain)s found.") %
|
||||
{'domain': domain}, valid=False,
|
||||
overwrite=True)
|
||||
self.set_result(
|
||||
_("No host for %(domain)s found.") % {'domain': domain},
|
||||
valid=False,
|
||||
overwrite=True,
|
||||
)
|
||||
return
|
||||
# set preference to zero
|
||||
mxdata = [(0, rdata.to_text(omit_final_dot=True))
|
||||
for rdata in answers]
|
||||
mxdata = [(0, rdata.to_text(omit_final_dot=True)) for rdata in answers]
|
||||
else:
|
||||
from dns.rdtypes.mxbase import MXBase
|
||||
mxdata = [(rdata.preference,
|
||||
rdata.exchange.to_text(omit_final_dot=True))
|
||||
for rdata in answers if isinstance(rdata, MXBase)]
|
||||
|
||||
mxdata = [
|
||||
(rdata.preference, rdata.exchange.to_text(omit_final_dot=True))
|
||||
for rdata in answers
|
||||
if isinstance(rdata, MXBase)
|
||||
]
|
||||
if not mxdata:
|
||||
self.set_result(
|
||||
_("Got invalid DNS answer %(answer)s for %(domain)s.") %
|
||||
{'answer': answers, 'domain': domain}, valid=False,
|
||||
overwrite=True)
|
||||
_("Got invalid DNS answer %(answer)s for %(domain)s.")
|
||||
% {'answer': answers, 'domain': domain},
|
||||
valid=False,
|
||||
overwrite=True,
|
||||
)
|
||||
return
|
||||
# sort according to preference (lower preference means this
|
||||
# host should be preferred)
|
||||
|
|
|
|||
|
|
@ -28,6 +28,7 @@ from .const import WARN_NNTP_NO_SERVER, WARN_NNTP_NO_NEWSGROUP
|
|||
|
||||
random.seed()
|
||||
|
||||
|
||||
class NntpUrl(urlbase.UrlBase):
|
||||
"""
|
||||
Url link with NNTP scheme.
|
||||
|
|
@ -41,8 +42,9 @@ class NntpUrl(urlbase.UrlBase):
|
|||
nntpserver = self.host or self.aggregate.config["nntpserver"]
|
||||
if not nntpserver:
|
||||
self.add_warning(
|
||||
_("No NNTP server was specified, skipping this URL."),
|
||||
tag=WARN_NNTP_NO_SERVER)
|
||||
_("No NNTP server was specified, skipping this URL."),
|
||||
tag=WARN_NNTP_NO_SERVER,
|
||||
)
|
||||
return
|
||||
nntp = self._connect_nntp(nntpserver)
|
||||
group = self.urlparts[2]
|
||||
|
|
@ -50,7 +52,7 @@ class NntpUrl(urlbase.UrlBase):
|
|||
group = group[1:]
|
||||
if '@' in group:
|
||||
# request article info (resp, number mid)
|
||||
number = nntp.stat("<"+group+">")[1]
|
||||
number = nntp.stat("<" + group + ">")[1]
|
||||
self.add_info(_('Article number %(num)s found.') % {"num": number})
|
||||
else:
|
||||
# split off trailing articel span
|
||||
|
|
@ -61,8 +63,9 @@ class NntpUrl(urlbase.UrlBase):
|
|||
self.add_info(_("News group %(name)s found.") % {"name": name})
|
||||
else:
|
||||
# group name is the empty string
|
||||
self.add_warning(_("No newsgroup specified in NNTP URL."),
|
||||
tag=WARN_NNTP_NO_NEWSGROUP)
|
||||
self.add_warning(
|
||||
_("No newsgroup specified in NNTP URL."), tag=WARN_NNTP_NO_NEWSGROUP
|
||||
)
|
||||
|
||||
def _connect_nntp(self, nntpserver):
|
||||
"""
|
||||
|
|
@ -85,7 +88,8 @@ class NntpUrl(urlbase.UrlBase):
|
|||
raise
|
||||
if nntp is None:
|
||||
raise LinkCheckerError(
|
||||
_("NNTP server too busy; tried more than %d times.") % tries)
|
||||
_("NNTP server too busy; tried more than %d times.") % tries
|
||||
)
|
||||
if log.is_debug(LOG_CHECK):
|
||||
nntp.set_debuglevel(1)
|
||||
self.add_info(nntp.getwelcome())
|
||||
|
|
|
|||
|
|
@ -40,14 +40,14 @@ class ProxySupport:
|
|||
if self.proxytype not in ('http', 'https'):
|
||||
# Note that invalid proxies might raise TypeError in urllib2,
|
||||
# so make sure to stop checking at this point, not later.
|
||||
msg = _("Proxy value `%(proxy)s' must start with 'http:' or 'https:'.") \
|
||||
% dict(proxy=proxy)
|
||||
msg = _(
|
||||
"Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
|
||||
) % dict(proxy=proxy)
|
||||
raise LinkCheckerError(msg)
|
||||
if self.ignore_proxy_host():
|
||||
# log proxy without auth info
|
||||
log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
|
||||
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") %
|
||||
dict(proxy=proxy))
|
||||
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") % dict(proxy=proxy))
|
||||
self.proxy = None
|
||||
return
|
||||
log.debug(LOG_CHECK, "using proxy %r", self.proxy)
|
||||
|
|
@ -58,7 +58,7 @@ class ProxySupport:
|
|||
username = proxyurl.username
|
||||
password = proxyurl.password if proxy.password is not None else ""
|
||||
auth = "%s:%s" % (username, password)
|
||||
self.proxyauth = "Basic "+httputil.encode_base64(auth)
|
||||
self.proxyauth = "Basic " + httputil.encode_base64(auth)
|
||||
|
||||
def ignore_proxy_host(self):
|
||||
"""Check if self.host is in the $no_proxy ignore list."""
|
||||
|
|
|
|||
|
|
@ -64,10 +64,10 @@ class TelnetUrl(urlbase.UrlBase):
|
|||
self.url_connection.open(self.host, self.port)
|
||||
if self.user:
|
||||
self.url_connection.read_until(b"login: ", 10)
|
||||
self.url_connection.write(encode(self.user)+b"\n")
|
||||
self.url_connection.write(encode(self.user) + b"\n")
|
||||
if self.password:
|
||||
self.url_connection.read_until(b"Password: ", 10)
|
||||
self.url_connection.write(encode(self.password)+b"\n")
|
||||
self.url_connection.write(encode(self.password) + b"\n")
|
||||
# XXX how to tell if we are logged in??
|
||||
self.url_connection.write(b"exit\n")
|
||||
|
||||
|
|
|
|||
|
|
@ -28,12 +28,12 @@ class UnknownUrl(urlbase.UrlBase):
|
|||
"""Only logs that this URL is unknown."""
|
||||
super(UnknownUrl, self).build_url()
|
||||
if self.is_ignored():
|
||||
self.add_info(_("%(scheme)s URL ignored.") %
|
||||
{"scheme": self.scheme.capitalize()})
|
||||
self.add_info(
|
||||
_("%(scheme)s URL ignored.") % {"scheme": self.scheme.capitalize()}
|
||||
)
|
||||
self.set_result(_("ignored"))
|
||||
else:
|
||||
self.set_result(_("URL is unrecognized or has invalid syntax"),
|
||||
valid=False)
|
||||
self.set_result(_("URL is unrecognized or has invalid syntax"), valid=False)
|
||||
|
||||
def is_ignored(self):
|
||||
"""Return True if this URL scheme is ignored."""
|
||||
|
|
@ -260,4 +260,3 @@ ignored_schemes = "^(%s%s%s%s)$" % (
|
|||
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
||||
|
||||
is_unknown_scheme = ignored_schemes_re.match
|
||||
|
||||
|
|
|
|||
|
|
@ -27,15 +27,30 @@ import select
|
|||
from io import BytesIO
|
||||
|
||||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, get_link_pat)
|
||||
from .. import (
|
||||
log,
|
||||
LOG_CHECK,
|
||||
strformat,
|
||||
LinkCheckerError,
|
||||
url as urlutil,
|
||||
trace,
|
||||
get_link_pat,
|
||||
)
|
||||
from ..htmlutil import htmlsoup
|
||||
from ..network import iputil
|
||||
from .const import (WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT, WARN_URL_OBFUSCATED_IP,
|
||||
WARN_URL_CONTENT_SIZE_ZERO, WARN_URL_CONTENT_SIZE_TOO_LARGE,
|
||||
WARN_URL_WHITESPACE, URL_MAX_LENGTH, WARN_URL_TOO_LONG,
|
||||
ExcList, ExcSyntaxList, ExcNoCacheList)
|
||||
from .const import (
|
||||
WARN_URL_EFFECTIVE_URL,
|
||||
WARN_URL_ERROR_GETTING_CONTENT,
|
||||
WARN_URL_OBFUSCATED_IP,
|
||||
WARN_URL_CONTENT_SIZE_ZERO,
|
||||
WARN_URL_CONTENT_SIZE_TOO_LARGE,
|
||||
WARN_URL_WHITESPACE,
|
||||
URL_MAX_LENGTH,
|
||||
WARN_URL_TOO_LONG,
|
||||
ExcList,
|
||||
ExcSyntaxList,
|
||||
ExcNoCacheList,
|
||||
)
|
||||
from ..url import url_fix_wayback_query
|
||||
|
||||
# helper alias
|
||||
|
|
@ -44,6 +59,7 @@ unicode_safe = strformat.unicode_safe
|
|||
# schemes that are invalid with an empty hostname
|
||||
scheme_requires_host = ("ftp", "http", "telnet")
|
||||
|
||||
|
||||
def urljoin(parent, url):
|
||||
"""
|
||||
If url is relative, join parent and url. Else leave url as-is.
|
||||
|
|
@ -61,8 +77,9 @@ def url_norm(url, encoding):
|
|||
try:
|
||||
return urlutil.url_norm(url, encoding=encoding)
|
||||
except UnicodeError:
|
||||
msg = _("URL has unparsable domain name: %(name)s") % \
|
||||
{"name": sys.exc_info()[1]}
|
||||
msg = _("URL has unparsable domain name: %(name)s") % {
|
||||
"name": sys.exc_info()[1]
|
||||
}
|
||||
raise LinkCheckerError(msg)
|
||||
|
||||
|
||||
|
|
@ -92,11 +109,22 @@ class UrlBase:
|
|||
}
|
||||
|
||||
# Read in 16kb chunks
|
||||
ReadChunkBytes = 1024*16
|
||||
ReadChunkBytes = 1024 * 16
|
||||
|
||||
def __init__(self, base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=-1, column=-1, page=-1,
|
||||
name="", url_encoding=None, extern=None):
|
||||
def __init__(
|
||||
self,
|
||||
base_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
parent_url=None,
|
||||
base_ref=None,
|
||||
line=-1,
|
||||
column=-1,
|
||||
page=-1,
|
||||
name="",
|
||||
url_encoding=None,
|
||||
extern=None,
|
||||
):
|
||||
"""
|
||||
Initialize check data, and store given variables.
|
||||
|
||||
|
|
@ -113,20 +141,44 @@ class UrlBase:
|
|||
@param extern: None or (is_extern, is_strict)
|
||||
"""
|
||||
self.reset()
|
||||
self.init(base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, page, name, url_encoding, extern)
|
||||
self.init(
|
||||
base_ref,
|
||||
base_url,
|
||||
parent_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
line,
|
||||
column,
|
||||
page,
|
||||
name,
|
||||
url_encoding,
|
||||
extern,
|
||||
)
|
||||
self.check_syntax()
|
||||
if recursion_level == 0:
|
||||
self.add_intern_pattern()
|
||||
self.set_extern(self.url)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
self.add_info(_("The URL is outside of the domain "
|
||||
"filter, checked only syntax."))
|
||||
self.add_info(
|
||||
_("The URL is outside of the domain " "filter, checked only syntax.")
|
||||
)
|
||||
if not self.has_result:
|
||||
self.set_result(_("filtered"))
|
||||
|
||||
def init(self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, page, name, url_encoding, extern):
|
||||
def init(
|
||||
self,
|
||||
base_ref,
|
||||
base_url,
|
||||
parent_url,
|
||||
recursion_level,
|
||||
aggregate,
|
||||
line,
|
||||
column,
|
||||
page,
|
||||
name,
|
||||
url_encoding,
|
||||
extern,
|
||||
):
|
||||
"""
|
||||
Initialize internal data.
|
||||
"""
|
||||
|
|
@ -149,17 +201,22 @@ class UrlBase:
|
|||
self.encoding = url_encoding
|
||||
self.extern = extern
|
||||
if self.base_ref:
|
||||
assert not urlutil.url_needs_quoting(self.base_ref), \
|
||||
"unquoted base reference URL %r" % self.base_ref
|
||||
assert not urlutil.url_needs_quoting(self.base_ref), (
|
||||
"unquoted base reference URL %r" % self.base_ref
|
||||
)
|
||||
if self.parent_url:
|
||||
assert not urlutil.url_needs_quoting(self.parent_url), \
|
||||
"unquoted parent URL %r" % self.parent_url
|
||||
assert not urlutil.url_needs_quoting(self.parent_url), (
|
||||
"unquoted parent URL %r" % self.parent_url
|
||||
)
|
||||
url = absolute_url(self.base_url, base_ref, parent_url)
|
||||
# assume file link if no scheme is found
|
||||
self.scheme = url.split(":", 1)[0].lower() or "file"
|
||||
if self.base_url != base_url:
|
||||
self.add_warning(_("Leading or trailing whitespace in URL `%(url)s'.") %
|
||||
{"url": base_url}, tag=WARN_URL_WHITESPACE)
|
||||
self.add_warning(
|
||||
_("Leading or trailing whitespace in URL `%(url)s'.")
|
||||
% {"url": base_url},
|
||||
tag=WARN_URL_WHITESPACE,
|
||||
)
|
||||
|
||||
def reset(self):
|
||||
"""
|
||||
|
|
@ -219,8 +276,13 @@ class UrlBase:
|
|||
Set result string and validity.
|
||||
"""
|
||||
if self.has_result and not overwrite:
|
||||
log.warn(LOG_CHECK,
|
||||
"Double result %r (previous %r) for %s", msg, self.result, self)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Double result %r (previous %r) for %s",
|
||||
msg,
|
||||
self.result,
|
||||
self,
|
||||
)
|
||||
else:
|
||||
self.has_result = True
|
||||
if not msg:
|
||||
|
|
@ -288,8 +350,10 @@ class UrlBase:
|
|||
Add a warning string.
|
||||
"""
|
||||
item = (tag, s)
|
||||
if item not in self.warnings and \
|
||||
tag not in self.aggregate.config["ignorewarnings"]:
|
||||
if (
|
||||
item not in self.warnings
|
||||
and tag not in self.aggregate.config["ignorewarnings"]
|
||||
):
|
||||
self.warnings.append(item)
|
||||
|
||||
def add_info(self, s):
|
||||
|
|
@ -303,7 +367,7 @@ class UrlBase:
|
|||
"""Set the URL to be used for caching."""
|
||||
# remove anchor from cached target url since we assume
|
||||
# URLs with different anchors to have the same content
|
||||
self.cache_url = urlutil.urlunsplit(self.urlparts[:4]+[''])
|
||||
self.cache_url = urlutil.urlunsplit(self.urlparts[:4] + [''])
|
||||
if self.cache_url is not None:
|
||||
assert isinstance(self.cache_url, str), repr(self.cache_url)
|
||||
|
||||
|
|
@ -332,13 +396,17 @@ class UrlBase:
|
|||
"""Check URL name and length."""
|
||||
effectiveurl = urlutil.urlunsplit(self.urlparts)
|
||||
if self.url != effectiveurl:
|
||||
self.add_warning(_("Effective URL %(url)r.") %
|
||||
{"url": effectiveurl},
|
||||
tag=WARN_URL_EFFECTIVE_URL)
|
||||
self.add_warning(
|
||||
_("Effective URL %(url)r.") % {"url": effectiveurl},
|
||||
tag=WARN_URL_EFFECTIVE_URL,
|
||||
)
|
||||
self.url = effectiveurl
|
||||
if len(self.url) > URL_MAX_LENGTH and self.scheme != "data":
|
||||
args = dict(len=len(self.url), max=URL_MAX_LENGTH)
|
||||
self.add_warning(_("URL length %(len)d is longer than %(max)d.") % args, tag=WARN_URL_TOO_LONG)
|
||||
self.add_warning(
|
||||
_("URL length %(len)d is longer than %(max)d.") % args,
|
||||
tag=WARN_URL_TOO_LONG,
|
||||
)
|
||||
|
||||
def build_url(self):
|
||||
"""
|
||||
|
|
@ -367,7 +435,9 @@ class UrlBase:
|
|||
if urlparts[2]:
|
||||
urlparts[2] = urlutil.collapse_segments(urlparts[2])
|
||||
if not urlparts[0].startswith("feed"):
|
||||
urlparts[2] = url_fix_wayback_query(urlparts[2]) # restore second / in http[s]:// in wayback path
|
||||
urlparts[2] = url_fix_wayback_query(
|
||||
urlparts[2]
|
||||
) # restore second / in http[s]:// in wayback path
|
||||
self.url = urlutil.urlunsplit(urlparts)
|
||||
# split into (modifiable) list
|
||||
self.urlparts = strformat.url_unicode_split(self.url)
|
||||
|
|
@ -384,8 +454,9 @@ class UrlBase:
|
|||
port = urlutil.default_ports.get(self.scheme, 0)
|
||||
host, port = urlutil.splitport(host, port=port)
|
||||
if port is None:
|
||||
raise LinkCheckerError(_("URL host %(host)r has invalid port") %
|
||||
{"host": host})
|
||||
raise LinkCheckerError(
|
||||
_("URL host %(host)r has invalid port") % {"host": host}
|
||||
)
|
||||
self.port = port
|
||||
# set host lowercase
|
||||
self.host = host.lower()
|
||||
|
|
@ -415,9 +486,10 @@ class UrlBase:
|
|||
if ips:
|
||||
self.host = ips[0]
|
||||
self.add_warning(
|
||||
_("URL %(url)s has obfuscated IP address %(ip)s") % \
|
||||
{"url": self.base_url, "ip": ips[0]},
|
||||
tag=WARN_URL_OBFUSCATED_IP)
|
||||
_("URL %(url)s has obfuscated IP address %(ip)s")
|
||||
% {"url": self.base_url, "ip": ips[0]},
|
||||
tag=WARN_URL_OBFUSCATED_IP,
|
||||
)
|
||||
|
||||
def check(self):
|
||||
"""Main check function for checking this URL."""
|
||||
|
|
@ -453,7 +525,10 @@ class UrlBase:
|
|||
value = _('Hostname not found')
|
||||
elif isinstance(exc, UnicodeError):
|
||||
# idna.encode(host) failed
|
||||
value = _('Bad hostname %(host)r: %(msg)s') % {'host': self.host, 'msg': value}
|
||||
value = _('Bad hostname %(host)r: %(msg)s') % {
|
||||
'host': self.host,
|
||||
'msg': value,
|
||||
}
|
||||
self.set_result(unicode_safe(value), valid=False)
|
||||
|
||||
def check_content(self):
|
||||
|
|
@ -469,8 +544,10 @@ class UrlBase:
|
|||
return True
|
||||
except tuple(ExcList):
|
||||
value = self.handle_exception()
|
||||
self.add_warning(_("could not get content: %(msg)s") %
|
||||
{"msg": value}, tag=WARN_URL_ERROR_GETTING_CONTENT)
|
||||
self.add_warning(
|
||||
_("could not get content: %(msg)s") % {"msg": value},
|
||||
tag=WARN_URL_ERROR_GETTING_CONTENT,
|
||||
)
|
||||
return False
|
||||
|
||||
def close_connection(self):
|
||||
|
|
@ -492,11 +569,15 @@ class UrlBase:
|
|||
An exception occurred. Log it and set the cache flag.
|
||||
"""
|
||||
etype, evalue = sys.exc_info()[:2]
|
||||
log.debug(LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True)
|
||||
log.debug(
|
||||
LOG_CHECK, "Error in %s: %s %s", self.url, etype, evalue, exception=True
|
||||
)
|
||||
# note: etype must be the exact class, not a subclass
|
||||
if (etype in ExcNoCacheList) or \
|
||||
(etype == socket.error and evalue.args[0]==errno.EBADF) or \
|
||||
not evalue:
|
||||
if (
|
||||
(etype in ExcNoCacheList)
|
||||
or (etype == socket.error and evalue.args[0] == errno.EBADF)
|
||||
or not evalue
|
||||
):
|
||||
# EBADF occurs when operating on an already socket
|
||||
self.caching = False
|
||||
# format message "<exception name>: <error message>"
|
||||
|
|
@ -519,10 +600,13 @@ class UrlBase:
|
|||
maxbytes = self.aggregate.config["maxfilesizedownload"]
|
||||
if self.size > maxbytes:
|
||||
self.add_warning(
|
||||
_("Content size %(size)s is larger than %(maxbytes)s.") %
|
||||
dict(size=strformat.strsize(self.size),
|
||||
maxbytes=strformat.strsize(maxbytes)),
|
||||
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE)
|
||||
_("Content size %(size)s is larger than %(maxbytes)s.")
|
||||
% dict(
|
||||
size=strformat.strsize(self.size),
|
||||
maxbytes=strformat.strsize(maxbytes),
|
||||
),
|
||||
tag=WARN_URL_CONTENT_SIZE_TOO_LARGE,
|
||||
)
|
||||
|
||||
def allows_simple_recursion(self):
|
||||
"""Check recursion level and extern status."""
|
||||
|
|
@ -579,15 +663,13 @@ class UrlBase:
|
|||
return
|
||||
for entry in self.aggregate.config["externlinks"]:
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
if (entry['negate'] and not match) or (match and not entry['negate']):
|
||||
log.debug(LOG_CHECK, "Extern URL %r", url)
|
||||
self.extern = (1, entry['strict'])
|
||||
return
|
||||
for entry in self.aggregate.config["internlinks"]:
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
(match and not entry['negate']):
|
||||
if (entry['negate'] and not match) or (match and not entry['negate']):
|
||||
log.debug(LOG_CHECK, "Intern URL %r", url)
|
||||
self.extern = (0, 0)
|
||||
return
|
||||
|
|
@ -612,8 +694,7 @@ class UrlBase:
|
|||
self.size = len(content)
|
||||
self.dltime = time.time() - t
|
||||
if self.size == 0:
|
||||
self.add_warning(_("Content size is zero."),
|
||||
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||
self.add_warning(_("Content size is zero."), tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||
else:
|
||||
self.aggregate.add_downloaded_bytes(self.size)
|
||||
return content
|
||||
|
|
@ -636,8 +717,9 @@ class UrlBase:
|
|||
# than an internal crash, eh? ISO-8859-1 is a safe fallback in the
|
||||
# sense that any binary blob can be decoded, it'll never cause a
|
||||
# UnicodeDecodeError.
|
||||
log.debug(LOG_CHECK, "Beautiful Soup detected %s",
|
||||
self.soup.original_encoding)
|
||||
log.debug(
|
||||
LOG_CHECK, "Beautiful Soup detected %s", self.soup.original_encoding
|
||||
)
|
||||
self.encoding = self.soup.original_encoding or 'ISO-8859-1'
|
||||
log.debug(LOG_CHECK, "Content encoding %s", self.encoding)
|
||||
self.text = self.data.decode(self.encoding)
|
||||
|
|
@ -675,29 +757,41 @@ class UrlBase:
|
|||
base_ref = urlutil.url_norm(base, encoding=self.encoding)[0]
|
||||
else:
|
||||
base_ref = None
|
||||
url_data = get_url_from(url, self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url, base_ref=base_ref, line=line, column=column,
|
||||
page=page, name=name, parent_content_type=self.content_type, url_encoding=self.encoding)
|
||||
url_data = get_url_from(
|
||||
url,
|
||||
self.recursion_level + 1,
|
||||
self.aggregate,
|
||||
parent_url=self.url,
|
||||
base_ref=base_ref,
|
||||
line=line,
|
||||
column=column,
|
||||
page=page,
|
||||
name=name,
|
||||
parent_content_type=self.content_type,
|
||||
url_encoding=self.encoding,
|
||||
)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def serialized(self, sep=os.linesep):
|
||||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
"""
|
||||
return unicode_safe(sep).join([
|
||||
"%s link" % self.scheme,
|
||||
"base_url=%r" % self.base_url,
|
||||
"parent_url=%r" % self.parent_url,
|
||||
"base_ref=%r" % self.base_ref,
|
||||
"recursion_level=%d" % self.recursion_level,
|
||||
"url_connection=%s" % self.url_connection,
|
||||
"line=%s" % self.line,
|
||||
"column=%s" % self.column,
|
||||
"page=%d" % self.page,
|
||||
"name=%r" % self.name,
|
||||
"anchor=%r" % self.anchor,
|
||||
"cache_url=%s" % self.cache_url,
|
||||
])
|
||||
return unicode_safe(sep).join(
|
||||
[
|
||||
"%s link" % self.scheme,
|
||||
"base_url=%r" % self.base_url,
|
||||
"parent_url=%r" % self.parent_url,
|
||||
"base_ref=%r" % self.base_ref,
|
||||
"recursion_level=%d" % self.recursion_level,
|
||||
"url_connection=%s" % self.url_connection,
|
||||
"line=%s" % self.line,
|
||||
"column=%s" % self.column,
|
||||
"page=%d" % self.page,
|
||||
"name=%r" % self.name,
|
||||
"anchor=%r" % self.anchor,
|
||||
"cache_url=%s" % self.cache_url,
|
||||
]
|
||||
)
|
||||
|
||||
def get_intern_pattern(self, url=None):
|
||||
"""Get pattern for intern URL matching.
|
||||
|
|
@ -717,8 +811,7 @@ class UrlBase:
|
|||
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
|
||||
self.aggregate.config['internlinks'].append(get_link_pat(pat))
|
||||
except UnicodeError as msg:
|
||||
res = _("URL has unparsable domain name: %(domain)s") % \
|
||||
{"domain": msg}
|
||||
res = _("URL has unparsable domain name: %(domain)s") % {"domain": msg}
|
||||
self.set_result(res, valid=False)
|
||||
|
||||
def __str__(self):
|
||||
|
|
@ -792,28 +885,29 @@ class UrlBase:
|
|||
- url_data.last_modified: datetime
|
||||
Last modification date of retrieved page (or None).
|
||||
"""
|
||||
return dict(valid=self.valid,
|
||||
extern=self.extern[0],
|
||||
result=self.result,
|
||||
warnings=self.warnings[:],
|
||||
name=self.name or "",
|
||||
title=self.get_title(),
|
||||
parent_url=self.parent_url or "",
|
||||
base_ref=self.base_ref or "",
|
||||
base_url=self.base_url or "",
|
||||
url=self.url or "",
|
||||
domain=(self.urlparts[1] if self.urlparts else ""),
|
||||
checktime=self.checktime,
|
||||
dltime=self.dltime,
|
||||
size=self.size,
|
||||
info=self.info,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
page=self.page,
|
||||
cache_url=self.cache_url,
|
||||
content_type=self.content_type,
|
||||
level=self.recursion_level,
|
||||
modified=self.modified,
|
||||
return dict(
|
||||
valid=self.valid,
|
||||
extern=self.extern[0],
|
||||
result=self.result,
|
||||
warnings=self.warnings[:],
|
||||
name=self.name or "",
|
||||
title=self.get_title(),
|
||||
parent_url=self.parent_url or "",
|
||||
base_ref=self.base_ref or "",
|
||||
base_url=self.base_url or "",
|
||||
url=self.url or "",
|
||||
domain=(self.urlparts[1] if self.urlparts else ""),
|
||||
checktime=self.checktime,
|
||||
dltime=self.dltime,
|
||||
size=self.size,
|
||||
info=self.info,
|
||||
line=self.line,
|
||||
column=self.column,
|
||||
page=self.page,
|
||||
cache_url=self.cache_url,
|
||||
content_type=self.content_type,
|
||||
level=self.recursion_level,
|
||||
modified=self.modified,
|
||||
)
|
||||
|
||||
def to_wire(self):
|
||||
|
|
@ -847,8 +941,10 @@ urlDataAttr = [
|
|||
'level',
|
||||
]
|
||||
|
||||
|
||||
class CompactUrlData:
|
||||
"""Store selected UrlData attributes in slots to minimize memory usage."""
|
||||
|
||||
__slots__ = urlDataAttr
|
||||
|
||||
def __init__(self, wired_url_data):
|
||||
|
|
|
|||
|
|
@ -43,7 +43,9 @@ def print_version(exit_code=0):
|
|||
def print_plugins(folders, exit_code=0):
|
||||
"""Print available plugins and exit."""
|
||||
modules = plugins.get_plugin_modules(folders)
|
||||
pluginclasses = sorted(plugins.get_plugin_classes(modules), key=lambda x: x.__name__)
|
||||
pluginclasses = sorted(
|
||||
plugins.get_plugin_classes(modules), key=lambda x: x.__name__
|
||||
)
|
||||
|
||||
for pluginclass in pluginclasses:
|
||||
print(pluginclass.__name__)
|
||||
|
|
@ -57,7 +59,10 @@ def print_usage(msg, exit_code=2):
|
|||
"""Print a program msg text to stderr and exit."""
|
||||
program = sys.argv[0]
|
||||
print(_("Error: %(msg)s") % {"msg": msg}, file=console.stderr)
|
||||
print(_("Execute '%(program)s -h' for help") % {"program": program}, file=console.stderr)
|
||||
print(
|
||||
_("Execute '%(program)s -h' for help") % {"program": program},
|
||||
file=console.stderr,
|
||||
)
|
||||
sys.exit(exit_code)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -26,8 +26,17 @@
|
|||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from ctypes import (windll, byref, Structure, c_char, c_short, c_uint32,
|
||||
c_ushort, ArgumentError, WinError)
|
||||
from ctypes import (
|
||||
windll,
|
||||
byref,
|
||||
Structure,
|
||||
c_char,
|
||||
c_short,
|
||||
c_uint32,
|
||||
c_ushort,
|
||||
ArgumentError,
|
||||
WinError,
|
||||
)
|
||||
|
||||
# from winbase.h
|
||||
STDOUT = -11
|
||||
|
|
@ -43,15 +52,19 @@ WORD = c_ushort
|
|||
DWORD = c_uint32
|
||||
TCHAR = c_char
|
||||
|
||||
|
||||
class COORD(Structure):
|
||||
"""struct in wincon.h"""
|
||||
|
||||
_fields_ = [
|
||||
('X', SHORT),
|
||||
('Y', SHORT),
|
||||
]
|
||||
|
||||
class SMALL_RECT(Structure):
|
||||
|
||||
class SMALL_RECT(Structure):
|
||||
"""struct in wincon.h."""
|
||||
|
||||
_fields_ = [
|
||||
("Left", SHORT),
|
||||
("Top", SHORT),
|
||||
|
|
@ -59,8 +72,10 @@ class SMALL_RECT(Structure):
|
|||
("Bottom", SHORT),
|
||||
]
|
||||
|
||||
|
||||
class CONSOLE_SCREEN_BUFFER_INFO(Structure):
|
||||
"""struct in wincon.h."""
|
||||
|
||||
_fields_ = [
|
||||
("dwSize", COORD),
|
||||
("dwCursorPosition", COORD),
|
||||
|
|
@ -68,22 +83,29 @@ class CONSOLE_SCREEN_BUFFER_INFO(Structure):
|
|||
("srWindow", SMALL_RECT),
|
||||
("dwMaximumWindowSize", COORD),
|
||||
]
|
||||
|
||||
def __str__(self):
|
||||
"""Get string representation of console screen buffer info."""
|
||||
return '(%d,%d,%d,%d,%d,%d,%d,%d,%d,%d,%d)' % (
|
||||
self.dwSize.Y, self.dwSize.X
|
||||
, self.dwCursorPosition.Y, self.dwCursorPosition.X
|
||||
, self.wAttributes
|
||||
, self.srWindow.Top, self.srWindow.Left, self.srWindow.Bottom, self.srWindow.Right
|
||||
, self.dwMaximumWindowSize.Y, self.dwMaximumWindowSize.X
|
||||
self.dwSize.Y,
|
||||
self.dwSize.X,
|
||||
self.dwCursorPosition.Y,
|
||||
self.dwCursorPosition.X,
|
||||
self.wAttributes,
|
||||
self.srWindow.Top,
|
||||
self.srWindow.Left,
|
||||
self.srWindow.Bottom,
|
||||
self.srWindow.Right,
|
||||
self.dwMaximumWindowSize.Y,
|
||||
self.dwMaximumWindowSize.X,
|
||||
)
|
||||
|
||||
|
||||
def GetConsoleScreenBufferInfo(stream_id=STDOUT):
|
||||
"""Get console screen buffer info object."""
|
||||
handle = handles[stream_id]
|
||||
csbi = CONSOLE_SCREEN_BUFFER_INFO()
|
||||
success = windll.kernel32.GetConsoleScreenBufferInfo(
|
||||
handle, byref(csbi))
|
||||
success = windll.kernel32.GetConsoleScreenBufferInfo(handle, byref(csbi))
|
||||
if not success:
|
||||
raise WinError()
|
||||
return csbi
|
||||
|
|
@ -96,18 +118,18 @@ def SetConsoleTextAttribute(stream_id, attrs):
|
|||
|
||||
|
||||
# from wincon.h
|
||||
BLACK = 0
|
||||
BLUE = 1
|
||||
GREEN = 2
|
||||
CYAN = 3
|
||||
RED = 4
|
||||
BLACK = 0
|
||||
BLUE = 1
|
||||
GREEN = 2
|
||||
CYAN = 3
|
||||
RED = 4
|
||||
MAGENTA = 5
|
||||
YELLOW = 6
|
||||
GREY = 7
|
||||
YELLOW = 6
|
||||
GREY = 7
|
||||
|
||||
# from wincon.h
|
||||
NORMAL = 0x00 # dim text, dim background
|
||||
BRIGHT = 0x08 # bright text, dim background
|
||||
NORMAL = 0x00 # dim text, dim background
|
||||
BRIGHT = 0x08 # bright text, dim background
|
||||
|
||||
_default_foreground = None
|
||||
_default_background = None
|
||||
|
|
|
|||
|
|
@ -25,30 +25,34 @@ import urllib.request
|
|||
import shutil
|
||||
import socket
|
||||
import _LinkChecker_configdata as configdata
|
||||
from .. import (log, LOG_CHECK, get_install_data, fileutil)
|
||||
from .. import log, LOG_CHECK, get_install_data, fileutil
|
||||
from . import confparse
|
||||
from xdg.BaseDirectory import xdg_config_home, xdg_data_home
|
||||
|
||||
Version = configdata.version
|
||||
ReleaseDate = configdata.release_date
|
||||
AppName = configdata.name
|
||||
App = AppName+" "+Version
|
||||
App = AppName + " " + Version
|
||||
Author = configdata.author
|
||||
HtmlAuthor = Author.replace(' ', ' ')
|
||||
Copyright = "Copyright (C) 2000-2014 "+Author
|
||||
HtmlCopyright = "Copyright © 2000-2014 "+HtmlAuthor
|
||||
AppInfo = App+" "+Copyright
|
||||
HtmlAppInfo = App+", "+HtmlCopyright
|
||||
Copyright = "Copyright (C) 2000-2014 " + Author
|
||||
HtmlCopyright = "Copyright © 2000-2014 " + HtmlAuthor
|
||||
AppInfo = App + " " + Copyright
|
||||
HtmlAppInfo = App + ", " + HtmlCopyright
|
||||
Url = configdata.url
|
||||
SupportUrl = "https://github.com/linkchecker/linkchecker/issues"
|
||||
Email = configdata.author_email
|
||||
UserAgent = "Mozilla/5.0 (compatible; %s/%s; +%s)" % (AppName, Version, Url)
|
||||
Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY!
|
||||
Freeware = (
|
||||
AppName
|
||||
+ """ comes with ABSOLUTELY NO WARRANTY!
|
||||
This is free software, and you are welcome to redistribute it
|
||||
under certain conditions. Look at the file `LICENSE' within this
|
||||
distribution."""
|
||||
)
|
||||
Portable = configdata.portable
|
||||
|
||||
|
||||
def normpath(path):
|
||||
"""Norm given system path with all available norm or expand functions
|
||||
in os.path."""
|
||||
|
|
@ -58,18 +62,19 @@ def normpath(path):
|
|||
|
||||
# List Python modules in the form (module, name, version attribute)
|
||||
Modules = (
|
||||
# required modules
|
||||
# required modules
|
||||
("requests", "Requests", "__version__"),
|
||||
# optional modules
|
||||
# optional modules
|
||||
("argcomplete", "Argcomplete", None),
|
||||
("GeoIP", "GeoIP", 'lib_version'), # on Unix systems
|
||||
("pygeoip", "GeoIP", 'lib_version'), # on Windows systems
|
||||
("GeoIP", "GeoIP", 'lib_version'), # on Unix systems
|
||||
("pygeoip", "GeoIP", 'lib_version'), # on Windows systems
|
||||
("sqlite3", "Pysqlite", 'version'),
|
||||
("sqlite3", "Sqlite", 'sqlite_version'),
|
||||
("gconf", "Gconf", '__version__'),
|
||||
("meliae", "Meliae", '__version__'),
|
||||
)
|
||||
|
||||
|
||||
def get_modules_info():
|
||||
"""Return unicode string with detected module info."""
|
||||
module_infos = []
|
||||
|
|
@ -136,6 +141,7 @@ def get_certifi_file():
|
|||
the file is not found
|
||||
"""
|
||||
import certifi
|
||||
|
||||
filename = certifi.where()
|
||||
if os.path.isfile(filename):
|
||||
return filename
|
||||
|
|
@ -161,8 +167,8 @@ class Configuration(dict):
|
|||
self['robotstxt'] = True
|
||||
self["debugmemory"] = False
|
||||
self["localwebroot"] = None
|
||||
self["maxfilesizeparse"] = 1*1024*1024
|
||||
self["maxfilesizedownload"] = 5*1024*1024
|
||||
self["maxfilesizeparse"] = 1 * 1024 * 1024
|
||||
self["maxfilesizedownload"] = 5 * 1024 * 1024
|
||||
self["maxnumurls"] = None
|
||||
self["maxrunseconds"] = None
|
||||
self["maxrequestspersecond"] = 10
|
||||
|
|
@ -201,6 +207,7 @@ class Configuration(dict):
|
|||
self['logger'] = None
|
||||
self.loggers = {}
|
||||
from ..logger import LoggerClasses
|
||||
|
||||
for c in LoggerClasses:
|
||||
key = c.LoggerName
|
||||
self[key] = {}
|
||||
|
|
@ -250,14 +257,11 @@ class Configuration(dict):
|
|||
def add_auth(self, user=None, password=None, pattern=None):
|
||||
"""Add given authentication data."""
|
||||
if not user or not pattern:
|
||||
log.warn(LOG_CHECK,
|
||||
_("missing user or URL pattern in authentication data."))
|
||||
log.warn(
|
||||
LOG_CHECK, _("missing user or URL pattern in authentication data.")
|
||||
)
|
||||
return
|
||||
entry = dict(
|
||||
user=user,
|
||||
password=password,
|
||||
pattern=re.compile(pattern),
|
||||
)
|
||||
entry = dict(user=user, password=password, pattern=re.compile(pattern),)
|
||||
self["authentication"].append(entry)
|
||||
|
||||
def get_user_password(self, url):
|
||||
|
|
@ -299,16 +303,16 @@ class Configuration(dict):
|
|||
url = self["loginurl"]
|
||||
disable = False
|
||||
if not self["loginpasswordfield"]:
|
||||
log.warn(LOG_CHECK,
|
||||
_("no CGI password fieldname given for login URL."))
|
||||
log.warn(LOG_CHECK, _("no CGI password fieldname given for login URL."))
|
||||
disable = True
|
||||
if not self["loginuserfield"]:
|
||||
log.warn(LOG_CHECK,
|
||||
_("no CGI user fieldname given for login URL."))
|
||||
log.warn(LOG_CHECK, _("no CGI user fieldname given for login URL."))
|
||||
disable = True
|
||||
if self.get_user_password(url) == (None, None):
|
||||
log.warn(LOG_CHECK,
|
||||
_("no user/password authentication data found for login URL."))
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
_("no user/password authentication data found for login URL."),
|
||||
)
|
||||
disable = True
|
||||
if not url.lower().startswith(("http:", "https:")):
|
||||
log.warn(LOG_CHECK, _("login URL is not a HTTP URL."))
|
||||
|
|
@ -318,8 +322,7 @@ class Configuration(dict):
|
|||
log.warn(LOG_CHECK, _("login URL is incomplete."))
|
||||
disable = True
|
||||
if disable:
|
||||
log.warn(LOG_CHECK,
|
||||
_("disabling login URL %(url)s.") % {"url": url})
|
||||
log.warn(LOG_CHECK, _("disabling login URL %(url)s.") % {"url": url})
|
||||
self["loginurl"] = None
|
||||
|
||||
def sanitize_proxies(self):
|
||||
|
|
@ -366,10 +369,14 @@ def get_user_data():
|
|||
@rtype string
|
||||
"""
|
||||
homedotdir = normpath("~/.linkchecker/")
|
||||
userdata = homedotdir if os.path.isdir(homedotdir) \
|
||||
userdata = (
|
||||
homedotdir
|
||||
if os.path.isdir(homedotdir)
|
||||
else os.path.join(xdg_data_home, "linkchecker")
|
||||
)
|
||||
return userdata
|
||||
|
||||
|
||||
def get_plugin_folders():
|
||||
"""Get linkchecker plugin folders. Default is
|
||||
"$XDG_DATA_HOME/linkchecker/plugins/". "~/.linkchecker/plugins/" is also
|
||||
|
|
@ -413,16 +420,20 @@ def get_user_config():
|
|||
initialconf = normpath(os.path.join(get_share_dir(), "linkcheckerrc"))
|
||||
# per user config settings
|
||||
homedotfile = normpath("~/.linkchecker/linkcheckerrc")
|
||||
userconf = homedotfile if os.path.isfile(homedotfile) \
|
||||
userconf = (
|
||||
homedotfile
|
||||
if os.path.isfile(homedotfile)
|
||||
else os.path.join(xdg_config_home, "linkchecker", "linkcheckerrc")
|
||||
if os.path.isfile(initialconf) and not os.path.exists(userconf) and \
|
||||
not Portable:
|
||||
)
|
||||
if os.path.isfile(initialconf) and not os.path.exists(userconf) and not Portable:
|
||||
# copy the initial configuration to the user configuration
|
||||
try:
|
||||
make_userdir(userconf)
|
||||
shutil.copy(initialconf, userconf)
|
||||
except Exception as errmsg:
|
||||
msg = _("could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r")
|
||||
msg = _(
|
||||
"could not copy initial configuration file %(src)r to %(dst)r: %(errmsg)r"
|
||||
)
|
||||
args = dict(src=initialconf, dst=userconf, errmsg=errmsg)
|
||||
log.warn(LOG_CHECK, msg % args)
|
||||
return userconf
|
||||
|
|
@ -496,6 +507,7 @@ def get_kde_ftp_proxy():
|
|||
log.debug(LOG_CHECK, "error getting FTP proxy from KDE: %s", msg)
|
||||
pass
|
||||
|
||||
|
||||
# The following KDE functions are largely ported and ajusted from
|
||||
# Google Chromium:
|
||||
# http://src.chromium.org/viewvc/chrome/trunk/src/net/proxy/proxy_config_service_linux.cc?revision=HEAD&view=markup
|
||||
|
|
@ -527,6 +539,7 @@ def get_kde_ftp_proxy():
|
|||
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
|
||||
def get_kde_config_dir():
|
||||
"""Return KDE configuration directory or None if not found."""
|
||||
kde_home = get_kde_home_dir()
|
||||
|
|
@ -571,6 +584,7 @@ def get_kde_home_dir():
|
|||
|
||||
loc_ro = re.compile(r"\[.*\]$")
|
||||
|
||||
|
||||
@lru_cache(1)
|
||||
def read_kioslaverc(kde_config_dir):
|
||||
"""Read kioslaverc into data dictionary."""
|
||||
|
|
@ -579,7 +593,7 @@ def read_kioslaverc(kde_config_dir):
|
|||
with open(filename) as fd:
|
||||
# First read all lines into dictionary since they can occur
|
||||
# in any order.
|
||||
for line in fd:
|
||||
for line in fd:
|
||||
line = line.rstrip()
|
||||
if line.startswith('['):
|
||||
in_proxy_settings = line.startswith("[Proxy Settings]")
|
||||
|
|
|
|||
|
|
@ -18,7 +18,15 @@
|
|||
from configparser import RawConfigParser
|
||||
import os
|
||||
|
||||
from .. import LinkCheckerError, get_link_pat, LOG_CHECK, log, fileutil, plugins, logconf
|
||||
from .. import (
|
||||
LinkCheckerError,
|
||||
get_link_pat,
|
||||
LOG_CHECK,
|
||||
log,
|
||||
fileutil,
|
||||
plugins,
|
||||
logconf,
|
||||
)
|
||||
|
||||
|
||||
def read_multiline(value):
|
||||
|
|
@ -50,7 +58,9 @@ class LCConfigParser(RawConfigParser):
|
|||
self.read_ok = super(LCConfigParser, self).read(files)
|
||||
if len(self.read_ok) < len(files):
|
||||
failed_files = set(files) - set(self.read_ok)
|
||||
log.warn(LOG_CHECK, "Could not read configuration files %s.", failed_files)
|
||||
log.warn(
|
||||
LOG_CHECK, "Could not read configuration files %s.", failed_files
|
||||
)
|
||||
# Read all the configuration parameters from the given files.
|
||||
self.read_checking_config()
|
||||
self.read_authentication_config()
|
||||
|
|
@ -58,15 +68,16 @@ class LCConfigParser(RawConfigParser):
|
|||
self.read_output_config()
|
||||
self.read_plugin_config()
|
||||
except Exception as msg:
|
||||
raise LinkCheckerError(
|
||||
_("Error parsing configuration: %s") % str(msg))
|
||||
raise LinkCheckerError(_("Error parsing configuration: %s") % str(msg))
|
||||
|
||||
def read_string_option(self, section, option, allowempty=False):
|
||||
"""Read a string option."""
|
||||
if self.has_option(section, option):
|
||||
value = self.get(section, option)
|
||||
if not allowempty and not value:
|
||||
raise LinkCheckerError(_("invalid empty value for %s: %s\n") % (option, value))
|
||||
raise LinkCheckerError(
|
||||
_("invalid empty value for %s: %s\n") % (option, value)
|
||||
)
|
||||
self.config[option] = value
|
||||
|
||||
def read_boolean_option(self, section, option):
|
||||
|
|
@ -80,10 +91,14 @@ class LCConfigParser(RawConfigParser):
|
|||
num = self.getint(section, option)
|
||||
if min is not None and num < min:
|
||||
raise LinkCheckerError(
|
||||
_("invalid value for %s: %d must not be less than %d") % (option, num, min))
|
||||
_("invalid value for %s: %d must not be less than %d")
|
||||
% (option, num, min)
|
||||
)
|
||||
if max is not None and num < max:
|
||||
raise LinkCheckerError(
|
||||
_("invalid value for %s: %d must not be greater than %d") % (option, num, max))
|
||||
_("invalid value for %s: %d must not be greater than %d")
|
||||
% (option, num, max)
|
||||
)
|
||||
if key is None:
|
||||
key = option
|
||||
self.config[key] = num
|
||||
|
|
@ -92,6 +107,7 @@ class LCConfigParser(RawConfigParser):
|
|||
"""Read configuration options in section "output"."""
|
||||
section = "output"
|
||||
from ..logger import LoggerClasses
|
||||
|
||||
for c in LoggerClasses:
|
||||
key = c.LoggerName
|
||||
if self.has_section(key):
|
||||
|
|
@ -124,8 +140,12 @@ class LCConfigParser(RawConfigParser):
|
|||
loggers = (x.strip().lower() for x in loggers)
|
||||
# no file output for the blacklist and none Logger
|
||||
from ..logger import LoggerNames
|
||||
loggers = (x for x in loggers if x in LoggerNames and
|
||||
x not in ("blacklist", "none"))
|
||||
|
||||
loggers = (
|
||||
x
|
||||
for x in loggers
|
||||
if x in LoggerNames and x not in ("blacklist", "none")
|
||||
)
|
||||
for val in loggers:
|
||||
output = self.config.logger_new(val, fileoutput=1)
|
||||
self.config['fileoutput'].append(output)
|
||||
|
|
@ -145,8 +165,10 @@ class LCConfigParser(RawConfigParser):
|
|||
self.read_int_option(section, "maxfilesizeparse", min=1)
|
||||
self.read_int_option(section, "maxfilesizedownload", min=1)
|
||||
if self.has_option(section, "allowedschemes"):
|
||||
self.config['allowedschemes'] = [x.strip().lower() for x in \
|
||||
self.get(section, 'allowedschemes').split(',')]
|
||||
self.config['allowedschemes'] = [
|
||||
x.strip().lower()
|
||||
for x in self.get(section, 'allowedschemes').split(',')
|
||||
]
|
||||
self.read_boolean_option(section, "debugmemory")
|
||||
self.read_string_option(section, "cookiefile")
|
||||
self.read_boolean_option(section, "robotstxt")
|
||||
|
|
@ -165,21 +187,29 @@ class LCConfigParser(RawConfigParser):
|
|||
for val in read_multiline(self.get(section, "entry")):
|
||||
auth = val.split()
|
||||
if len(auth) == 3:
|
||||
self.config.add_auth(pattern=auth[0], user=auth[1],
|
||||
password=auth[2])
|
||||
self.config.add_auth(
|
||||
pattern=auth[0], user=auth[1], password=auth[2]
|
||||
)
|
||||
password_fields.append("entry/%s/%s" % (auth[0], auth[1]))
|
||||
elif len(auth) == 2:
|
||||
self.config.add_auth(pattern=auth[0], user=auth[1])
|
||||
else:
|
||||
raise LinkCheckerError(
|
||||
_("missing auth part in entry %(val)r") % {"val": val})
|
||||
_("missing auth part in entry %(val)r") % {"val": val}
|
||||
)
|
||||
# read login URL and field names
|
||||
if self.has_option(section, "loginurl"):
|
||||
val = self.get(section, "loginurl").strip()
|
||||
if not (val.lower().startswith("http:") or
|
||||
val.lower().startswith("https:")):
|
||||
raise LinkCheckerError(_("invalid login URL `%s'. Only " \
|
||||
"HTTP and HTTPS URLs are supported.") % val)
|
||||
if not (
|
||||
val.lower().startswith("http:") or val.lower().startswith("https:")
|
||||
):
|
||||
raise LinkCheckerError(
|
||||
_(
|
||||
"invalid login URL `%s'. Only "
|
||||
"HTTP and HTTPS URLs are supported."
|
||||
)
|
||||
% val
|
||||
)
|
||||
self.config["loginurl"] = val
|
||||
self.read_string_option(section, "loginuserfield")
|
||||
self.read_string_option(section, "loginpasswordfield")
|
||||
|
|
@ -201,11 +231,22 @@ class LCConfigParser(RawConfigParser):
|
|||
return
|
||||
fn = self.read_ok[0]
|
||||
if fileutil.is_accessable_by_others(fn):
|
||||
log.warn(LOG_CHECK, "The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.", fn, section, fields)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"The configuration file %s contains password information (in section [%s] and options %s) and the file is readable by others. Please make the file only readable by you.",
|
||||
fn,
|
||||
section,
|
||||
fields,
|
||||
)
|
||||
if os.name == 'posix':
|
||||
log.warn(LOG_CHECK, _("For example execute 'chmod go-rw %s'.") % fn)
|
||||
elif os.name == 'nt':
|
||||
log.warn(LOG_CHECK, _("See http://support.microsoft.com/kb/308419 for more info on setting file permissions."))
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
_(
|
||||
"See http://support.microsoft.com/kb/308419 for more info on setting file permissions."
|
||||
),
|
||||
)
|
||||
|
||||
def read_filtering_config(self):
|
||||
"""
|
||||
|
|
@ -213,8 +254,10 @@ class LCConfigParser(RawConfigParser):
|
|||
"""
|
||||
section = "filtering"
|
||||
if self.has_option(section, "ignorewarnings"):
|
||||
self.config['ignorewarnings'] = [f.strip().lower() for f in \
|
||||
self.get(section, 'ignorewarnings').split(',')]
|
||||
self.config['ignorewarnings'] = [
|
||||
f.strip().lower()
|
||||
for f in self.get(section, 'ignorewarnings').split(',')
|
||||
]
|
||||
if self.has_option(section, "ignore"):
|
||||
for line in read_multiline(self.get(section, "ignore")):
|
||||
pat = get_link_pat(line, strict=1)
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
Special container classes.
|
||||
"""
|
||||
|
||||
|
||||
class LFUCache(dict):
|
||||
"""Limited cache which purges least frequently used items."""
|
||||
|
||||
|
|
@ -40,7 +41,7 @@ class LFUCache(dict):
|
|||
|
||||
def shrink(self):
|
||||
"""Shrink ca. 5% of entries."""
|
||||
trim = int(0.05*len(self))
|
||||
trim = int(0.05 * len(self))
|
||||
if trim:
|
||||
items = super(LFUCache, self).items()
|
||||
# sorting function for items
|
||||
|
|
|
|||
|
|
@ -60,7 +60,8 @@ def from_headers(strheader):
|
|||
for headervalue in headers.get_all("Set-Cookie"):
|
||||
for pairs in split_header_words([headervalue]):
|
||||
for name, value in pairs:
|
||||
cookie = requests.cookies.create_cookie(name, value,
|
||||
domain=host, path=path)
|
||||
cookie = requests.cookies.create_cookie(
|
||||
name, value, domain=host, path=path
|
||||
)
|
||||
res.append(cookie)
|
||||
return res
|
||||
|
|
|
|||
|
|
@ -56,11 +56,15 @@ def update_func_meta(fake_func, real_func):
|
|||
def deprecated(func):
|
||||
"""A decorator which can be used to mark functions as deprecated.
|
||||
It emits a warning when the function is called."""
|
||||
|
||||
def newfunc(*args, **kwargs):
|
||||
"""Print deprecated warning and execute original function."""
|
||||
warnings.warn("Call to deprecated function %s." % func.__name__,
|
||||
category=DeprecationWarning)
|
||||
warnings.warn(
|
||||
"Call to deprecated function %s." % func.__name__,
|
||||
category=DeprecationWarning,
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return update_func_meta(newfunc, func)
|
||||
|
||||
|
||||
|
|
@ -83,19 +87,27 @@ def signal_handler(signal_number):
|
|||
if is_valid_signal and os.name == 'posix':
|
||||
signal.signal(signal_number, function)
|
||||
return function
|
||||
|
||||
return newfunc
|
||||
|
||||
|
||||
def synchronize(lock, func, log_duration_secs=0):
|
||||
"""Return synchronized function acquiring the given lock."""
|
||||
|
||||
def newfunc(*args, **kwargs):
|
||||
"""Execute function synchronized."""
|
||||
t = time.time()
|
||||
with lock:
|
||||
duration = time.time() - t
|
||||
if duration > log_duration_secs > 0:
|
||||
print("WARN:", func.__name__, "locking took %0.2f seconds" % duration, file=sys.stderr)
|
||||
print(
|
||||
"WARN:",
|
||||
func.__name__,
|
||||
"locking took %0.2f seconds" % duration,
|
||||
file=sys.stderr,
|
||||
)
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return update_func_meta(newfunc, func)
|
||||
|
||||
|
||||
|
|
@ -106,11 +118,13 @@ def synchronized(lock):
|
|||
|
||||
def notimplemented(func):
|
||||
"""Raises a NotImplementedError if the function is called."""
|
||||
|
||||
def newfunc(*args, **kwargs):
|
||||
"""Raise NotImplementedError"""
|
||||
co = func.func_code
|
||||
attrs = (co.co_name, co.co_filename, co.co_firstlineno)
|
||||
raise NotImplementedError("function %s at %s:%d is not implemented" % attrs)
|
||||
|
||||
return update_func_meta(newfunc, func)
|
||||
|
||||
|
||||
|
|
@ -127,6 +141,7 @@ def timeit(func, log, limit):
|
|||
print(args, file=log)
|
||||
print(kwargs, file=log)
|
||||
return res
|
||||
|
||||
return update_func_meta(newfunc, func)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -32,14 +32,12 @@ def check_urls(aggregate):
|
|||
try:
|
||||
aggregate.visit_loginurl()
|
||||
except Exception as msg:
|
||||
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \
|
||||
dict(msg=msg))
|
||||
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % dict(msg=msg))
|
||||
raise
|
||||
try:
|
||||
aggregate.logger.start_log_output()
|
||||
except Exception as msg:
|
||||
log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % \
|
||||
dict(msg=msg))
|
||||
log.error(LOG_CHECK, _("Error starting log output: %(msg)s.") % dict(msg=msg))
|
||||
raise
|
||||
try:
|
||||
if not aggregate.urlqueue.empty():
|
||||
|
|
@ -52,9 +50,13 @@ def check_urls(aggregate):
|
|||
except KeyboardInterrupt:
|
||||
interrupt(aggregate)
|
||||
except RuntimeError:
|
||||
log.warn(LOG_CHECK,
|
||||
_("Could not start a new thread. Check that the current user" \
|
||||
" is allowed to start new threads."))
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
_(
|
||||
"Could not start a new thread. Check that the current user"
|
||||
" is allowed to start new threads."
|
||||
),
|
||||
)
|
||||
abort(aggregate)
|
||||
except Exception:
|
||||
# Catching "Exception" is intentionally done. This saves the program
|
||||
|
|
@ -84,10 +86,8 @@ def interrupt(aggregate):
|
|||
interrupts."""
|
||||
while True:
|
||||
try:
|
||||
log.warn(LOG_CHECK,
|
||||
_("interrupt; waiting for active threads to finish"))
|
||||
log.warn(LOG_CHECK,
|
||||
_("another interrupt will exit immediately"))
|
||||
log.warn(LOG_CHECK, _("interrupt; waiting for active threads to finish"))
|
||||
log.warn(LOG_CHECK, _("another interrupt will exit immediately"))
|
||||
abort(aggregate)
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
|
|
@ -113,6 +113,7 @@ def abort_now():
|
|||
if os.name == 'posix':
|
||||
# Unix systems can use signals
|
||||
import signal
|
||||
|
||||
os.kill(os.getpid(), signal.SIGTERM)
|
||||
time.sleep(1)
|
||||
os.kill(os.getpid(), signal.SIGKILL)
|
||||
|
|
@ -130,5 +131,6 @@ def get_aggregate(config):
|
|||
_robots_txt = robots_txt.RobotsTxt(config["useragent"])
|
||||
plugin_manager = plugins.PluginManager(config)
|
||||
result_cache = results.ResultCache()
|
||||
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,
|
||||
result_cache)
|
||||
return aggregator.Aggregate(
|
||||
config, _urlqueue, _robots_txt, plugin_manager, result_cache
|
||||
)
|
||||
|
|
|
|||
|
|
@ -34,15 +34,16 @@ _threads_lock = threading.RLock()
|
|||
_hosts_lock = threading.RLock()
|
||||
_downloadedbytes_lock = threading.RLock()
|
||||
|
||||
|
||||
def new_request_session(config, cookies):
|
||||
"""Create a new request session."""
|
||||
session = requests.Session()
|
||||
if cookies:
|
||||
session.cookies = cookies
|
||||
session.max_redirects = config["maxhttpredirects"]
|
||||
session.headers.update({
|
||||
"User-Agent": config["useragent"],
|
||||
})
|
||||
session.headers.update(
|
||||
{"User-Agent": config["useragent"],}
|
||||
)
|
||||
if config["cookiefile"]:
|
||||
for cookie in from_file(config["cookiefile"]):
|
||||
session.cookies.set_cookie(cookie)
|
||||
|
|
@ -52,8 +53,7 @@ def new_request_session(config, cookies):
|
|||
class Aggregate:
|
||||
"""Store thread-safe data collections for checker threads."""
|
||||
|
||||
def __init__(self, config, urlqueue, robots_txt, plugin_manager,
|
||||
result_cache):
|
||||
def __init__(self, config, urlqueue, robots_txt, plugin_manager, result_cache):
|
||||
"""Store given link checking objects."""
|
||||
self.config = config
|
||||
self.urlqueue = urlqueue
|
||||
|
|
@ -78,7 +78,8 @@ class Aggregate:
|
|||
user, password = self.config.get_user_password(url)
|
||||
if not user and not password:
|
||||
raise LinkCheckerError(
|
||||
"loginurl is configured but neither user nor password are set")
|
||||
"loginurl is configured but neither user nor password are set"
|
||||
)
|
||||
session = new_request_session(self.config, self.cookies)
|
||||
log.debug(LOG_CHECK, "Getting login form %s", url)
|
||||
kwargs = dict(timeout=self.config["timeout"])
|
||||
|
|
@ -119,11 +120,15 @@ class Aggregate:
|
|||
num = self.config["threads"]
|
||||
if num > 0:
|
||||
for dummy in range(num):
|
||||
t = checker.Checker(self.urlqueue, self.logger, self.add_request_session)
|
||||
t = checker.Checker(
|
||||
self.urlqueue, self.logger, self.add_request_session
|
||||
)
|
||||
self.threads.append(t)
|
||||
t.start()
|
||||
else:
|
||||
self.request_sessions[threading.get_ident()] = new_request_session(self.config, self.cookies)
|
||||
self.request_sessions[threading.get_ident()] = new_request_session(
|
||||
self.config, self.cookies
|
||||
)
|
||||
checker.check_urls(self.urlqueue, self.logger)
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
|
|
@ -162,10 +167,18 @@ class Aggregate:
|
|||
first = False
|
||||
log.info(LOG_CHECK, name[12:])
|
||||
args = dict(
|
||||
num=len([x for x in self.threads if x.getName().startswith("CheckThread-")]),
|
||||
num=len(
|
||||
[x for x in self.threads if x.getName().startswith("CheckThread-")]
|
||||
),
|
||||
timeout=strformat.strduration_long(self.config["aborttimeout"]),
|
||||
)
|
||||
log.info(LOG_CHECK, _("%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop.") % args)
|
||||
log.info(
|
||||
LOG_CHECK,
|
||||
_(
|
||||
"%(num)d URLs are still active. After a timeout of %(timeout)s the active URLs will stop."
|
||||
)
|
||||
% args,
|
||||
)
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
def get_check_threads(self):
|
||||
|
|
@ -187,7 +200,10 @@ class Aggregate:
|
|||
try:
|
||||
self.urlqueue.join(timeout=timeout)
|
||||
except urlqueue.Timeout:
|
||||
log.warn(LOG_CHECK, "Abort timed out after %d seconds, stopping application." % timeout)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Abort timed out after %d seconds, stopping application." % timeout,
|
||||
)
|
||||
raise KeyboardInterrupt()
|
||||
|
||||
@synchronized(_threads_lock)
|
||||
|
|
@ -219,8 +235,9 @@ class Aggregate:
|
|||
|
||||
def end_log_output(self, **kwargs):
|
||||
"""Print ending output to log."""
|
||||
kwargs.update(dict(
|
||||
downloaded_bytes=self.downloaded_bytes,
|
||||
num_urls = len(self.result_cache),
|
||||
))
|
||||
kwargs.update(
|
||||
dict(
|
||||
downloaded_bytes=self.downloaded_bytes, num_urls=len(self.result_cache),
|
||||
)
|
||||
)
|
||||
self.logger.end_log_output(**kwargs)
|
||||
|
|
|
|||
|
|
@ -35,8 +35,7 @@ class StatusLogger:
|
|||
|
||||
def log_status(self, checked, in_progress, queue, duration, num_urls):
|
||||
"""Write status message to file descriptor."""
|
||||
msg = _n("%2d thread active", "%2d threads active", in_progress) % \
|
||||
in_progress
|
||||
msg = _n("%2d thread active", "%2d threads active", in_progress) % in_progress
|
||||
self.write("%s, " % msg)
|
||||
msg = _n("%5d link queued", "%5d links queued", queue) % queue
|
||||
self.write("%s, " % msg)
|
||||
|
|
@ -64,7 +63,9 @@ class StatusLogger:
|
|||
def internal_error(out=stderr, etype=None, evalue=None, tb=None):
|
||||
"""Print internal error message (output defaults to stderr)."""
|
||||
print(os.linesep, file=out)
|
||||
print(_("""********** Oops, I did it again. *************
|
||||
print(
|
||||
_(
|
||||
"""********** Oops, I did it again. *************
|
||||
|
||||
You have found an internal error in LinkChecker. Please write a bug report
|
||||
at %s
|
||||
|
|
@ -79,7 +80,11 @@ When using the commandline client:
|
|||
Not disclosing some of the information above due to privacy reasons is ok.
|
||||
I will try to help you nonetheless, but you have to give me something
|
||||
I can work with ;) .
|
||||
""") % configuration.SupportUrl, file=out)
|
||||
"""
|
||||
)
|
||||
% configuration.SupportUrl,
|
||||
file=out,
|
||||
)
|
||||
if etype is None:
|
||||
etype = sys.exc_info()[0]
|
||||
if evalue is None:
|
||||
|
|
@ -90,8 +95,11 @@ I can work with ;) .
|
|||
print_app_info(out=out)
|
||||
print_proxy_info(out=out)
|
||||
print_locale_info(out=out)
|
||||
print(os.linesep,
|
||||
_("******** LinkChecker internal error, over and out ********"), file=out)
|
||||
print(
|
||||
os.linesep,
|
||||
_("******** LinkChecker internal error, over and out ********"),
|
||||
file=out,
|
||||
)
|
||||
|
||||
|
||||
def print_env_info(key, out=stderr):
|
||||
|
|
@ -113,6 +121,7 @@ def print_locale_info(out=stderr):
|
|||
print_env_info(key, out=out)
|
||||
print(_("Default locale:"), i18n.get_locale(), file=out)
|
||||
|
||||
|
||||
# Environment variables influencing the interpreter execution
|
||||
# See python(1) man page.
|
||||
PYTHON_ENV_VARS = (
|
||||
|
|
@ -131,13 +140,18 @@ PYTHON_ENV_VARS = (
|
|||
'PYTHONWARNINGS',
|
||||
'PYTHONHASHSEED',
|
||||
)
|
||||
|
||||
|
||||
def print_app_info(out=stderr):
|
||||
"""Print system and application info (output defaults to stderr)."""
|
||||
print(_("System info:"), file=out)
|
||||
print(configuration.App, file=out)
|
||||
print(_("Released on:"), configuration.ReleaseDate, file=out)
|
||||
print(_("Python %(version)s on %(platform)s") %
|
||||
{"version": sys.version, "platform": sys.platform}, file=out)
|
||||
print(
|
||||
_("Python %(version)s on %(platform)s")
|
||||
% {"version": sys.version, "platform": sys.platform},
|
||||
file=out,
|
||||
)
|
||||
for key in PYTHON_ENV_VARS:
|
||||
print_env_info(key, out=out)
|
||||
print(configuration.get_modules_info(), file=out)
|
||||
|
|
@ -148,6 +162,5 @@ def print_app_info(out=stderr):
|
|||
|
||||
def print_version(out=stdout):
|
||||
"""Print the program version (output defaults to stdout)."""
|
||||
print(configuration.App, _("released"),
|
||||
configuration.ReleaseDate, file=out)
|
||||
print(configuration.App, _("released"), configuration.ReleaseDate, file=out)
|
||||
print(configuration.Copyright, file=out)
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ class Interrupt(task.CheckedTask):
|
|||
This gives us a portable SIGALRM implementation.
|
||||
The duration is checked every 5 seconds.
|
||||
"""
|
||||
|
||||
WaitSeconds = 5
|
||||
|
||||
def __init__(self, duration):
|
||||
|
|
@ -41,5 +42,8 @@ class Interrupt(task.CheckedTask):
|
|||
while not self.stopped(self.WaitSeconds):
|
||||
duration = time.time() - self.start_time
|
||||
if duration > self.duration:
|
||||
log.warn(LOG_CHECK, "Interrupt after %s" % strformat.strduration_long(duration))
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Interrupt after %s" % strformat.strduration_long(duration),
|
||||
)
|
||||
raise KeyboardInterrupt()
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ import threading
|
|||
import _thread
|
||||
|
||||
from ..decorators import synchronized
|
||||
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@
|
|||
Dummy objects.
|
||||
"""
|
||||
|
||||
|
||||
class Dummy:
|
||||
"""A dummy object ignores all access to it. Useful for testing."""
|
||||
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ elif "G_BROKEN_FILENAMES" in os.environ:
|
|||
else:
|
||||
FSCODING = "utf-8"
|
||||
|
||||
|
||||
def path_safe(path):
|
||||
"""Ensure path string is compatible with the platform file system encoding."""
|
||||
if isinstance(path, str) and not os.path.supports_unicode_filenames:
|
||||
|
|
@ -83,7 +84,7 @@ def get_temp_file(mode='r', **kwargs):
|
|||
|
||||
def is_tty(fp):
|
||||
"""Check if is a file object pointing to a TTY."""
|
||||
return (hasattr(fp, "isatty") and fp.isatty())
|
||||
return hasattr(fp, "isatty") and fp.isatty()
|
||||
|
||||
|
||||
@lru_cache(128)
|
||||
|
|
|
|||
|
|
@ -19,8 +19,22 @@ Python implementation of a part of Dan Bernstein's ftpparse library.
|
|||
See also http://cr.yp.to/ftpparse.html
|
||||
"""
|
||||
|
||||
months = ("jan", "feb", "mar", "apr", "may", "jun", "jul", "aug", "sep",
|
||||
"oct", "nov", "dec")
|
||||
months = (
|
||||
"jan",
|
||||
"feb",
|
||||
"mar",
|
||||
"apr",
|
||||
"may",
|
||||
"jun",
|
||||
"jul",
|
||||
"aug",
|
||||
"sep",
|
||||
"oct",
|
||||
"nov",
|
||||
"dec",
|
||||
)
|
||||
|
||||
|
||||
def ismonth(txt):
|
||||
"""Check if given text is a month name."""
|
||||
return txt.lower() in months
|
||||
|
|
@ -78,20 +92,20 @@ def ftpparse(line):
|
|||
parts = line.split()
|
||||
if len(parts) < 7:
|
||||
return None
|
||||
del parts[0] # skip permissions
|
||||
del parts[0] # skip permissions
|
||||
if parts[0] != 'folder':
|
||||
del parts[0] # skip nlink
|
||||
del parts[0] # skip uid
|
||||
del parts[0] # skip gid or size
|
||||
del parts[0] # skip nlink
|
||||
del parts[0] # skip uid
|
||||
del parts[0] # skip gid or size
|
||||
if not ismonth(parts[0]):
|
||||
del parts[0] # skip size
|
||||
del parts[0] # skip size
|
||||
if not ismonth(parts[0]):
|
||||
return None
|
||||
del parts[0] # skip month
|
||||
del parts[0] # skip day
|
||||
del parts[0] # skip month
|
||||
del parts[0] # skip day
|
||||
if not parts:
|
||||
return None
|
||||
del parts[0] # skip year or time
|
||||
del parts[0] # skip year or time
|
||||
name = " ".join(parts)
|
||||
# resolve links
|
||||
if line[0] == 'l' and ' -> ' in name:
|
||||
|
|
|
|||
|
|
@ -19,13 +19,17 @@ HTML parser implemented using Beautiful Soup and html.parser.
|
|||
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
filterwarnings(
|
||||
"ignore",
|
||||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
category=UserWarning,
|
||||
module="bs4",
|
||||
)
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def make_soup(markup, from_encoding=None):
|
||||
return BeautifulSoup(markup, "html.parser", from_encoding=from_encoding,
|
||||
multi_valued_attributes=None)
|
||||
return BeautifulSoup(
|
||||
markup, "html.parser", from_encoding=from_encoding, multi_valued_attributes=None
|
||||
)
|
||||
|
|
|
|||
|
|
@ -25,41 +25,41 @@ unquote = strformat.unquote
|
|||
# HTML4/5 link tags
|
||||
# ripped mainly from HTML::Tagset.pm with HTML5 added
|
||||
LinkTags = {
|
||||
'a': ['href'],
|
||||
'applet': ['archive', 'src'],
|
||||
'area': ['href'],
|
||||
'audio': ['src'], # HTML5
|
||||
'bgsound': ['src'],
|
||||
'a': ['href'],
|
||||
'applet': ['archive', 'src'],
|
||||
'area': ['href'],
|
||||
'audio': ['src'], # HTML5
|
||||
'bgsound': ['src'],
|
||||
'blockquote': ['cite'],
|
||||
'body': ['background'],
|
||||
'button': ['formaction'], # HTML5
|
||||
'del': ['cite'],
|
||||
'embed': ['pluginspage', 'src'],
|
||||
'form': ['action'],
|
||||
'frame': ['src', 'longdesc'],
|
||||
'head': ['profile'],
|
||||
'html': ['manifest'], # HTML5
|
||||
'iframe': ['src', 'longdesc'],
|
||||
'ilayer': ['background'],
|
||||
'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'],
|
||||
'input': ['src', 'usemap', 'formaction'],
|
||||
'ins': ['cite'],
|
||||
'isindex': ['action'],
|
||||
'layer': ['background', 'src'],
|
||||
'link': ['href'],
|
||||
'meta': ['content', 'href'],
|
||||
'object': ['classid', 'data', 'archive', 'usemap', 'codebase'],
|
||||
'q': ['cite'],
|
||||
'script': ['src'],
|
||||
'source': ['src'], # HTML5
|
||||
'table': ['background'],
|
||||
'td': ['background'],
|
||||
'th': ['background'],
|
||||
'tr': ['background'],
|
||||
'track': ['src'], # HTML5
|
||||
'video': ['src'], # HTML5
|
||||
'xmp': ['href'],
|
||||
None: ['style', 'itemtype'],
|
||||
'body': ['background'],
|
||||
'button': ['formaction'], # HTML5
|
||||
'del': ['cite'],
|
||||
'embed': ['pluginspage', 'src'],
|
||||
'form': ['action'],
|
||||
'frame': ['src', 'longdesc'],
|
||||
'head': ['profile'],
|
||||
'html': ['manifest'], # HTML5
|
||||
'iframe': ['src', 'longdesc'],
|
||||
'ilayer': ['background'],
|
||||
'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'],
|
||||
'input': ['src', 'usemap', 'formaction'],
|
||||
'ins': ['cite'],
|
||||
'isindex': ['action'],
|
||||
'layer': ['background', 'src'],
|
||||
'link': ['href'],
|
||||
'meta': ['content', 'href'],
|
||||
'object': ['classid', 'data', 'archive', 'usemap', 'codebase'],
|
||||
'q': ['cite'],
|
||||
'script': ['src'],
|
||||
'source': ['src'], # HTML5
|
||||
'table': ['background'],
|
||||
'td': ['background'],
|
||||
'th': ['background'],
|
||||
'tr': ['background'],
|
||||
'track': ['src'], # HTML5
|
||||
'video': ['src'], # HTML5
|
||||
'xmp': ['href'],
|
||||
None: ['style', 'itemtype'],
|
||||
}
|
||||
|
||||
# HTML anchor tags
|
||||
|
|
@ -70,8 +70,8 @@ AnchorTags = {
|
|||
|
||||
# WML tags
|
||||
WmlTags = {
|
||||
'a': ['href'],
|
||||
'go': ['href'],
|
||||
'a': ['href'],
|
||||
'go': ['href'],
|
||||
'img': ['src'],
|
||||
}
|
||||
|
||||
|
|
@ -219,6 +219,9 @@ def find_links(soup, callback, tags):
|
|||
lf = LinkFinder(callback, tags)
|
||||
for element in soup.find_all(True):
|
||||
lf.html_element(
|
||||
element.name, element.attrs, element.text.strip(),
|
||||
element.name,
|
||||
element.attrs,
|
||||
element.text.strip(),
|
||||
element.sourceline,
|
||||
None if element.sourcepos is None else element.sourcepos + 1)
|
||||
None if element.sourcepos is None else element.sourcepos + 1,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ HTML form utils
|
|||
from ..htmlutil import htmlsoup
|
||||
from .. import log, LOG_CHECK
|
||||
|
||||
|
||||
class Form:
|
||||
"""Store HTML form URL and form data."""
|
||||
|
||||
|
|
@ -44,10 +45,8 @@ def search_form(content, cgiuser, cgipassword):
|
|||
cginames = {cgiuser, cgipassword} - {None}
|
||||
for form_element in soup.find_all("form", action=True):
|
||||
form = Form(form_element["action"])
|
||||
for input_element in form_element.find_all("input",
|
||||
attrs={"name": True}):
|
||||
form.add_value(
|
||||
input_element["name"], input_element.attrs.get("value"))
|
||||
for input_element in form_element.find_all("input", attrs={"name": True}):
|
||||
form.add_value(input_element["name"], input_element.attrs.get("value"))
|
||||
if cginames <= set(form.data):
|
||||
log.debug(LOG_CHECK, "Found form %s", form)
|
||||
return form
|
||||
|
|
|
|||
|
|
@ -27,14 +27,10 @@ def x509_to_dict(x509):
|
|||
subject, subjectAltName and optional notAfter.
|
||||
"""
|
||||
from requests.packages.urllib3.contrib.pyopenssl import get_subj_alt_name
|
||||
|
||||
res = {
|
||||
'subject': (
|
||||
(('commonName', x509.get_subject().CN),),
|
||||
),
|
||||
'subjectAltName': [
|
||||
('DNS', value)
|
||||
for value in get_subj_alt_name(x509)
|
||||
]
|
||||
'subject': ((('commonName', x509.get_subject().CN),),),
|
||||
'subjectAltName': [('DNS', value) for value in get_subj_alt_name(x509)],
|
||||
}
|
||||
notAfter = x509.get_notAfter()
|
||||
if notAfter is not None:
|
||||
|
|
|
|||
|
|
@ -30,13 +30,16 @@ default_language = default_encoding = None
|
|||
default_directory = None
|
||||
default_domain = None
|
||||
|
||||
|
||||
def install_builtin(translator, do_unicode):
|
||||
"""Install _() and _n() gettext methods into default namespace."""
|
||||
import builtins
|
||||
|
||||
builtins.__dict__['_'] = translator.gettext
|
||||
# also install ngettext
|
||||
builtins.__dict__['_n'] = translator.ngettext
|
||||
|
||||
|
||||
class Translator(gettext.GNUTranslations):
|
||||
"""A translation class always installing its gettext methods into the
|
||||
default namespace."""
|
||||
|
|
@ -84,18 +87,29 @@ def init(domain, directory, loc=None):
|
|||
|
||||
def install_language(language):
|
||||
"""Install translation service routines into default namespace."""
|
||||
translator = get_translator(default_domain, default_directory,
|
||||
languages=[get_lang(language)], fallback=True)
|
||||
translator = get_translator(
|
||||
default_domain, default_directory, languages=[get_lang(language)], fallback=True
|
||||
)
|
||||
do_unicode = True
|
||||
translator.install(do_unicode)
|
||||
|
||||
|
||||
def get_translator(domain, directory, languages=None,
|
||||
translatorklass=Translator, fallback=False,
|
||||
fallbackklass=NullTranslator):
|
||||
def get_translator(
|
||||
domain,
|
||||
directory,
|
||||
languages=None,
|
||||
translatorklass=Translator,
|
||||
fallback=False,
|
||||
fallbackklass=NullTranslator,
|
||||
):
|
||||
"""Search the appropriate GNUTranslations class."""
|
||||
translator = gettext.translation(domain, localedir=directory,
|
||||
languages=languages, class_=translatorklass, fallback=fallback)
|
||||
translator = gettext.translation(
|
||||
domain,
|
||||
localedir=directory,
|
||||
languages=languages,
|
||||
class_=translatorklass,
|
||||
fallback=fallback,
|
||||
)
|
||||
if not isinstance(translator, gettext.GNUTranslations) and fallbackklass:
|
||||
translator = fallbackklass()
|
||||
return translator
|
||||
|
|
@ -175,6 +189,7 @@ lang_transis = {
|
|||
'en': {'de': 'Englisch'},
|
||||
}
|
||||
|
||||
|
||||
def lang_name(lang):
|
||||
"""Return full name of given language."""
|
||||
return lang_names[lang]
|
||||
|
|
|
|||
|
|
@ -25,8 +25,15 @@ import re
|
|||
import time
|
||||
import urllib.parse
|
||||
|
||||
from . import configuration, strformat, checker, director, get_link_pat, \
|
||||
init_i18n, url as urlutil
|
||||
from . import (
|
||||
configuration,
|
||||
strformat,
|
||||
checker,
|
||||
director,
|
||||
get_link_pat,
|
||||
init_i18n,
|
||||
url as urlutil,
|
||||
)
|
||||
from .decorators import synchronized
|
||||
|
||||
# 5 minutes timeout for requests
|
||||
|
|
@ -67,17 +74,20 @@ lang_locale = {
|
|||
}
|
||||
_is_level = re.compile(r'^(0|1|2|3|-1)$').match
|
||||
|
||||
|
||||
class LCFormError(Exception):
|
||||
"""Form related errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def get_response_headers():
|
||||
"""Get list of response headers in key-value form."""
|
||||
return [("Content-type", "text/html"),
|
||||
("Cache-Control", "no-cache"),
|
||||
("Pragma:", "no-cache")
|
||||
]
|
||||
return [
|
||||
("Content-type", "text/html"),
|
||||
("Cache-Control", "no-cache"),
|
||||
("Pragma:", "no-cache"),
|
||||
]
|
||||
|
||||
|
||||
def formvalue(form, key):
|
||||
|
|
@ -89,6 +99,8 @@ def formvalue(form, key):
|
|||
|
||||
|
||||
_lock = threading.Lock()
|
||||
|
||||
|
||||
class ThreadsafeIO:
|
||||
"""Thread-safe unicode I/O class."""
|
||||
|
||||
|
|
@ -235,7 +247,7 @@ def log(env, msg):
|
|||
def dump(env, form):
|
||||
"""Log environment and form."""
|
||||
for var, value in env.items():
|
||||
log(env, var+"="+value)
|
||||
log(env, var + "=" + value)
|
||||
for key in form:
|
||||
log(env, str(formvalue(form, key)))
|
||||
|
||||
|
|
@ -247,7 +259,9 @@ def format_error(why):
|
|||
@return: HTML page content
|
||||
@rtype: unicode
|
||||
"""
|
||||
return _("""<!DOCTYPE HTML>
|
||||
return (
|
||||
_(
|
||||
"""<!DOCTYPE HTML>
|
||||
<html><head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
|
||||
<title>LinkChecker Online Error</title></head>
|
||||
|
|
@ -260,4 +274,7 @@ contains only these characters: <code>A-Za-z0-9./_~-</code><br/><br/>
|
|||
Errors are logged.
|
||||
</blockquote>
|
||||
</body>
|
||||
</html>""") % html.escape(why)
|
||||
</html>"""
|
||||
)
|
||||
% html.escape(why)
|
||||
)
|
||||
|
|
|
|||
|
|
@ -42,15 +42,17 @@ def get_package_modules(packagename):
|
|||
parentmodule = os.path.basename(os.path.dirname(__file__))
|
||||
with zipfile.ZipFile(zipname, 'r') as f:
|
||||
prefix = "%s/%s/" % (parentmodule, packagename)
|
||||
modnames = [os.path.splitext(n[len(prefix):])[0]
|
||||
for n in f.namelist()
|
||||
if n.startswith(prefix) and "__init__" not in n]
|
||||
modnames = [
|
||||
os.path.splitext(n[len(prefix) :])[0]
|
||||
for n in f.namelist()
|
||||
if n.startswith(prefix) and "__init__" not in n
|
||||
]
|
||||
else:
|
||||
dirname = os.path.join(os.path.dirname(__file__), packagename)
|
||||
modnames = [x[:-3] for x in get_importable_files(dirname)]
|
||||
for modname in modnames:
|
||||
try:
|
||||
name ="..%s.%s" % (packagename, modname)
|
||||
name = "..%s.%s" % (packagename, modname)
|
||||
yield importlib.import_module(name, __name__)
|
||||
except ImportError as msg:
|
||||
print("WARN: could not load module %s: %s" % (modname, msg))
|
||||
|
|
@ -63,7 +65,7 @@ def get_folder_modules(folder, parentpackage):
|
|||
return
|
||||
for filename in get_importable_files(folder):
|
||||
fullname = os.path.join(folder, filename)
|
||||
modname = parentpackage+"."+filename[:-3]
|
||||
modname = parentpackage + "." + filename[:-3]
|
||||
try:
|
||||
yield imp.load_source(modname, fullname)
|
||||
except ImportError as msg:
|
||||
|
|
@ -80,7 +82,10 @@ def get_importable_files(folder):
|
|||
if fname.endswith('.py') and not fname.startswith('_'):
|
||||
fullname = os.path.join(folder, fname)
|
||||
if check_writable_by_others(fullname):
|
||||
print("ERROR: refuse to load module from world writable file %r" % fullname)
|
||||
print(
|
||||
"ERROR: refuse to load module from world writable file %r"
|
||||
% fullname
|
||||
)
|
||||
else:
|
||||
yield fname
|
||||
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Locking utility class.
|
|||
import threading
|
||||
from . import log, LOG_THREAD
|
||||
|
||||
|
||||
def get_lock(name, debug=False):
|
||||
"""Get a new lock.
|
||||
@param debug: if True, acquire() and release() will have debug messages
|
||||
|
|
|
|||
|
|
@ -24,11 +24,13 @@ import inspect
|
|||
import traceback
|
||||
|
||||
# memory leak debugging
|
||||
#import gc
|
||||
#gc.enable()
|
||||
#gc.set_debug(gc.DEBUG_LEAK)
|
||||
# import gc
|
||||
# gc.enable()
|
||||
# gc.set_debug(gc.DEBUG_LEAK)
|
||||
|
||||
PRINT_LOCALVARS = False
|
||||
|
||||
|
||||
def _stack_format(stack):
|
||||
"""Format a stack trace to a message.
|
||||
|
||||
|
|
|
|||
|
|
@ -41,20 +41,18 @@ lognamelist = ", ".join(repr(name) for name in lognames)
|
|||
# logging configuration
|
||||
configdict = {
|
||||
'version': 1,
|
||||
'loggers': {
|
||||
},
|
||||
'root': {
|
||||
'level': 'WARN',
|
||||
},
|
||||
'loggers': {},
|
||||
'root': {'level': 'WARN',},
|
||||
'incremental': True,
|
||||
}
|
||||
|
||||
|
||||
def init_log_config(handler=None):
|
||||
"""Set up the application logging (not to be confused with check loggers).
|
||||
"""
|
||||
for applog in lognames.values():
|
||||
# propagate except for root app logger 'linkcheck'
|
||||
propagate = (applog != LOG_ROOT)
|
||||
propagate = applog != LOG_ROOT
|
||||
configdict['loggers'][applog] = dict(level='INFO', propagate=propagate)
|
||||
|
||||
logging.config.dictConfig(configdict)
|
||||
|
|
@ -86,8 +84,8 @@ def set_debug(loggers):
|
|||
"""Set debugging log level."""
|
||||
set_loglevel(loggers, logging.DEBUG)
|
||||
# enable for httplib debugging (used by requests.packages.urllib3)
|
||||
#import httplib
|
||||
#httplib.HTTPConnection.debuglevel = 1
|
||||
# import httplib
|
||||
# httplib.HTTPConnection.debuglevel = 1
|
||||
|
||||
|
||||
def set_loglevel(loggers, level):
|
||||
|
|
|
|||
|
|
@ -46,15 +46,7 @@ Fields = dict(
|
|||
)
|
||||
del _
|
||||
|
||||
ContentTypes = dict(
|
||||
image=0,
|
||||
text=0,
|
||||
video=0,
|
||||
audio=0,
|
||||
application=0,
|
||||
mail=0,
|
||||
other=0,
|
||||
)
|
||||
ContentTypes = dict(image=0, text=0, video=0, audio=0, application=0, mail=0, other=0,)
|
||||
|
||||
|
||||
class LogStatistics:
|
||||
|
|
@ -236,9 +228,13 @@ class _Logger(abc.ABC):
|
|||
self.close_fd = True
|
||||
except IOError:
|
||||
msg = sys.exc_info()[1]
|
||||
log.warn(LOG_CHECK,
|
||||
"Could not open file %r for writing: %s\n"
|
||||
"Disabling log output of %s", self.filename, msg, self)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Could not open file %r for writing: %s\n" "Disabling log output of %s",
|
||||
self.filename,
|
||||
msg,
|
||||
self,
|
||||
)
|
||||
self.fd = dummy.Dummy()
|
||||
self.is_active = False
|
||||
self.filename = None
|
||||
|
|
@ -246,10 +242,10 @@ class _Logger(abc.ABC):
|
|||
def create_fd(self):
|
||||
"""Create open file descriptor."""
|
||||
if self.filename is None:
|
||||
return i18n.get_encoded_writer(encoding=self.output_encoding,
|
||||
errors=self.codec_errors)
|
||||
return codecs.open(self.filename, "wb", self.output_encoding,
|
||||
self.codec_errors)
|
||||
return i18n.get_encoded_writer(
|
||||
encoding=self.output_encoding, errors=self.codec_errors
|
||||
)
|
||||
return codecs.open(self.filename, "wb", self.output_encoding, self.codec_errors)
|
||||
|
||||
def close_fileoutput(self):
|
||||
"""
|
||||
|
|
@ -289,12 +285,14 @@ class _Logger(abc.ABC):
|
|||
"""
|
||||
Return wrapped version of given lines.
|
||||
"""
|
||||
sep = os.linesep+os.linesep
|
||||
sep = os.linesep + os.linesep
|
||||
text = sep.join(lines)
|
||||
kwargs = dict(subsequent_indent=" "*self.max_indent,
|
||||
initial_indent=" "*self.max_indent,
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False)
|
||||
kwargs = dict(
|
||||
subsequent_indent=" " * self.max_indent,
|
||||
initial_indent=" " * self.max_indent,
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False,
|
||||
)
|
||||
return strformat.wrap(text, width, **kwargs).lstrip()
|
||||
|
||||
def write(self, s, **args):
|
||||
|
|
@ -311,9 +309,12 @@ class _Logger(abc.ABC):
|
|||
self.fd.write(s, **args)
|
||||
except IOError:
|
||||
msg = sys.exc_info()[1]
|
||||
log.warn(LOG_CHECK,
|
||||
"Could not write to output file: %s\n"
|
||||
"Disabling log output of %s", msg, self)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Could not write to output file: %s\n" "Disabling log output of %s",
|
||||
msg,
|
||||
self,
|
||||
)
|
||||
self.close_fileoutput()
|
||||
self.fd = dummy.Dummy()
|
||||
self.is_active = False
|
||||
|
|
@ -356,9 +357,9 @@ class _Logger(abc.ABC):
|
|||
parts = self.logparts
|
||||
values = (self.part(x) for x in parts)
|
||||
# maximum indent for localized log part names
|
||||
self.max_indent = max(len(x) for x in values)+1
|
||||
self.max_indent = max(len(x) for x in values) + 1
|
||||
for key in parts:
|
||||
numspaces = (self.max_indent - len(self.part(key)))
|
||||
numspaces = self.max_indent - len(self.part(key))
|
||||
self.logspaces[key] = " " * numspaces
|
||||
self.stats.reset()
|
||||
self.starttime = time.time()
|
||||
|
|
@ -374,22 +375,29 @@ class _Logger(abc.ABC):
|
|||
|
||||
def write_intro(self):
|
||||
"""Write intro comments."""
|
||||
self.comment(_("created by %(app)s at %(time)s") %
|
||||
{"app": configuration.AppName,
|
||||
"time": strformat.strtime(self.starttime)})
|
||||
self.comment(_("Get the newest version at %(url)s") %
|
||||
{'url': configuration.Url})
|
||||
self.comment(_("Write comments and bugs to %(url)s") %
|
||||
{'url': configuration.SupportUrl})
|
||||
self.comment(
|
||||
_("created by %(app)s at %(time)s")
|
||||
% {"app": configuration.AppName, "time": strformat.strtime(self.starttime)}
|
||||
)
|
||||
self.comment(
|
||||
_("Get the newest version at %(url)s") % {'url': configuration.Url}
|
||||
)
|
||||
self.comment(
|
||||
_("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl}
|
||||
)
|
||||
self.check_date()
|
||||
|
||||
def write_outro(self):
|
||||
"""Write outro comments."""
|
||||
self.stoptime = time.time()
|
||||
duration = self.stoptime - self.starttime
|
||||
self.comment(_("Stopped checking at %(time)s (%(duration)s)") %
|
||||
{"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration)})
|
||||
self.comment(
|
||||
_("Stopped checking at %(time)s (%(duration)s)")
|
||||
% {
|
||||
"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration),
|
||||
}
|
||||
)
|
||||
|
||||
@abc.abstractmethod
|
||||
def log_url(self, url_data):
|
||||
|
|
@ -445,9 +453,11 @@ class _Logger(abc.ABC):
|
|||
return modified.strftime("%Y-%m-%d{0}%H:%M:%S.%fZ".format(sep))
|
||||
return ""
|
||||
|
||||
|
||||
def _get_loggers():
|
||||
"""Return list of Logger classes."""
|
||||
from .. import loader
|
||||
|
||||
modules = loader.get_package_modules('logger')
|
||||
return list(loader.get_plugins(modules, [_Logger]))
|
||||
|
||||
|
|
|
|||
|
|
@ -76,8 +76,9 @@ class BlacklistLogger(_Logger):
|
|||
"""
|
||||
Read a previously stored blacklist from file fd.
|
||||
"""
|
||||
with codecs.open(self.filename, 'r', self.output_encoding,
|
||||
self.codec_errors) as fd:
|
||||
with codecs.open(
|
||||
self.filename, 'r', self.output_encoding, self.codec_errors
|
||||
) as fd:
|
||||
for line in fd:
|
||||
line = line.rstrip()
|
||||
if line.startswith('#') or not line:
|
||||
|
|
|
|||
|
|
@ -24,9 +24,23 @@ from . import _Logger
|
|||
from .. import strformat
|
||||
|
||||
Columns = (
|
||||
"urlname", "parentname", "baseref", "result", "warningstring",
|
||||
"infostring", "valid", "url", "line", "column", "name",
|
||||
"dltime", "size", "checktime", "cached", "level", "modified",
|
||||
"urlname",
|
||||
"parentname",
|
||||
"baseref",
|
||||
"result",
|
||||
"warningstring",
|
||||
"infostring",
|
||||
"valid",
|
||||
"url",
|
||||
"line",
|
||||
"column",
|
||||
"name",
|
||||
"dltime",
|
||||
"size",
|
||||
"checktime",
|
||||
"cached",
|
||||
"level",
|
||||
"modified",
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -70,9 +84,13 @@ class CSVLogger(_Logger):
|
|||
# write empty string to initialize file output
|
||||
self.write("")
|
||||
self.queue = StringIO()
|
||||
self.writer = csv.writer(self.queue, dialect=self.dialect,
|
||||
delimiter=self.separator, lineterminator=self.linesep,
|
||||
quotechar=self.quotechar)
|
||||
self.writer = csv.writer(
|
||||
self.queue,
|
||||
dialect=self.dialect,
|
||||
delimiter=self.separator,
|
||||
lineterminator=self.linesep,
|
||||
quotechar=self.quotechar,
|
||||
)
|
||||
for s in Columns:
|
||||
if self.has_part(s):
|
||||
row.append(s)
|
||||
|
|
|
|||
|
|
@ -55,8 +55,7 @@ class CustomXMLLogger(xmllog._XMLLogger):
|
|||
'line': "%s" % url_data.line,
|
||||
'column': "%s" % url_data.column,
|
||||
}
|
||||
self.xml_tag("parent", url_data.parent_url,
|
||||
attrs=attrs)
|
||||
self.xml_tag("parent", url_data.parent_url, attrs=attrs)
|
||||
if url_data.base_ref and self.has_part('base'):
|
||||
self.xml_tag("baseref", url_data.base_ref)
|
||||
if self.has_part("realurl"):
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ class _GraphLogger(_Logger):
|
|||
|
||||
_disallowed = re.compile(r"[^a-zA-Z0-9 '#(){}\-\[\]\.,;:\!\?]+")
|
||||
|
||||
|
||||
def quote(s):
|
||||
"""Replace disallowed characters in node or edge labels.
|
||||
Also remove whitespace from beginning or end of label."""
|
||||
|
|
|
|||
|
|
@ -27,7 +27,7 @@ class GraphXMLLogger(_XMLLogger, _GraphLogger):
|
|||
|
||||
LoggerName = 'gxml'
|
||||
|
||||
LoggerArgs = {
|
||||
LoggerArgs = {
|
||||
"filename": "linkchecker-out.gxml",
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -28,8 +28,10 @@ from .. import strformat, configuration
|
|||
# ss=1 enables show source
|
||||
validate_html = "http://validator.w3.org/check?ss=1&uri=%(uri)s"
|
||||
# options are the default
|
||||
validate_css = "http://jigsaw.w3.org/css-validator/validator?" \
|
||||
"uri=%(uri)s&warning=1&profile=css2&usermedium=all"
|
||||
validate_css = (
|
||||
"http://jigsaw.w3.org/css-validator/validator?"
|
||||
"uri=%(uri)s&warning=1&profile=css2&usermedium=all"
|
||||
)
|
||||
|
||||
HTML_HEADER = """<!DOCTYPE HTML>
|
||||
<html>
|
||||
|
|
@ -64,15 +66,15 @@ class HtmlLogger(_Logger):
|
|||
|
||||
LoggerName = 'html'
|
||||
|
||||
LoggerArgs = {
|
||||
"filename": "linkchecker-out.html",
|
||||
LoggerArgs = {
|
||||
"filename": "linkchecker-out.html",
|
||||
'colorbackground': '#fff7e5',
|
||||
'colorurl': '#dcd5cf',
|
||||
'colorborder': '#000000',
|
||||
'colorlink': '#191c83',
|
||||
'colorwarning': '#e0954e',
|
||||
'colorerror': '#db4930',
|
||||
'colorok': '#3ba557',
|
||||
'colorurl': '#dcd5cf',
|
||||
'colorborder': '#000000',
|
||||
'colorlink': '#191c83',
|
||||
'colorwarning': '#e0954e',
|
||||
'colorerror': '#db4930',
|
||||
'colorok': '#3ba557',
|
||||
}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
|
@ -116,12 +118,16 @@ class HtmlLogger(_Logger):
|
|||
self.write(HTML_HEADER % header)
|
||||
self.comment("Generated by %s" % configuration.App)
|
||||
if self.has_part('intro'):
|
||||
self.write("<h2>"+configuration.App+
|
||||
"</h2><br/><blockquote>"+
|
||||
configuration.Freeware+"<br/><br/>"+
|
||||
(_("Start checking at %s") %
|
||||
strformat.strtime(self.starttime))+
|
||||
os.linesep+"<br/>")
|
||||
self.write(
|
||||
"<h2>"
|
||||
+ configuration.App
|
||||
+ "</h2><br/><blockquote>"
|
||||
+ configuration.Freeware
|
||||
+ "<br/><br/>"
|
||||
+ (_("Start checking at %s") % strformat.strtime(self.starttime))
|
||||
+ os.linesep
|
||||
+ "<br/>"
|
||||
)
|
||||
self.check_date()
|
||||
self.flush()
|
||||
|
||||
|
|
@ -184,10 +190,15 @@ class HtmlLogger(_Logger):
|
|||
|
||||
def write_parent(self, url_data):
|
||||
"""Write url_data.parent_url."""
|
||||
self.write("<tr><td>"+self.part("parenturl")+
|
||||
'</td><td><a target="top" href="'+
|
||||
url_data.parent_url+'">'+
|
||||
html.escape(url_data.parent_url)+"</a>")
|
||||
self.write(
|
||||
"<tr><td>"
|
||||
+ self.part("parenturl")
|
||||
+ '</td><td><a target="top" href="'
|
||||
+ url_data.parent_url
|
||||
+ '">'
|
||||
+ html.escape(url_data.parent_url)
|
||||
+ "</a>"
|
||||
)
|
||||
if url_data.line is not None:
|
||||
self.write(_(", line %d") % url_data.line)
|
||||
if url_data.column is not None:
|
||||
|
|
@ -199,58 +210,98 @@ class HtmlLogger(_Logger):
|
|||
vhtml = validate_html % {'uri': url_data.parent_url}
|
||||
vcss = validate_css % {'uri': url_data.parent_url}
|
||||
self.writeln()
|
||||
self.writeln('(<a href="'+vhtml+'">HTML</a>)')
|
||||
self.write('(<a href="'+vcss+'">CSS</a>)')
|
||||
self.writeln('(<a href="' + vhtml + '">HTML</a>)')
|
||||
self.write('(<a href="' + vcss + '">CSS</a>)')
|
||||
self.writeln("</td></tr>")
|
||||
|
||||
def write_base(self, url_data):
|
||||
"""Write url_data.base_ref."""
|
||||
self.writeln("<tr><td>"+self.part("base")+"</td><td>"+
|
||||
html.escape(url_data.base_ref)+"</td></tr>")
|
||||
self.writeln(
|
||||
"<tr><td>"
|
||||
+ self.part("base")
|
||||
+ "</td><td>"
|
||||
+ html.escape(url_data.base_ref)
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_real(self, url_data):
|
||||
"""Write url_data.url."""
|
||||
self.writeln("<tr><td>"+self.part("realurl")+"</td><td>"+
|
||||
'<a target="top" href="'+url_data.url+
|
||||
'">'+html.escape(url_data.url)+"</a></td></tr>")
|
||||
self.writeln(
|
||||
"<tr><td>"
|
||||
+ self.part("realurl")
|
||||
+ "</td><td>"
|
||||
+ '<a target="top" href="'
|
||||
+ url_data.url
|
||||
+ '">'
|
||||
+ html.escape(url_data.url)
|
||||
+ "</a></td></tr>"
|
||||
)
|
||||
|
||||
def write_dltime(self, url_data):
|
||||
"""Write url_data.dltime."""
|
||||
self.writeln("<tr><td>"+self.part("dltime")+"</td><td>"+
|
||||
(_("%.3f seconds") % url_data.dltime)+
|
||||
"</td></tr>")
|
||||
self.writeln(
|
||||
"<tr><td>"
|
||||
+ self.part("dltime")
|
||||
+ "</td><td>"
|
||||
+ (_("%.3f seconds") % url_data.dltime)
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_size(self, url_data):
|
||||
"""Write url_data.size."""
|
||||
self.writeln("<tr><td>"+self.part("dlsize")+"</td><td>"+
|
||||
strformat.strsize(url_data.size)+
|
||||
"</td></tr>")
|
||||
self.writeln(
|
||||
"<tr><td>"
|
||||
+ self.part("dlsize")
|
||||
+ "</td><td>"
|
||||
+ strformat.strsize(url_data.size)
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_checktime(self, url_data):
|
||||
"""Write url_data.checktime."""
|
||||
self.writeln("<tr><td>"+self.part("checktime")+"</td><td>"+
|
||||
(_("%.3f seconds") % url_data.checktime)+"</td></tr>")
|
||||
self.writeln(
|
||||
"<tr><td>"
|
||||
+ self.part("checktime")
|
||||
+ "</td><td>"
|
||||
+ (_("%.3f seconds") % url_data.checktime)
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_info(self, url_data):
|
||||
"""Write url_data.info."""
|
||||
sep = "<br/>"+os.linesep
|
||||
sep = "<br/>" + os.linesep
|
||||
text = sep.join(html.escape(x) for x in url_data.info)
|
||||
self.writeln('<tr><td valign="top">' + self.part("info")+
|
||||
"</td><td>"+text+"</td></tr>")
|
||||
self.writeln(
|
||||
'<tr><td valign="top">'
|
||||
+ self.part("info")
|
||||
+ "</td><td>"
|
||||
+ text
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_modified(self, url_data):
|
||||
"""Write url_data.modified."""
|
||||
text = html.escape(self.format_modified(url_data.modified))
|
||||
self.writeln('<tr><td valign="top">' + self.part("modified") +
|
||||
"</td><td>"+text+"</td></tr>")
|
||||
self.writeln(
|
||||
'<tr><td valign="top">'
|
||||
+ self.part("modified")
|
||||
+ "</td><td>"
|
||||
+ text
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_warning(self, url_data):
|
||||
"""Write url_data.warnings."""
|
||||
sep = "<br/>"+os.linesep
|
||||
sep = "<br/>" + os.linesep
|
||||
text = sep.join(html.escape(x[1]) for x in url_data.warnings)
|
||||
self.writeln('<tr><td class="warning" '+
|
||||
'valign="top">' + self.part("warning") +
|
||||
'</td><td class="warning">' + text + "</td></tr>")
|
||||
self.writeln(
|
||||
'<tr><td class="warning" '
|
||||
+ 'valign="top">'
|
||||
+ self.part("warning")
|
||||
+ '</td><td class="warning">'
|
||||
+ text
|
||||
+ "</td></tr>"
|
||||
)
|
||||
|
||||
def write_result(self, url_data):
|
||||
"""Write url_data.result."""
|
||||
|
|
@ -265,22 +316,30 @@ class HtmlLogger(_Logger):
|
|||
self.write('</td><td class="error">')
|
||||
self.write(html.escape(_("Error")))
|
||||
if url_data.result:
|
||||
self.write(": "+html.escape(url_data.result))
|
||||
self.write(": " + html.escape(url_data.result))
|
||||
self.writeln("</td></tr>")
|
||||
|
||||
def write_stats(self):
|
||||
"""Write check statistic infos."""
|
||||
self.writeln('<br/><i>%s</i><br/>' % _("Statistics"))
|
||||
if self.stats.number > 0:
|
||||
self.writeln(_(
|
||||
"Content types: %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(mail)d mail"
|
||||
" and %(other)d other.") % self.stats.link_types)
|
||||
self.writeln(
|
||||
_(
|
||||
"Content types: %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(mail)d mail"
|
||||
" and %(other)d other."
|
||||
)
|
||||
% self.stats.link_types
|
||||
)
|
||||
self.writeln("<br/>")
|
||||
self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") %
|
||||
dict(min=self.stats.min_url_length,
|
||||
max=self.stats.max_url_length,
|
||||
avg=self.stats.avg_url_length))
|
||||
self.writeln(
|
||||
_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.")
|
||||
% dict(
|
||||
min=self.stats.min_url_length,
|
||||
max=self.stats.max_url_length,
|
||||
avg=self.stats.avg_url_length,
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.writeln(_("No statistics available since no URLs were checked."))
|
||||
self.writeln("<br/>")
|
||||
|
|
@ -288,42 +347,77 @@ class HtmlLogger(_Logger):
|
|||
def write_outro(self):
|
||||
"""Write end of check message."""
|
||||
self.writeln("<br/>")
|
||||
self.write(_("That's it.")+" ")
|
||||
self.write(_("That's it.") + " ")
|
||||
if self.stats.number >= 0:
|
||||
self.write(_n("%d link checked.", "%d links checked.",
|
||||
self.stats.number) % self.stats.number)
|
||||
self.write(
|
||||
_n("%d link checked.", "%d links checked.", self.stats.number)
|
||||
% self.stats.number
|
||||
)
|
||||
self.write(" ")
|
||||
self.write(_n("%d warning found", "%d warnings found",
|
||||
self.stats.warnings_printed) % self.stats.warnings_printed)
|
||||
self.write(
|
||||
_n("%d warning found", "%d warnings found", self.stats.warnings_printed)
|
||||
% self.stats.warnings_printed
|
||||
)
|
||||
if self.stats.warnings != self.stats.warnings_printed:
|
||||
self.write(_(" (%d ignored or duplicates not printed)") %
|
||||
(self.stats.warnings - self.stats.warnings_printed))
|
||||
self.write(
|
||||
_(" (%d ignored or duplicates not printed)")
|
||||
% (self.stats.warnings - self.stats.warnings_printed)
|
||||
)
|
||||
self.write(". ")
|
||||
self.write(_n("%d error found", "%d errors found",
|
||||
self.stats.errors_printed) % self.stats.errors_printed)
|
||||
self.write(
|
||||
_n("%d error found", "%d errors found", self.stats.errors_printed)
|
||||
% self.stats.errors_printed
|
||||
)
|
||||
if self.stats.errors != self.stats.errors_printed:
|
||||
self.write(_(" (%d duplicates not printed)") %
|
||||
(self.stats.errors - self.stats.errors_printed))
|
||||
self.write(
|
||||
_(" (%d duplicates not printed)")
|
||||
% (self.stats.errors - self.stats.errors_printed)
|
||||
)
|
||||
self.writeln(".")
|
||||
self.writeln("<br/>")
|
||||
num = self.stats.internal_errors
|
||||
if num:
|
||||
self.write(_n("There was %(num)d internal error.",
|
||||
"There were %(num)d internal errors.", num) % {"num": num})
|
||||
self.write(
|
||||
_n(
|
||||
"There was %(num)d internal error.",
|
||||
"There were %(num)d internal errors.",
|
||||
num,
|
||||
)
|
||||
% {"num": num}
|
||||
)
|
||||
self.writeln("<br/>")
|
||||
self.stoptime = time.time()
|
||||
duration = self.stoptime - self.starttime
|
||||
self.writeln(_("Stopped checking at %(time)s (%(duration)s)") %
|
||||
{"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration)})
|
||||
self.writeln('</blockquote><br/><hr><small>'+
|
||||
configuration.HtmlAppInfo+"<br/>")
|
||||
self.writeln(_("Get the newest version at %s") %
|
||||
('<a href="'+configuration.Url+'" target="_top">'+
|
||||
configuration.Url+"</a>.<br/>"))
|
||||
self.writeln(_("Write comments and bugs to %s") %
|
||||
('<a href="'+configuration.SupportUrl+'">'+
|
||||
configuration.SupportUrl+"</a>.<br/>"))
|
||||
self.writeln(
|
||||
_("Stopped checking at %(time)s (%(duration)s)")
|
||||
% {
|
||||
"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration),
|
||||
}
|
||||
)
|
||||
self.writeln(
|
||||
'</blockquote><br/><hr><small>' + configuration.HtmlAppInfo + "<br/>"
|
||||
)
|
||||
self.writeln(
|
||||
_("Get the newest version at %s")
|
||||
% (
|
||||
'<a href="'
|
||||
+ configuration.Url
|
||||
+ '" target="_top">'
|
||||
+ configuration.Url
|
||||
+ "</a>.<br/>"
|
||||
)
|
||||
)
|
||||
self.writeln(
|
||||
_("Write comments and bugs to %s")
|
||||
% (
|
||||
'<a href="'
|
||||
+ configuration.SupportUrl
|
||||
+ '">'
|
||||
+ configuration.SupportUrl
|
||||
+ "</a>.<br/>"
|
||||
)
|
||||
)
|
||||
self.writeln("</small></body></html>")
|
||||
|
||||
def end_output(self, **kwargs):
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@ ChangeFreqs = (
|
|||
HTTP_SCHEMES = ('http:', 'https:')
|
||||
HTML_TYPES = ('text/html', "application/xhtml+xml")
|
||||
|
||||
|
||||
class SitemapXmlLogger(xmllog._XMLLogger):
|
||||
"""Sitemap XML output according to http://www.sitemaps.org/protocol.html
|
||||
"""
|
||||
|
|
@ -81,7 +82,11 @@ class SitemapXmlLogger(xmllog._XMLLogger):
|
|||
# initialize prefix and priority
|
||||
if self.prefix is None:
|
||||
if not url_data.url.startswith(HTTP_SCHEMES):
|
||||
log.warn(LOG_CHECK, "Sitemap URL %r does not start with http: or https:.", url_data.url)
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
"Sitemap URL %r does not start with http: or https:.",
|
||||
url_data.url,
|
||||
)
|
||||
self.disabled = True
|
||||
return
|
||||
self.prefix = url_data.url
|
||||
|
|
@ -94,11 +99,13 @@ class SitemapXmlLogger(xmllog._XMLLogger):
|
|||
priority = 0.5
|
||||
if self.priority is not None:
|
||||
priority = self.priority
|
||||
# ignore the do_print flag and determine ourselves if we filter the url
|
||||
if (url_data.valid
|
||||
# ignore the do_print flag and determine ourselves if we filter the url
|
||||
if (
|
||||
url_data.valid
|
||||
and url_data.url.startswith(HTTP_SCHEMES)
|
||||
and url_data.url.startswith(self.prefix)
|
||||
and url_data.content_type in HTML_TYPES):
|
||||
and url_data.content_type in HTML_TYPES
|
||||
):
|
||||
self.log_url(url_data, priority=priority)
|
||||
|
||||
def log_url(self, url_data, priority=None):
|
||||
|
|
|
|||
|
|
@ -87,47 +87,50 @@ class SQLLogger(_Logger):
|
|||
"""
|
||||
Store url check info into the database.
|
||||
"""
|
||||
self.writeln("insert into %(table)s(urlname,"
|
||||
"parentname,baseref,valid,result,warning,info,url,line,col,"
|
||||
"name,checktime,dltime,size,cached,level,modified) values ("
|
||||
"%(base_url)s,"
|
||||
"%(url_parent)s,"
|
||||
"%(base_ref)s,"
|
||||
"%(valid)d,"
|
||||
"%(result)s,"
|
||||
"%(warning)s,"
|
||||
"%(info)s,"
|
||||
"%(url)s,"
|
||||
"%(line)s,"
|
||||
"%(column)s,"
|
||||
"%(name)s,"
|
||||
"%(checktime)d,"
|
||||
"%(dltime)d,"
|
||||
"%(size)d,"
|
||||
"%(cached)d,"
|
||||
"%(level)d,"
|
||||
"%(modified)s"
|
||||
")%(separator)s" %
|
||||
{'table': self.dbname,
|
||||
'base_url': sqlify(url_data.base_url),
|
||||
'url_parent': sqlify((url_data.parent_url)),
|
||||
'base_ref': sqlify((url_data.base_ref)),
|
||||
'valid': intify(url_data.valid),
|
||||
'result': sqlify(url_data.result),
|
||||
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
|
||||
'info': sqlify(os.linesep.join(url_data.info)),
|
||||
'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")),
|
||||
'line': 'NULL' if url_data.line is None else url_data.line,
|
||||
'column': 'NULL' if url_data.column is None else url_data.column,
|
||||
'name': sqlify(url_data.name),
|
||||
'checktime': url_data.checktime,
|
||||
'dltime': url_data.dltime,
|
||||
'size': url_data.size,
|
||||
'cached': 0,
|
||||
'separator': self.separator,
|
||||
"level": url_data.level,
|
||||
"modified": sqlify(self.format_modified(url_data.modified)),
|
||||
})
|
||||
self.writeln(
|
||||
"insert into %(table)s(urlname,"
|
||||
"parentname,baseref,valid,result,warning,info,url,line,col,"
|
||||
"name,checktime,dltime,size,cached,level,modified) values ("
|
||||
"%(base_url)s,"
|
||||
"%(url_parent)s,"
|
||||
"%(base_ref)s,"
|
||||
"%(valid)d,"
|
||||
"%(result)s,"
|
||||
"%(warning)s,"
|
||||
"%(info)s,"
|
||||
"%(url)s,"
|
||||
"%(line)s,"
|
||||
"%(column)s,"
|
||||
"%(name)s,"
|
||||
"%(checktime)d,"
|
||||
"%(dltime)d,"
|
||||
"%(size)d,"
|
||||
"%(cached)d,"
|
||||
"%(level)d,"
|
||||
"%(modified)s"
|
||||
")%(separator)s"
|
||||
% {
|
||||
'table': self.dbname,
|
||||
'base_url': sqlify(url_data.base_url),
|
||||
'url_parent': sqlify((url_data.parent_url)),
|
||||
'base_ref': sqlify((url_data.base_ref)),
|
||||
'valid': intify(url_data.valid),
|
||||
'result': sqlify(url_data.result),
|
||||
'warning': sqlify(os.linesep.join(x[1] for x in url_data.warnings)),
|
||||
'info': sqlify(os.linesep.join(url_data.info)),
|
||||
'url': sqlify(urlutil.url_quote(url_data.url, encoding="utf-8")),
|
||||
'line': 'NULL' if url_data.line is None else url_data.line,
|
||||
'column': 'NULL' if url_data.column is None else url_data.column,
|
||||
'name': sqlify(url_data.name),
|
||||
'checktime': url_data.checktime,
|
||||
'dltime': url_data.dltime,
|
||||
'size': url_data.size,
|
||||
'cached': 0,
|
||||
'separator': self.separator,
|
||||
"level": url_data.level,
|
||||
"modified": sqlify(self.format_modified(url_data.modified)),
|
||||
}
|
||||
)
|
||||
self.flush()
|
||||
|
||||
def end_output(self, **kwargs):
|
||||
|
|
|
|||
|
|
@ -38,18 +38,18 @@ class TextLogger(_Logger):
|
|||
|
||||
LoggerArgs = {
|
||||
"filename": "linkchecker-out.txt",
|
||||
'colorparent': "default",
|
||||
'colorurl': "default",
|
||||
'colorname': "default",
|
||||
'colorreal': "cyan",
|
||||
'colorbase': "purple",
|
||||
'colorvalid': "bold;green",
|
||||
'colorparent': "default",
|
||||
'colorurl': "default",
|
||||
'colorname': "default",
|
||||
'colorreal': "cyan",
|
||||
'colorbase': "purple",
|
||||
'colorvalid': "bold;green",
|
||||
'colorinvalid': "bold;red",
|
||||
'colorinfo': "default",
|
||||
'colorinfo': "default",
|
||||
'colorwarning': "bold;yellow",
|
||||
'colordltime': "default",
|
||||
'colordlsize': "default",
|
||||
'colorreset': "default",
|
||||
'colordltime': "default",
|
||||
'colordlsize': "default",
|
||||
'colorreset': "default",
|
||||
}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
|
|
@ -95,14 +95,15 @@ class TextLogger(_Logger):
|
|||
"""Log introduction text."""
|
||||
self.writeln(configuration.AppInfo)
|
||||
self.writeln(configuration.Freeware)
|
||||
self.writeln(_("Get the newest version at %(url)s") %
|
||||
{'url': configuration.Url})
|
||||
self.writeln(_("Write comments and bugs to %(url)s") %
|
||||
{'url': configuration.SupportUrl})
|
||||
self.writeln(
|
||||
_("Get the newest version at %(url)s") % {'url': configuration.Url}
|
||||
)
|
||||
self.writeln(
|
||||
_("Write comments and bugs to %(url)s") % {'url': configuration.SupportUrl}
|
||||
)
|
||||
self.check_date()
|
||||
self.writeln()
|
||||
self.writeln(_("Start checking at %s") %
|
||||
strformat.strtime(self.starttime))
|
||||
self.writeln(_("Start checking at %s") % strformat.strtime(self.starttime))
|
||||
|
||||
def log_url(self, url_data):
|
||||
"""Write url checking info."""
|
||||
|
|
@ -175,20 +176,17 @@ class TextLogger(_Logger):
|
|||
def write_dltime(self, url_data):
|
||||
"""Write url_data.dltime."""
|
||||
self.write(self.part("dltime") + self.spaces("dltime"))
|
||||
self.writeln(_("%.3f seconds") % url_data.dltime,
|
||||
color=self.colordltime)
|
||||
self.writeln(_("%.3f seconds") % url_data.dltime, color=self.colordltime)
|
||||
|
||||
def write_size(self, url_data):
|
||||
"""Write url_data.size."""
|
||||
self.write(self.part("dlsize") + self.spaces("dlsize"))
|
||||
self.writeln(strformat.strsize(url_data.size),
|
||||
color=self.colordlsize)
|
||||
self.writeln(strformat.strsize(url_data.size), color=self.colordlsize)
|
||||
|
||||
def write_checktime(self, url_data):
|
||||
"""Write url_data.checktime."""
|
||||
self.write(self.part("checktime") + self.spaces("checktime"))
|
||||
self.writeln(_("%.3f seconds") % url_data.checktime,
|
||||
color=self.colordltime)
|
||||
self.writeln(_("%.3f seconds") % url_data.checktime, color=self.colordltime)
|
||||
|
||||
def write_info(self, url_data):
|
||||
"""Write url_data.info."""
|
||||
|
|
@ -225,60 +223,88 @@ class TextLogger(_Logger):
|
|||
if interrupt:
|
||||
self.writeln(_("The check has been interrupted; results are not complete."))
|
||||
self.write(_("That's it.") + " ")
|
||||
self.write(_n("%d link", "%d links",
|
||||
self.stats.number) % self.stats.number)
|
||||
self.write(_n("%d link", "%d links", self.stats.number) % self.stats.number)
|
||||
self.write(" ")
|
||||
if self.stats.num_urls is not None:
|
||||
self.write(_n("in %d URL", "in %d URLs",
|
||||
self.stats.num_urls) % self.stats.num_urls)
|
||||
self.write(
|
||||
_n("in %d URL", "in %d URLs", self.stats.num_urls) % self.stats.num_urls
|
||||
)
|
||||
self.write(" checked. ")
|
||||
warning_text = _n("%d warning found", "%d warnings found",
|
||||
self.stats.warnings_printed) % self.stats.warnings_printed
|
||||
warning_text = (
|
||||
_n("%d warning found", "%d warnings found", self.stats.warnings_printed)
|
||||
% self.stats.warnings_printed
|
||||
)
|
||||
if self.stats.warnings_printed:
|
||||
warning_color = self.colorwarning
|
||||
else:
|
||||
warning_color = self.colorinfo
|
||||
self.write(warning_text, color=warning_color)
|
||||
if self.stats.warnings != self.stats.warnings_printed:
|
||||
self.write(_(" (%d ignored or duplicates not printed)") %
|
||||
(self.stats.warnings - self.stats.warnings_printed))
|
||||
self.write(
|
||||
_(" (%d ignored or duplicates not printed)")
|
||||
% (self.stats.warnings - self.stats.warnings_printed)
|
||||
)
|
||||
self.write(". ")
|
||||
error_text = _n("%d error found", "%d errors found",
|
||||
self.stats.errors_printed) % self.stats.errors_printed
|
||||
error_text = (
|
||||
_n("%d error found", "%d errors found", self.stats.errors_printed)
|
||||
% self.stats.errors_printed
|
||||
)
|
||||
if self.stats.errors_printed:
|
||||
error_color = self.colorinvalid
|
||||
else:
|
||||
error_color = self.colorvalid
|
||||
self.write(error_text, color=error_color)
|
||||
if self.stats.errors != self.stats.errors_printed:
|
||||
self.write(_(" (%d duplicates not printed)") %
|
||||
(self.stats.errors - self.stats.errors_printed))
|
||||
self.write(
|
||||
_(" (%d duplicates not printed)")
|
||||
% (self.stats.errors - self.stats.errors_printed)
|
||||
)
|
||||
self.writeln(".")
|
||||
num = self.stats.internal_errors
|
||||
if num:
|
||||
self.writeln(_n("There was %(num)d internal error.",
|
||||
"There were %(num)d internal errors.", num) % {"num": num})
|
||||
self.writeln(
|
||||
_n(
|
||||
"There was %(num)d internal error.",
|
||||
"There were %(num)d internal errors.",
|
||||
num,
|
||||
)
|
||||
% {"num": num}
|
||||
)
|
||||
self.stoptime = time.time()
|
||||
duration = self.stoptime - self.starttime
|
||||
self.writeln(_("Stopped checking at %(time)s (%(duration)s)") %
|
||||
{"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration)})
|
||||
self.writeln(
|
||||
_("Stopped checking at %(time)s (%(duration)s)")
|
||||
% {
|
||||
"time": strformat.strtime(self.stoptime),
|
||||
"duration": strformat.strduration_long(duration),
|
||||
}
|
||||
)
|
||||
|
||||
def write_stats(self):
|
||||
"""Write check statistic info."""
|
||||
self.writeln()
|
||||
self.writeln(_("Statistics:"))
|
||||
if self.stats.downloaded_bytes is not None:
|
||||
self.writeln(_("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes))
|
||||
self.writeln(
|
||||
_("Downloaded: %s.") % strformat.strsize(self.stats.downloaded_bytes)
|
||||
)
|
||||
if self.stats.number > 0:
|
||||
self.writeln(_(
|
||||
"Content types: %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(mail)d mail"
|
||||
" and %(other)d other.") % self.stats.link_types)
|
||||
self.writeln(_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.") %
|
||||
dict(min=self.stats.min_url_length,
|
||||
max=self.stats.max_url_length,
|
||||
avg=self.stats.avg_url_length))
|
||||
self.writeln(
|
||||
_(
|
||||
"Content types: %(image)d image, %(text)d text, %(video)d video, "
|
||||
"%(audio)d audio, %(application)d application, %(mail)d mail"
|
||||
" and %(other)d other."
|
||||
)
|
||||
% self.stats.link_types
|
||||
)
|
||||
self.writeln(
|
||||
_("URL lengths: min=%(min)d, max=%(max)d, avg=%(avg)d.")
|
||||
% dict(
|
||||
min=self.stats.min_url_length,
|
||||
max=self.stats.max_url_length,
|
||||
avg=self.stats.avg_url_length,
|
||||
)
|
||||
)
|
||||
else:
|
||||
self.writeln(_("No statistics available since no URLs were checked."))
|
||||
|
||||
|
|
|
|||
|
|
@ -66,8 +66,10 @@ class _XMLLogger(_Logger):
|
|||
"""
|
||||
Write start of checking info as xml comment.
|
||||
"""
|
||||
self.writeln('<?xml version="1.0" encoding="%s"?>' %
|
||||
xmlquoteattr(self.get_charset_encoding()))
|
||||
self.writeln(
|
||||
'<?xml version="1.0" encoding="%s"?>'
|
||||
% xmlquoteattr(self.get_charset_encoding())
|
||||
)
|
||||
if self.has_part("intro"):
|
||||
self.write_intro()
|
||||
self.writeln()
|
||||
|
|
@ -83,7 +85,7 @@ class _XMLLogger(_Logger):
|
|||
"""
|
||||
Write XML start tag.
|
||||
"""
|
||||
self.write(self.indent*self.level)
|
||||
self.write(self.indent * self.level)
|
||||
self.write("<%s" % xmlquote(name))
|
||||
if attrs:
|
||||
for name, value in attrs.items():
|
||||
|
|
@ -98,14 +100,14 @@ class _XMLLogger(_Logger):
|
|||
"""
|
||||
self.level -= 1
|
||||
assert self.level >= 0
|
||||
self.write(self.indent*self.level)
|
||||
self.write(self.indent * self.level)
|
||||
self.writeln("</%s>" % xmlquote(name))
|
||||
|
||||
def xml_tag(self, name, content, attrs=None):
|
||||
"""
|
||||
Write XML tag with content.
|
||||
"""
|
||||
self.write(self.indent*self.level)
|
||||
self.write(self.indent * self.level)
|
||||
self.write("<%s" % xmlquote(name))
|
||||
if attrs:
|
||||
for aname, avalue in attrs.items():
|
||||
|
|
|
|||
|
|
@ -22,9 +22,9 @@ from . import strformat, log, LOG_CHECK
|
|||
from .fileutil import get_temp_file
|
||||
|
||||
# Message to display when meliae package is not installed
|
||||
MemoryDebugMsg = strformat.format_feature_warning(module='meliae',
|
||||
feature='memory debugging',
|
||||
url='https://launchpad.net/meliae')
|
||||
MemoryDebugMsg = strformat.format_feature_warning(
|
||||
module='meliae', feature='memory debugging', url='https://launchpad.net/meliae'
|
||||
)
|
||||
|
||||
|
||||
def write_memory_dump():
|
||||
|
|
@ -37,10 +37,10 @@ def write_memory_dump():
|
|||
if gc.garbage:
|
||||
log.warn(LOG_CHECK, "Unreachabe objects: %s", pprint.pformat(gc.garbage))
|
||||
from meliae import scanner
|
||||
|
||||
fo, filename = get_temp_file(mode='wb', suffix='.json', prefix='lcdump_')
|
||||
try:
|
||||
scanner.dump_all_objects(fo)
|
||||
finally:
|
||||
fo.close()
|
||||
return filename
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@ from .logconf import LOG_CHECK
|
|||
|
||||
mimedb = None
|
||||
|
||||
|
||||
def init_mimedb():
|
||||
"""Initialize the local MIME database."""
|
||||
global mimedb
|
||||
|
|
@ -59,6 +60,7 @@ PARSE_CONTENTS = {
|
|||
"application/xml+sitemap": re.compile(r'<\?xml[^<]+<urlset\s+', re.IGNORECASE),
|
||||
}
|
||||
|
||||
|
||||
def guess_mimetype(filename, read=None):
|
||||
"""Return MIME type of file, or 'application/octet-stream' if it could
|
||||
not be determined."""
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import re
|
|||
import socket
|
||||
from .. import log, LOG_CHECK
|
||||
|
||||
|
||||
def is_valid_ip(ip):
|
||||
"""
|
||||
Return True if given ip is a valid IPv4 or IPv6 address.
|
||||
|
|
|
|||
|
|
@ -26,7 +26,11 @@ def parse_url(url_data):
|
|||
if url_data.is_directory():
|
||||
# both ftp and file links represent directories as HTML data
|
||||
key = "html"
|
||||
elif url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
|
||||
elif (
|
||||
url_data.is_file()
|
||||
and firefox.has_sqlite
|
||||
and firefox.extension.search(url_data.url)
|
||||
):
|
||||
key = "firefox"
|
||||
elif url_data.scheme == "itms-services":
|
||||
key = "itms_services"
|
||||
|
|
@ -34,7 +38,7 @@ def parse_url(url_data):
|
|||
# determine parse routine according to content types
|
||||
mime = url_data.content_type
|
||||
key = url_data.ContentMimetypes[mime]
|
||||
funcname = "parse_"+key
|
||||
funcname = "parse_" + key
|
||||
if funcname in globals():
|
||||
globals()[funcname](url_data)
|
||||
else:
|
||||
|
|
@ -51,6 +55,7 @@ def parse_html(url_data):
|
|||
def parse_opera(url_data):
|
||||
"""Parse an opera bookmark file."""
|
||||
from ..bookmarks.opera import parse_bookmark_data
|
||||
|
||||
for url, name, lineno in parse_bookmark_data(url_data.get_content()):
|
||||
url_data.add_url(url, line=lineno, name=name)
|
||||
|
||||
|
|
@ -58,6 +63,7 @@ def parse_opera(url_data):
|
|||
def parse_chromium(url_data):
|
||||
"""Parse a Chromium or Google Chrome bookmark file."""
|
||||
from ..bookmarks.chromium import parse_bookmark_data
|
||||
|
||||
for url, name in parse_bookmark_data(url_data.get_content()):
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
|
@ -65,6 +71,7 @@ def parse_chromium(url_data):
|
|||
def parse_safari(url_data):
|
||||
"""Parse a Safari bookmark file."""
|
||||
from ..bookmarks.safari import parse_bookmark_data
|
||||
|
||||
for url, name in parse_bookmark_data(url_data.get_raw_content()):
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
|
@ -124,8 +131,9 @@ def parse_firefox(url_data):
|
|||
def parse_itms_services(url_data):
|
||||
"""Get "url" CGI parameter value as child URL."""
|
||||
query = url_data.urlparts[3]
|
||||
for k, v, sep in urlutil.parse_qsl(query, encoding=url_data.encoding,
|
||||
keep_blank_values=True):
|
||||
for k, v, sep in urlutil.parse_qsl(
|
||||
query, encoding=url_data.encoding, keep_blank_values=True
|
||||
):
|
||||
if k == "url":
|
||||
url_data.add_url(v)
|
||||
break
|
||||
|
|
|
|||
|
|
@ -18,7 +18,8 @@ Main functions for link parsing
|
|||
"""
|
||||
from xml.parsers.expat import ParserCreate
|
||||
from xml.parsers.expat import ExpatError
|
||||
from ..checker.const import (WARN_XML_PARSE_ERROR)
|
||||
from ..checker.const import WARN_XML_PARSE_ERROR
|
||||
|
||||
|
||||
class XmlTagUrlParser:
|
||||
"""Parse XML files and find URLs in text content of a tag name."""
|
||||
|
|
@ -42,11 +43,11 @@ class XmlTagUrlParser:
|
|||
try:
|
||||
self.parser.Parse(data, isfinal)
|
||||
except ExpatError as expaterr:
|
||||
self.url_data.add_warning(expaterr.message,tag=WARN_XML_PARSE_ERROR)
|
||||
self.url_data.add_warning(expaterr.message, tag=WARN_XML_PARSE_ERROR)
|
||||
|
||||
def start_element(self, name, attrs):
|
||||
"""Set tag status for start element."""
|
||||
self.in_tag = (name == self.tag)
|
||||
self.in_tag = name == self.tag
|
||||
self.url = ""
|
||||
|
||||
def end_element(self, name):
|
||||
|
|
@ -58,8 +59,11 @@ class XmlTagUrlParser:
|
|||
def add_url(self):
|
||||
"""Add non-empty URLs to the queue."""
|
||||
if self.url:
|
||||
self.url_data.add_url(self.url, line=self.parser.CurrentLineNumber,
|
||||
column=self.parser.CurrentColumnNumber)
|
||||
self.url_data.add_url(
|
||||
self.url,
|
||||
line=self.parser.CurrentLineNumber,
|
||||
column=self.parser.CurrentColumnNumber,
|
||||
)
|
||||
self.url = ""
|
||||
|
||||
def char_data(self, data):
|
||||
|
|
|
|||
|
|
@ -46,21 +46,23 @@ class _PluginBase:
|
|||
|
||||
class _ConnectionPlugin(_PluginBase):
|
||||
"""Plugins run after connection checks."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class _ContentPlugin(_PluginBase):
|
||||
"""Plugins run for valid URLs with content."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class _ParserPlugin(_PluginBase):
|
||||
"""Plugins run for valid URLs to parse their contents."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
def get_plugin_modules(folders, package='plugins',
|
||||
parentpackage='linkcheck.dummy'):
|
||||
def get_plugin_modules(folders, package='plugins', parentpackage='linkcheck.dummy'):
|
||||
"""Get plugin modules for given folders."""
|
||||
for folder in folders:
|
||||
for module in loader.get_folder_modules(folder, parentpackage):
|
||||
|
|
@ -114,7 +116,9 @@ class PluginManager:
|
|||
|
||||
def run_parser_plugins(self, url_data, pagetype):
|
||||
"""Run parser plugins for given pagetype."""
|
||||
run_plugins(self.parser_plugins, url_data, stop_after_match=True, pagetype=pagetype)
|
||||
run_plugins(
|
||||
self.parser_plugins, url_data, stop_after_match=True, pagetype=pagetype
|
||||
)
|
||||
|
||||
|
||||
def run_plugins(plugins, url_data, stop_after_match=False, **kwargs):
|
||||
|
|
|
|||
|
|
@ -35,8 +35,7 @@ class AnchorCheck(_ContentPlugin):
|
|||
log.debug(LOG_PLUGIN, "checking content for invalid anchors")
|
||||
# list of parsed anchors
|
||||
self.anchors = []
|
||||
linkparse.find_links(url_data.get_soup(), self.add_anchor,
|
||||
linkparse.AnchorTags)
|
||||
linkparse.find_links(url_data.get_soup(), self.add_anchor, linkparse.AnchorTags)
|
||||
self.check_anchor(url_data)
|
||||
|
||||
def add_anchor(self, url, line, column, name, base):
|
||||
|
|
@ -56,6 +55,8 @@ class AnchorCheck(_ContentPlugin):
|
|||
else:
|
||||
anchors = "-"
|
||||
args = {"name": url_data.anchor, "anchors": anchors}
|
||||
msg = "%s %s" % (_("Anchor `%(name)s' not found.") % args,
|
||||
_("Available anchors: %(anchors)s.") % args)
|
||||
msg = "%s %s" % (
|
||||
_("Anchor `%(name)s' not found.") % args,
|
||||
_("Available anchors: %(anchors)s.") % args,
|
||||
)
|
||||
url_data.add_warning(msg)
|
||||
|
|
|
|||
|
|
@ -38,7 +38,10 @@ class HttpHeaderInfo(_ConnectionPlugin):
|
|||
if name.lower().startswith(self.prefixes):
|
||||
headers.append(name.lower())
|
||||
if headers:
|
||||
items = ["%s=%s" % (name.capitalize(), url_data.headers[name]) for name in headers]
|
||||
items = [
|
||||
"%s=%s" % (name.capitalize(), url_data.headers[name])
|
||||
for name in headers
|
||||
]
|
||||
info = "HTTP headers %s" % ", ".join(items)
|
||||
url_data.add_info(info)
|
||||
|
||||
|
|
@ -55,4 +58,3 @@ class HttpHeaderInfo(_ConnectionPlugin):
|
|||
names = []
|
||||
config[option] = names
|
||||
return config
|
||||
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ from ..decorators import synchronized
|
|||
from ..strformat import unicode_safe
|
||||
from .. import log, LOG_PLUGIN
|
||||
|
||||
|
||||
class LocationInfo(_ConnectionPlugin):
|
||||
"""Adds the country and if possible city name of the URL host as info.
|
||||
Needs GeoIP or pygeoip and a local country or city lookup DB installed."""
|
||||
|
|
@ -43,13 +44,16 @@ class LocationInfo(_ConnectionPlugin):
|
|||
"""Try to ask GeoIP database for country info."""
|
||||
location = get_location(url_data.host)
|
||||
if location:
|
||||
url_data.add_info(_("URL is located in %(location)s.") %
|
||||
{"location": _(location)})
|
||||
url_data.add_info(
|
||||
_("URL is located in %(location)s.") % {"location": _(location)}
|
||||
)
|
||||
|
||||
|
||||
# It is unknown if the geoip library is already thread-safe, so
|
||||
# no risks should be taken here by using a lock.
|
||||
_lock = get_lock("geoip")
|
||||
|
||||
|
||||
def get_geoip_dat():
|
||||
"""Find a GeoIP database, preferring city over country lookup."""
|
||||
datafiles = ("GeoIPCity.dat", "GeoIP.dat")
|
||||
|
|
@ -63,17 +67,20 @@ def get_geoip_dat():
|
|||
if os.path.isfile(filename):
|
||||
return filename
|
||||
|
||||
|
||||
# try importing both the C-library GeoIP and the pure-python pygeoip
|
||||
geoip_dat = get_geoip_dat()
|
||||
geoip = None
|
||||
if geoip_dat:
|
||||
try:
|
||||
import GeoIP
|
||||
|
||||
geoip = GeoIP.open(geoip_dat, GeoIP.GEOIP_STANDARD)
|
||||
geoip_error = GeoIP.error
|
||||
except ImportError:
|
||||
try:
|
||||
import pygeoip
|
||||
|
||||
geoip = pygeoip.GeoIP(geoip_dat)
|
||||
geoip_error = pygeoip.GeoIPError
|
||||
except ImportError:
|
||||
|
|
@ -81,7 +88,9 @@ if geoip_dat:
|
|||
if geoip_dat.endswith('GeoIPCity.dat'):
|
||||
get_geoip_record = lambda host: geoip.record_by_name(host)
|
||||
else:
|
||||
get_geoip_record = lambda host: {'country_name': geoip.country_name_by_name(host)}
|
||||
get_geoip_record = lambda host: {
|
||||
'country_name': geoip.country_name_by_name(host)
|
||||
}
|
||||
|
||||
|
||||
@synchronized(_lock)
|
||||
|
|
|
|||
|
|
@ -37,8 +37,10 @@ class MarkdownCheck(_ContentPlugin):
|
|||
_filename_re_key = "filename_re"
|
||||
_default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
|
||||
|
||||
_link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
|
||||
re.compile(r"""
|
||||
_link_res = [
|
||||
re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
|
||||
re.compile(
|
||||
r"""
|
||||
\[.+\]: # id
|
||||
[ \t]*\n? # maybe *one* newline
|
||||
[ \t]*
|
||||
|
|
@ -54,20 +56,26 @@ class MarkdownCheck(_ContentPlugin):
|
|||
[ \t]*
|
||||
)? # title is optional
|
||||
(?:\n+|\Z)
|
||||
""", re.X | re.M | re.U)]
|
||||
""",
|
||||
re.X | re.M | re.U,
|
||||
),
|
||||
]
|
||||
|
||||
_whitespace = re.compile(r'\s*')
|
||||
|
||||
_strip_anglebrackets = re.compile(r'<(.*)>.*')
|
||||
|
||||
_inline_link_title = re.compile(r'''
|
||||
_inline_link_title = re.compile(
|
||||
r'''
|
||||
( # \1
|
||||
[ \t]+
|
||||
(['"]) # quote char
|
||||
(.*?)
|
||||
)? # title is optional
|
||||
\)$
|
||||
''', re.X | re.S)
|
||||
''',
|
||||
re.X | re.S,
|
||||
)
|
||||
|
||||
def __init__(self, config):
|
||||
super(MarkdownCheck, self).__init__(config)
|
||||
|
|
@ -83,8 +91,11 @@ class MarkdownCheck(_ContentPlugin):
|
|||
def read_config(cls, configparser):
|
||||
"""Read configuration file options."""
|
||||
config = dict()
|
||||
config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \
|
||||
if configparser.has_option(cls.__name__, cls._filename_re_key) else None
|
||||
config[cls._filename_re_key] = (
|
||||
configparser.get(cls.__name__, cls._filename_re_key)
|
||||
if configparser.has_option(cls.__name__, cls._filename_re_key)
|
||||
else None
|
||||
)
|
||||
return config
|
||||
|
||||
def applies_to(self, url_data, pagetype=None):
|
||||
|
|
@ -107,7 +118,9 @@ class MarkdownCheck(_ContentPlugin):
|
|||
"""
|
||||
line = content.count('\n', 0, url_pos) + 1
|
||||
column = url_pos - content.rfind('\n', 0, url_pos)
|
||||
url_data.add_url(url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column)
|
||||
url_data.add_url(
|
||||
url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column
|
||||
)
|
||||
|
||||
def _check_by_re(self, url_data, content):
|
||||
""" Finds urls by re.
|
||||
|
|
@ -144,12 +157,12 @@ class MarkdownCheck(_ContentPlugin):
|
|||
end_idx = idx
|
||||
has_anglebrackets = text[idx] == "<"
|
||||
if has_anglebrackets:
|
||||
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
|
||||
end_idx = self._find_balanced(text, end_idx + 1, "<", ">")
|
||||
end_idx = self._find_balanced(text, end_idx, "(", ")")
|
||||
match = self._inline_link_title.search(text, idx, end_idx)
|
||||
if not match:
|
||||
return None, None
|
||||
url = text[idx:match.start()]
|
||||
url = text[idx : match.start()]
|
||||
if has_anglebrackets:
|
||||
url = self._strip_anglebrackets.sub(r'\1', url)
|
||||
return url, end_idx
|
||||
|
|
@ -175,7 +188,9 @@ class MarkdownCheck(_ContentPlugin):
|
|||
|
||||
# Find the matching closing ']'.
|
||||
bracket_depth = 0
|
||||
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)):
|
||||
for p in range(
|
||||
start_idx + 1, min(start_idx + MAX_LINK_TEXT_SENTINEL, content_length)
|
||||
):
|
||||
if content[p] == ']':
|
||||
bracket_depth -= 1
|
||||
if bracket_depth < 0:
|
||||
|
|
|
|||
|
|
@ -19,6 +19,7 @@ Parse links in PDF files with pdfminer.
|
|||
from io import BytesIO
|
||||
|
||||
from . import _ParserPlugin
|
||||
|
||||
try:
|
||||
from pdfminer.pdfparser import PDFParser
|
||||
from pdfminer.pdfdocument import PDFDocument
|
||||
|
|
@ -32,7 +33,6 @@ else:
|
|||
from .. import log, LOG_PLUGIN, strformat
|
||||
|
||||
|
||||
|
||||
def search_url(obj, url_data, pageno, seen_objs):
|
||||
"""Recurse through a PDF object, searching for URLs."""
|
||||
if isinstance(obj, PDFObjRef):
|
||||
|
|
|
|||
|
|
@ -17,9 +17,11 @@
|
|||
Parse hyperlinks in Word files.
|
||||
"""
|
||||
from . import _ParserPlugin
|
||||
|
||||
try:
|
||||
import win32com
|
||||
import pythoncom
|
||||
|
||||
has_win32com = True
|
||||
Error = pythoncom.com_error
|
||||
except ImportError:
|
||||
|
|
@ -29,14 +31,17 @@ from .. import fileutil, log, LOG_PLUGIN
|
|||
|
||||
|
||||
_initialized = False
|
||||
|
||||
|
||||
def init_win32com():
|
||||
"""Initialize the win32com.client cache."""
|
||||
global _initialized
|
||||
if _initialized:
|
||||
return
|
||||
import win32com.client
|
||||
|
||||
if win32com.client.gencache.is_readonly:
|
||||
#allow gencache to create the cached wrapper objects
|
||||
# allow gencache to create the cached wrapper objects
|
||||
win32com.client.gencache.is_readonly = False
|
||||
# under py2exe the call in gencache to __init__() does not happen
|
||||
# so we use Rebuild() to force the creation of the gen_py folder
|
||||
|
|
@ -79,6 +84,7 @@ def get_word_app():
|
|||
# the COM layer.
|
||||
pythoncom.CoInitialize()
|
||||
import win32com.client
|
||||
|
||||
app = win32com.client.gencache.EnsureDispatch("Word.Application")
|
||||
app.Visible = False
|
||||
return app
|
||||
|
|
@ -91,8 +97,13 @@ def close_word_app(app):
|
|||
|
||||
def open_wordfile(app, filename):
|
||||
"""Open given Word file with application object."""
|
||||
return app.Documents.Open(filename, ReadOnly=True,
|
||||
AddToRecentFiles=False, Visible=False, NoEncodingDialog=True)
|
||||
return app.Documents.Open(
|
||||
filename,
|
||||
ReadOnly=True,
|
||||
AddToRecentFiles=False,
|
||||
Visible=False,
|
||||
NoEncodingDialog=True,
|
||||
)
|
||||
|
||||
|
||||
def close_wordfile(doc):
|
||||
|
|
@ -128,7 +139,7 @@ class WordParser(_ParserPlugin):
|
|||
try:
|
||||
for link in doc.Hyperlinks:
|
||||
line = get_line_number(link.Range)
|
||||
name=link.TextToDisplay
|
||||
name = link.TextToDisplay
|
||||
url_data.add_url(link.Address, name=name, line=line)
|
||||
finally:
|
||||
close_wordfile(doc)
|
||||
|
|
@ -158,11 +169,9 @@ def get_line_number(doc, wrange):
|
|||
def get_temp_filename(content):
|
||||
"""Get temporary filename for content to parse."""
|
||||
# store content in temporary file
|
||||
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc',
|
||||
prefix='lc_')
|
||||
fd, filename = fileutil.get_temp_file(mode='wb', suffix='.doc', prefix='lc_')
|
||||
try:
|
||||
fd.write(content)
|
||||
finally:
|
||||
fd.close()
|
||||
return filename
|
||||
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ _lock = threading.Lock()
|
|||
# configuration option names
|
||||
sslcertwarndays = "sslcertwarndays"
|
||||
|
||||
|
||||
class SslCertificateCheck(_ConnectionPlugin):
|
||||
"""Check SSL certificate expiration date. Only internal https: links
|
||||
will be checked. A domain will only be checked once to avoid duplicate
|
||||
|
|
@ -37,14 +38,20 @@ class SslCertificateCheck(_ConnectionPlugin):
|
|||
def __init__(self, config):
|
||||
"""Initialize clamav configuration."""
|
||||
super(SslCertificateCheck, self).__init__(config)
|
||||
self.warn_ssl_cert_secs_valid = config[sslcertwarndays] * strformat.SECONDS_PER_DAY
|
||||
self.warn_ssl_cert_secs_valid = (
|
||||
config[sslcertwarndays] * strformat.SECONDS_PER_DAY
|
||||
)
|
||||
# do not check hosts multiple times
|
||||
self.checked_hosts = set()
|
||||
|
||||
def applies_to(self, url_data):
|
||||
"""Check validity, scheme, extern and url_connection."""
|
||||
return url_data.valid and url_data.scheme == 'https' and \
|
||||
not url_data.extern[0] and url_data.url_connection is not None
|
||||
return (
|
||||
url_data.valid
|
||||
and url_data.scheme == 'https'
|
||||
and not url_data.extern[0]
|
||||
and url_data.url_connection is not None
|
||||
)
|
||||
|
||||
@synchronized(_lock)
|
||||
def check(self, url_data):
|
||||
|
|
@ -71,6 +78,7 @@ class SslCertificateCheck(_ConnectionPlugin):
|
|||
if it's at least a number of days valid.
|
||||
"""
|
||||
import ssl
|
||||
|
||||
try:
|
||||
notAfter = ssl.cert_time_to_seconds(cert['notAfter'])
|
||||
except ValueError as msg:
|
||||
|
|
@ -88,7 +96,9 @@ class SslCertificateCheck(_ConnectionPlugin):
|
|||
else:
|
||||
args['valid'] = strformat.strduration_long(secondsValid)
|
||||
if secondsValid < self.warn_ssl_cert_secs_valid:
|
||||
msg = _('SSL certificate expires on %(expire)s and is only %(valid)s valid.')
|
||||
msg = _(
|
||||
'SSL certificate expires on %(expire)s and is only %(valid)s valid.'
|
||||
)
|
||||
url_data.add_warning(msg % args)
|
||||
else:
|
||||
msg = _('SSL certificate expires on %(expire)s and is %(valid)s valid.')
|
||||
|
|
@ -105,7 +115,11 @@ class SslCertificateCheck(_ConnectionPlugin):
|
|||
if num > 0:
|
||||
config[option] = num
|
||||
else:
|
||||
msg = _("invalid value for %s: %d must not be less than %d") % (option, num, 0)
|
||||
msg = _("invalid value for %s: %d must not be less than %d") % (
|
||||
option,
|
||||
num,
|
||||
0,
|
||||
)
|
||||
raise LinkCheckerError(msg)
|
||||
else:
|
||||
# set the default
|
||||
|
|
|
|||
|
|
@ -47,6 +47,7 @@ class HtmlSyntaxCheck(_ContentPlugin):
|
|||
"""Check the syntax of HTML pages with the online W3C HTML validator.
|
||||
See http://validator.w3.org/docs/api.html.
|
||||
"""
|
||||
|
||||
def __init__(self, config):
|
||||
"""Initialize plugin."""
|
||||
super(HtmlSyntaxCheck, self).__init__(config)
|
||||
|
|
@ -69,9 +70,11 @@ class HtmlSyntaxCheck(_ContentPlugin):
|
|||
return
|
||||
check_w3_errors(url_data, response.text, "W3C HTML")
|
||||
except requests.exceptions.RequestException:
|
||||
pass # ignore service failures
|
||||
pass # ignore service failures
|
||||
except Exception as msg:
|
||||
log.warn(LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg})
|
||||
log.warn(
|
||||
LOG_PLUGIN, _("HTML syntax check plugin error: %(msg)s ") % {"msg": msg}
|
||||
)
|
||||
|
||||
|
||||
class CssSyntaxCheck(_ContentPlugin):
|
||||
|
|
@ -106,9 +109,11 @@ class CssSyntaxCheck(_ContentPlugin):
|
|||
return
|
||||
check_w3_errors(url_data, response.text, "W3C HTML")
|
||||
except requests.exceptions.RequestException:
|
||||
pass # ignore service failures
|
||||
pass # ignore service failures
|
||||
except Exception as msg:
|
||||
log.warn(LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg})
|
||||
log.warn(
|
||||
LOG_PLUGIN, _("CSS syntax check plugin error: %(msg)s ") % {"msg": msg}
|
||||
)
|
||||
|
||||
|
||||
def check_w3_errors(url_data, xml, w3type):
|
||||
|
|
@ -116,7 +121,9 @@ def check_w3_errors(url_data, xml, w3type):
|
|||
w3type is either "W3C HTML" or "W3C CSS"."""
|
||||
dom = parseString(xml)
|
||||
for error in dom.getElementsByTagName('m:error'):
|
||||
warnmsg = _("%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s")
|
||||
warnmsg = _(
|
||||
"%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s"
|
||||
)
|
||||
attrs = {
|
||||
"w3type": w3type,
|
||||
"line": getXmlText(error, "m:line"),
|
||||
|
|
|
|||
|
|
@ -67,6 +67,7 @@ class VirusCheck(_ContentPlugin):
|
|||
|
||||
class ClamavError(Exception):
|
||||
"""Raised on clamav errors."""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
|
|
@ -78,8 +79,7 @@ class ClamdScanner:
|
|||
self.infected = []
|
||||
self.errors = []
|
||||
self.sock, self.host = clamav_conf.new_connection()
|
||||
self.sock_rcvbuf = \
|
||||
self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF)
|
||||
self.sock_rcvbuf = self.sock.getsockopt(socket.SOL_SOCKET, socket.SO_RCVBUF)
|
||||
self.wsock = self.new_scansock()
|
||||
|
||||
def new_scansock(self):
|
||||
|
|
@ -92,7 +92,7 @@ class ClamdScanner:
|
|||
data = self.sock.recv(self.sock_rcvbuf)
|
||||
i = data.find(b"PORT")
|
||||
if i != -1:
|
||||
port = int(data[i+5:])
|
||||
port = int(data[i + 5 :])
|
||||
break
|
||||
except socket.error:
|
||||
self.sock.close()
|
||||
|
|
@ -159,7 +159,9 @@ class ClamavConfig(dict):
|
|||
if self.get('ScannerDaemonOutputFormat'):
|
||||
raise ClamavError(_("ScannerDaemonOutputFormat must be disabled"))
|
||||
if self.get('TCPSocket') and self.get('LocalSocket'):
|
||||
raise ClamavError(_("only one of TCPSocket and LocalSocket must be enabled"))
|
||||
raise ClamavError(
|
||||
_("only one of TCPSocket and LocalSocket must be enabled")
|
||||
)
|
||||
|
||||
def parseconf(self, filename):
|
||||
"""Parse clamav configuration from given file."""
|
||||
|
|
|
|||
|
|
@ -35,8 +35,7 @@ class RobotFileParser:
|
|||
"""This class provides a set of methods to read, parse and answer
|
||||
questions about a single robots.txt file."""
|
||||
|
||||
def __init__(self, url='', session=None, proxies=None, auth=None,
|
||||
timeout=None):
|
||||
def __init__(self, url='', session=None, proxies=None, auth=None, timeout=None):
|
||||
"""Initialize internal entry lists and store given url and
|
||||
credentials."""
|
||||
self.set_url(url)
|
||||
|
|
@ -85,7 +84,7 @@ class RobotFileParser:
|
|||
"""Read the robots.txt URL and feeds it to the parser."""
|
||||
self._reset()
|
||||
kwargs = dict(
|
||||
headers = {
|
||||
headers={
|
||||
'User-Agent': configuration.UserAgent,
|
||||
'Accept-Encoding': ACCEPT_ENCODING,
|
||||
}
|
||||
|
|
@ -109,7 +108,12 @@ class RobotFileParser:
|
|||
except requests.HTTPError as x:
|
||||
if x.response.status_code in (401, 403):
|
||||
self.disallow_all = True
|
||||
log.debug(LOG_CHECK, "%r disallow all (code %d)", self.url, x.response.status_code)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r disallow all (code %d)",
|
||||
self.url,
|
||||
x.response.status_code,
|
||||
)
|
||||
else:
|
||||
self.allow_all = True
|
||||
log.debug(LOG_CHECK, "%r allow all (HTTP error)", self.url)
|
||||
|
|
@ -148,7 +152,12 @@ class RobotFileParser:
|
|||
linenumber += 1
|
||||
if not line:
|
||||
if state == 1:
|
||||
log.debug(LOG_CHECK, "%r line %d: allow or disallow directives without any user-agent line", self.url, linenumber)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: allow or disallow directives without any user-agent line",
|
||||
self.url,
|
||||
linenumber,
|
||||
)
|
||||
entry = Entry()
|
||||
state = 0
|
||||
elif state == 2:
|
||||
|
|
@ -168,35 +177,61 @@ class RobotFileParser:
|
|||
line[1] = urllib.parse.unquote(line[1].strip(), self.encoding)
|
||||
if line[0] == "user-agent":
|
||||
if state == 2:
|
||||
log.debug(LOG_CHECK, "%r line %d: missing blank line before user-agent directive", self.url, linenumber)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: missing blank line before user-agent directive",
|
||||
self.url,
|
||||
linenumber,
|
||||
)
|
||||
self._add_entry(entry)
|
||||
entry = Entry()
|
||||
entry.useragents.append(line[1])
|
||||
state = 1
|
||||
elif line[0] == "disallow":
|
||||
if state == 0:
|
||||
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: missing user-agent directive before this line",
|
||||
self.url,
|
||||
linenumber,
|
||||
)
|
||||
pass
|
||||
else:
|
||||
entry.rulelines.append(RuleLine(line[1], False))
|
||||
state = 2
|
||||
elif line[0] == "allow":
|
||||
if state == 0:
|
||||
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: missing user-agent directive before this line",
|
||||
self.url,
|
||||
linenumber,
|
||||
)
|
||||
pass
|
||||
else:
|
||||
entry.rulelines.append(RuleLine(line[1], True))
|
||||
state = 2
|
||||
elif line[0] == "crawl-delay":
|
||||
if state == 0:
|
||||
log.debug(LOG_CHECK, "%r line %d: missing user-agent directive before this line", self.url, linenumber)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: missing user-agent directive before this line",
|
||||
self.url,
|
||||
linenumber,
|
||||
)
|
||||
pass
|
||||
else:
|
||||
try:
|
||||
entry.crawldelay = max(0, int(line[1]))
|
||||
state = 2
|
||||
except (ValueError, OverflowError):
|
||||
log.debug(LOG_CHECK, "%r line %d: invalid delay number %r", self.url, linenumber, line[1])
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: invalid delay number %r",
|
||||
self.url,
|
||||
linenumber,
|
||||
line[1],
|
||||
)
|
||||
pass
|
||||
elif line[0] == "sitemap":
|
||||
# Note that sitemap URLs must be absolute according to
|
||||
|
|
@ -204,10 +239,22 @@ class RobotFileParser:
|
|||
# But this should be checked by the calling layer.
|
||||
self.sitemap_urls.append((line[1], linenumber))
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r line %d: unknown key %r", self.url, linenumber, line[0])
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: unknown key %r",
|
||||
self.url,
|
||||
linenumber,
|
||||
line[0],
|
||||
)
|
||||
pass
|
||||
else:
|
||||
log.debug(LOG_CHECK, "%r line %d: malformed line %r", self.url, linenumber, line)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r line %d: malformed line %r",
|
||||
self.url,
|
||||
linenumber,
|
||||
line,
|
||||
)
|
||||
pass
|
||||
if state in (1, 2):
|
||||
self.entries.append(entry)
|
||||
|
|
@ -220,7 +267,13 @@ class RobotFileParser:
|
|||
@return: True if agent can fetch url, else False
|
||||
@rtype: bool
|
||||
"""
|
||||
log.debug(LOG_CHECK, "%r check allowance for:\n user agent: %r\n url: %r ...", self.url, useragent, url)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
"%r check allowance for:\n user agent: %r\n url: %r ...",
|
||||
self.url,
|
||||
useragent,
|
||||
url,
|
||||
)
|
||||
if not isinstance(useragent, str):
|
||||
useragent = useragent.encode("ascii", "ignore")
|
||||
if not isinstance(url, str):
|
||||
|
|
@ -233,7 +286,10 @@ class RobotFileParser:
|
|||
return True
|
||||
# search for given user agent matches
|
||||
# the first match counts
|
||||
url = urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2]) or "/"
|
||||
url = (
|
||||
urllib.parse.quote(urllib.parse.urlparse(urllib.parse.unquote(url))[2])
|
||||
or "/"
|
||||
)
|
||||
for entry in self.entries:
|
||||
if entry.applies_to(useragent):
|
||||
return entry.allowance(url)
|
||||
|
|
@ -296,7 +352,7 @@ class RuleLine:
|
|||
@return: robots.txt format
|
||||
@rtype: string
|
||||
"""
|
||||
return ("Allow" if self.allowance else "Disallow")+": "+self.path
|
||||
return ("Allow" if self.allowance else "Disallow") + ": " + self.path
|
||||
|
||||
|
||||
class Entry:
|
||||
|
|
@ -352,5 +408,10 @@ class Entry:
|
|||
if line.applies_to(filename):
|
||||
log.debug(LOG_CHECK, " ... rule line %s", line)
|
||||
return line.allowance
|
||||
log.debug(LOG_CHECK, " ... no rule lines of %s applied to %s; allowed.", self.useragents, filename)
|
||||
log.debug(
|
||||
LOG_CHECK,
|
||||
" ... no rule lines of %s applied to %s; allowed.",
|
||||
self.useragents,
|
||||
filename,
|
||||
)
|
||||
return True
|
||||
|
|
|
|||
|
|
@ -123,6 +123,7 @@ _para_posix = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\n'}
|
|||
_para_win = r"(?:%(sep)s)(?:(?:%(sep)s)\s*)+" % {'sep': '\r\n'}
|
||||
_para_ro = re.compile("%s|%s|%s" % (_para_mac, _para_posix, _para_win))
|
||||
|
||||
|
||||
def get_paragraphs(text):
|
||||
"""A new paragraph is considered to start at a line which follows
|
||||
one or more blank lines (lines containing nothing or just spaces).
|
||||
|
|
@ -148,8 +149,7 @@ def wrap(text, width, **kwargs):
|
|||
|
||||
def indent(text, indent_string=" "):
|
||||
"""Indent each line of text with the given indent string."""
|
||||
return os.linesep.join("%s%s" % (indent_string, x)
|
||||
for x in text.splitlines())
|
||||
return os.linesep.join("%s%s" % (indent_string, x) for x in text.splitlines())
|
||||
|
||||
|
||||
def get_line_number(s, index):
|
||||
|
|
@ -173,11 +173,12 @@ def paginate(text):
|
|||
|
||||
_markup_re = re.compile("<.*?>", re.DOTALL)
|
||||
|
||||
|
||||
def remove_markup(s):
|
||||
"""Remove all <*> html markup tags from s."""
|
||||
mo = _markup_re.search(s)
|
||||
while mo:
|
||||
s = s[0:mo.start()] + s[mo.end():]
|
||||
s = s[0 : mo.start()] + s[mo.end() :]
|
||||
mo = _markup_re.search(s)
|
||||
return s
|
||||
|
||||
|
|
@ -194,12 +195,20 @@ def strsize(b, grouping=True):
|
|||
if b < 1024 * 1024:
|
||||
return "%sKB" % locale.format_string("%.2f", (float(b) / 1024), grouping)
|
||||
if b < 1024 * 1024 * 10:
|
||||
return "%sMB" % locale.format_string("%.2f", (float(b) / (1024*1024)), grouping)
|
||||
return "%sMB" % locale.format_string(
|
||||
"%.2f", (float(b) / (1024 * 1024)), grouping
|
||||
)
|
||||
if b < 1024 * 1024 * 1024:
|
||||
return "%sMB" % locale.format_string("%.1f", (float(b) / (1024*1024)), grouping)
|
||||
return "%sMB" % locale.format_string(
|
||||
"%.1f", (float(b) / (1024 * 1024)), grouping
|
||||
)
|
||||
if b < 1024 * 1024 * 1024 * 10:
|
||||
return "%sGB" % locale.format_string("%.2f", (float(b) / (1024*1024*1024)), grouping)
|
||||
return "%sGB" % locale.format_string("%.1f", (float(b) / (1024*1024*1024)), grouping)
|
||||
return "%sGB" % locale.format_string(
|
||||
"%.2f", (float(b) / (1024 * 1024 * 1024)), grouping
|
||||
)
|
||||
return "%sGB" % locale.format_string(
|
||||
"%.1f", (float(b) / (1024 * 1024 * 1024)), grouping
|
||||
)
|
||||
|
||||
|
||||
def strtime(t, func=time.localtime):
|
||||
|
|
@ -216,15 +225,21 @@ def strduration(duration):
|
|||
else:
|
||||
prefix = ""
|
||||
duration = math.ceil(duration)
|
||||
if duration >= SECONDS_PER_HOUR: # 1 hour
|
||||
if duration >= SECONDS_PER_HOUR: # 1 hour
|
||||
# time, in hours:minutes:seconds
|
||||
return "%s%02d:%02d:%02d" % (prefix, duration // SECONDS_PER_HOUR,
|
||||
(duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE,
|
||||
duration % SECONDS_PER_MINUTE)
|
||||
return "%s%02d:%02d:%02d" % (
|
||||
prefix,
|
||||
duration // SECONDS_PER_HOUR,
|
||||
(duration % SECONDS_PER_HOUR) // SECONDS_PER_MINUTE,
|
||||
duration % SECONDS_PER_MINUTE,
|
||||
)
|
||||
else:
|
||||
# time, in minutes:seconds
|
||||
return "%s%02d:%02d" % (prefix, duration // SECONDS_PER_MINUTE,
|
||||
duration % SECONDS_PER_MINUTE)
|
||||
return "%s%02d:%02d" % (
|
||||
prefix,
|
||||
duration // SECONDS_PER_MINUTE,
|
||||
duration % SECONDS_PER_MINUTE,
|
||||
)
|
||||
|
||||
|
||||
# from quodlibet
|
||||
|
|
@ -236,15 +251,17 @@ def strduration_long(duration, do_translate=True):
|
|||
else:
|
||||
# do not translate
|
||||
_ = lambda x: x
|
||||
_n = lambda a, b, n: a if n==1 else b
|
||||
_n = lambda a, b, n: a if n == 1 else b
|
||||
if duration < 0:
|
||||
duration = abs(duration)
|
||||
prefix = "-"
|
||||
else:
|
||||
prefix = ""
|
||||
if duration < 1:
|
||||
return _("%(prefix)s%(duration).02f seconds") % \
|
||||
{"prefix": prefix, "duration": duration}
|
||||
return _("%(prefix)s%(duration).02f seconds") % {
|
||||
"prefix": prefix,
|
||||
"duration": duration,
|
||||
}
|
||||
# translation dummies
|
||||
_n("%d second", "%d seconds", 1)
|
||||
_n("%d minute", "%d minutes", 1)
|
||||
|
|
@ -281,7 +298,7 @@ def strtimezone():
|
|||
zone = time.altzone
|
||||
else:
|
||||
zone = time.timezone
|
||||
return "%+04d" % (-zone//SECONDS_PER_HOUR)
|
||||
return "%+04d" % (-zone // SECONDS_PER_HOUR)
|
||||
|
||||
|
||||
def stripurl(s):
|
||||
|
|
@ -319,7 +336,12 @@ def format_feature_warning(**kwargs):
|
|||
"""Format warning that a module could not be imported and that it should
|
||||
be installed for a certain URL.
|
||||
"""
|
||||
return _("Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature.") % kwargs
|
||||
return (
|
||||
_(
|
||||
"Could not import %(module)s for %(feature)s. Install %(module)s from %(url)s to use this feature."
|
||||
)
|
||||
% kwargs
|
||||
)
|
||||
|
||||
|
||||
def strip_control_chars(text):
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ def _trace(frame, event, arg):
|
|||
elif event in ('return', 'c_return'):
|
||||
_trace_line(frame, event, arg)
|
||||
print(" return:", arg)
|
||||
#elif event in ('exception', 'c_exception'):
|
||||
# elif event in ('exception', 'c_exception'):
|
||||
# _trace_line(frame, event, arg)
|
||||
return _trace
|
||||
|
||||
|
|
|
|||
|
|
@ -24,7 +24,9 @@ from distutils.version import LooseVersion
|
|||
|
||||
# Use the Freecode submit file as source since that file gets updated
|
||||
# only when releasing a new version.
|
||||
UPDATE_URL = "https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode"
|
||||
UPDATE_URL = (
|
||||
"https://raw.github.com/linkchecker/linkchecker/master/linkchecker.freecode"
|
||||
)
|
||||
VERSION_TAG = 'Version:'
|
||||
if os.name == 'nt':
|
||||
URL_TAG = 'Windows-installer-URL:'
|
||||
|
|
|
|||
|
|
@ -60,20 +60,23 @@ _basic = {
|
|||
"_hex_full": r"0-9a-f",
|
||||
"_part": r"([a-z0-9][-a-z0-9]{0,61}|[a-z])",
|
||||
}
|
||||
_safe_char = r"([a-z0-9%(_path)s\+]|"\
|
||||
r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
|
||||
_safe_char = r"([a-z0-9%(_path)s\+]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))" % _basic
|
||||
_safe_scheme_pattern = r"(https?|ftp)"
|
||||
_safe_domain_pattern = r"(%(_part)s(\.%(_part)s)*\.?)" % _basic
|
||||
_safe_host_pattern = _safe_domain_pattern+r"(:(80|8080|8000|443))?" % _basic
|
||||
_safe_path_pattern = r"((/([a-z0-9%(_path)s]|"\
|
||||
r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
|
||||
_safe_host_pattern = _safe_domain_pattern + r"(:(80|8080|8000|443))?" % _basic
|
||||
_safe_path_pattern = (
|
||||
r"((/([a-z0-9%(_path)s]|" r"(%%[%(_hex_safe)s][%(_hex_full)s]))+)*/?)" % _basic
|
||||
)
|
||||
_safe_fragment_pattern = r"%s*" % _safe_char
|
||||
_safe_cgi = r"%s+(=(%s|/)+)?" % (_safe_char, _safe_char)
|
||||
_safe_query_pattern = r"(%s(&%s)*)?" % (_safe_cgi, _safe_cgi)
|
||||
_safe_param_pattern = r"(%s(;%s)*)?" % (_safe_cgi, _safe_cgi)
|
||||
safe_url_pattern = r"%s://%s%s(#%s)?" % \
|
||||
(_safe_scheme_pattern, _safe_host_pattern,
|
||||
_safe_path_pattern, _safe_fragment_pattern)
|
||||
safe_url_pattern = r"%s://%s%s(#%s)?" % (
|
||||
_safe_scheme_pattern,
|
||||
_safe_host_pattern,
|
||||
_safe_path_pattern,
|
||||
_safe_fragment_pattern,
|
||||
)
|
||||
|
||||
is_safe_char = re.compile("(?i)^%s$" % _safe_char).match
|
||||
is_safe_url = re.compile("(?i)^%s$" % safe_url_pattern).match
|
||||
|
|
@ -96,7 +99,7 @@ def splitparams(path):
|
|||
i = path.find(';')
|
||||
if i < 0:
|
||||
return path, ''
|
||||
return path[:i], path[i+1:]
|
||||
return path[:i], path[i + 1 :]
|
||||
|
||||
|
||||
def is_numeric_port(portstr):
|
||||
|
|
@ -113,8 +116,12 @@ def is_numeric_port(portstr):
|
|||
|
||||
def safe_host_pattern(host):
|
||||
"""Return regular expression pattern with given host for URL testing."""
|
||||
return "(?i)%s://%s%s(#%s)?" % \
|
||||
(_safe_scheme_pattern, host, _safe_path_pattern, _safe_fragment_pattern)
|
||||
return "(?i)%s://%s%s(#%s)?" % (
|
||||
_safe_scheme_pattern,
|
||||
host,
|
||||
_safe_path_pattern,
|
||||
_safe_fragment_pattern,
|
||||
)
|
||||
|
||||
|
||||
def parse_qsl(qs, encoding, keep_blank_values=0, strict_parsing=0):
|
||||
|
|
@ -190,18 +197,23 @@ def url_fix_host(urlparts, encoding):
|
|||
userpass, netloc = urllib.parse.splituser(urlparts[1])
|
||||
if userpass:
|
||||
userpass = urllib.parse.unquote(userpass, encoding=encoding)
|
||||
netloc, is_idn = idna_encode(urllib.parse.unquote(netloc, encoding=encoding).lower())
|
||||
netloc, is_idn = idna_encode(
|
||||
urllib.parse.unquote(netloc, encoding=encoding).lower()
|
||||
)
|
||||
# a leading backslash in path causes urlsplit() to add the
|
||||
# path components up to the first slash to host
|
||||
# try to find this case...
|
||||
i = netloc.find("\\")
|
||||
if i != -1:
|
||||
# ...and fix it by prepending the misplaced components to the path
|
||||
comps = netloc[i:] # note: still has leading backslash
|
||||
comps = netloc[i:] # note: still has leading backslash
|
||||
if not urlparts[2] or urlparts[2] == '/':
|
||||
urlparts[2] = comps
|
||||
else:
|
||||
urlparts[2] = "%s%s" % (comps, urllib.parse.unquote(urlparts[2], encoding=encoding))
|
||||
urlparts[2] = "%s%s" % (
|
||||
comps,
|
||||
urllib.parse.unquote(urlparts[2], encoding=encoding),
|
||||
)
|
||||
netloc = netloc[:i]
|
||||
else:
|
||||
# a leading ? in path causes urlsplit() to add the query to the
|
||||
|
|
@ -224,7 +236,7 @@ def url_fix_host(urlparts, encoding):
|
|||
if port != dport:
|
||||
host = "%s:%d" % (host, port)
|
||||
netloc = host
|
||||
urlparts[1] = userpass+netloc
|
||||
urlparts[1] = userpass + netloc
|
||||
return is_idn
|
||||
|
||||
|
||||
|
|
@ -243,21 +255,25 @@ def url_fix_mailto_urlsplit(urlparts):
|
|||
if sep in urlparts[2]:
|
||||
urlparts[2], urlparts[3] = urlparts[2].split(sep, 1)
|
||||
|
||||
|
||||
# wayback urls include in the path http[s]://. By default the
|
||||
# tidying mechanism in linkchecker encodes the : and deletes the second slash
|
||||
# This function reverses these corrections. This function expects only the
|
||||
# path section of the URL as input.
|
||||
wayback_regex = re.compile(r'(https?)(\%3A/|:/)')
|
||||
|
||||
|
||||
def url_fix_wayback_query(path):
|
||||
return wayback_regex.sub(r'\1://', path)
|
||||
|
||||
|
||||
def url_parse_query(query, encoding):
|
||||
"""Parse and re-join the given CGI query."""
|
||||
# if ? is in the query, split it off, seen at msdn.microsoft.com
|
||||
append = ""
|
||||
while '?' in query:
|
||||
query, rest = query.rsplit('?', 1)
|
||||
append = '?'+url_parse_query(rest, encoding=encoding)+append
|
||||
append = '?' + url_parse_query(rest, encoding=encoding) + append
|
||||
l = []
|
||||
for k, v, sep in parse_qsl(query, keep_blank_values=True, encoding=encoding):
|
||||
k = urllib.parse.quote(k, safe='/-:,;')
|
||||
|
|
@ -316,12 +332,14 @@ def url_norm(url, encoding):
|
|||
# anchor
|
||||
urlparts[4] = urllib.parse.unquote(urlparts[4], encoding=encoding)
|
||||
# quote parts again
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe='@:') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe=_nopathquote_chars) # path
|
||||
if not urlparts[0].startswith("feed"):
|
||||
urlparts[2] = url_fix_wayback_query(urlparts[2]) # unencode colon in http[s]:// in wayback path
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
|
||||
urlparts[2] = url_fix_wayback_query(
|
||||
urlparts[2]
|
||||
) # unencode colon in http[s]:// in wayback path
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4], safe="!$&'()*+,-./;=?@_~") # anchor
|
||||
res = urlunsplit(urlparts)
|
||||
if url.endswith('#') and not urlparts[4]:
|
||||
# re-append trailing empty fragment
|
||||
|
|
@ -334,6 +352,8 @@ _thisdir_ro = re.compile(r"^\./")
|
|||
_samedir_ro = re.compile(r"/\./|/\.$")
|
||||
_parentdir_ro = re.compile(r"^/(\.\./)+|/(?!\.\./)[^/]+/\.\.(/|$)")
|
||||
_relparentdir_ro = re.compile(r"^(?!\.\./)[^/]+/\.\.(/|$)")
|
||||
|
||||
|
||||
def collapse_segments(path):
|
||||
"""Remove all redundant segments from the given URL path.
|
||||
Precondition: path is an unquoted url path"""
|
||||
|
|
@ -375,12 +395,14 @@ def url_quote(url, encoding):
|
|||
if not url_is_absolute(url):
|
||||
return document_quote(url)
|
||||
urlparts = list(urllib.parse.urlsplit(url))
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
|
||||
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
|
||||
urlparts[0] = urllib.parse.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.parse.quote(urlparts[1], safe=':') # host
|
||||
urlparts[2] = urllib.parse.quote(urlparts[2], safe='/=,') # path
|
||||
urlparts[3] = urllib.parse.quote(urlparts[3], safe='&=,') # query
|
||||
l = []
|
||||
for k, v, sep in parse_qsl(urlparts[3], encoding=encoding, keep_blank_values=True): # query
|
||||
for k, v, sep in parse_qsl(
|
||||
urlparts[3], encoding=encoding, keep_blank_values=True
|
||||
): # query
|
||||
k = urllib.parse.quote(k, safe='/-:,;')
|
||||
if v:
|
||||
v = urllib.parse.quote(v, safe='/-:,;')
|
||||
|
|
@ -388,7 +410,7 @@ def url_quote(url, encoding):
|
|||
else:
|
||||
l.append("%s%s" % (k, sep))
|
||||
urlparts[3] = ''.join(l)
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
|
||||
urlparts[4] = urllib.parse.quote(urlparts[4]) # anchor
|
||||
return urlunsplit(urlparts)
|
||||
|
||||
|
||||
|
|
@ -425,8 +447,10 @@ def match_host(host, domainlist):
|
|||
_nopathquote_chars = "-;/=,~*+()@!"
|
||||
if os.name == 'nt':
|
||||
_nopathquote_chars += "|"
|
||||
_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!")+"a-zA-Z0-9"
|
||||
_safe_url_chars = re.escape(_nopathquote_chars + "_:.&#%?[]!") + "a-zA-Z0-9"
|
||||
_safe_url_chars_ro = re.compile(r"^[%s]*$" % _safe_url_chars)
|
||||
|
||||
|
||||
def url_needs_quoting(url):
|
||||
"""Check if url needs percent quoting. Note that the method does
|
||||
only check basic character sets, and not any other syntax.
|
||||
|
|
@ -487,8 +511,7 @@ def splitport(host, port=0):
|
|||
return host, port
|
||||
|
||||
|
||||
def get_content(url, user=None, password=None, proxy=None, data=None,
|
||||
addheaders=None):
|
||||
def get_content(url, user=None, password=None, proxy=None, data=None, addheaders=None):
|
||||
"""Get URL content and info.
|
||||
|
||||
@return: (decoded text content of URL, headers) or
|
||||
|
|
@ -496,6 +519,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
|
|||
@rtype: tuple (String, dict) or (None, String)
|
||||
"""
|
||||
from . import configuration
|
||||
|
||||
headers = {
|
||||
'User-Agent': configuration.UserAgent,
|
||||
}
|
||||
|
|
@ -511,6 +535,7 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
|
|||
if proxy:
|
||||
kwargs['proxy'] = dict(http=proxy)
|
||||
from .configuration import get_share_file
|
||||
|
||||
try:
|
||||
kwargs["verify"] = get_share_file('cacert.pem')
|
||||
except ValueError:
|
||||
|
|
@ -518,10 +543,15 @@ def get_content(url, user=None, password=None, proxy=None, data=None,
|
|||
try:
|
||||
response = requests.request(method, url, **kwargs)
|
||||
return response.text, response.headers
|
||||
except (requests.exceptions.RequestException,
|
||||
requests.exceptions.BaseHTTPError) as msg:
|
||||
log.warn(LOG_CHECK, ("Could not get content of URL %(url)s: %(msg)s.") \
|
||||
% {"url": url, "msg": str(msg)})
|
||||
except (
|
||||
requests.exceptions.RequestException,
|
||||
requests.exceptions.BaseHTTPError,
|
||||
) as msg:
|
||||
log.warn(
|
||||
LOG_CHECK,
|
||||
("Could not get content of URL %(url)s: %(msg)s.")
|
||||
% {"url": url, "msg": str(msg)},
|
||||
)
|
||||
return None, str(msg)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue