linkchecker/linkcheck/checker/__init__.py
Chris Mayo ac0967e251 Fix remaining flake8 violations in linkcheck/
linkcheck/better_exchook2.py:28:89: E501 line too long (90 > 88 characters)
linkcheck/better_exchook2.py:155:9: E722 do not use bare 'except'
linkcheck/better_exchook2.py:166:9: E722 do not use bare 'except'
linkcheck/better_exchook2.py:289:13: E741 ambiguous variable name 'l'
linkcheck/better_exchook2.py:299:9: E722 do not use bare 'except'
linkcheck/containers.py:48:13: E731 do not assign a lambda expression, use a def
linkcheck/ftpparse.py:123:89: E501 line too long (93 > 88 characters)
linkcheck/loader.py:46:47: E203 whitespace before ':'
linkcheck/logconf.py:45:29: E231 missing whitespace after ','
linkcheck/robotparser2.py:157:89: E501 line too long (95 > 88 characters)
linkcheck/robotparser2.py:182:89: E501 line too long (89 > 88 characters)
linkcheck/strformat.py:181:16: E203 whitespace before ':'
linkcheck/strformat.py:181:43: E203 whitespace before ':'
linkcheck/strformat.py:253:9: E731 do not assign a lambda expression, use a def
linkcheck/strformat.py:254:9: E731 do not assign a lambda expression, use a def
linkcheck/strformat.py:341:89: E501 line too long (111 > 88 characters)
linkcheck/url.py:102:32: E203 whitespace before ':'
linkcheck/url.py:277:5: E741 ambiguous variable name 'l'
linkcheck/url.py:402:5: E741 ambiguous variable name 'l'
linkcheck/checker/__init__.py:203:1: E402 module level import not at top of file
linkcheck/checker/fileurl.py:200:89: E501 line too long (103 > 88 characters)
linkcheck/checker/mailtourl.py:122:60: E203 whitespace before ':'
linkcheck/checker/mailtourl.py:157:89: E501 line too long (96 > 88 characters)
linkcheck/checker/mailtourl.py:190:89: E501 line too long (109 > 88 characters)
linkcheck/checker/mailtourl.py:200:89: E501 line too long (111 > 88 characters)
linkcheck/checker/mailtourl.py:249:89: E501 line too long (106 > 88 characters)
linkcheck/checker/unknownurl.py:226:23: W291 trailing whitespace
linkcheck/checker/urlbase.py:245:89: E501 line too long (101 > 88 characters)
linkcheck/configuration/confparse.py:236:89: E501 line too long (186 > 88 characters)
linkcheck/configuration/confparse.py:247:89: E501 line too long (111 > 88 characters)
linkcheck/configuration/__init__.py:164:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:184:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:190:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:195:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:198:9: E266 too many leading '#' for block comment
linkcheck/configuration/__init__.py:435:89: E501 line too long (90 > 88 characters)
linkcheck/director/aggregator.py:45:43: E231 missing whitespace after ','
linkcheck/director/aggregator.py:178:89: E501 line too long (106 > 88 characters)
linkcheck/logger/__init__.py:29:1: E731 do not assign a lambda expression, use a def
linkcheck/logger/__init__.py:108:13: E741 ambiguous variable name 'l'
linkcheck/logger/__init__.py:275:19: F821 undefined name '_'
linkcheck/logger/__init__.py:342:16: F821 undefined name '_'
linkcheck/logger/__init__.py:380:13: F821 undefined name '_'
linkcheck/logger/__init__.py:384:13: F821 undefined name '_'
linkcheck/logger/__init__.py:387:13: F821 undefined name '_'
linkcheck/logger/__init__.py:396:13: F821 undefined name '_'
linkcheck/network/__init__.py:1:1: W391 blank line at end of file
linkcheck/plugins/locationinfo.py:89:9: E731 do not assign a lambda expression, use a def
linkcheck/plugins/locationinfo.py:91:9: E731 do not assign a lambda expression, use a def
linkcheck/plugins/markdowncheck.py:112:89: E501 line too long (111 > 88 characters)
linkcheck/plugins/markdowncheck.py:141:9: E741 ambiguous variable name 'l'
linkcheck/plugins/markdowncheck.py:165:23: E203 whitespace before ':'
linkcheck/plugins/viruscheck.py:95:42: E203 whitespace before ':'
2020-05-30 17:01:36 +01:00

214 lines
6.3 KiB
Python

# Copyright (C) 2000-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Main functions for link checking.
"""
import os
import html
import urllib.parse
from .. import strformat, url as urlutil, log, LOG_CHECK
MAX_FILESIZE = 1024 * 1024 * 10 # 10MB
def guess_url(url):
"""Guess if URL is a http or ftp URL.
@param url: the URL to check
@ptype url: unicode
@return: url with http:// or ftp:// prepended if it's detected as
a http respective ftp URL.
@rtype: unicode
"""
if url.lower().startswith("www."):
# syntactic sugar
return "http://%s" % url
elif url.lower().startswith("ftp."):
# syntactic sugar
return "ftp://%s" % url
return url
def absolute_url(base_url, base_ref, parent_url):
"""
Search for the absolute url to detect the link type. This does not
join any url fragments together!
@param base_url: base url from a link tag
@type base_url: string or None
@param base_ref: base url from <base> tag
@type base_ref: string or None
@param parent_url: url of parent document
@type parent_url: string or None
"""
if base_url and urlutil.url_is_absolute(base_url):
return base_url
elif base_ref and urlutil.url_is_absolute(base_ref):
return base_ref
elif parent_url and urlutil.url_is_absolute(parent_url):
return parent_url
return ""
def get_url_from(
base_url,
recursion_level,
aggregate,
parent_url=None,
base_ref=None,
line=None,
column=None,
page=0,
name="",
parent_content_type=None,
extern=None,
url_encoding=None,
):
"""
Get url data from given base data.
@param base_url: base url from a link tag
@type base_url: string or None
@param recursion_level: current recursion level
@type recursion_level: number
@param aggregate: aggregate object
@type aggregate: aggregate.Consumer
@param parent_url: parent url
@type parent_url: string or None
@param base_ref: base url from <base> tag
@type base_ref string or None
@param line: line number
@type line: number
@param column: column number
@type column: number
@param page: page number
@type page: number
@param name: link name
@type name: string
@param extern: (is_extern, is_strict) or None
@type extern: tuple(int, int) or None
"""
if base_url is not None:
base_url = strformat.unicode_safe(base_url)
# left strip for detection of URL scheme
base_url_stripped = base_url.lstrip()
else:
base_url_stripped = base_url
if parent_url is not None:
parent_url = strformat.unicode_safe(parent_url)
if base_ref is not None:
base_ref = strformat.unicode_safe(base_ref)
name = strformat.unicode_safe(name)
url = absolute_url(base_url_stripped, base_ref, parent_url).lower()
if ":" in url:
scheme = url.split(":", 1)[0].lower()
else:
scheme = None
if not (url or name):
# use filename as base url, with slash as path seperator
name = base_url.replace("\\", "/")
allowed_schemes = aggregate.config["allowedschemes"]
# ignore local PHP files with execution directives
local_php = (
parent_content_type == 'application/x-httpd-php'
and '<?' in base_url
and '?>' in base_url
and scheme == 'file'
)
if local_php or (allowed_schemes and scheme not in allowed_schemes):
klass = ignoreurl.IgnoreUrl
else:
assume_local_file = recursion_level == 0
klass = get_urlclass_from(scheme, assume_local_file=assume_local_file)
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(
base_url,
recursion_level,
aggregate,
parent_url=parent_url,
base_ref=base_ref,
line=line,
column=column,
page=page,
name=name,
extern=extern,
url_encoding=url_encoding,
)
def get_urlclass_from(scheme, assume_local_file=False):
"""Return checker class for given URL scheme. If the scheme
cannot be matched and assume_local_file is True, assume a local file.
"""
if scheme in ("http", "https"):
klass = httpurl.HttpUrl
elif scheme == "ftp":
klass = ftpurl.FtpUrl
elif scheme == "file":
klass = fileurl.FileUrl
elif scheme == "telnet":
klass = telneturl.TelnetUrl
elif scheme == "mailto":
klass = mailtourl.MailtoUrl
elif scheme in ("nntp", "news", "snews"):
klass = nntpurl.NntpUrl
elif scheme == "dns":
klass = dnsurl.DnsUrl
elif scheme == "itms-services":
klass = itmsservicesurl.ItmsServicesUrl
elif scheme and unknownurl.is_unknown_scheme(scheme):
klass = unknownurl.UnknownUrl
elif assume_local_file:
klass = fileurl.FileUrl
else:
klass = unknownurl.UnknownUrl
return klass
def get_index_html(urls):
"""
Construct artificial index.html from given URLs.
@param urls: URL strings
@type urls: iterator of string
"""
lines = ["<html>", "<body>"]
for entry in urls:
name = html.escape(entry)
try:
url = html.escape(urllib.parse.quote(entry))
except KeyError:
# Some unicode entries raise KeyError.
url = name
lines.append('<a href="%s">%s</a>' % (url, name))
lines.extend(["</body>", "</html>"])
return os.linesep.join(lines)
# all the URL classes
from . import ( # noqa: E402
fileurl,
unknownurl,
ftpurl,
httpurl,
dnsurl,
mailtourl,
telneturl,
nntpurl,
ignoreurl,
itmsservicesurl,
)