linkchecker/linkcheck/htmlutil/linkparse.py
Chris Mayo 1663e10fe7 Remove spaces after names in function definitions
This is a PEP 8 convention, E211.
2020-05-16 20:19:42 +01:00

225 lines
8.1 KiB
Python

# Copyright (C) 2001-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Find link tags in HTML text.
"""
import re
from .. import strformat, log, LOG_CHECK, url as urlutil
from builtins import str as str_text
unquote = strformat.unquote
# HTML4/5 link tags
# ripped mainly from HTML::Tagset.pm with HTML5 added
LinkTags = {
'a': ['href'],
'applet': ['archive', 'src'],
'area': ['href'],
'audio': ['src'], # HTML5
'bgsound': ['src'],
'blockquote': ['cite'],
'body': ['background'],
'button': ['formaction'], # HTML5
'del': ['cite'],
'embed': ['pluginspage', 'src'],
'form': ['action'],
'frame': ['src', 'longdesc'],
'head': ['profile'],
'html': ['manifest'], # HTML5
'iframe': ['src', 'longdesc'],
'ilayer': ['background'],
'img': ['src', 'lowsrc', 'longdesc', 'usemap', 'srcset'],
'input': ['src', 'usemap', 'formaction'],
'ins': ['cite'],
'isindex': ['action'],
'layer': ['background', 'src'],
'link': ['href'],
'meta': ['content', 'href'],
'object': ['classid', 'data', 'archive', 'usemap', 'codebase'],
'q': ['cite'],
'script': ['src'],
'source': ['src'], # HTML5
'table': ['background'],
'td': ['background'],
'th': ['background'],
'tr': ['background'],
'track': ['src'], # HTML5
'video': ['src'], # HTML5
'xmp': ['href'],
None: ['style', 'itemtype'],
}
# HTML anchor tags
AnchorTags = {
'a': ['name'],
None: ['id'],
}
# WML tags
WmlTags = {
'a': ['href'],
'go': ['href'],
'img': ['src'],
}
# matcher for <meta http-equiv=refresh> tags
refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
_quoted_pat = r"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
css_url_re = re.compile(r"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
# Note that swf_url_re, unlike all other regular expressions here, is meant
# to match byte strings. Yes, we're scraping binary SWF data for anything
# that looks like a URL. What did you expect, a full SWF format decoder?
swf_url_re = re.compile(b"(?i)%s" % urlutil.safe_url_pattern.encode('ascii'))
c_comment_re = re.compile(r"/\*.*?\*/", re.DOTALL)
def strip_c_comments(text):
"""Remove C/CSS-style comments from text. Note that this method also
deliberately removes comments inside of strings."""
return c_comment_re.sub('', text)
def is_meta_url(attr, attrs):
"""Check if the meta attributes contain a URL."""
res = False
if attr == "content":
equiv = attrs.get('http-equiv', '').lower()
scheme = attrs.get('scheme', '').lower()
res = equiv in ('refresh',) or scheme in ('dcterms.uri',)
if attr == "href":
rel = attrs.get('rel', '').lower()
res = rel in ('shortcut icon', 'icon')
return res
def is_form_get(attr, attrs):
"""Check if this is a GET form action URL."""
res = False
if attr == "action":
method = attrs.get('method', '').lower()
res = method != 'post'
return res
class LinkFinder:
"""Find HTML links, and apply them to the callback function with the
format (url, lineno, column, name, codebase)."""
def __init__ (self, callback, tags):
"""Store content in buffer and initialize URL list."""
self.callback = callback
# set universal tag attributes using tagname None
self.universal_attrs = set(tags.get(None, []))
self.tags = dict()
for tag, attrs in tags.items():
self.tags[tag] = set(attrs)
# add universal tag attributes
self.tags[tag].update(self.universal_attrs)
self.base_ref = ''
def html_element (self, tag, attrs, element_text, lineno, column):
"""Search for links and store found URLs in a list."""
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
log.debug(LOG_CHECK, "line %d col %d", lineno, column)
if tag == "base" and not self.base_ref:
self.base_ref = attrs.get("href", '')
tagattrs = self.tags.get(tag, self.universal_attrs)
# parse URLs in tag (possibly multiple URLs in CSS styles)
for attr in sorted(tagattrs.intersection(attrs)):
if tag == "meta" and not is_meta_url(attr, attrs):
continue
if tag == "form" and not is_form_get(attr, attrs):
continue
# name of this link
name = self.get_link_name(tag, attrs, attr, element_text)
# possible codebase
base = ''
if tag == 'applet':
base = attrs.get('codebase', '')
if not base:
base = self.base_ref
# note: value can be None
value = attrs.get(attr)
if tag == 'link' and attrs.get('rel') == 'dns-prefetch':
if ':' in value:
value = value.split(':', 1)[1]
value = 'dns:' + value.rstrip('/')
# parse tag for URLs
self.parse_tag(tag, attr, value, name, base, lineno, column)
log.debug(LOG_CHECK, "LinkFinder finished tag %s", tag)
def get_link_name (self, tag, attrs, attr, name=None):
"""Parse attrs for link name. Return name of link."""
if tag == 'a' and attr == 'href':
if not name:
name = attrs.get('title', '')
elif tag == 'img':
name = attrs.get('alt', '')
if not name:
name = attrs.get('title', '')
else:
name = ""
return name
def parse_tag (self, tag, attr, value, name, base, lineno, column):
"""Add given url data to url list."""
assert isinstance(tag, str_text), repr(tag)
assert isinstance(attr, str_text), repr(attr)
assert isinstance(name, str_text), repr(name)
assert isinstance(base, str_text), repr(base)
assert isinstance(value, str_text) or value is None, repr(value)
# look for meta refresh
if tag == 'meta' and value:
mo = refresh_re.match(value)
if mo:
self.found_url(mo.group("url"), name, base, lineno, column)
elif attr != 'content':
self.found_url(value, name, base, lineno, column)
elif attr == 'style' and value:
for mo in css_url_re.finditer(value):
url = unquote(mo.group("url"), matching=True)
self.found_url(url, name, base, lineno, column)
elif attr == 'archive':
for url in value.split(','):
self.found_url(url, name, base, lineno, column)
elif attr == 'srcset':
for img_candidate in value.split(','):
url = img_candidate.split()[0]
self.found_url(url, name, base, lineno, column)
else:
self.found_url(value, name, base, lineno, column)
def found_url(self, url, name, base, lineno, column):
"""Add newly found URL to queue."""
assert isinstance(url, str_text) or url is None, repr(url)
self.callback(url, line=lineno, column=column, name=name, base=base)
def find_links(soup, callback, tags):
"""Parse into content and search for URLs to check.
When a URL is found it is passed to the supplied callback.
"""
lf = LinkFinder(callback, tags)
for element in soup.find_all(True):
lf.html_element(
element.name, element.attrs, element.text.strip(),
element.sourceline,
None if element.sourcepos is None else element.sourcepos + 1)