Merge pull request #373 from linkchecker/fix-swf-parsing

SWF files are binary data
This commit is contained in:
anarcat 2020-04-27 09:39:52 -04:00 committed by GitHub
commit 350f8bfef9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 3 deletions

View file

@ -80,9 +80,15 @@ WmlTags = {
# matcher for <meta http-equiv=refresh> tags
refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
_quoted_pat = r"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
css_url_re = re.compile(r"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
swf_url_re = re.compile("(?i)%s" % urlutil.safe_url_pattern)
# Note that swf_url_re, unlike all other regular expressions here, is meant
# to match byte strings. Yes, we're scraping binary SWF data for anything
# that looks like a URL. What did you expect, a full SWF format decoder?
swf_url_re = re.compile(b"(?i)%s" % urlutil.safe_url_pattern.encode('ascii'))
c_comment_re = re.compile(r"/\*.*?\*/", re.DOTALL)

View file

@ -100,8 +100,11 @@ def parse_css (url_data):
def parse_swf (url_data):
"""Parse a SWF file for URLs."""
linkfinder = linkparse.swf_url_re.finditer
for mo in linkfinder(url_data.get_content()):
url = mo.group()
for mo in linkfinder(url_data.get_raw_content()):
# We're scraping binary data for anything that looks like an URL using
# a regex that matches only ASCII characters. Any non-ASCII characters
# in the URL are expected to be %-encoded.
url = mo.group().decode('ascii')
url_data.add_url(url)