Merge pull request #373 from linkchecker/fix-swf-parsing

SWF files are binary data
2026-04-28 01:54:42 +00:00 · 2020-04-27 09:39:52 -04:00 · 2020-04-27 09:39:52 -04:00 · 350f8bfef9
commit 350f8bfef9
parent 183d483074 680783b1ff
2 changed files with 12 additions and 3 deletions
--- a/linkcheck/htmlutil/linkparse.py
+++ b/linkcheck/htmlutil/linkparse.py
@ -80,9 +80,15 @@ WmlTags = {

 # matcher for <meta http-equiv=refresh> tags
 refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
+
 _quoted_pat = r"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
 css_url_re = re.compile(r"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
-swf_url_re = re.compile("(?i)%s" % urlutil.safe_url_pattern)
+
+# Note that swf_url_re, unlike all other regular expressions here, is meant
+# to match byte strings.  Yes, we're scraping binary SWF data for anything
+# that looks like a URL.  What did you expect, a full SWF format decoder?
+swf_url_re = re.compile(b"(?i)%s" % urlutil.safe_url_pattern.encode('ascii'))
+
 c_comment_re = re.compile(r"/\*.*?\*/", re.DOTALL)


--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -100,8 +100,11 @@ def parse_css (url_data):
 def parse_swf (url_data):
    """Parse a SWF file for URLs."""
    linkfinder = linkparse.swf_url_re.finditer
-    for mo in linkfinder(url_data.get_content()):
-        url = mo.group()
+    for mo in linkfinder(url_data.get_raw_content()):
+        # We're scraping binary data for anything that looks like an URL using
+        # a regex that matches only ASCII characters.  Any non-ASCII characters
+        # in the URL are expected to be %-encoded.
+        url = mo.group().decode('ascii')
        url_data.add_url(url)