mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-28 01:54:42 +00:00
Merge pull request #373 from linkchecker/fix-swf-parsing
SWF files are binary data
This commit is contained in:
commit
350f8bfef9
2 changed files with 12 additions and 3 deletions
|
|
@ -80,9 +80,15 @@ WmlTags = {
|
|||
|
||||
# matcher for <meta http-equiv=refresh> tags
|
||||
refresh_re = re.compile(r"(?i)^\d+;\s*url=(?P<url>.+)$")
|
||||
|
||||
_quoted_pat = r"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
|
||||
css_url_re = re.compile(r"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
|
||||
swf_url_re = re.compile("(?i)%s" % urlutil.safe_url_pattern)
|
||||
|
||||
# Note that swf_url_re, unlike all other regular expressions here, is meant
|
||||
# to match byte strings. Yes, we're scraping binary SWF data for anything
|
||||
# that looks like a URL. What did you expect, a full SWF format decoder?
|
||||
swf_url_re = re.compile(b"(?i)%s" % urlutil.safe_url_pattern.encode('ascii'))
|
||||
|
||||
c_comment_re = re.compile(r"/\*.*?\*/", re.DOTALL)
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -100,8 +100,11 @@ def parse_css (url_data):
|
|||
def parse_swf (url_data):
|
||||
"""Parse a SWF file for URLs."""
|
||||
linkfinder = linkparse.swf_url_re.finditer
|
||||
for mo in linkfinder(url_data.get_content()):
|
||||
url = mo.group()
|
||||
for mo in linkfinder(url_data.get_raw_content()):
|
||||
# We're scraping binary data for anything that looks like an URL using
|
||||
# a regex that matches only ASCII characters. Any non-ASCII characters
|
||||
# in the URL are expected to be %-encoded.
|
||||
url = mo.group().decode('ascii')
|
||||
url_data.add_url(url)
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue