Added Shockwave Flash (SWF) parsing

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3656 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2007-12-28 02:12:48 +00:00
parent 9e1795757b
commit c971ebdabf
12 changed files with 77 additions and 33 deletions

View file

@ -1,3 +1,7 @@
4.9 "" (released xx.xx.2008)
* Parse Shockwave Flash (SWF) for URLs to check.
4.8 "Hallam Foe" (released 16.12.2007)
* Fix message typo for not disclosing information.

6
TODO
View file

@ -18,9 +18,3 @@
- [FEATURE] Virus check
- [FEATURE] Allow specifying proxy data in the web interface
- [FEATURE] Parse and check URLs from SWF files. Looks like there are no
Python SWF parsers that I could use for that (2007-12-26), but there
is a Perl module at http://www.thegestalt.org/flash/stuff/.
Alternatively just do a strings(1) like search and try to recognize
URL patterns. Seems to be much easier.

View file

@ -129,8 +129,22 @@ Warnings = {
}
# file extensions we can parse recursively
extensions = {
PARSE_EXTENSIONS = {
"html": re.compile(r'(?i)\.s?html?$'),
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
"swf": re.compile(r'(?i)\.swf$'), # SWF file
}
PARSE_MIMETYPES = (
"text/html",
"text/css",
"application/x-shockwave-flash",
)
# if file extension lookup was unsuccessful, look at the content
PARSE_CONTENTS = {
"html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
"opera" : re.compile(r'^Opera Hotlist'),
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
}

View file

@ -29,14 +29,8 @@ import urlbase
import linkcheck.log
import linkcheck.checker
import linkcheck.fileutil
from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
# if file extension lookup was unsuccessful, look at the content
contents = {
"html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
"opera" : re.compile(r'^Opera Hotlist'),
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
}
from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
PARSE_EXTENSIONS, PARSE_CONTENTS
def get_files (dirname):
@ -185,9 +179,9 @@ class FileUrl (urlbase.UrlBase):
"""
Check if file is a parseable HTML file.
"""
if linkcheck.checker.const.extensions['html'].search(self.url):
if PARSE_EXTENSIONS['html'].search(self.url):
return True
if contents['html'].search(self.get_content()):
if PARSE_CONTENTS['html'].search(self.get_content()):
return True
return False
@ -232,12 +226,12 @@ class FileUrl (urlbase.UrlBase):
if self.is_directory():
return True
# guess by extension
for ro in linkcheck.checker.const.extensions.itervalues():
for ro in PARSE_EXTENSIONS.itervalues():
if ro.search(self.url):
return True
# try to read content (can fail, so catch error)
try:
for ro in contents.itervalues():
for ro in PARSE_CONTENTS.itervalues():
if ro.search(self.get_content()[:30]):
return True
except IOError:
@ -251,11 +245,11 @@ class FileUrl (urlbase.UrlBase):
if self.is_directory():
self.parse_html()
return
for key, ro in linkcheck.checker.const.extensions.iteritems():
for key, ro in PARSE_EXTENSIONS.iteritems():
if ro.search(self.url):
getattr(self, "parse_"+key)()
return
for key, ro in contents.iteritems():
for key, ro in PARSE_CONTENTS.iteritems():
if ro.search(self.get_content()[:30]):
getattr(self, "parse_"+key)()
return

View file

@ -28,7 +28,7 @@ import proxysupport
import httpurl
import internpaturl
import linkcheck.ftpparse._ftpparse as ftpparse
from const import WARN_FTP_MISSING_SLASH
from const import WARN_FTP_MISSING_SLASH, PARSE_EXTENSIONS, PARSE_CONTENTS
DEFAULT_TIMEOUT_SECS = 300
@ -180,7 +180,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
See if URL target is a HTML file by looking at the extension.
"""
if linkcheck.checker.const.extensions['html'].search(self.url):
if PARSE_EXTENSIONS['html'].search(self.url):
return True
return False
@ -190,7 +190,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if self.is_directory():
return True
for ro in linkcheck.checker.const.extensions.itervalues():
for ro in PARSE_EXTENSIONS.itervalues():
if ro.search(self.url):
return True
return False
@ -208,7 +208,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if self.is_directory():
self.parse_html()
return
for key, ro in linkcheck.checker.const.extensions.iteritems():
for key, ro in PARSE_EXTENSIONS.iteritems():
if ro.search(self.url):
getattr(self, "parse_"+key)()

View file

@ -39,7 +39,8 @@ import proxysupport
from const import WARN_HTTP_ROBOTS_DENIED, WARN_HTTP_NO_ANCHOR_SUPPORT, \
WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
PARSE_MIMETYPES
# helper alias
unicode_safe = linkcheck.strformat.unicode_safe
@ -619,8 +620,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
if not (self.valid and self.headers):
return False
if headers.get_content_type(self.headers) not in \
("text/html", "text/css"):
if headers.get_content_type(self.headers) not in PARSE_MIMETYPES:
return False
encoding = headers.get_content_encoding(self.headers)
if encoding and encoding not in _supported_encodings and \
@ -634,11 +634,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Parse file contents for new links to check.
"""
ptype = headers.get_content_type(self.headers)
if ptype == "text/html":
ctype = headers.get_content_type(self.headers)
if ctype == "text/html":
self.parse_html()
elif ptype == "text/css":
elif ctype == "text/css":
self.parse_css()
elif ctype == "application/x-shockwave-flash":
self.parse_swf()
def get_robots_txt_url (self):
"""

View file

@ -190,7 +190,6 @@ class LinkCheckTest (unittest.TestCase):
logargs = {'expected': self.get_resultlines(filename)}
aggregate = get_test_aggregate(confargs, logargs)
url_data = get_url_from(url, 0, aggregate)
# XXX if assume_local
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
@ -216,7 +215,6 @@ class LinkCheckTest (unittest.TestCase):
logargs['parts'] = parts
aggregate = get_test_aggregate(confargs, logargs)
url_data = get_url_from(url, 0, aggregate)
# XXX if assume_local:
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)

View file

@ -8,3 +8,6 @@
<!-- empty tag -->
<tr background>
<!-- shockwave flash -->
<a href="test.swf">SWF</a>

View file

@ -28,3 +28,10 @@ url
cache key None
real url None
error
url test.swf
cache key file:///home/calvin/src/linkchecker-svn/linkcheck/checker/tests/data/test.swf
real url file:///home/calvin/src/linkchecker-svn/linkcheck/checker/tests/data/test.swf
name SWF
valid

View file

@ -42,6 +42,7 @@ class TestHttp (httptest.HttpServerTest):
self.robots_txt_test()
self.robots_txt2_test()
self.noproxyfor_test()
self.swf_test()
finally:
self.stop_server()
@ -145,6 +146,21 @@ class TestHttp (httptest.HttpServerTest):
confargs=confargs)
del os.environ["http_proxy"]
def swf_test (self):
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
u"test.swf" % self.port
resultlines = [
u"url %s" % url,
u"cache key %s" % url,
u"real url %s" % url,
u"valid",
u"url http://www.imadoofus.org/",
u"cache key http://www.imadoofus.org/",
u"real url http://www.imadoofus.org/",
u"error",
]
self.direct(url, resultlines, recursionlevel=1)
def get_cookie (maxage=2000):
data = (

View file

@ -731,6 +731,16 @@ class UrlBase (object):
parent_url=self.url, line=lineno, column=column)
self.aggregate.urlqueue.put(url_data)
def parse_swf (self):
"""Parse a SWF file for URLs."""
linkfinder = linkcheck.linkparse.swf_url_re.finditer
for mo in linkfinder(self.get_content()):
url = mo.group()
url_data = linkcheck.checker.get_url_from(url,
self.recursion_level+1, self.aggregate,
parent_url=self.url)
self.aggregate.urlqueue.put(url_data)
def serialized (self):
"""
Return serialized url check data as unicode string.

View file

@ -22,6 +22,7 @@ import re
import linkcheck.strformat
import linkcheck.linkname
import linkcheck.log
import linkcheck.url
MAX_NAMELEN = 256
unquote = linkcheck.strformat.unquote
@ -63,6 +64,7 @@ LinkTags = {
refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
_quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
css_url_re = re.compile(ur"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
swf_url_re = re.compile("(?i)%s" % linkcheck.url.safe_url_pattern)
c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL)
def strip_c_comments (text):