diff --git a/ChangeLog b/ChangeLog index d9c321dd..56df0215 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,7 @@ +4.9 "" (released xx.xx.2008) + + * Parse Shockwave Flash (SWF) for URLs to check. + 4.8 "Hallam Foe" (released 16.12.2007) * Fix message typo for not disclosing information. diff --git a/TODO b/TODO index 09edc222..946a1617 100644 --- a/TODO +++ b/TODO @@ -18,9 +18,3 @@ - [FEATURE] Virus check - [FEATURE] Allow specifying proxy data in the web interface - -- [FEATURE] Parse and check URLs from SWF files. Looks like there are no - Python SWF parsers that I could use for that (2007-12-26), but there - is a Perl module at http://www.thegestalt.org/flash/stuff/. - Alternatively just do a strings(1) like search and try to recognize - URL patterns. Seems to be much easier. diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 9a40eb5e..c789c041 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -129,8 +129,22 @@ Warnings = { } # file extensions we can parse recursively -extensions = { +PARSE_EXTENSIONS = { "html": re.compile(r'(?i)\.s?html?$'), "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file "css": re.compile(r'(?i)\.css$'), # CSS stylesheet + "swf": re.compile(r'(?i)\.swf$'), # SWF file +} + +PARSE_MIMETYPES = ( + "text/html", + "text/css", + "application/x-shockwave-flash", +) + +# if file extension lookup was unsuccessful, look at the content +PARSE_CONTENTS = { + "html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'), + "opera" : re.compile(r'^Opera Hotlist'), + "text" : re.compile(r'(?i)^# LinkChecker URL list'), } diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 3b94bb7a..bc569c64 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -29,14 +29,8 @@ import urlbase import linkcheck.log import linkcheck.checker import linkcheck.fileutil -from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH - -# if file extension lookup was unsuccessful, look at the content -contents = { - "html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'), - "opera" : re.compile(r'^Opera Hotlist'), - "text" : re.compile(r'(?i)^# LinkChecker URL list'), -} +from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \ + PARSE_EXTENSIONS, PARSE_CONTENTS def get_files (dirname): @@ -185,9 +179,9 @@ class FileUrl (urlbase.UrlBase): """ Check if file is a parseable HTML file. """ - if linkcheck.checker.const.extensions['html'].search(self.url): + if PARSE_EXTENSIONS['html'].search(self.url): return True - if contents['html'].search(self.get_content()): + if PARSE_CONTENTS['html'].search(self.get_content()): return True return False @@ -232,12 +226,12 @@ class FileUrl (urlbase.UrlBase): if self.is_directory(): return True # guess by extension - for ro in linkcheck.checker.const.extensions.itervalues(): + for ro in PARSE_EXTENSIONS.itervalues(): if ro.search(self.url): return True # try to read content (can fail, so catch error) try: - for ro in contents.itervalues(): + for ro in PARSE_CONTENTS.itervalues(): if ro.search(self.get_content()[:30]): return True except IOError: @@ -251,11 +245,11 @@ class FileUrl (urlbase.UrlBase): if self.is_directory(): self.parse_html() return - for key, ro in linkcheck.checker.const.extensions.iteritems(): + for key, ro in PARSE_EXTENSIONS.iteritems(): if ro.search(self.url): getattr(self, "parse_"+key)() return - for key, ro in contents.iteritems(): + for key, ro in PARSE_CONTENTS.iteritems(): if ro.search(self.get_content()[:30]): getattr(self, "parse_"+key)() return diff --git a/linkcheck/checker/ftpurl.py b/linkcheck/checker/ftpurl.py index bf00adaa..417e2590 100644 --- a/linkcheck/checker/ftpurl.py +++ b/linkcheck/checker/ftpurl.py @@ -28,7 +28,7 @@ import proxysupport import httpurl import internpaturl import linkcheck.ftpparse._ftpparse as ftpparse -from const import WARN_FTP_MISSING_SLASH +from const import WARN_FTP_MISSING_SLASH, PARSE_EXTENSIONS, PARSE_CONTENTS DEFAULT_TIMEOUT_SECS = 300 @@ -180,7 +180,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ See if URL target is a HTML file by looking at the extension. """ - if linkcheck.checker.const.extensions['html'].search(self.url): + if PARSE_EXTENSIONS['html'].search(self.url): return True return False @@ -190,7 +190,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ if self.is_directory(): return True - for ro in linkcheck.checker.const.extensions.itervalues(): + for ro in PARSE_EXTENSIONS.itervalues(): if ro.search(self.url): return True return False @@ -208,7 +208,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): if self.is_directory(): self.parse_html() return - for key, ro in linkcheck.checker.const.extensions.iteritems(): + for key, ro in PARSE_EXTENSIONS.iteritems(): if ro.search(self.url): getattr(self, "parse_"+key)() diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index be8645b4..c04c8d6b 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -39,7 +39,8 @@ import proxysupport from const import WARN_HTTP_ROBOTS_DENIED, WARN_HTTP_NO_ANCHOR_SUPPORT, \ WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \ WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \ - WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING + WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \ + PARSE_MIMETYPES # helper alias unicode_safe = linkcheck.strformat.unicode_safe @@ -619,8 +620,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ if not (self.valid and self.headers): return False - if headers.get_content_type(self.headers) not in \ - ("text/html", "text/css"): + if headers.get_content_type(self.headers) not in PARSE_MIMETYPES: return False encoding = headers.get_content_encoding(self.headers) if encoding and encoding not in _supported_encodings and \ @@ -634,11 +634,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): """ Parse file contents for new links to check. """ - ptype = headers.get_content_type(self.headers) - if ptype == "text/html": + ctype = headers.get_content_type(self.headers) + if ctype == "text/html": self.parse_html() - elif ptype == "text/css": + elif ctype == "text/css": self.parse_css() + elif ctype == "application/x-shockwave-flash": + self.parse_swf() def get_robots_txt_url (self): """ diff --git a/linkcheck/checker/tests/__init__.py b/linkcheck/checker/tests/__init__.py index 84c83bc0..139cb8ed 100644 --- a/linkcheck/checker/tests/__init__.py +++ b/linkcheck/checker/tests/__init__.py @@ -190,7 +190,6 @@ class LinkCheckTest (unittest.TestCase): logargs = {'expected': self.get_resultlines(filename)} aggregate = get_test_aggregate(confargs, logargs) url_data = get_url_from(url, 0, aggregate) - # XXX if assume_local linkcheck.add_intern_pattern(url_data, aggregate.config) aggregate.urlqueue.put(url_data) linkcheck.director.check_urls(aggregate) @@ -216,7 +215,6 @@ class LinkCheckTest (unittest.TestCase): logargs['parts'] = parts aggregate = get_test_aggregate(confargs, logargs) url_data = get_url_from(url, 0, aggregate) - # XXX if assume_local: linkcheck.add_intern_pattern(url_data, aggregate.config) aggregate.urlqueue.put(url_data) linkcheck.director.check_urls(aggregate) diff --git a/linkcheck/checker/tests/data/misc.html b/linkcheck/checker/tests/data/misc.html index 692e36ab..8caf4ad7 100644 --- a/linkcheck/checker/tests/data/misc.html +++ b/linkcheck/checker/tests/data/misc.html @@ -8,3 +8,6 @@