mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-09 23:24:44 +00:00
Added Shockwave Flash (SWF) parsing
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3656 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
9e1795757b
commit
c971ebdabf
12 changed files with 77 additions and 33 deletions
|
|
@ -1,3 +1,7 @@
|
|||
4.9 "" (released xx.xx.2008)
|
||||
|
||||
* Parse Shockwave Flash (SWF) for URLs to check.
|
||||
|
||||
4.8 "Hallam Foe" (released 16.12.2007)
|
||||
|
||||
* Fix message typo for not disclosing information.
|
||||
|
|
|
|||
6
TODO
6
TODO
|
|
@ -18,9 +18,3 @@
|
|||
- [FEATURE] Virus check
|
||||
|
||||
- [FEATURE] Allow specifying proxy data in the web interface
|
||||
|
||||
- [FEATURE] Parse and check URLs from SWF files. Looks like there are no
|
||||
Python SWF parsers that I could use for that (2007-12-26), but there
|
||||
is a Perl module at http://www.thegestalt.org/flash/stuff/.
|
||||
Alternatively just do a strings(1) like search and try to recognize
|
||||
URL patterns. Seems to be much easier.
|
||||
|
|
|
|||
|
|
@ -129,8 +129,22 @@ Warnings = {
|
|||
}
|
||||
|
||||
# file extensions we can parse recursively
|
||||
extensions = {
|
||||
PARSE_EXTENSIONS = {
|
||||
"html": re.compile(r'(?i)\.s?html?$'),
|
||||
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
|
||||
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
|
||||
"swf": re.compile(r'(?i)\.swf$'), # SWF file
|
||||
}
|
||||
|
||||
PARSE_MIMETYPES = (
|
||||
"text/html",
|
||||
"text/css",
|
||||
"application/x-shockwave-flash",
|
||||
)
|
||||
|
||||
# if file extension lookup was unsuccessful, look at the content
|
||||
PARSE_CONTENTS = {
|
||||
"html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
|
||||
"opera" : re.compile(r'^Opera Hotlist'),
|
||||
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -29,14 +29,8 @@ import urlbase
|
|||
import linkcheck.log
|
||||
import linkcheck.checker
|
||||
import linkcheck.fileutil
|
||||
from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH
|
||||
|
||||
# if file extension lookup was unsuccessful, look at the content
|
||||
contents = {
|
||||
"html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
|
||||
"opera" : re.compile(r'^Opera Hotlist'),
|
||||
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
}
|
||||
from const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
|
||||
PARSE_EXTENSIONS, PARSE_CONTENTS
|
||||
|
||||
|
||||
def get_files (dirname):
|
||||
|
|
@ -185,9 +179,9 @@ class FileUrl (urlbase.UrlBase):
|
|||
"""
|
||||
Check if file is a parseable HTML file.
|
||||
"""
|
||||
if linkcheck.checker.const.extensions['html'].search(self.url):
|
||||
if PARSE_EXTENSIONS['html'].search(self.url):
|
||||
return True
|
||||
if contents['html'].search(self.get_content()):
|
||||
if PARSE_CONTENTS['html'].search(self.get_content()):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -232,12 +226,12 @@ class FileUrl (urlbase.UrlBase):
|
|||
if self.is_directory():
|
||||
return True
|
||||
# guess by extension
|
||||
for ro in linkcheck.checker.const.extensions.itervalues():
|
||||
for ro in PARSE_EXTENSIONS.itervalues():
|
||||
if ro.search(self.url):
|
||||
return True
|
||||
# try to read content (can fail, so catch error)
|
||||
try:
|
||||
for ro in contents.itervalues():
|
||||
for ro in PARSE_CONTENTS.itervalues():
|
||||
if ro.search(self.get_content()[:30]):
|
||||
return True
|
||||
except IOError:
|
||||
|
|
@ -251,11 +245,11 @@ class FileUrl (urlbase.UrlBase):
|
|||
if self.is_directory():
|
||||
self.parse_html()
|
||||
return
|
||||
for key, ro in linkcheck.checker.const.extensions.iteritems():
|
||||
for key, ro in PARSE_EXTENSIONS.iteritems():
|
||||
if ro.search(self.url):
|
||||
getattr(self, "parse_"+key)()
|
||||
return
|
||||
for key, ro in contents.iteritems():
|
||||
for key, ro in PARSE_CONTENTS.iteritems():
|
||||
if ro.search(self.get_content()[:30]):
|
||||
getattr(self, "parse_"+key)()
|
||||
return
|
||||
|
|
|
|||
|
|
@ -28,7 +28,7 @@ import proxysupport
|
|||
import httpurl
|
||||
import internpaturl
|
||||
import linkcheck.ftpparse._ftpparse as ftpparse
|
||||
from const import WARN_FTP_MISSING_SLASH
|
||||
from const import WARN_FTP_MISSING_SLASH, PARSE_EXTENSIONS, PARSE_CONTENTS
|
||||
|
||||
DEFAULT_TIMEOUT_SECS = 300
|
||||
|
||||
|
|
@ -180,7 +180,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
See if URL target is a HTML file by looking at the extension.
|
||||
"""
|
||||
if linkcheck.checker.const.extensions['html'].search(self.url):
|
||||
if PARSE_EXTENSIONS['html'].search(self.url):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
|
@ -190,7 +190,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if self.is_directory():
|
||||
return True
|
||||
for ro in linkcheck.checker.const.extensions.itervalues():
|
||||
for ro in PARSE_EXTENSIONS.itervalues():
|
||||
if ro.search(self.url):
|
||||
return True
|
||||
return False
|
||||
|
|
@ -208,7 +208,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if self.is_directory():
|
||||
self.parse_html()
|
||||
return
|
||||
for key, ro in linkcheck.checker.const.extensions.iteritems():
|
||||
for key, ro in PARSE_EXTENSIONS.iteritems():
|
||||
if ro.search(self.url):
|
||||
getattr(self, "parse_"+key)()
|
||||
|
||||
|
|
|
|||
|
|
@ -39,7 +39,8 @@ import proxysupport
|
|||
from const import WARN_HTTP_ROBOTS_DENIED, WARN_HTTP_NO_ANCHOR_SUPPORT, \
|
||||
WARN_HTTP_WRONG_REDIRECT, WARN_HTTP_MOVED_PERMANENT, \
|
||||
WARN_HTTP_EMPTY_CONTENT, WARN_HTTP_COOKIE_STORE_ERROR, \
|
||||
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING
|
||||
WARN_HTTP_DECOMPRESS_ERROR, WARN_HTTP_UNSUPPORTED_ENCODING, \
|
||||
PARSE_MIMETYPES
|
||||
|
||||
# helper alias
|
||||
unicode_safe = linkcheck.strformat.unicode_safe
|
||||
|
|
@ -619,8 +620,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
if not (self.valid and self.headers):
|
||||
return False
|
||||
if headers.get_content_type(self.headers) not in \
|
||||
("text/html", "text/css"):
|
||||
if headers.get_content_type(self.headers) not in PARSE_MIMETYPES:
|
||||
return False
|
||||
encoding = headers.get_content_encoding(self.headers)
|
||||
if encoding and encoding not in _supported_encodings and \
|
||||
|
|
@ -634,11 +634,13 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
"""
|
||||
Parse file contents for new links to check.
|
||||
"""
|
||||
ptype = headers.get_content_type(self.headers)
|
||||
if ptype == "text/html":
|
||||
ctype = headers.get_content_type(self.headers)
|
||||
if ctype == "text/html":
|
||||
self.parse_html()
|
||||
elif ptype == "text/css":
|
||||
elif ctype == "text/css":
|
||||
self.parse_css()
|
||||
elif ctype == "application/x-shockwave-flash":
|
||||
self.parse_swf()
|
||||
|
||||
def get_robots_txt_url (self):
|
||||
"""
|
||||
|
|
|
|||
|
|
@ -190,7 +190,6 @@ class LinkCheckTest (unittest.TestCase):
|
|||
logargs = {'expected': self.get_resultlines(filename)}
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
# XXX if assume_local
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
|
|
@ -216,7 +215,6 @@ class LinkCheckTest (unittest.TestCase):
|
|||
logargs['parts'] = parts
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
# XXX if assume_local:
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
|
|
|
|||
|
|
@ -8,3 +8,6 @@
|
|||
|
||||
<!-- empty tag -->
|
||||
<tr background>
|
||||
|
||||
<!-- shockwave flash -->
|
||||
<a href="test.swf">SWF</a>
|
||||
|
|
|
|||
|
|
@ -28,3 +28,10 @@ url
|
|||
cache key None
|
||||
real url None
|
||||
error
|
||||
|
||||
url test.swf
|
||||
cache key file:///home/calvin/src/linkchecker-svn/linkcheck/checker/tests/data/test.swf
|
||||
real url file:///home/calvin/src/linkchecker-svn/linkcheck/checker/tests/data/test.swf
|
||||
name SWF
|
||||
valid
|
||||
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@ class TestHttp (httptest.HttpServerTest):
|
|||
self.robots_txt_test()
|
||||
self.robots_txt2_test()
|
||||
self.noproxyfor_test()
|
||||
self.swf_test()
|
||||
finally:
|
||||
self.stop_server()
|
||||
|
||||
|
|
@ -145,6 +146,21 @@ class TestHttp (httptest.HttpServerTest):
|
|||
confargs=confargs)
|
||||
del os.environ["http_proxy"]
|
||||
|
||||
def swf_test (self):
|
||||
url = u"http://localhost:%d/linkcheck/checker/tests/data/" \
|
||||
u"test.swf" % self.port
|
||||
resultlines = [
|
||||
u"url %s" % url,
|
||||
u"cache key %s" % url,
|
||||
u"real url %s" % url,
|
||||
u"valid",
|
||||
u"url http://www.imadoofus.org/",
|
||||
u"cache key http://www.imadoofus.org/",
|
||||
u"real url http://www.imadoofus.org/",
|
||||
u"error",
|
||||
]
|
||||
self.direct(url, resultlines, recursionlevel=1)
|
||||
|
||||
|
||||
def get_cookie (maxage=2000):
|
||||
data = (
|
||||
|
|
|
|||
|
|
@ -731,6 +731,16 @@ class UrlBase (object):
|
|||
parent_url=self.url, line=lineno, column=column)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def parse_swf (self):
|
||||
"""Parse a SWF file for URLs."""
|
||||
linkfinder = linkcheck.linkparse.swf_url_re.finditer
|
||||
for mo in linkfinder(self.get_content()):
|
||||
url = mo.group()
|
||||
url_data = linkcheck.checker.get_url_from(url,
|
||||
self.recursion_level+1, self.aggregate,
|
||||
parent_url=self.url)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
|
||||
def serialized (self):
|
||||
"""
|
||||
Return serialized url check data as unicode string.
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ import re
|
|||
import linkcheck.strformat
|
||||
import linkcheck.linkname
|
||||
import linkcheck.log
|
||||
import linkcheck.url
|
||||
|
||||
MAX_NAMELEN = 256
|
||||
unquote = linkcheck.strformat.unquote
|
||||
|
|
@ -63,6 +64,7 @@ LinkTags = {
|
|||
refresh_re = re.compile(ur"(?i)^\d+;\s*url=(?P<url>.+)$")
|
||||
_quoted_pat = ur"('[^']+'|\"[^\"]+\"|[^\)\s]+)"
|
||||
css_url_re = re.compile(ur"url\(\s*(?P<url>%s)\s*\)" % _quoted_pat)
|
||||
swf_url_re = re.compile("(?i)%s" % linkcheck.url.safe_url_pattern)
|
||||
c_comment_re = re.compile(ur"/\*.*?\*/", re.DOTALL)
|
||||
|
||||
def strip_c_comments (text):
|
||||
|
|
|
|||
Loading…
Reference in a new issue