content regex fixes

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1708 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-09-03 14:43:11 +00:00
parent b1708fc725
commit 594797b5e2

View file

@ -29,9 +29,9 @@ from linkcheck.i18n import _
# if file extension lookup was unsuccessful, look at the content
contents = {
"html": re.compile(r'(?i)<html>.*</html>'),
"opera" : re.compile(r'Opera Hotlist'),
"text" : re.compile(r'(?i)# LinkChecker URL list'),
"html": re.compile(r'^(?i)<(!DOCTYPE html|html|head|title)'),
"opera" : re.compile(r'^Opera Hotlist'),
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
}
@ -138,7 +138,7 @@ class FileUrl (urlbase.UrlBase):
# try to read content (can fail, so catch error)
try:
for ro in contents.values():
if ro.search(self.get_content()):
if ro.search(self.get_content()[:30]):
return True
except IOError:
pass
@ -151,6 +151,6 @@ class FileUrl (urlbase.UrlBase):
if ro.search(self.url):
return getattr(self, "parse_"+key)()
for key, ro in contents.items():
if ro.search(self.get_content()[:20]):
if ro.search(self.get_content()[:30]):
return getattr(self, "parse_"+key)()
return None