git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@673 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2002-12-07 00:45:31 +00:00
parent 66b8d36af1
commit 5b2df85c67
4 changed files with 63 additions and 49 deletions

View file

@ -21,15 +21,6 @@ from UrlData import UrlData, ExcList
# OSError is thrown on Windows when a file is not found
ExcList.append(OSError)
# file extensions we can parse recursively
extensions = {
"html": r'(?i)\.s?html?$',
"opera": r'^(?i)opera.adr$', # opera bookmark file
# "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
}
for key in extensions.keys():
extensions[key] = re.compile(extensions[key])
# if file extension was fruitless, look at the content
contents = {
"html": r'(?i)<html>.*</html>',
@ -119,9 +110,8 @@ class FileUrlData (UrlData):
self.url = urlparse.urlunsplit(self.urlparts)
def getCacheKey (self):
# use that the host is lowercase
# the host in urlparts is lowercase()d
if self.urlparts:
self.urlparts[4] = self.anchor
key = urlparse.urlunsplit(self.urlparts)
@ -137,7 +127,7 @@ class FileUrlData (UrlData):
def isHtml (self):
# guess by extension
for ro in extensions.values():
for ro in linkcheck.extensions.values():
if ro.search(self.url):
return 1
# try to read content (can fail, so catch error)
@ -151,46 +141,10 @@ class FileUrlData (UrlData):
def parseUrl (self):
for key,ro in extensions.items():
for key,ro in linkcheck.extensions.items():
if ro.search(self.url):
return getattr(self, "parse_"+key)()
for key,ro in contents.items():
if ro.search(self.getContent()[:20]):
return getattr(self, "parse_"+key)()
return None
def parse_html (self):
UrlData.parseUrl(self)
def parse_opera (self):
# parse an opera bookmark file
name = ""
lineno = 0
for line in self.getContent().splitlines():
lineno += 1
line = line.strip()
if line.startswith("NAME="):
name = line[5:]
elif line.startswith("URL="):
url = line[4:]
if url:
self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
self.recursionLevel+1, self.config, self.url, None, lineno, name))
name = ""
def parse_text (self):
# unused at the moment
lineno = 0
for line in self.getContent().splitlines():
lineno += 1
i = 0
while 1:
mo = _url_re.search(line, i)
if not mo: break
self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
self.recursionLevel+1, self.config, self.url, None, lineno, ""))
i = mo.end()

View file

@ -59,6 +59,20 @@ class FtpUrlData (ProxyUrlData):
self.retrieve(filename)
def isHtml (self):
# guess by extension
for ro in linkcheck.extensions.values():
if ro.search(self.url):
return 1
def parseUrl (self):
for key,ro in linkcheck.extensions.items():
if ro.search(self.url):
return getattr(self, "parse_"+key)()
return None
def login (self, _user, _password):
"""log into ftp server and check the welcome message"""
# ready to connect

View file

@ -393,7 +393,12 @@ class UrlData:
def parseUrl (self):
# default parse type is html
debug(BRING_IT_ON, "Parsing recursively into", self)
self.parse_html();
def parse_html (self):
# search for a possible base reference
h = LinkParser(self.getContent(), {'base': ['href']})
baseRef = None
@ -414,6 +419,37 @@ class UrlData:
line=line, column=column, name=name))
def parse_opera (self):
# parse an opera bookmark file
name = ""
lineno = 0
for line in self.getContent().splitlines():
lineno += 1
line = line.strip()
if line.startswith("NAME="):
name = line[5:]
elif line.startswith("URL="):
url = line[4:]
if url:
self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url,
self.recursionLevel+1, self.config, self.url, None, lineno, name))
name = ""
def parse_text (self):
# unused at the moment
lineno = 0
for line in self.getContent().splitlines():
lineno += 1
i = 0
while 1:
mo = _url_re.search(line, i)
if not mo: break
self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(),
self.recursionLevel+1, self.config, self.url, None, lineno, ""))
i = mo.end()
def __str__ (self):
return ("%s link\n"
"urlname=%s\n"

View file

@ -34,6 +34,16 @@ def getLinkPat (arg, strict=0):
"strict": strict,
}
# file extensions we can parse recursively
extensions = {
"html": r'(?i)\.s?html?$',
"opera": r'^(?i)opera.adr$', # opera bookmark file
# "text": r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$',
}
for key in extensions.keys():
extensions[key] = re.compile(extensions[key])
# i18n suppport
import sys, os, _linkchecker_configdata
def init_gettext ():