diff --git a/TODO b/TODO index 9f3fa948..33825f95 100644 --- a/TODO +++ b/TODO @@ -3,4 +3,5 @@ Check why threaded app wont exit resp. is stalled Another Profiling roundup Named constants for ANSI Color codes Test Proxy Authentication -Test socket timeout (how?) +Cookie support +http://www.host.com:/ syntax check diff --git a/debian/changelog b/debian/changelog index 96b1a7e5..d6db5eac 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,3 +1,9 @@ +linkchecker (1.5.0) unstable; urgency=low + + * More syntax checking for host:port network locations + + -- Bastian Kleineidam Sat, 4 May 2002 00:21:45 +0200 + linkchecker (1.4.3) unstable; urgency=low * UrlData.py: also catch Timeout and other exception on retrieving diff --git a/linkcheck/Config.py b/linkcheck/Config.py index 5325bb5a..68eb2200 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -103,6 +103,7 @@ class Configuration(UserDict.UserDict): self["recursionlevel"] = 1 self["wait"] = 0 self["robotstxt"] = 1 + self['cookies'] = 0 self["strict"] = 0 self["fileoutput"] = [] # Logger configurations @@ -186,6 +187,8 @@ class Configuration(UserDict.UserDict): self.robotsTxtCache_set = self.robotsTxtCache_set_NoThreads self.robotsTxtCacheLock = None self.incrementLinknumber = self.incrementLinknumber_NoThreads + self.setCookies = self.setCookies_NoThreads + self.storeCookies = self.storeCookies_NoThreads self.log_newUrl = self.log_newUrl_NoThreads self.logLock = None self.urls = [] @@ -215,6 +218,8 @@ class Configuration(UserDict.UserDict): self.robotsTxtCache_set = self.robotsTxtCache_set_Threads self.robotsTxtCacheLock = Lock() self.incrementLinknumber = self.incrementLinknumber_Threads + self.setCookies = self.setCookies_Threads + self.storeCookies = self.storeCookies_Threads self.log_newUrl = self.log_newUrl_Threads self.logLock = Lock() self.urls = Queue.Queue(0) @@ -224,40 +229,48 @@ class Configuration(UserDict.UserDict): def hasMoreUrls_NoThreads(self): return len(self.urls) - + def finished_NoThreads(self): return not self.hasMoreUrls_NoThreads() def finish_NoThreads(self): pass - + def appendUrl_NoThreads(self, url): self.urls.append(url) - + def getUrl_NoThreads(self): return self.urls.pop(0) - + def checkUrl_NoThreads(self, url): - url.check(self) - + url.check() + def urlCache_has_key_NoThreads(self, key): return self.urlCache.has_key(key) - + def urlCache_get_NoThreads(self, key): return self.urlCache[key] - + def urlCache_set_NoThreads(self, key, val): self.urlCache[key] = val def robotsTxtCache_has_key_NoThreads(self, key): return self.robotsTxtCache.has_key(key) - + def robotsTxtCache_get_NoThreads(self, key): return self.robotsTxtCache[key] - + def robotsTxtCache_set_NoThreads(self, key, val): self.robotsTxtCache[key] = val + def storeCookies_NoThreads(self, headers): + pass + # XXX + + def setCookies_NoThreads(self, urlConnection): + pass + # XXX + def newLogger(self, logtype, dict={}): args = {} args.update(self[logtype]) @@ -266,7 +279,7 @@ class Configuration(UserDict.UserDict): def incrementLinknumber_NoThreads(self): self['linknumber'] += 1 - + def log_newUrl_NoThreads(self, url): if not self["quiet"]: self["log"].newUrl(url) for log in self["fileoutput"]: @@ -312,7 +325,7 @@ class Configuration(UserDict.UserDict): return self.urls.get() def checkUrl_Threads(self, url): - self.threader.startThread(url.check, (self,)) + self.threader.startThread(url.check, ()) def urlCache_has_key_Threads(self, key): ret = None @@ -373,6 +386,20 @@ class Configuration(UserDict.UserDict): finally: self.logLock.release() + def storeCookies_Threads(self, headers): + try: + self.dataLock.acquire() + # XXX + finally: + self.dataLock.release() + + def setCookies_Threads(self, urlConnection): + try: + self.dataLock.acquire() + # XXX + finally: + self.dataLock.release() + def read(self, files = []): if not files: # system wide config settings diff --git a/linkcheck/FileUrlData.py b/linkcheck/FileUrlData.py index 0bbba12e..f5aa5c45 100644 --- a/linkcheck/FileUrlData.py +++ b/linkcheck/FileUrlData.py @@ -86,16 +86,18 @@ _url_re = re.compile(_url, re.VERBOSE) class FileUrlData (UrlData): "Url link with file scheme" - def __init__(self, - urlName, - recursionLevel, - parentName = None, - baseRef = None, line=0, name=""): + def __init__ (self, + urlName, + config, + recursionLevel, + parentName = None, + baseRef = None, line=0, name=""): UrlData.__init__(self, - urlName, - recursionLevel, - parentName=parentName, - baseRef=baseRef, line=line, name=name) + urlName, + config, + recursionLevel, + parentName=parentName, + baseRef=baseRef, line=line, name=name) if not parentName and not baseRef and \ not re.compile("^file:").search(self.urlName): self.urlName = os.path.expanduser(self.urlName) @@ -136,18 +138,18 @@ class FileUrlData (UrlData): return None - def parseUrl (self, config): + def parseUrl (self): for key,ro in extensions.items(): if ro.search(self.url): - return getattr(self, "parse_"+key)(config) + return getattr(self, "parse_"+key)() for key,ro in contents.items(): if ro.search(self.getContent()): - return getattr(self, "parse_"+key)(config) + return getattr(self, "parse_"+key)() - def parse_html (self, config): - UrlData.parseUrl(self, config) + def parse_html (self): + UrlData.parseUrl(self) - def parse_opera (self, config): + def parse_opera (self): # parse an opera bookmark file name = "" lineno = 0 @@ -159,11 +161,11 @@ class FileUrlData (UrlData): elif line.startswith("URL="): url = line[4:] if url: - config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, - self.recursionLevel+1, self.url, None, lineno, name)) + self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, + self.recursionLevel+1, self.config, self.url, None, lineno, name)) name = "" - def parse_text (self, config): + def parse_text (self): lineno = 0 for line in self.getContent().splitlines(): lineno += 1 @@ -171,8 +173,8 @@ class FileUrlData (UrlData): while 1: mo = _url_re.search(line, i) if not mo: break - config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(), - self.recursionLevel+1, self.url, None, lineno, "")) + self.config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(mo.group(), + self.recursionLevel+1, self.config, self.url, None, lineno, "")) i = mo.end() return diff --git a/linkcheck/FtpUrlData.py b/linkcheck/FtpUrlData.py index 92e134dc..24e6c173 100644 --- a/linkcheck/FtpUrlData.py +++ b/linkcheck/FtpUrlData.py @@ -25,13 +25,13 @@ ExcList.extend([ ftplib.error_proto, ]) -class FtpUrlData(UrlData): +class FtpUrlData (UrlData): """ - Url link with ftp scheme. + Url link with ftp scheme. """ - def checkConnection(self, config): - _user, _password = self._getUserPassword(config) + def checkConnection (self): + _user, _password = self._getUserPassword() if _user is None or _password is None: raise linkcheck.error, linkcheck._("No user or password found") try: @@ -45,7 +45,7 @@ class FtpUrlData(UrlData): self.setInfo(info) - def closeConnection(self): + def closeConnection (self): try: self.urlConnection.quit() except: pass self.urlConnection = None diff --git a/linkcheck/GopherUrlData.py b/linkcheck/GopherUrlData.py index 3d20086b..4ca3665d 100644 --- a/linkcheck/GopherUrlData.py +++ b/linkcheck/GopherUrlData.py @@ -17,6 +17,6 @@ from UrlData import UrlData -class GopherUrlData(UrlData): +class GopherUrlData (UrlData): "Url link with gopher scheme" pass diff --git a/linkcheck/HostCheckingUrlData.py b/linkcheck/HostCheckingUrlData.py index e5954f71..58c18755 100644 --- a/linkcheck/HostCheckingUrlData.py +++ b/linkcheck/HostCheckingUrlData.py @@ -18,27 +18,24 @@ import socket, linkcheck from UrlData import UrlData -class HostCheckingUrlData(UrlData): +class HostCheckingUrlData (UrlData): "Url link for which we have to connect to a specific host" - def __init__(self, urlName, recursionLevel, parentName = None, - baseRef = None, line=0, name=""): - UrlData.__init__(self, urlName, recursionLevel, + def __init__ (self, urlName, recursionLevel, config, parentName=None, + baseRef=None, line=0, name=""): + UrlData.__init__(self, urlName, recursionLevel, config, parentName=parentName, baseRef=baseRef, line=line, - name=name) + name=name) self.host = None self.url = urlName - - def buildUrl(self): + def buildUrl (self): # to avoid anchor checking self.urlTuple=None - - def getCacheKey(self): + def getCacheKey (self): return "%s:%s" % (self.scheme, self.host) - - def checkConnection(self, config): + def checkConnection (self): ip = socket.gethostbyname(self.host) self.setValid(self.host+"("+ip+") "+linkcheck._("found")) diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index cf2eb900..6a34b8b5 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -24,11 +24,11 @@ from urllib import splittype, splithost, splituser, splitpasswd from debuglevels import * -class HttpUrlData(UrlData): +class HttpUrlData (UrlData): "Url link with http scheme" netscape_re = re.compile("Netscape-Enterprise/") - def checkConnection(self, config): + def checkConnection (self): """ Check a URL with HTTP protocol. Here is an excerpt from RFC 1945 with common response codes: @@ -70,27 +70,26 @@ class HttpUrlData(UrlData): | extension-code """ - self._setProxy(config["proxy"].get(self.scheme)) - self.mime = None + self._setProxy(self.config["proxy"].get(self.scheme)) + self.headers = None self.auth = None self.proxyauth = None if not self.urlTuple[2]: self.setWarning(linkcheck._("Missing '/' at end of URL")) - if config["robotstxt"] and not self.robotsTxtAllowsUrl(config): + if self.config["robotstxt"] and not self.robotsTxtAllowsUrl(): self.setWarning(linkcheck._("Access denied by robots.txt, checked only syntax")) return # first try - status, statusText, self.mime = self._getHttpRequest() - Config.debug(BRING_IT_ON, status, statusText, self.mime) + status, statusText, self.headers = self._getHttpRequest() + Config.debug(BRING_IT_ON, status, statusText, self.headers) has301status = 0 while 1: # proxy enforcement (overrides standard proxy) - if status == 305 and self.mime: - self._setProxy(self.mime.get("Location")) - status, statusText, self.mime = self._getHttpRequest() - + if status == 305 and self.headers: + self._setProxy(self.headers.getheader("Location")) + status, statusText, self.headers = self._getHttpRequest() # proxy authentication if status==407: if not (self.proxyuser and self.proxypass): @@ -99,34 +98,33 @@ class HttpUrlData(UrlData): import base64 self.proxyauth = "Basic "+base64.encodestring("%s:%s" % \ (self.proxyuser, self.proxypass)) - status, statusText, self.mime = self._getHttpRequest() - + status, statusText, self.headers = self._getHttpRequest() # follow redirections tries = 0 redirected = self.urlName - while status in [301,302] and self.mime and tries < 5: + while status in [301,302] and self.headers and tries < 5: has301status = (status==301) - newurl = self.mime.get("Location", self.mime.get("Uri", "")) + + newurl = self.headers.getheader("Location", + self.headers.getheader("Uri", "")) redirected = urlparse.urljoin(redirected, newurl) self.urlTuple = urlparse.urlparse(redirected) - status, statusText, self.mime = self._getHttpRequest() - Config.debug(BRING_IT_ON, "Redirected", self.mime) + status, statusText, self.headers = self._getHttpRequest() + Config.debug(BRING_IT_ON, "Redirected", self.headers) tries += 1 if tries >= 5: self.setError(linkcheck._("too much redirections (>= 5)")) return - # user authentication if status==401: if not self.auth: import base64 - _user, _password = self._getUserPassword(config) + _user, _password = self._getUserPassword() self.auth = "Basic "+\ base64.encodestring("%s:%s" % (_user, _password)) - status, statusText, self.mime = self._getHttpRequest() + status, statusText, self.headers = self._getHttpRequest() Config.debug(BRING_IT_ON, "Authentication", _user, "/", _password) - # some servers get the HEAD request wrong: # - Netscape Enterprise Server (no HEAD implemented, 404 error) # - Hyperwave Information Server (501 error) @@ -138,24 +136,23 @@ class HttpUrlData(UrlData): # HEAD method not allowed ==> try get self.setWarning(linkcheck._("Server does not support HEAD request (got " "%d status), falling back to GET")%status) - status, statusText, self.mime = self._getHttpRequest("GET") - elif status>=400 and self.mime: - server = self.mime.getheader("Server") + status, statusText, self.headers = self._getHttpRequest("GET") + elif status>=400 and self.headers: + server = self.headers.getheader("Server") if server and self.netscape_re.search(server): self.setWarning(linkcheck._("Netscape Enterprise Server with no " "HEAD support, falling back to GET")) - status,statusText,self.mime = self._getHttpRequest("GET") - elif self.mime: - type = self.mime.gettype() - poweredby = self.mime.getheader('X-Powered-By') - server = self.mime.getheader('Server') + status,statusText,self.headers = self._getHttpRequest("GET") + elif self.headers: + type = self.headers.gettype() + poweredby = self.headers.getheader('X-Powered-By') + server = self.headers.getheader('Server') if type=='application/octet-stream' and \ ((poweredby and poweredby[:4]=='Zope') or \ (server and server[:4]=='Zope')): self.setWarning(linkcheck._("Zope Server cannot determine MIME type" " with HEAD, falling back to GET")) - status,statusText,self.mime = self._getHttpRequest("GET") - + status,statusText,self.headers = self._getHttpRequest("GET") if status not in [301,302]: break effectiveurl = urlparse.urlunparse(self.urlTuple) @@ -183,9 +180,11 @@ class HttpUrlData(UrlData): self.setValid(`status`+" "+statusText) else: self.setValid("OK") + # store cookies for valid links + if self.config['cookies']: + self.config.storeCookies(self.headers) - - def _setProxy(self, proxy): + def _setProxy (self, proxy): self.proxy = proxy self.proxyuser = None self.proxypass = None @@ -198,8 +197,7 @@ class HttpUrlData(UrlData): if self.proxyuser: self.proxyuser, self.proxypass = splitpasswd(self.proxyuser) - - def _getHttpRequest(self, method="HEAD"): + def _getHttpRequest (self, method="HEAD"): """Put request and return (status code, status text, mime object). host can be host:port format """ @@ -226,23 +224,23 @@ class HttpUrlData(UrlData): if self.parentName: self.urlConnection.putheader("Referer", self.parentName) self.urlConnection.putheader("User-agent", Config.UserAgent) + if self.config['cookies']: + self.config.setCookies(self.urlConnection) self.urlConnection.endheaders() return self.urlConnection.getreply() - - def _getHTTPObject(self, host): + def _getHTTPObject (self, host): h = httplib.HTTP() h.set_debuglevel(Config.DebugLevel) h.connect(host) return h - - def getContent(self): + def getContent (self): if not self.has_content: self.has_content = 1 self.closeConnection() t = time.time() - status, statusText, self.mime = self._getHttpRequest("GET") + status, statusText, self.headers = self._getHttpRequest("GET") self.urlConnection = self.urlConnection.getfile() self.data = self.urlConnection.read() self.downloadtime = time.time() - t @@ -250,29 +248,26 @@ class HttpUrlData(UrlData): Config.debug(HURT_ME_PLENTY, "comment spans", self.html_comments) return self.data - - def isHtml(self): - if not (self.valid and self.mime): + def isHtml (self): + if not (self.valid and self.headers): return 0 - return self.mime.gettype()[:9]=="text/html" + return self.headers.gettype()[:9]=="text/html" - - def robotsTxtAllowsUrl(self, config): + def robotsTxtAllowsUrl (self): roboturl = "%s://%s/robots.txt" % self.urlTuple[0:2] Config.debug(HURT_ME_PLENTY, "robots.txt url", roboturl) Config.debug(HURT_ME_PLENTY, "url", self.url) - if not config.robotsTxtCache_has_key(roboturl): + if not self.config.robotsTxtCache_has_key(roboturl): rp = robotparser.RobotFileParser() rp.set_url(roboturl) rp.read() - config.robotsTxtCache_set(roboturl, rp) - rp = config.robotsTxtCache_get(roboturl) + self.config.robotsTxtCache_set(roboturl, rp) + rp = self.config.robotsTxtCache_get(roboturl) return rp.can_fetch(Config.UserAgent, self.url) - - def closeConnection(self): - if self.mime: - try: self.mime.close() - except: pass - self.mime = None + def closeConnection (self): + #if self.headers: + # try: self.headers.close() + # except: pass + # self.headers = None UrlData.closeConnection(self) diff --git a/linkcheck/HttpsUrlData.py b/linkcheck/HttpsUrlData.py index 0603cae5..72861635 100644 --- a/linkcheck/HttpsUrlData.py +++ b/linkcheck/HttpsUrlData.py @@ -22,19 +22,18 @@ import linkcheck, Config _supportHttps = hasattr(httplib, "HTTPS") -class HttpsUrlData(HttpUrlData): +class HttpsUrlData (HttpUrlData): """Url link with https scheme""" - def _getHTTPObject(self, host): + def _getHTTPObject (self, host): h = httplib.HTTPS() h.set_debuglevel(Config.DebugLevel) h.connect(host) return h - - def _check(self, config): + def _check (self): if _supportHttps: - HttpUrlData._check(self, config) + HttpUrlData._check(self) else: self.setWarning(linkcheck._("HTTPS url ignored")) - self.logMe(config) + self.logMe() diff --git a/linkcheck/IgnoredUrlData.py b/linkcheck/IgnoredUrlData.py index ef7f9cc5..b77bc1db 100644 --- a/linkcheck/IgnoredUrlData.py +++ b/linkcheck/IgnoredUrlData.py @@ -54,9 +54,9 @@ acap # application configuration access protocol ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) -class IgnoredUrlData(UrlData): +class IgnoredUrlData (UrlData): """Some schemes are defined in http://www.w3.org/Addressing/schemes""" - def _check(self, config): + def _check (self): self.setWarning(linkcheck._("%s url ignored")%self.scheme.capitalize()) - self.logMe(config) + self.logMe() diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py index 0033f82d..c971b576 100644 --- a/linkcheck/MailtoUrlData.py +++ b/linkcheck/MailtoUrlData.py @@ -28,10 +28,10 @@ headers_re = re.compile(r"\?(.+)$") # or read entries from the registry (Windows systems) linkcheck.DNS.init_dns_resolver() -class MailtoUrlData(HostCheckingUrlData): +class MailtoUrlData (HostCheckingUrlData): "Url link with mailto scheme" - def buildUrl(self): + def buildUrl (self): HostCheckingUrlData.buildUrl(self) self.headers = {} self.adresses = AddressList(self._cutout_adresses()).addresslist @@ -42,8 +42,7 @@ class MailtoUrlData(HostCheckingUrlData): self.adresses.extend(AddressList(a).addresslist) Config.debug(BRING_IT_ON, "adresses: ", self.adresses) - - def _cutout_adresses(self): + def _cutout_adresses (self): mo = headers_re.search(self.urlName) if mo: headers = cgi.parse_qs(mo.group(1), strict_parsing=1) @@ -54,7 +53,7 @@ class MailtoUrlData(HostCheckingUrlData): return self.urlName[7:] - def checkConnection(self, config): + def checkConnection (self): """Verify a list of email adresses. If one adress fails, the whole list will fail. For each mail adress we check the following things: @@ -97,7 +96,6 @@ class MailtoUrlData(HostCheckingUrlData): type, value = sys.exc_info()[:2] #print type,value if smtpconnect: break - if not smtpconnect: self.setWarning(linkcheck._("None of the MX mail hosts for %s accepts an " "SMTP connection: %s") % (host, str(value))) @@ -107,7 +105,7 @@ class MailtoUrlData(HostCheckingUrlData): self.setValid(linkcheck._("found MX mail host %s") % mxrecord) - def _split_adress(self, adress): + def _split_adress (self, adress): split = adress.split("@", 1) if len(split)==2: if not split[1]: @@ -117,12 +115,10 @@ class MailtoUrlData(HostCheckingUrlData): return (split[0], "localhost") raise linkcheck.error, linkcheck._("could not split the mail adress") - - def closeConnection(self): + def closeConnection (self): try: self.urlConnection.quit() except: pass self.urlConnection = None - - def getCacheKey(self): + def getCacheKey (self): return "%s:%s" % (self.scheme, str(self.adresses)) diff --git a/linkcheck/NntpUrlData.py b/linkcheck/NntpUrlData.py index 5a70802e..762bc8f6 100644 --- a/linkcheck/NntpUrlData.py +++ b/linkcheck/NntpUrlData.py @@ -26,10 +26,10 @@ ExcList.extend([nntplib.error_reply, nntplib.error_proto, ]) -class NntpUrlData(UrlData): +class NntpUrlData (UrlData): "Url link with NNTP scheme" - def buildUrl(self): + def buildUrl (self): # use nntp instead of news to comply with the unofficial internet # draft of Alfred Gilman which unifies (s)news and nntp URLs # note: we use this only internally (for parsing and caching) @@ -40,9 +40,8 @@ class NntpUrlData(UrlData): self.urlTuple = urlparse.urlparse(self.url) debug(BRING_IT_ON, self.urlTuple) - - def checkConnection(self, config): - nntpserver = self.urlTuple[1] or config["nntpserver"] + def checkConnection (self): + nntpserver = self.urlTuple[1] or self.config["nntpserver"] if not nntpserver: self.setWarning(linkcheck._("No NNTP server specified, skipping this URL")) return @@ -66,8 +65,7 @@ class NntpUrlData(UrlData): # group name is the empty string self.setWarning(linkcheck._("No newsgroup specified in NNTP URL")) - - def _connectNntp(self, nntpserver): + def _connectNntp (self, nntpserver): """This is done only once per checking task.""" timeout = 1 while timeout: @@ -84,6 +82,5 @@ class NntpUrlData(UrlData): raise return nntp - - def getCacheKey(self): + def getCacheKey (self): return self.url diff --git a/linkcheck/StringUtil.py b/linkcheck/StringUtil.py index ece29842..56978017 100644 --- a/linkcheck/StringUtil.py +++ b/linkcheck/StringUtil.py @@ -30,52 +30,7 @@ SQLTable = [ ("'","''") ] -TeXTable = [] - -def stripHtmlComments(data): - "Remove HTML comments from data" - i = data.find("", i) - if j == -1: - break - data = data[:i] + data[j+3:] - i = data.find("