""" linkcheck/UrlData.py Copyright (C) 2000 Bastian Kleineidam This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. """ import sys,re,string,urlparse,urllib,time import Config,StringUtil,linkcheck from linkcheck import _ debug = linkcheck.Config.debug ExcList = [ IOError, ValueError, # from http11lib.py linkcheck.error, EOFError, # from ftplib.py ] try: import socket ExcList.append(socket.error) except ImportError: pass _linkMatcher = r""" (?i) # case insensitive < # open tag \s* # whitespace %s # tag name \s+ # whitespace [^>]*? # skip leading attributes %s # attrib name \s* # whitespace = # equal sign \s* # whitespace (?P # attribute value ".*?" | # in double quotes '.*?' | # in single quotes [^\s>]+) # unquoted ([^">]|".*?")* # skip trailing attributes > # close tag """ LinkTags = ( ("a", "href"), ("img", "src"), ("form", "action"), ("body", "background"), ("frame", "src"), ("link", "href"), ("meta", "url"), # ("area", "href"), ("script", "src"), ) LinkPatterns = [] for tag,attr in LinkTags: LinkPatterns.append({'pattern': re.compile(_linkMatcher % (tag, attr), re.VERBOSE), 'tag': tag, 'attr': attr}) AnchorPattern = { 'pattern': re.compile(_linkMatcher % ("a", "name"), re.VERBOSE), 'tag': 'a', 'attr': 'name', } BasePattern = { 'pattern': re.compile(_linkMatcher % ("base", "href"), re.VERBOSE), 'tag': 'base', 'attr': 'href', } class UrlData: "Representing a URL with additional information like validity etc" def __init__(self, urlName, recursionLevel, parentName = None, baseRef = None, line = 0, name = ""): self.urlName = urlName self.recursionLevel = recursionLevel self.parentName = parentName self.baseRef = baseRef self.errorString = _("Error") self.validString = _("Valid") self.warningString = None self.infoString = None self.valid = 1 self.url = None self.line = line self.name = name self.downloadtime = 0 self.checktime = 0 self.cached = 0 self.urlConnection = None self.extern = 1 self.data = None self.html_comments = [] def setError(self, s): self.valid=0 self.errorString = _("Error")+": "+s def setValid(self, s): self.valid=1 self.validString = _("Valid")+": "+s def isHtml(self): return 0 def setWarning(self, s): if self.warningString: self.warningString = self.warningString+"\n" + s else: self.warningString = s def setInfo(self, s): if self.infoString: self.infoString = self.infoString+"\n"+s else: self.infoString = s def copyFrom(self, urlData): self.errorString = urlData.errorString self.validString = urlData.validString self.warningString = urlData.warningString self.infoString = urlData.infoString self.valid = urlData.valid self.downloadtime = urlData.downloadtime def buildUrl(self): if self.baseRef: self.url = urlparse.urljoin(self.baseRef, self.urlName) elif self.parentName: self.url = urlparse.urljoin(self.parentName, self.urlName) else: self.url = self.urlName self.urlTuple = urlparse.urlparse(self.url) # make host lowercase self.urlTuple = (self.urlTuple[0],string.lower(self.urlTuple[1]), self.urlTuple[2],self.urlTuple[3],self.urlTuple[4], self.urlTuple[5]) self.url = urlparse.urlunparse(self.urlTuple) def logMe(self, config): debug("DEBUG: logging url\n") config.incrementLinknumber() if config["verbose"] or not self.valid or \ (self.warningString and config["warnings"]): config.log_newUrl(self) def check(self, config): debug(Config.DebugDelim+"Checking\n"+str(self)+"\n"+\ Config.DebugDelim) t = time.time() # check syntax debug("DEBUG: checking syntax\n") if not self.urlName or self.urlName=="": self.setError(_("URL is null or empty")) self.logMe(config) return try: self.buildUrl() self.extern = self._getExtern(config) except linkcheck.error: type, value = sys.exc_info()[:2] self.setError(str(value)) self.logMe(config) return # check the cache debug("DEBUG: checking cache\n") if config.urlCache_has_key(self.getCacheKey()): self.copyFrom(config.urlCache_get(self.getCacheKey())) self.cached = 1 self.logMe(config) return # apply filter debug("DEBUG: checking filter\n") debug("DEBUG: extern = %s\n" % str(self.extern)) if self.extern and (config["strict"] or self.extern[1]): self.setWarning(_("outside of domain filter, checked only syntax")) self.logMe(config) return # check connection debug("DEBUG: checking connection\n") try: self.checkConnection(config) if self.urlTuple and config["anchors"]: self.checkAnchors(self.urlTuple[5]) except tuple(ExcList): type, value = sys.exc_info()[:2] self.setError(str(value)) # check content warningregex = config["warningregex"] if warningregex and self.valid: debug("DEBUG: checking content\n") self.checkContent(warningregex) self.checktime = time.time() - t # check recursion debug("DEBUG: checking recursion\n") if self.allowsRecursion(config): self.parseUrl(config) self.closeConnection() self.logMe(config) self.putInCache(config) def closeConnection(self): # brute force closing if self.urlConnection is not None: try: self.urlConnection.close() except: pass # release variable for garbage collection self.urlConnection = None def putInCache(self, config): cacheKey = self.getCacheKey() if cacheKey and not self.cached: config.urlCache_set(cacheKey, self) self.cached = 1 def getCacheKey(self): if self.urlTuple: return urlparse.urlunparse(self.urlTuple) return None def checkConnection(self, config): self.urlConnection = urllib.urlopen(self.url) def allowsRecursion(self, config): Config.debug("extern: %s\n" % str(self.extern)) return self.valid and \ self.isHtml() and \ not self.cached and \ self.recursionLevel < config["recursionlevel"] and \ not self.extern def isHtml(self): return 0 def checkAnchors(self, anchor): if not (anchor!="" and self.isHtml() and self.valid): return self.getContent() for cur_anchor,line in self.searchInForTag(AnchorPattern): if cur_anchor == anchor: return self.setWarning("anchor #"+anchor+" not found") def _getExtern(self, config): if not (config["externlinks"] or config["internlinks"]): return 0 # deny and allow external checking if config["denyallow"]: for pat, strict in config["externlinks"]: if pat.search(self.url): return (1, strict) for pat in config["internlinks"]: if pat.search(self.url): return 0 return 0 else: for pat in config["internlinks"]: if pat.search(self.url): return 0 for pat, strict in config["externlinks"]: if pat.search(self.url): return (1, strict) return (1,0) raise ValueError, "internal error in UrlData._getExtern" def getContent(self): """Precondition: urlConnection is an opened URL.""" if not self.data: t = time.time() self.data = self.urlConnection.read() self.downloadtime = time.time() - t self._init_html_comments() debug("DEBUG: comment spans %s\n" % self.html_comments) return self.data def _init_html_comments(self): # if we find an URL inside HTML comments we ignore it # so build a list of intervalls which are HTML comments pattern = re.compile("