From 58be481d1becc0ec7cc21d088d815531fd87e2e5 Mon Sep 17 00:00:00 2001 From: calvin Date: Wed, 21 Jun 2000 01:27:37 +0000 Subject: [PATCH] profiling git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@123 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- DNS/Lib.py | 2 +- README | 1 + linkcheck/HttpUrlData.py | 27 +++++++++++++++++---------- linkcheck/MailtoUrlData.py | 8 ++++++-- test/profiletest.py.tmpl | 36 +++++++++++++++++------------------- test/viewprof.py | 2 +- 6 files changed, 43 insertions(+), 33 deletions(-) diff --git a/DNS/Lib.py b/DNS/Lib.py index 111fe0c6..424de3b8 100644 --- a/DNS/Lib.py +++ b/DNS/Lib.py @@ -563,7 +563,7 @@ class DnsResult: if hasattr(u, mname): r['data']=getattr(u, mname)() else: - r['data']=u.getbytes(rdlength) + r['data']=u.getbytes(r['rdlength']) return r def dumpQ(u): diff --git a/README b/README index e2a56021..f43e34bf 100644 --- a/README +++ b/README @@ -53,6 +53,7 @@ Note that the following packages are modified by me: httplib.py (renamed to http11lib.py and a bug fixed) fcgi.py (implemented immediate output) sz_fcgi.py (simplified the code) +DNS/Lib.py:566 fixed rdlength name error Internationalization diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index ae0516b8..d164698a 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -58,6 +58,7 @@ class HttpUrlData(UrlData): | "401" ; Unauthorized | "403" ; Forbidden | "404" ; Not Found + | "405" ; Method not allowed | "500" ; Internal Server Error | "501" ; Not Implemented | "502" ; Bad Gateway @@ -84,7 +85,7 @@ class HttpUrlData(UrlData): tries = 0 redirected = self.urlName while status in [301,302] and self.mime and tries < 5: - has301status = status==301 + has301status = (status==301) redirected = urlparse.urljoin(redirected, self.mime.getheader("Location")) self.urlTuple = urlparse.urlparse(redirected) status, statusText, self.mime = self._getHttpRequest() @@ -101,13 +102,26 @@ class HttpUrlData(UrlData): status, statusText, self.mime = self._getHttpRequest() Config.debug("DEBUG: Authentication "+_user+"/"+_password+"\n") - # Netscape Enterprise Server 3 returns errors with HEAD - # request, but valid urls with GET request. Bummer! + # some servers get the HEAD request wrong: + # - Netscape Enterprise Server III (no HEAD implemented) + # - some advertisings (they want only GET, dont ask why ;) + # - Zope server (it has to render the page to get the correct + # content-type + elif status==405: + # HEAD method not allowed ==> try get + status, statusText, self.mime = self._getHttpRequest("GET") + Config.debug("DEBUG: detected 405 error\n") elif status>=400 and self.mime: server = self.mime.getheader("Server") if server and self.netscape_re.search(server): status, statusText, self.mime = self._getHttpRequest("GET") Config.debug("DEBUG: Netscape Enterprise Server detected\n") + elif self.mime: + type = self.mime.gettype() + poweredby = self.mime.getheader('X-Powered-By') + if type=='application/octet-stream' and poweredby[:4]=='Zope': + status,statusText,self.mime = self._getHttpRequest("GET") + if status not in [301,302]: break effectiveurl = urlparse.urlunparse(self.urlTuple) @@ -175,13 +189,6 @@ class HttpUrlData(UrlData): def isHtml(self): if not (self.valid and self.mime): return 0 - # some web servers (Zope) only know the mime-type when they have - # to render the whole page. Before that, they return - # "application/octet-stream" - if self.mime.gettype()=="application/octet-stream": - self.closeConnection() - self.mime = self._getHttpRequest("GET")[2] - if not self.mime: return 0 return self.mime.gettype()=="text/html" def robotsTxtAllowsUrl(self, config): diff --git a/linkcheck/MailtoUrlData.py b/linkcheck/MailtoUrlData.py index b904ccbe..18017e5d 100644 --- a/linkcheck/MailtoUrlData.py +++ b/linkcheck/MailtoUrlData.py @@ -26,6 +26,9 @@ word = r"[-a-zA-Z0-9,./%]+" headers = r"\?(%s=%s(&%s=%s)*)$" % (word, word, word, word) headers_re = re.compile(headers) +# parse /etc/resolv.conf (only on UNIX systems) +DNS.ParseResolvConf() + class MailtoUrlData(HostCheckingUrlData): "Url link with mailto scheme" @@ -38,7 +41,7 @@ class MailtoUrlData(HostCheckingUrlData): for val in self.headers[key]: a = urllib.unquote(val) self.adresses.extend(AddressList(a).addresslist) - Config.debug("DEBUG: %s\nDEBUG: %s\n" % (self.adresses, self.headers)) + Config.debug("DEBUG: mailto headers: %s\n" % self.headers) def _cutout_adresses(self): @@ -64,8 +67,8 @@ class MailtoUrlData(HostCheckingUrlData): self.setWarning(_("No adresses found")) return - DNS.ParseResolvConf() for name,mail in self.adresses: + Config.debug("DEBUG: checking mail address %s" % mail) user,host = self._split_adress(mail) mxrecords = DNS.mxlookup(host) if not len(mxrecords): @@ -74,6 +77,7 @@ class MailtoUrlData(HostCheckingUrlData): smtpconnect = 0 for mxrecord in mxrecords: try: + Config.debug("DEBUG: SMTP check for %s\n" % mxrecord) self.urlConnection = SMTP(mxrecord[1]) smtpconnect = 1 self.urlConnection.helo() diff --git a/test/profiletest.py.tmpl b/test/profiletest.py.tmpl index 85b20c15..8919e73d 100755 --- a/test/profiletest.py.tmpl +++ b/test/profiletest.py.tmpl @@ -21,25 +21,23 @@ import sys,re,profile,pstats # add the path to linkcheck module if you do not install with distutils $syspath import linkcheck -url='http://www.yahoo.de/' -config = linkcheck.Config.Configuration() +#linkcheck.Config.DebugFlag = 1 -config['recursionlevel'] = 2 -config['anchors'] = 1 -config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*")) -# avoid checking of local files (security!) -config["externlinks"].append((re.compile("^file:"), 1)) -config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) -profile.run("linkcheck.checkUrls(config)", "threads.prof") +def runit(config, name): + url='http://www.heise.de/' + config['recursionlevel'] = 2 + config['anchors'] = 1 + config['internlinks'].append(re.compile(r"^https?://www\.heise\.de")) + # avoid checking of local files (security!) + config["externlinks"].append((re.compile("^file:"), 1)) + config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) + profile.run("linkcheck.checkUrls(config)", name) -config.reset() - -config.disableThreading() -config['recursionlevel'] = 2 -config['anchors'] = 1 -config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*")) -# avoid checking of local files (security!) -config["externlinks"].append((re.compile("^file:"), 1)) -config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0)) -profile.run("linkcheck.checkUrls(config)", "nothreads.prof") +if __name__=='__main__': + config = linkcheck.Config.Configuration() + config.disableThreading() + runit(config, "nothreads.prof") + config.reset() + config.enableThreading(10) + runit(config, "threads.prof") diff --git a/test/viewprof.py b/test/viewprof.py index 5d6dd5af..32f8c4a6 100644 --- a/test/viewprof.py +++ b/test/viewprof.py @@ -18,4 +18,4 @@ import pstats,glob for file in glob.glob('*.prof'): - pstats.Stats(file).strip_dirs().sort_stats("time").print_stats(20) + pstats.Stats(file).strip_dirs().sort_stats("cumulative").print_stats(20)