profiling

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@123 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-05-22 21:25:48 +00:00 · 2000-06-21 01:27:37 +00:00 · 2000-06-21 01:27:37 +00:00 · 58be481d1b
commit 58be481d1b
parent 86fc8ec652
6 changed files with 43 additions and 33 deletions
--- a/DNS/Lib.py
+++ b/DNS/Lib.py
@ -563,7 +563,7 @@ class DnsResult:
 	if hasattr(u, mname):
 	    r['data']=getattr(u, mname)()
 	else:
-	    r['data']=u.getbytes(rdlength)
+	    r['data']=u.getbytes(r['rdlength'])
 	return r

 def dumpQ(u):
--- a/1
+++ b/1
@ -53,6 +53,7 @@ Note that the following packages are modified by me:
 httplib.py (renamed to http11lib.py and a bug fixed)
 fcgi.py (implemented immediate output)
 sz_fcgi.py (simplified the code)
+DNS/Lib.py:566 fixed rdlength name error


 Internationalization
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -58,6 +58,7 @@ class HttpUrlData(UrlData):
        | "401"   ; Unauthorized
        | "403"   ; Forbidden
        | "404"   ; Not Found
+        | "405"   ; Method not allowed
        | "500"   ; Internal Server Error
        | "501"   ; Not Implemented
        | "502"   ; Bad Gateway
@ -84,7 +85,7 @@ class HttpUrlData(UrlData):
            tries = 0
            redirected = self.urlName
            while status in [301,302] and self.mime and tries < 5:
-                has301status = status==301
+                has301status = (status==301)
                redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
                self.urlTuple = urlparse.urlparse(redirected)
                status, statusText, self.mime = self._getHttpRequest()
@ -101,13 +102,26 @@ class HttpUrlData(UrlData):
                status, statusText, self.mime = self._getHttpRequest()
                Config.debug("DEBUG: Authentication "+_user+"/"+_password+"\n")

-            # Netscape Enterprise Server 3 returns errors with HEAD
-            # request, but valid urls with GET request. Bummer!
+            # some servers get the HEAD request wrong:
+            # - Netscape Enterprise Server III (no HEAD implemented)
+            # - some advertisings (they want only GET, dont ask why ;)
+            # - Zope server (it has to render the page to get the correct
+            #   content-type
+            elif status==405:
+                # HEAD method not allowed ==> try get
+                status, statusText, self.mime = self._getHttpRequest("GET")
+                Config.debug("DEBUG: detected 405 error\n")
            elif status>=400 and self.mime:
                server = self.mime.getheader("Server")
                if server and self.netscape_re.search(server):
                    status, statusText, self.mime = self._getHttpRequest("GET")
                    Config.debug("DEBUG: Netscape Enterprise Server detected\n")
+            elif self.mime:
+                type = self.mime.gettype()
+                poweredby = self.mime.getheader('X-Powered-By')
+                if type=='application/octet-stream' and poweredby[:4]=='Zope':
+                    status,statusText,self.mime = self._getHttpRequest("GET")
+
            if status not in [301,302]: break

        effectiveurl = urlparse.urlunparse(self.urlTuple)
@ -175,13 +189,6 @@ class HttpUrlData(UrlData):
    def isHtml(self):
        if not (self.valid and self.mime):
            return 0
-        # some web servers (Zope) only know the mime-type when they have
-        # to render the whole page. Before that, they return
-        # "application/octet-stream"
-        if self.mime.gettype()=="application/octet-stream":
-            self.closeConnection()
-            self.mime = self._getHttpRequest("GET")[2]
-            if not self.mime: return 0
        return self.mime.gettype()=="text/html"

    def robotsTxtAllowsUrl(self, config):
--- a/linkcheck/MailtoUrlData.py
+++ b/linkcheck/MailtoUrlData.py
@ -26,6 +26,9 @@ word = r"[-a-zA-Z0-9,./%]+"
 headers = r"\?(%s=%s(&%s=%s)*)$" % (word, word, word, word)
 headers_re = re.compile(headers)

+# parse /etc/resolv.conf (only on UNIX systems)
+DNS.ParseResolvConf()
+
 class MailtoUrlData(HostCheckingUrlData):
    "Url link with mailto scheme"
    
@ -38,7 +41,7 @@ class MailtoUrlData(HostCheckingUrlData):
                for val in self.headers[key]:
                    a = urllib.unquote(val)
                    self.adresses.extend(AddressList(a).addresslist)
-        Config.debug("DEBUG: %s\nDEBUG: %s\n" % (self.adresses, self.headers))
+        Config.debug("DEBUG: mailto headers: %s\n" % self.headers)


    def _cutout_adresses(self):
@ -64,8 +67,8 @@ class MailtoUrlData(HostCheckingUrlData):
            self.setWarning(_("No adresses found"))
            return

-        DNS.ParseResolvConf()
        for name,mail in self.adresses:
+            Config.debug("DEBUG: checking mail address %s" % mail)
            user,host = self._split_adress(mail)
            mxrecords = DNS.mxlookup(host)
            if not len(mxrecords):
@ -74,6 +77,7 @@ class MailtoUrlData(HostCheckingUrlData):
            smtpconnect = 0
            for mxrecord in mxrecords:
                try:
+                    Config.debug("DEBUG: SMTP check for %s\n" % mxrecord)
                    self.urlConnection = SMTP(mxrecord[1])
                    smtpconnect = 1
                    self.urlConnection.helo()
--- a/test/profiletest.py.tmpl
+++ b/test/profiletest.py.tmpl
@ -21,25 +21,23 @@ import sys,re,profile,pstats
 # add the path to linkcheck module if you do not install with distutils
 $syspath
 import linkcheck
-url='http://www.yahoo.de/'
-config = linkcheck.Config.Configuration()
+#linkcheck.Config.DebugFlag = 1

-config['recursionlevel'] = 2
-config['anchors'] = 1
-config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
-# avoid checking of local files (security!)
-config["externlinks"].append((re.compile("^file:"), 1))
-config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
-profile.run("linkcheck.checkUrls(config)", "threads.prof")
+def runit(config, name):
+    url='http://www.heise.de/'
+    config['recursionlevel'] = 2
+    config['anchors'] = 1
+    config['internlinks'].append(re.compile(r"^https?://www\.heise\.de"))
+    # avoid checking of local files (security!)
+    config["externlinks"].append((re.compile("^file:"), 1))
+    config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
+    profile.run("linkcheck.checkUrls(config)", name)

-config.reset()
-
-config.disableThreading()
-config['recursionlevel'] = 2
-config['anchors'] = 1
-config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
-# avoid checking of local files (security!)
-config["externlinks"].append((re.compile("^file:"), 1))
-config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
-profile.run("linkcheck.checkUrls(config)", "nothreads.prof")
+if __name__=='__main__':
+    config = linkcheck.Config.Configuration()
+    config.disableThreading()
+    runit(config, "nothreads.prof")
+    config.reset()
+    config.enableThreading(10)
+    runit(config, "threads.prof")

--- a/test/viewprof.py
+++ b/test/viewprof.py
@ -18,4 +18,4 @@
 import pstats,glob

 for file in glob.glob('*.prof'):
-    pstats.Stats(file).strip_dirs().sort_stats("time").print_stats(20)
+    pstats.Stats(file).strip_dirs().sort_stats("cumulative").print_stats(20)