profiling

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@123 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2000-06-21 01:27:37 +00:00
parent 86fc8ec652
commit 58be481d1b
6 changed files with 43 additions and 33 deletions

View file

@ -563,7 +563,7 @@ class DnsResult:
if hasattr(u, mname):
r['data']=getattr(u, mname)()
else:
r['data']=u.getbytes(rdlength)
r['data']=u.getbytes(r['rdlength'])
return r
def dumpQ(u):

1
README
View file

@ -53,6 +53,7 @@ Note that the following packages are modified by me:
httplib.py (renamed to http11lib.py and a bug fixed)
fcgi.py (implemented immediate output)
sz_fcgi.py (simplified the code)
DNS/Lib.py:566 fixed rdlength name error
Internationalization

View file

@ -58,6 +58,7 @@ class HttpUrlData(UrlData):
| "401" ; Unauthorized
| "403" ; Forbidden
| "404" ; Not Found
| "405" ; Method not allowed
| "500" ; Internal Server Error
| "501" ; Not Implemented
| "502" ; Bad Gateway
@ -84,7 +85,7 @@ class HttpUrlData(UrlData):
tries = 0
redirected = self.urlName
while status in [301,302] and self.mime and tries < 5:
has301status = status==301
has301status = (status==301)
redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
self.urlTuple = urlparse.urlparse(redirected)
status, statusText, self.mime = self._getHttpRequest()
@ -101,13 +102,26 @@ class HttpUrlData(UrlData):
status, statusText, self.mime = self._getHttpRequest()
Config.debug("DEBUG: Authentication "+_user+"/"+_password+"\n")
# Netscape Enterprise Server 3 returns errors with HEAD
# request, but valid urls with GET request. Bummer!
# some servers get the HEAD request wrong:
# - Netscape Enterprise Server III (no HEAD implemented)
# - some advertisings (they want only GET, dont ask why ;)
# - Zope server (it has to render the page to get the correct
# content-type
elif status==405:
# HEAD method not allowed ==> try get
status, statusText, self.mime = self._getHttpRequest("GET")
Config.debug("DEBUG: detected 405 error\n")
elif status>=400 and self.mime:
server = self.mime.getheader("Server")
if server and self.netscape_re.search(server):
status, statusText, self.mime = self._getHttpRequest("GET")
Config.debug("DEBUG: Netscape Enterprise Server detected\n")
elif self.mime:
type = self.mime.gettype()
poweredby = self.mime.getheader('X-Powered-By')
if type=='application/octet-stream' and poweredby[:4]=='Zope':
status,statusText,self.mime = self._getHttpRequest("GET")
if status not in [301,302]: break
effectiveurl = urlparse.urlunparse(self.urlTuple)
@ -175,13 +189,6 @@ class HttpUrlData(UrlData):
def isHtml(self):
if not (self.valid and self.mime):
return 0
# some web servers (Zope) only know the mime-type when they have
# to render the whole page. Before that, they return
# "application/octet-stream"
if self.mime.gettype()=="application/octet-stream":
self.closeConnection()
self.mime = self._getHttpRequest("GET")[2]
if not self.mime: return 0
return self.mime.gettype()=="text/html"
def robotsTxtAllowsUrl(self, config):

View file

@ -26,6 +26,9 @@ word = r"[-a-zA-Z0-9,./%]+"
headers = r"\?(%s=%s(&%s=%s)*)$" % (word, word, word, word)
headers_re = re.compile(headers)
# parse /etc/resolv.conf (only on UNIX systems)
DNS.ParseResolvConf()
class MailtoUrlData(HostCheckingUrlData):
"Url link with mailto scheme"
@ -38,7 +41,7 @@ class MailtoUrlData(HostCheckingUrlData):
for val in self.headers[key]:
a = urllib.unquote(val)
self.adresses.extend(AddressList(a).addresslist)
Config.debug("DEBUG: %s\nDEBUG: %s\n" % (self.adresses, self.headers))
Config.debug("DEBUG: mailto headers: %s\n" % self.headers)
def _cutout_adresses(self):
@ -64,8 +67,8 @@ class MailtoUrlData(HostCheckingUrlData):
self.setWarning(_("No adresses found"))
return
DNS.ParseResolvConf()
for name,mail in self.adresses:
Config.debug("DEBUG: checking mail address %s" % mail)
user,host = self._split_adress(mail)
mxrecords = DNS.mxlookup(host)
if not len(mxrecords):
@ -74,6 +77,7 @@ class MailtoUrlData(HostCheckingUrlData):
smtpconnect = 0
for mxrecord in mxrecords:
try:
Config.debug("DEBUG: SMTP check for %s\n" % mxrecord)
self.urlConnection = SMTP(mxrecord[1])
smtpconnect = 1
self.urlConnection.helo()

View file

@ -21,25 +21,23 @@ import sys,re,profile,pstats
# add the path to linkcheck module if you do not install with distutils
$syspath
import linkcheck
url='http://www.yahoo.de/'
config = linkcheck.Config.Configuration()
#linkcheck.Config.DebugFlag = 1
config['recursionlevel'] = 2
config['anchors'] = 1
config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
# avoid checking of local files (security!)
config["externlinks"].append((re.compile("^file:"), 1))
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
profile.run("linkcheck.checkUrls(config)", "threads.prof")
def runit(config, name):
url='http://www.heise.de/'
config['recursionlevel'] = 2
config['anchors'] = 1
config['internlinks'].append(re.compile(r"^https?://www\.heise\.de"))
# avoid checking of local files (security!)
config["externlinks"].append((re.compile("^file:"), 1))
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
profile.run("linkcheck.checkUrls(config)", name)
config.reset()
config.disableThreading()
config['recursionlevel'] = 2
config['anchors'] = 1
config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
# avoid checking of local files (security!)
config["externlinks"].append((re.compile("^file:"), 1))
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
profile.run("linkcheck.checkUrls(config)", "nothreads.prof")
if __name__=='__main__':
config = linkcheck.Config.Configuration()
config.disableThreading()
runit(config, "nothreads.prof")
config.reset()
config.enableThreading(10)
runit(config, "threads.prof")

View file

@ -18,4 +18,4 @@
import pstats,glob
for file in glob.glob('*.prof'):
pstats.Stats(file).strip_dirs().sort_stats("time").print_stats(20)
pstats.Stats(file).strip_dirs().sort_stats("cumulative").print_stats(20)