mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-05 07:20:58 +00:00
profiling
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@123 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
86fc8ec652
commit
58be481d1b
6 changed files with 43 additions and 33 deletions
|
|
@ -563,7 +563,7 @@ class DnsResult:
|
|||
if hasattr(u, mname):
|
||||
r['data']=getattr(u, mname)()
|
||||
else:
|
||||
r['data']=u.getbytes(rdlength)
|
||||
r['data']=u.getbytes(r['rdlength'])
|
||||
return r
|
||||
|
||||
def dumpQ(u):
|
||||
|
|
|
|||
1
README
1
README
|
|
@ -53,6 +53,7 @@ Note that the following packages are modified by me:
|
|||
httplib.py (renamed to http11lib.py and a bug fixed)
|
||||
fcgi.py (implemented immediate output)
|
||||
sz_fcgi.py (simplified the code)
|
||||
DNS/Lib.py:566 fixed rdlength name error
|
||||
|
||||
|
||||
Internationalization
|
||||
|
|
|
|||
|
|
@ -58,6 +58,7 @@ class HttpUrlData(UrlData):
|
|||
| "401" ; Unauthorized
|
||||
| "403" ; Forbidden
|
||||
| "404" ; Not Found
|
||||
| "405" ; Method not allowed
|
||||
| "500" ; Internal Server Error
|
||||
| "501" ; Not Implemented
|
||||
| "502" ; Bad Gateway
|
||||
|
|
@ -84,7 +85,7 @@ class HttpUrlData(UrlData):
|
|||
tries = 0
|
||||
redirected = self.urlName
|
||||
while status in [301,302] and self.mime and tries < 5:
|
||||
has301status = status==301
|
||||
has301status = (status==301)
|
||||
redirected = urlparse.urljoin(redirected, self.mime.getheader("Location"))
|
||||
self.urlTuple = urlparse.urlparse(redirected)
|
||||
status, statusText, self.mime = self._getHttpRequest()
|
||||
|
|
@ -101,13 +102,26 @@ class HttpUrlData(UrlData):
|
|||
status, statusText, self.mime = self._getHttpRequest()
|
||||
Config.debug("DEBUG: Authentication "+_user+"/"+_password+"\n")
|
||||
|
||||
# Netscape Enterprise Server 3 returns errors with HEAD
|
||||
# request, but valid urls with GET request. Bummer!
|
||||
# some servers get the HEAD request wrong:
|
||||
# - Netscape Enterprise Server III (no HEAD implemented)
|
||||
# - some advertisings (they want only GET, dont ask why ;)
|
||||
# - Zope server (it has to render the page to get the correct
|
||||
# content-type
|
||||
elif status==405:
|
||||
# HEAD method not allowed ==> try get
|
||||
status, statusText, self.mime = self._getHttpRequest("GET")
|
||||
Config.debug("DEBUG: detected 405 error\n")
|
||||
elif status>=400 and self.mime:
|
||||
server = self.mime.getheader("Server")
|
||||
if server and self.netscape_re.search(server):
|
||||
status, statusText, self.mime = self._getHttpRequest("GET")
|
||||
Config.debug("DEBUG: Netscape Enterprise Server detected\n")
|
||||
elif self.mime:
|
||||
type = self.mime.gettype()
|
||||
poweredby = self.mime.getheader('X-Powered-By')
|
||||
if type=='application/octet-stream' and poweredby[:4]=='Zope':
|
||||
status,statusText,self.mime = self._getHttpRequest("GET")
|
||||
|
||||
if status not in [301,302]: break
|
||||
|
||||
effectiveurl = urlparse.urlunparse(self.urlTuple)
|
||||
|
|
@ -175,13 +189,6 @@ class HttpUrlData(UrlData):
|
|||
def isHtml(self):
|
||||
if not (self.valid and self.mime):
|
||||
return 0
|
||||
# some web servers (Zope) only know the mime-type when they have
|
||||
# to render the whole page. Before that, they return
|
||||
# "application/octet-stream"
|
||||
if self.mime.gettype()=="application/octet-stream":
|
||||
self.closeConnection()
|
||||
self.mime = self._getHttpRequest("GET")[2]
|
||||
if not self.mime: return 0
|
||||
return self.mime.gettype()=="text/html"
|
||||
|
||||
def robotsTxtAllowsUrl(self, config):
|
||||
|
|
|
|||
|
|
@ -26,6 +26,9 @@ word = r"[-a-zA-Z0-9,./%]+"
|
|||
headers = r"\?(%s=%s(&%s=%s)*)$" % (word, word, word, word)
|
||||
headers_re = re.compile(headers)
|
||||
|
||||
# parse /etc/resolv.conf (only on UNIX systems)
|
||||
DNS.ParseResolvConf()
|
||||
|
||||
class MailtoUrlData(HostCheckingUrlData):
|
||||
"Url link with mailto scheme"
|
||||
|
||||
|
|
@ -38,7 +41,7 @@ class MailtoUrlData(HostCheckingUrlData):
|
|||
for val in self.headers[key]:
|
||||
a = urllib.unquote(val)
|
||||
self.adresses.extend(AddressList(a).addresslist)
|
||||
Config.debug("DEBUG: %s\nDEBUG: %s\n" % (self.adresses, self.headers))
|
||||
Config.debug("DEBUG: mailto headers: %s\n" % self.headers)
|
||||
|
||||
|
||||
def _cutout_adresses(self):
|
||||
|
|
@ -64,8 +67,8 @@ class MailtoUrlData(HostCheckingUrlData):
|
|||
self.setWarning(_("No adresses found"))
|
||||
return
|
||||
|
||||
DNS.ParseResolvConf()
|
||||
for name,mail in self.adresses:
|
||||
Config.debug("DEBUG: checking mail address %s" % mail)
|
||||
user,host = self._split_adress(mail)
|
||||
mxrecords = DNS.mxlookup(host)
|
||||
if not len(mxrecords):
|
||||
|
|
@ -74,6 +77,7 @@ class MailtoUrlData(HostCheckingUrlData):
|
|||
smtpconnect = 0
|
||||
for mxrecord in mxrecords:
|
||||
try:
|
||||
Config.debug("DEBUG: SMTP check for %s\n" % mxrecord)
|
||||
self.urlConnection = SMTP(mxrecord[1])
|
||||
smtpconnect = 1
|
||||
self.urlConnection.helo()
|
||||
|
|
|
|||
|
|
@ -21,25 +21,23 @@ import sys,re,profile,pstats
|
|||
# add the path to linkcheck module if you do not install with distutils
|
||||
$syspath
|
||||
import linkcheck
|
||||
url='http://www.yahoo.de/'
|
||||
config = linkcheck.Config.Configuration()
|
||||
#linkcheck.Config.DebugFlag = 1
|
||||
|
||||
config['recursionlevel'] = 2
|
||||
config['anchors'] = 1
|
||||
config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
|
||||
# avoid checking of local files (security!)
|
||||
config["externlinks"].append((re.compile("^file:"), 1))
|
||||
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
|
||||
profile.run("linkcheck.checkUrls(config)", "threads.prof")
|
||||
def runit(config, name):
|
||||
url='http://www.heise.de/'
|
||||
config['recursionlevel'] = 2
|
||||
config['anchors'] = 1
|
||||
config['internlinks'].append(re.compile(r"^https?://www\.heise\.de"))
|
||||
# avoid checking of local files (security!)
|
||||
config["externlinks"].append((re.compile("^file:"), 1))
|
||||
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
|
||||
profile.run("linkcheck.checkUrls(config)", name)
|
||||
|
||||
config.reset()
|
||||
|
||||
config.disableThreading()
|
||||
config['recursionlevel'] = 2
|
||||
config['anchors'] = 1
|
||||
config['internlinks'].append(re.compile(r"^(ftp|https?)://.*yahoo.*"))
|
||||
# avoid checking of local files (security!)
|
||||
config["externlinks"].append((re.compile("^file:"), 1))
|
||||
config.appendUrl(linkcheck.UrlData.GetUrlDataFrom(url, 0))
|
||||
profile.run("linkcheck.checkUrls(config)", "nothreads.prof")
|
||||
if __name__=='__main__':
|
||||
config = linkcheck.Config.Configuration()
|
||||
config.disableThreading()
|
||||
runit(config, "nothreads.prof")
|
||||
config.reset()
|
||||
config.enableThreading(10)
|
||||
runit(config, "threads.prof")
|
||||
|
||||
|
|
|
|||
|
|
@ -18,4 +18,4 @@
|
|||
import pstats,glob
|
||||
|
||||
for file in glob.glob('*.prof'):
|
||||
pstats.Stats(file).strip_dirs().sort_stats("time").print_stats(20)
|
||||
pstats.Stats(file).strip_dirs().sort_stats("cumulative").print_stats(20)
|
||||
|
|
|
|||
Loading…
Reference in a new issue