addd contact email and url to user-agent string

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1284 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-03-11 11:01:00 +00:00
parent 0c8b3653d4
commit 67fabd5d8e
2 changed files with 51 additions and 21 deletions

View file

@ -22,6 +22,11 @@
easier reading
Type: documentation
Changed: linkchecker, linkchecker.1
* added contact url and email to the HTTP User-Agent string, which
gets us more accepted by some bot-blocking software; also see
http://www.livejournal.com/bots/
Type: feature
Changed: linkcheck/Config.py
1.12.1 (release 21.2.2004)
* raise IncompleteRead instead of ValueError on malformed chunked

View file

@ -28,12 +28,11 @@ try:
import threading as _threading
except ImportError:
import dummy_threading as _threading
import Queue, Threader
import Threader
Version = _linkchecker_configdata.version
AppName = "LinkChecker"
App = AppName+" "+Version
UserAgent = AppName+"/"+Version
Author = _linkchecker_configdata.author
HtmlAuthor = Author.replace(' ', ' ')
Copyright = "Copyright © 2000-2004 "+Author
@ -42,6 +41,7 @@ AppInfo = App+" "+Copyright
HtmlAppInfo = App+", "+HtmlCopyright
Url = _linkchecker_configdata.url
Email = _linkchecker_configdata.author_email
UserAgent = "%s/%s (%s; %s)" % (AppName, Version, Url, Email)
Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY!
This is free software, and you are welcome to redistribute it
under certain conditions. Look at the file `LICENSE' within this
@ -161,21 +161,24 @@ class Configuration (dict):
}
self['none'] = {}
self['log'] = self.newLogger('text')
self.logLock = _threading.Lock()
self["quiet"] = False
self["warningregex"] = None
self["warnsizebytes"] = None
self["nntpserver"] = os.environ.get("NNTP_SERVER",None)
self["threads"] = True
self.threader = Threader.Threader()
self.setThreads(10)
self.urlSeen = Set()
self.urlSeenLock = _threading.Lock()
self.urlCache = LRU(MAX_URL_CACHE)
self.robotsTxtCache = LRU(MAX_ROBOTS_TXT_CACHE)
self["threads"] = True
self.urlsLock = _threading.Lock()
self.urlCacheLock = _threading.Lock()
self.robotsTxtCache = LRU(MAX_ROBOTS_TXT_CACHE)
self.robotsTxtCacheLock = _threading.Lock()
self.logLock = _threading.Lock()
self.urls = Queue.Queue(0)
self.urls = []
self.urlCounter = 0
self.urlsLock = _threading.Lock()
# basic data lock (eg for cookies, link numbers etc.)
self.dataLock = _threading.Lock()
self.cookies = LRU(MAX_COOKIES_CACHE)
@ -226,11 +229,11 @@ class Configuration (dict):
def hasMoreUrls (self):
return not self.urls.empty()
return self.urls
def finished (self):
return self.threader.finished() and self.urls.empty()
return self.threader.finished() and not self.urls
def finish (self):
@ -238,17 +241,39 @@ class Configuration (dict):
def appendUrl (self, urlData):
# check syntax
if not urlData.checkSyntax():
return
# check the cache
if not urlData.checkCache():
return
self.urls.put(urlData)
self.urlsLock.acquire()
try:
# check syntax
if not urlData.checkSyntax():
return
# check the cache
if not urlData.checkCache():
return
self.urlCounter += 1
if self.urlCounter==1000:
self.urlCounter = 0
self.filterUrlQueue()
self.urls.append(urlData)
finally:
self.urlsLock.release()
def filterUrlQueue (self):
"""remove already cached urls from queue"""
# note: url lock must be acquired
olen = len(self.urls)
self.urls = [ u for u in self.urls if u.checkCache() ]
removed = olen - len(self.urls)
print >>sys.stderr, \
i18n._("removed %d cached urls from incoming queue")%removed
def getUrl (self):
return self.urls.get()
self.urlsLock.acquire()
try:
return self.urls.pop()
finally:
self.urlsLock.release()
def checkUrl (self, url):
@ -256,19 +281,19 @@ class Configuration (dict):
def urlSeen_has_key (self, key):
self.urlsLock.acquire()
self.urlSeenLock.acquire()
try:
return key in self.urlSeen
finally:
self.urlsLock.release()
self.urlSeenLock.release()
def urlSeen_set (self, key):
self.urlsLock.acquire()
self.urlSeenLock.acquire()
try:
self.urlSeen.add(key)
finally:
self.urlsLock.release()
self.urlSeenLock.release()
def urlCache_has_key (self, key):