From 67fabd5d8eebf2e9d37df471c6aedd58ae64c42b Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 11 Mar 2004 11:01:00 +0000 Subject: [PATCH] addd contact email and url to user-agent string git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1284 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- ChangeLog | 5 ++++ linkcheck/Config.py | 67 +++++++++++++++++++++++++++++++-------------- 2 files changed, 51 insertions(+), 21 deletions(-) diff --git a/ChangeLog b/ChangeLog index 0056da91..e4120b85 100644 --- a/ChangeLog +++ b/ChangeLog @@ -22,6 +22,11 @@ easier reading Type: documentation Changed: linkchecker, linkchecker.1 + * added contact url and email to the HTTP User-Agent string, which + gets us more accepted by some bot-blocking software; also see + http://www.livejournal.com/bots/ + Type: feature + Changed: linkcheck/Config.py 1.12.1 (release 21.2.2004) * raise IncompleteRead instead of ValueError on malformed chunked diff --git a/linkcheck/Config.py b/linkcheck/Config.py index 1b04c24e..2c462d6a 100644 --- a/linkcheck/Config.py +++ b/linkcheck/Config.py @@ -28,12 +28,11 @@ try: import threading as _threading except ImportError: import dummy_threading as _threading -import Queue, Threader +import Threader Version = _linkchecker_configdata.version AppName = "LinkChecker" App = AppName+" "+Version -UserAgent = AppName+"/"+Version Author = _linkchecker_configdata.author HtmlAuthor = Author.replace(' ', ' ') Copyright = "Copyright © 2000-2004 "+Author @@ -42,6 +41,7 @@ AppInfo = App+" "+Copyright HtmlAppInfo = App+", "+HtmlCopyright Url = _linkchecker_configdata.url Email = _linkchecker_configdata.author_email +UserAgent = "%s/%s (%s; %s)" % (AppName, Version, Url, Email) Freeware = AppName+""" comes with ABSOLUTELY NO WARRANTY! This is free software, and you are welcome to redistribute it under certain conditions. Look at the file `LICENSE' within this @@ -161,21 +161,24 @@ class Configuration (dict): } self['none'] = {} self['log'] = self.newLogger('text') + self.logLock = _threading.Lock() self["quiet"] = False self["warningregex"] = None self["warnsizebytes"] = None self["nntpserver"] = os.environ.get("NNTP_SERVER",None) + self["threads"] = True self.threader = Threader.Threader() self.setThreads(10) self.urlSeen = Set() + self.urlSeenLock = _threading.Lock() self.urlCache = LRU(MAX_URL_CACHE) - self.robotsTxtCache = LRU(MAX_ROBOTS_TXT_CACHE) - self["threads"] = True - self.urlsLock = _threading.Lock() self.urlCacheLock = _threading.Lock() + self.robotsTxtCache = LRU(MAX_ROBOTS_TXT_CACHE) self.robotsTxtCacheLock = _threading.Lock() - self.logLock = _threading.Lock() - self.urls = Queue.Queue(0) + self.urls = [] + self.urlCounter = 0 + self.urlsLock = _threading.Lock() + # basic data lock (eg for cookies, link numbers etc.) self.dataLock = _threading.Lock() self.cookies = LRU(MAX_COOKIES_CACHE) @@ -226,11 +229,11 @@ class Configuration (dict): def hasMoreUrls (self): - return not self.urls.empty() + return self.urls def finished (self): - return self.threader.finished() and self.urls.empty() + return self.threader.finished() and not self.urls def finish (self): @@ -238,17 +241,39 @@ class Configuration (dict): def appendUrl (self, urlData): - # check syntax - if not urlData.checkSyntax(): - return - # check the cache - if not urlData.checkCache(): - return - self.urls.put(urlData) + self.urlsLock.acquire() + try: + # check syntax + if not urlData.checkSyntax(): + return + # check the cache + if not urlData.checkCache(): + return + self.urlCounter += 1 + if self.urlCounter==1000: + self.urlCounter = 0 + self.filterUrlQueue() + self.urls.append(urlData) + finally: + self.urlsLock.release() + + + def filterUrlQueue (self): + """remove already cached urls from queue""" + # note: url lock must be acquired + olen = len(self.urls) + self.urls = [ u for u in self.urls if u.checkCache() ] + removed = olen - len(self.urls) + print >>sys.stderr, \ + i18n._("removed %d cached urls from incoming queue")%removed def getUrl (self): - return self.urls.get() + self.urlsLock.acquire() + try: + return self.urls.pop() + finally: + self.urlsLock.release() def checkUrl (self, url): @@ -256,19 +281,19 @@ class Configuration (dict): def urlSeen_has_key (self, key): - self.urlsLock.acquire() + self.urlSeenLock.acquire() try: return key in self.urlSeen finally: - self.urlsLock.release() + self.urlSeenLock.release() def urlSeen_set (self, key): - self.urlsLock.acquire() + self.urlSeenLock.acquire() try: self.urlSeen.add(key) finally: - self.urlsLock.release() + self.urlSeenLock.release() def urlCache_has_key (self, key):