# -*- coding: iso-8859-1 -*- """store cached data during checking""" # Copyright (C) 2000-2004 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. import Cookie try: import threading except ImportError: import dummy_threading as threading import linkcheck import linkcheck.log import linkcheck.containers import linkcheck.configuration import linkcheck.threader def _check_morsel (m, host, path): """check given cookie morsel against the desired host and path""" # check domain (if its stored) if m["domain"] and not host.endswith(m["domain"]): return None # check path (if its stored) if m["path"] and not path.startswith(m["path"]): return None # check expiry date (if its stored) if m["expires"]: linkcheck.log.debug(linkcheck.LOG_CACHE, "Cookie expires %s", m["expires"]) # XXX check cookie expiration return m.output(header='').strip() class Cache (object): """Store and provide routines for cached data. Currently there are caches for cookies, check urls and robots.txt contents. All public operations (except __init__()) are thread-safe. """ def __init__ (self): """Initialize the default options""" # one big lock for all caches and queues self.lock = threading.Lock() # already checked urls self.checked = {} # urls that are being checked self.in_progress = {} # to-be-checked urls self.incoming = [] # downloaded robots.txt files self.robots_txt = {} # stored cookies self.cookies = {} def incoming_is_empty (self): self.lock.acquire() try: return len(self.incoming) <= 0 finally: self.lock.release() def incoming_get_url (self): """Get first not-in-progress url from the incoming queue and return it. If no such url is available return None. The url might be already cached.""" self.lock.acquire() try: for i, url_data in enumerate(self.incoming): key = url_data.cache_url_key if key not in self.in_progress: del self.incoming[i] if key in self.checked: # url is cached and can be logged url_data.copy_from_cache(self.checked[key]) else: self.in_progress[key] = url_data return url_data return None finally: self.lock.release() def incoming_len (self): """return number of entries in incoming queue""" self.lock.acquire() try: return len(self.incoming) finally: self.lock.release() def incoming_add (self, url_data): """add new URL to list of URLs to check""" self.lock.acquire() try: linkcheck.log.debug(linkcheck.LOG_CACHE, "Add url %s..", url_data) # check syntax if not url_data.check_syntax(): # wrong syntax, do not check any further return False # check the cache key = url_data.cache_url_key if key in self.checked: # url is cached and can be logged url_data.copy_from_cache(self.checked[key]) return False self.incoming.append(url_data) linkcheck.log.debug(linkcheck.LOG_CACHE, "..added.") return True finally: self.lock.release() def has_incoming (self, key): self.lock.acquire() try: return key in self.incoming finally: self.lock.release() def has_in_progress (self, key): self.lock.acquire() try: return key in self.in_progress finally: self.lock.release() def in_progress_remove (self, url_data): """remove url from in-progress cache""" self.lock.acquire() try: key = url_data.cache_url_key assert key in self.in_progress del self.in_progress[key] finally: self.lock.release() def checked_add (self, url_data): """cache checked url data""" self.lock.acquire() try: data = url_data.get_cache_data() key = url_data.cache_url_key assert key not in self.checked assert key in self.in_progress del self.in_progress[key] self.checked[key] = data # also append all aliases for key in url_data.aliases: self.checked[key] = data finally: self.lock.release() def checked_redirect (self, redirect, url_data): """check if redirect is already in cache""" self.lock.acquire() try: if redirect in self.checked: url_data.copy_from_cache(self.checked[redirect]) return True return False finally: self.lock.release() def robots_txt_allows_url (self, url_data): """ask robots.txt allowance""" self.lock.acquire() try: roboturl = url_data.get_robots_txt_url() linkcheck.log.debug(linkcheck.LOG_CACHE, "robots.txt url %r of %r", roboturl, url_data.url) if roboturl not in self.robots_txt: user, password = url_data.get_user_password() rp = linkcheck.robotparser2.RobotFileParser( user=user, password=password) rp.set_url(roboturl) rp.read() self.robots_txt[roboturl] = rp else: rp = self.robots_txt[roboturl] return rp.can_fetch(linkcheck.configuration.UserAgent, url_data.url) finally: self.lock.release() def store_cookies (self, headers, host): """Thread-safe cookie cache setter function. Can raise the exception Cookie.CookieError. """ self.lock.acquire() try: output = [] for h in headers.getallmatchingheaders("Set-Cookie"): output.append(h) linkcheck.log.debug(linkcheck.LOG_CACHE, "Store Cookie %s", h) c = self.cookies.setdefault(host, Cookie.SimpleCookie()) c.load(h) return output finally: self.lock.release() def get_cookies (self, host, path): """Thread-safe cookie cache getter function.""" self.lock.acquire() try: linkcheck.log.debug(linkcheck.LOG_CACHE, "Get Cookie %s (%s)", host, path) if not self.cookies.has_key(host): return [] cookievals = [] for m in self.cookies[host].values(): val = _check_morsel(m, host, path) if val: cookievals.append(val) return cookievals finally: self.lock.release()