linkchecker/linkcheck/checker/cache.py
2004-08-31 22:23:09 +00:00

229 lines
7.6 KiB
Python

# -*- coding: iso-8859-1 -*-
"""store cached data during checking"""
# Copyright (C) 2000-2004 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
import Cookie
try:
import threading
except ImportError:
import dummy_threading as threading
import linkcheck
import linkcheck.log
import linkcheck.containers
import linkcheck.configuration
import linkcheck.threader
from linkcheck.i18n import _
def _check_morsel (m, host, path):
"""check given cookie morsel against the desired host and path"""
# check domain (if its stored)
if m["domain"] and not host.endswith(m["domain"]):
return None
# check path (if its stored)
if m["path"] and not path.startswith(m["path"]):
return None
# check expiry date (if its stored)
if m["expires"]:
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cookie expires %s",
m["expires"])
# XXX check cookie expiration
return m.output(header='').strip()
class Cache (object):
"""Store and provide routines for cached data. Currently there are
caches for cookies, check urls and robots.txt contents.
All public operations (except __init__()) are thread-safe.
"""
def __init__ (self):
"""Initialize the default options"""
# one big lock for all caches and queues
self.lock = threading.Lock()
# already checked urls
self.checked = {}
# urls that are being checked
self.in_progress = {}
# to-be-checked urls
self.incoming = []
# downloaded robots.txt files
self.robots_txt = {}
# stored cookies
self.cookies = {}
def incoming_is_empty (self):
self.lock.acquire()
try:
return len(self.incoming) <= 0
finally:
self.lock.release()
def incoming_get_url (self):
"""Get first not-in-progress url from the incoming queue and
return it. If no such url is available return None. The
url might be already cached."""
self.lock.acquire()
try:
for i, url_data in enumerate(self.incoming):
key = url_data.cache_key
if key not in self.in_progress:
del self.incoming[i]
if key in self.checked:
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
else:
self.in_progress[key] = url_data
return url_data
return None
finally:
self.lock.release()
def incoming_len (self):
"""return number of entries in incoming queue"""
self.lock.acquire()
try:
return len(self.incoming)
finally:
self.lock.release()
def incoming_add (self, url_data):
"""add new URL to list of URLs to check"""
self.lock.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE, "Add url %s..", url_data)
# check syntax
if not url_data.check_syntax():
# wrong syntax, do not check any further
return False
# check the cache
key = url_data.cache_key
if key in self.checked:
# url is cached and can be logged
url_data.copy_from_cache(self.checked[key])
return False
self.incoming.append(url_data)
linkcheck.log.debug(linkcheck.LOG_CACHE, "..added.")
return True
finally:
self.lock.release()
def has_incoming (self, key):
self.lock.acquire()
try:
return key in self.incoming
finally:
self.lock.release()
def has_in_progress (self, key):
self.lock.acquire()
try:
return key in self.in_progress
finally:
self.lock.release()
def in_progress_remove (self, url_data):
"""remove url from in-progress cache"""
self.lock.acquire()
try:
key = url_data.cache_key
assert key in self.in_progress
del self.in_progress[key]
finally:
self.lock.release()
def checked_add (self, url_data):
"""cache checked url data"""
self.lock.acquire()
try:
data = url_data.get_cache_data()
key = url_data.cache_key
assert key not in self.checked
assert key in self.in_progress
del self.in_progress[key]
self.checked[key] = data
# also append all aliases
for key in url_data.aliases:
self.checked[key] = data
finally:
self.lock.release()
def checked_redirect (self, redirect, url_data):
"""check if redirect is already in cache"""
self.lock.acquire()
try:
if redirect in self.checked:
url_data.copy_from_cache(self.checked[key])
return True
return False
finally:
self.lock.release()
def robots_txt_allows_url (self, url_data):
"""ask robots.txt allowance"""
self.lock.acquire()
try:
roboturl = url_data.get_robots_txt_url()
linkcheck.log.debug(linkcheck.LOG_CACHE,
"robots.txt url %r of %r", roboturl, url_data.url)
if roboturl not in self.robots_txt:
user, password = url_data.get_user_password()
rp = linkcheck.robotparser2.RobotFileParser(
user=user, password=password)
rp.set_url(roboturl)
rp.read()
self.robots_txt[roboturl] = rp
else:
rp = self.robots_txt[roboturl]
return rp.can_fetch(linkcheck.configuration.UserAgent,
url_data.url)
finally:
self.lock.release()
def store_cookies (self, headers, host):
"""thread-safe cookie cache setter function"""
self.lock.acquire()
try:
output = []
for h in headers.getallmatchingheaders("Set-Cookie"):
output.append(h)
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store Cookie %s", h)
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
c.load(h)
return output
finally:
self.lock.release()
def get_cookies (self, host, path):
"""thread-safe cookie cache getter function"""
self.lock.acquire()
try:
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Get Cookie %s (%s)", host, path)
if not self.cookies.has_key(host):
return []
cookievals = []
for m in self.cookies[host].values():
val = _check_morsel(m, host, path)
if val:
cookievals.append(val)
return cookievals
finally:
self.lock.release()