mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 16:30:28 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1657 e7d03fd6-7b0d-0410-9947-9c21f3af8025
229 lines
7.6 KiB
Python
229 lines
7.6 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
"""store cached data during checking"""
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import Cookie
|
|
try:
|
|
import threading
|
|
except ImportError:
|
|
import dummy_threading as threading
|
|
|
|
import linkcheck
|
|
import linkcheck.log
|
|
import linkcheck.containers
|
|
import linkcheck.configuration
|
|
import linkcheck.threader
|
|
|
|
from linkcheck.i18n import _
|
|
|
|
|
|
def _check_morsel (m, host, path):
|
|
"""check given cookie morsel against the desired host and path"""
|
|
# check domain (if its stored)
|
|
if m["domain"] and not host.endswith(m["domain"]):
|
|
return None
|
|
# check path (if its stored)
|
|
if m["path"] and not path.startswith(m["path"]):
|
|
return None
|
|
# check expiry date (if its stored)
|
|
if m["expires"]:
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cookie expires %s",
|
|
m["expires"])
|
|
# XXX check cookie expiration
|
|
return m.output(header='').strip()
|
|
|
|
|
|
class Cache (object):
|
|
"""Store and provide routines for cached data. Currently there are
|
|
caches for cookies, check urls and robots.txt contents.
|
|
|
|
All public operations (except __init__()) are thread-safe.
|
|
"""
|
|
|
|
def __init__ (self):
|
|
"""Initialize the default options"""
|
|
# one big lock for all caches and queues
|
|
self.lock = threading.Lock()
|
|
# already checked urls
|
|
self.checked = {}
|
|
# urls that are being checked
|
|
self.in_progress = {}
|
|
# to-be-checked urls
|
|
self.incoming = []
|
|
# downloaded robots.txt files
|
|
self.robots_txt = {}
|
|
# stored cookies
|
|
self.cookies = {}
|
|
|
|
def incoming_is_empty (self):
|
|
self.lock.acquire()
|
|
try:
|
|
return len(self.incoming) <= 0
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def incoming_get_url (self):
|
|
"""Get first not-in-progress url from the incoming queue and
|
|
return it. If no such url is available return None. The
|
|
url might be already cached."""
|
|
self.lock.acquire()
|
|
try:
|
|
for i, url_data in enumerate(self.incoming):
|
|
key = url_data.cache_key
|
|
if key not in self.in_progress:
|
|
del self.incoming[i]
|
|
if key in self.checked:
|
|
# url is cached and can be logged
|
|
url_data.copy_from_cache(self.checked[key])
|
|
else:
|
|
self.in_progress[key] = url_data
|
|
return url_data
|
|
return None
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def incoming_len (self):
|
|
"""return number of entries in incoming queue"""
|
|
self.lock.acquire()
|
|
try:
|
|
return len(self.incoming)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def incoming_add (self, url_data):
|
|
"""add new URL to list of URLs to check"""
|
|
self.lock.acquire()
|
|
try:
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "Add url %s..", url_data)
|
|
# check syntax
|
|
if not url_data.check_syntax():
|
|
# wrong syntax, do not check any further
|
|
return False
|
|
# check the cache
|
|
key = url_data.cache_key
|
|
if key in self.checked:
|
|
# url is cached and can be logged
|
|
url_data.copy_from_cache(self.checked[key])
|
|
return False
|
|
self.incoming.append(url_data)
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "..added.")
|
|
return True
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def has_incoming (self, key):
|
|
self.lock.acquire()
|
|
try:
|
|
return key in self.incoming
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def has_in_progress (self, key):
|
|
self.lock.acquire()
|
|
try:
|
|
return key in self.in_progress
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def in_progress_remove (self, url_data):
|
|
"""remove url from in-progress cache"""
|
|
self.lock.acquire()
|
|
try:
|
|
key = url_data.cache_key
|
|
assert key in self.in_progress
|
|
del self.in_progress[key]
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def checked_add (self, url_data):
|
|
"""cache checked url data"""
|
|
self.lock.acquire()
|
|
try:
|
|
data = url_data.get_cache_data()
|
|
key = url_data.cache_key
|
|
assert key not in self.checked
|
|
assert key in self.in_progress
|
|
del self.in_progress[key]
|
|
self.checked[key] = data
|
|
# also append all aliases
|
|
for key in url_data.aliases:
|
|
self.checked[key] = data
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def checked_redirect (self, redirect, url_data):
|
|
"""check if redirect is already in cache"""
|
|
self.lock.acquire()
|
|
try:
|
|
if redirect in self.checked:
|
|
url_data.copy_from_cache(self.checked[key])
|
|
return True
|
|
return False
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def robots_txt_allows_url (self, url_data):
|
|
"""ask robots.txt allowance"""
|
|
self.lock.acquire()
|
|
try:
|
|
roboturl = url_data.get_robots_txt_url()
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
|
"robots.txt url %r of %r", roboturl, url_data.url)
|
|
if roboturl not in self.robots_txt:
|
|
user, password = url_data.get_user_password()
|
|
rp = linkcheck.robotparser2.RobotFileParser(
|
|
user=user, password=password)
|
|
rp.set_url(roboturl)
|
|
rp.read()
|
|
self.robots_txt[roboturl] = rp
|
|
else:
|
|
rp = self.robots_txt[roboturl]
|
|
return rp.can_fetch(linkcheck.configuration.UserAgent,
|
|
url_data.url)
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def store_cookies (self, headers, host):
|
|
"""thread-safe cookie cache setter function"""
|
|
self.lock.acquire()
|
|
try:
|
|
output = []
|
|
for h in headers.getallmatchingheaders("Set-Cookie"):
|
|
output.append(h)
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store Cookie %s", h)
|
|
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
|
|
c.load(h)
|
|
return output
|
|
finally:
|
|
self.lock.release()
|
|
|
|
def get_cookies (self, host, path):
|
|
"""thread-safe cookie cache getter function"""
|
|
self.lock.acquire()
|
|
try:
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE,
|
|
"Get Cookie %s (%s)", host, path)
|
|
if not self.cookies.has_key(host):
|
|
return []
|
|
cookievals = []
|
|
for m in self.cookies[host].values():
|
|
val = _check_morsel(m, host, path)
|
|
if val:
|
|
cookievals.append(val)
|
|
return cookievals
|
|
finally:
|
|
self.lock.release()
|