use new cookie parsing

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2983 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-12-18 08:19:11 +00:00
parent 5eb3061ee0
commit 19c0a3c2ed
2 changed files with 28 additions and 42 deletions

View file

@ -19,6 +19,7 @@ Store cached data during checking.
"""
import Cookie
import time
import collections
import linkcheck
@ -26,28 +27,11 @@ import linkcheck.log
import linkcheck.lock
import linkcheck.containers
import linkcheck.configuration
import linkcheck.cookies
import linkcheck.threader
import linkcheck.checker.pool
def check_morsel (m, host, path):
"""
Check given cookie morsel against the desired host and path.
"""
# check domain (if its stored)
if m["domain"] and not host.endswith(m["domain"]):
return None
# check path (if its stored)
if m["path"] and not path.startswith(m["path"]):
return None
# check expiry date (if its stored)
if m["expires"]:
linkcheck.log.debug(linkcheck.LOG_CACHE, "Cookie expires %s",
m["expires"])
# XXX check cookie expiration
return m.output(header='').strip()
class Cache (object):
"""
Store and provide routines for cached data. Currently there are
@ -61,17 +45,19 @@ class Cache (object):
"""
super(Cache, self).__init__()
# already checked URLs
# {cache key (string) -> cache data (dict)}
# format: {cache key (string) -> cache data (dict)}
self.checked = {}
# URLs that are being checked
# {cache key (string) -> urldata (UrlData)}
# format: {cache key (string) -> urldata (UrlData)}
self.in_progress = {}
# to-be-checked URLs
# [urldata (UrlData)]
# format: [urldata (UrlData)]
self.incoming = collections.deque()
# downloaded robots.txt files
# format: {cache key (string) -> robots.txt content (RobotFileParser)}
self.robots_txt = {}
# stored cookies
# format: {cache key (string) -> cookie jar (linkcheck.cookielib.CookieJar)}
self.cookies = {}
# pooled connections
self.pool = linkcheck.checker.pool.ConnectionPool()
@ -221,30 +207,21 @@ class Cache (object):
"""
self.pool.release_connection(key)
def store_cookies (self, headers, host):
def store_cookies (self, headers, scheme, host, path):
"""
Thread-safe cookie cache setter function. Can raise the
exception Cookie.CookieError.
"""
output = []
for h in headers.getallmatchingheaders("Set-Cookie"):
output.append(h)
linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h)
c = self.cookies.setdefault(host, Cookie.SimpleCookie())
c.load(h)
return output
jar = self.cookies.setdefault(host, linkcheck.cookies.CookieJar())
return jar.add_cookies(headers, scheme, host, path)
def get_cookies (self, host, path):
def get_cookies (self, scheme, host, port, path):
"""
Thread-safe cookie cache getter function.
"""
linkcheck.log.debug(linkcheck.LOG_CACHE,
"Get cookies for host %r path %r", host, path)
if not self.cookies.has_key(host):
return []
cookievals = []
for m in self.cookies[host].values():
val = check_morsel(m, host, path)
if val:
cookievals.append(val)
return cookievals
jar = self.cookies.setdefault(host, linkcheck.cookies.CookieJar())
jar.remove_expired()
return [x for x in jar if x.is_valid_for(scheme, host, port, path)]

View file

@ -19,6 +19,7 @@ Handle http links.
"""
import urlparse
import urllib
import time
import re
import zlib
@ -408,7 +409,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.add_info(_("Store cookie: %s.") % c)
try:
out = self.consumer.store_cookies(self.headers,
self.urlparts[1])
self.urlparts[0],
self.urlparts[1],
self.urlparts[2])
for h in out:
self.add_info(linkcheck.strformat.unicode_safe(h))
except Cookie.CookieError, msg:
@ -466,10 +469,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.url_connection.putheader("Accept-Encoding",
"gzip;q=1.0, deflate;q=0.9, identity;q=0.5")
if self.consumer.config('cookies'):
self.cookies = self.consumer.get_cookies(self.urlparts[1],
self.urlparts[2])
scheme = self.urlparts[0]
host = self.urlparts[1]
port = linkcheck.url.default_ports.get(scheme, 80)
host, port = urllib.splitnport(host, port)
path = self.urlparts[2]
self.cookies = self.consumer.get_cookies(scheme, host, port, path)
for c in self.cookies:
self.url_connection.putheader("Cookie", c)
name = c.client_header_name()
value = c.client_header_value()
self.url_connection.putheader(name, value)
self.url_connection.endheaders()
response = self.url_connection.getresponse()
self.persistent = headers.http_persistent(response)