From 19c0a3c2eda10392b56ed49b514075b1a483ba7a Mon Sep 17 00:00:00 2001 From: calvin Date: Sun, 18 Dec 2005 08:19:11 +0000 Subject: [PATCH] use new cookie parsing git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2983 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- linkcheck/checker/cache.py | 53 ++++++++++-------------------------- linkcheck/checker/httpurl.py | 17 +++++++++--- 2 files changed, 28 insertions(+), 42 deletions(-) diff --git a/linkcheck/checker/cache.py b/linkcheck/checker/cache.py index 9e3b723d..7fc7c721 100644 --- a/linkcheck/checker/cache.py +++ b/linkcheck/checker/cache.py @@ -19,6 +19,7 @@ Store cached data during checking. """ import Cookie +import time import collections import linkcheck @@ -26,28 +27,11 @@ import linkcheck.log import linkcheck.lock import linkcheck.containers import linkcheck.configuration +import linkcheck.cookies import linkcheck.threader import linkcheck.checker.pool -def check_morsel (m, host, path): - """ - Check given cookie morsel against the desired host and path. - """ - # check domain (if its stored) - if m["domain"] and not host.endswith(m["domain"]): - return None - # check path (if its stored) - if m["path"] and not path.startswith(m["path"]): - return None - # check expiry date (if its stored) - if m["expires"]: - linkcheck.log.debug(linkcheck.LOG_CACHE, "Cookie expires %s", - m["expires"]) - # XXX check cookie expiration - return m.output(header='').strip() - - class Cache (object): """ Store and provide routines for cached data. Currently there are @@ -61,17 +45,19 @@ class Cache (object): """ super(Cache, self).__init__() # already checked URLs - # {cache key (string) -> cache data (dict)} + # format: {cache key (string) -> cache data (dict)} self.checked = {} # URLs that are being checked - # {cache key (string) -> urldata (UrlData)} + # format: {cache key (string) -> urldata (UrlData)} self.in_progress = {} # to-be-checked URLs - # [urldata (UrlData)] + # format: [urldata (UrlData)] self.incoming = collections.deque() # downloaded robots.txt files + # format: {cache key (string) -> robots.txt content (RobotFileParser)} self.robots_txt = {} # stored cookies + # format: {cache key (string) -> cookie jar (linkcheck.cookielib.CookieJar)} self.cookies = {} # pooled connections self.pool = linkcheck.checker.pool.ConnectionPool() @@ -221,30 +207,21 @@ class Cache (object): """ self.pool.release_connection(key) - def store_cookies (self, headers, host): + def store_cookies (self, headers, scheme, host, path): """ Thread-safe cookie cache setter function. Can raise the exception Cookie.CookieError. """ - output = [] - for h in headers.getallmatchingheaders("Set-Cookie"): - output.append(h) - linkcheck.log.debug(linkcheck.LOG_CACHE, "Store cookie %s", h) - c = self.cookies.setdefault(host, Cookie.SimpleCookie()) - c.load(h) - return output + jar = self.cookies.setdefault(host, linkcheck.cookies.CookieJar()) + return jar.add_cookies(headers, scheme, host, path) - def get_cookies (self, host, path): + def get_cookies (self, scheme, host, port, path): """ Thread-safe cookie cache getter function. """ linkcheck.log.debug(linkcheck.LOG_CACHE, "Get cookies for host %r path %r", host, path) - if not self.cookies.has_key(host): - return [] - cookievals = [] - for m in self.cookies[host].values(): - val = check_morsel(m, host, path) - if val: - cookievals.append(val) - return cookievals + jar = self.cookies.setdefault(host, linkcheck.cookies.CookieJar()) + jar.remove_expired() + return [x for x in jar if x.is_valid_for(scheme, host, port, path)] + diff --git a/linkcheck/checker/httpurl.py b/linkcheck/checker/httpurl.py index fffc2c34..d324490b 100644 --- a/linkcheck/checker/httpurl.py +++ b/linkcheck/checker/httpurl.py @@ -19,6 +19,7 @@ Handle http links. """ import urlparse +import urllib import time import re import zlib @@ -408,7 +409,9 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.add_info(_("Store cookie: %s.") % c) try: out = self.consumer.store_cookies(self.headers, - self.urlparts[1]) + self.urlparts[0], + self.urlparts[1], + self.urlparts[2]) for h in out: self.add_info(linkcheck.strformat.unicode_safe(h)) except Cookie.CookieError, msg: @@ -466,10 +469,16 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport): self.url_connection.putheader("Accept-Encoding", "gzip;q=1.0, deflate;q=0.9, identity;q=0.5") if self.consumer.config('cookies'): - self.cookies = self.consumer.get_cookies(self.urlparts[1], - self.urlparts[2]) + scheme = self.urlparts[0] + host = self.urlparts[1] + port = linkcheck.url.default_ports.get(scheme, 80) + host, port = urllib.splitnport(host, port) + path = self.urlparts[2] + self.cookies = self.consumer.get_cookies(scheme, host, port, path) for c in self.cookies: - self.url_connection.putheader("Cookie", c) + name = c.client_header_name() + value = c.client_header_value() + self.url_connection.putheader(name, value) self.url_connection.endheaders() response = self.url_connection.getresponse() self.persistent = headers.http_persistent(response)