diff --git a/linkcheck/cache/cookie.py b/linkcheck/cache/cookie.py index ccbe0c66..5eaab2e7 100644 --- a/linkcheck/cache/cookie.py +++ b/linkcheck/cache/cookie.py @@ -38,8 +38,9 @@ class CookieJar (object): errors = [] for h in headers.getallmatchingheaders("Set-Cookie"): # RFC 2109 (Netscape) cookie type + name, value = h.split(':', 1) try: - cookie = cookies.NetscapeCookie(h, scheme, host, path) + cookie = cookies.NetscapeCookie(value, scheme, host, path) if cookie in self.cache: self.cache.remove(cookie) if not cookie.is_expired(): @@ -50,8 +51,9 @@ class CookieJar (object): errors.append(errmsg) for h in headers.getallmatchingheaders("Set-Cookie2"): # RFC 2965 cookie type + name, value = h.split(':', 1) try: - cookie = cookies.Rfc2965Cookie(h, scheme, host, path) + cookie = cookies.Rfc2965Cookie(value, scheme, host, path) if cookie in self.cache: self.cache.remove(cookie) if not cookie.is_expired(): diff --git a/linkcheck/cookies.py b/linkcheck/cookies.py index 700a8eb4..5059cd88 100644 --- a/linkcheck/cookies.py +++ b/linkcheck/cookies.py @@ -28,40 +28,172 @@ And a cookie storage class is provided. """ import time +import string import re -import Cookie import cookielib import httplib from cStringIO import StringIO from . import strformat +_nulljoin = ''.join +_semispacejoin = '; '.join +_spacejoin = ' '.join + class CookieError (StandardError): """Thrown for invalid cookie syntax or conflicting/impossible values.""" pass +_LegalChars = string.ascii_letters + string.digits + "!#$%&'*+-.^_`|~:" +_Translator = { + '\000' : '\\000', '\001' : '\\001', '\002' : '\\002', + '\003' : '\\003', '\004' : '\\004', '\005' : '\\005', + '\006' : '\\006', '\007' : '\\007', '\010' : '\\010', + '\011' : '\\011', '\012' : '\\012', '\013' : '\\013', + '\014' : '\\014', '\015' : '\\015', '\016' : '\\016', + '\017' : '\\017', '\020' : '\\020', '\021' : '\\021', + '\022' : '\\022', '\023' : '\\023', '\024' : '\\024', + '\025' : '\\025', '\026' : '\\026', '\027' : '\\027', + '\030' : '\\030', '\031' : '\\031', '\032' : '\\032', + '\033' : '\\033', '\034' : '\\034', '\035' : '\\035', + '\036' : '\\036', '\037' : '\\037', + + # Because of the way browsers really handle cookies (as opposed + # to what the RFC says) we also encode , and ; + + ',' : '\\054', ';' : '\\073', + + '"' : '\\"', '\\' : '\\\\', + + '\177' : '\\177', '\200' : '\\200', '\201' : '\\201', + '\202' : '\\202', '\203' : '\\203', '\204' : '\\204', + '\205' : '\\205', '\206' : '\\206', '\207' : '\\207', + '\210' : '\\210', '\211' : '\\211', '\212' : '\\212', + '\213' : '\\213', '\214' : '\\214', '\215' : '\\215', + '\216' : '\\216', '\217' : '\\217', '\220' : '\\220', + '\221' : '\\221', '\222' : '\\222', '\223' : '\\223', + '\224' : '\\224', '\225' : '\\225', '\226' : '\\226', + '\227' : '\\227', '\230' : '\\230', '\231' : '\\231', + '\232' : '\\232', '\233' : '\\233', '\234' : '\\234', + '\235' : '\\235', '\236' : '\\236', '\237' : '\\237', + '\240' : '\\240', '\241' : '\\241', '\242' : '\\242', + '\243' : '\\243', '\244' : '\\244', '\245' : '\\245', + '\246' : '\\246', '\247' : '\\247', '\250' : '\\250', + '\251' : '\\251', '\252' : '\\252', '\253' : '\\253', + '\254' : '\\254', '\255' : '\\255', '\256' : '\\256', + '\257' : '\\257', '\260' : '\\260', '\261' : '\\261', + '\262' : '\\262', '\263' : '\\263', '\264' : '\\264', + '\265' : '\\265', '\266' : '\\266', '\267' : '\\267', + '\270' : '\\270', '\271' : '\\271', '\272' : '\\272', + '\273' : '\\273', '\274' : '\\274', '\275' : '\\275', + '\276' : '\\276', '\277' : '\\277', '\300' : '\\300', + '\301' : '\\301', '\302' : '\\302', '\303' : '\\303', + '\304' : '\\304', '\305' : '\\305', '\306' : '\\306', + '\307' : '\\307', '\310' : '\\310', '\311' : '\\311', + '\312' : '\\312', '\313' : '\\313', '\314' : '\\314', + '\315' : '\\315', '\316' : '\\316', '\317' : '\\317', + '\320' : '\\320', '\321' : '\\321', '\322' : '\\322', + '\323' : '\\323', '\324' : '\\324', '\325' : '\\325', + '\326' : '\\326', '\327' : '\\327', '\330' : '\\330', + '\331' : '\\331', '\332' : '\\332', '\333' : '\\333', + '\334' : '\\334', '\335' : '\\335', '\336' : '\\336', + '\337' : '\\337', '\340' : '\\340', '\341' : '\\341', + '\342' : '\\342', '\343' : '\\343', '\344' : '\\344', + '\345' : '\\345', '\346' : '\\346', '\347' : '\\347', + '\350' : '\\350', '\351' : '\\351', '\352' : '\\352', + '\353' : '\\353', '\354' : '\\354', '\355' : '\\355', + '\356' : '\\356', '\357' : '\\357', '\360' : '\\360', + '\361' : '\\361', '\362' : '\\362', '\363' : '\\363', + '\364' : '\\364', '\365' : '\\365', '\366' : '\\366', + '\367' : '\\367', '\370' : '\\370', '\371' : '\\371', + '\372' : '\\372', '\373' : '\\373', '\374' : '\\374', + '\375' : '\\375', '\376' : '\\376', '\377' : '\\377' + } + +def quote(str, LegalChars=_LegalChars): + r"""Quote a string for use in a cookie header. + + If the string does not need to be double-quoted, then just return the + string. Otherwise, surround the string in doublequotes and quote + (with a \) special characters. + """ + if all(c in LegalChars for c in str): + return str + else: + return '"' + _nulljoin(_Translator.get(s, s) for s in str) + '"' + + +_OctalPatt = re.compile(r"\\[0-3][0-7][0-7]") +_QuotePatt = re.compile(r"[\\].") + +def unquote(str): + # If there aren't any doublequotes, + # then there can't be any special characters. See RFC 2109. + if len(str) < 2: + return str + if str[0] != '"' or str[-1] != '"': + return str + + # We have to assume that we must decode this string. + # Down to work. + + # Remove the "s + str = str[1:-1] + + # Check for special sequences. Examples: + # \012 --> \n + # \" --> " + # + i = 0 + n = len(str) + res = [] + while 0 <= i < n: + o_match = _OctalPatt.search(str, i) + q_match = _QuotePatt.search(str, i) + if not o_match and not q_match: # Neither matched + res.append(str[i:]) + break + # else: + j = k = -1 + if o_match: + j = o_match.start(0) + if q_match: + k = q_match.start(0) + if q_match and (not o_match or k < j): # QuotePatt matched + res.append(str[i:k]) + res.append(str[k+1]) + i = k + 2 + else: # OctalPatt matched + res.append(str[i:j]) + res.append(chr(int(str[j+1:j+4], 8))) + i = j + 4 + return _nulljoin(res) + -unquote = Cookie._unquote -quote = Cookie._quote has_embedded_dot = re.compile(r"[a-zA-Z0-9]\.[a-zA-Z]").search # Pattern for finding cookie snatched from Pythons Cookie.py # Modification: allow whitespace in values. -LegalChars = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=" -CookiePattern = re.compile(r""" - (?P # Start of group 'key' - [%(legalchars)s]+? # Any word of at least one letter, nongreedy - ) # End of group 'key' - \s*=\s* # Equal Sign - (?P # Start of group 'val' - "(?:[^\\"]|\\.)*" # Any doublequoted string - | # or - [%(legalchars)s\s]* # Any word or empty string - ) # End of group 'val' - \s*;? # Probably ending in a semi-colon - """ % {"legalchars": LegalChars}, re.VERBOSE) - +_LegalCharsPatt = r"[\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=]" +_CookiePattern = re.compile(r""" + (?x) # This is a verbose pattern + (?P # Start of group 'key' + """ + _LegalCharsPatt + r"""+? # Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + (?P # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + """ + _LegalCharsPatt + r"""* # Any word or empty string + ) # End of group 'val' + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """) class HttpCookie (object): """A cookie consists of one name-value pair with attributes. @@ -84,6 +216,8 @@ class HttpCookie (object): "commenturl": "CommentURL", "discard": "Discard", "port": "Port", + # httponly to protect against XSS attacks + "httponly": "httponly", } def __init__ (self, name, value, attributes=None): @@ -203,7 +337,10 @@ class HttpCookie (object): key = key.lower() if key not in self.attribute_names: raise CookieError("invalid attribute %r" % key) - value = unquote(value) + if value: + value = unquote(value) + else: + value = "" if key == "domain": value = value.lower() if not value.startswith(".") and not has_embedded_dot(value): @@ -227,7 +364,7 @@ class HttpCookie (object): raise CookieError("invalid port number: %r" % port) self.attributes[key] = value - def parse (self, text, patt=CookiePattern): + def parse (self, text, patt=_CookiePattern): """Parse cookie data.""" text = strformat.ascii_safe(text.rstrip('\r\n')) # reset values @@ -246,6 +383,8 @@ class HttpCookie (object): # No more key-value pairs. break key, value = match.group("key"), match.group("val") + if value is None: + value = "" i = match.end() # Parse the key, value in case it's metainfo. if self.name is None: @@ -430,6 +569,7 @@ def cookie_str(cookie): #if cookie.port_specified: h.append(("port_spec", None)) #if cookie.domain_initial_dot: h.append(("domain_dot", None)) if cookie.secure: h.append(("secure", None)) + if cookie.httponly: h.append(("httponly", None)) if cookie.expires: h.append(("expires", time2isoz(float(cookie.expires)))) if cookie.discard: h.append(("discard", None)) diff --git a/tests/cache/test_cookiejar.py b/tests/cache/test_cookiejar.py index 7ec88b47..95b301c0 100644 --- a/tests/cache/test_cookiejar.py +++ b/tests/cache/test_cookiejar.py @@ -39,33 +39,36 @@ class TestCookieJar (unittest.TestCase): jar = linkcheck.cache.cookie.CookieJar() data = ( ("Foo", "Bar"), - ("Domain", "example.org"), + ("Domain", host), ("Path", "/"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 1) # add updated cookie data = ( ("FOO", "Baz"), - ("Domain", "example.org"), + ("Domain", host), ("Path", "/"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 1) # remove cookie data = ( ("FOO", "Baz"), - ("Domain", "example.org"), + ("Domain", host), ("Path", "/"), ("Max-Age", "0"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 0) def test_cookie_cache2 (self): @@ -75,31 +78,34 @@ class TestCookieJar (unittest.TestCase): jar = linkcheck.cache.cookie.CookieJar() data = ( ("Foo", "Bar"), - ("Domain", "example.org"), + ("Domain", host), ("Path", "/"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie2', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 1) # add updated cookie data = ( ("Foo", "Baz"), - ("Domain", "EXAMPLE.org"), + ("Domain", host.upper()), ("Path", "/"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie2', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 1) # remove cookie data = ( ("FOO", "Baz"), - ("Domain", "example.org"), + ("Domain", host), ("Path", "/"), ("Max-Age", "0"), ) value = "; ".join('%s=%s' % (key, value) for key, value in data) headers = get_headers('Set-Cookie2', value) - jar.add(headers, scheme, host, path) + errors = jar.add(headers, scheme, host, path) + self.assertFalse(errors, str(errors)) self.assertEqual(len(jar.cache), 0) diff --git a/tests/test_cookies.py b/tests/test_cookies.py index 074fd346..0711aec8 100644 --- a/tests/test_cookies.py +++ b/tests/test_cookies.py @@ -58,8 +58,9 @@ class TestCookies (unittest.TestCase): self.assertTrue(cookie.is_expired()) def test_netscape_cookie3 (self): + # invalid port data = ( - ("Foo", "Bar\""), + ("Foo", "Bar"), ("Port", "hul,la"), ) value = "; ".join('%s="%s"' % (key, value) for key, value in data) @@ -71,7 +72,7 @@ class TestCookies (unittest.TestCase): def test_netscape_cookie4 (self): data = ( - ("Foo", "Bar\""), + ("Foo", "Bar"), ("Domain", "localhost"), ("Port", "100,555,76"), ) @@ -200,8 +201,9 @@ class TestCookies (unittest.TestCase): self.assertTrue(cookie.is_expired()) def test_rfc_cookie3 (self): + # invalid port data = ( - ("Foo", "Bar\""), + ("Foo", "Bar"), ("Port", "hul,la"), ) value = "; ".join('%s="%s"' % (key, value) for key, value in data) @@ -213,7 +215,7 @@ class TestCookies (unittest.TestCase): def test_rfc_cookie4 (self): data = ( - ("Foo", "Bar\""), + ("Foo", "Bar"), ("Port", "100,555,76"), ) value = "; ".join('%s="%s"' % (key, value) for key, value in data)