split off header methods in separate file

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2381 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-15 03:41:01 +00:00 · 2005-03-09 13:46:58 +00:00 · 2005-03-09 13:46:58 +00:00 · b3c3c3eaaa
commit b3c3c3eaaa
parent ca49e04730
2 changed files with 110 additions and 87 deletions
--- a/linkcheck/checker/httpheaders.py
+++ b/linkcheck/checker/httpheaders.py
@ -0,0 +1,100 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2005  Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+"""
+Helper functions dealing with HTTP headers.
+"""
+
+DEFAULT_TIMEOUT_SECS = 300
+
+def has_header_value (headers, name, value):
+    """
+    Look in headers for a specific header name and value.
+    Both name and value are case insensitive.
+
+    @return: True if header name and value are found
+    @rtype: bool
+    """
+    name = name.lower()
+    value = value.lower()
+    for hname, hvalue in headers:
+        if hname.lower()==name and hvalue.lower()==value:
+            return True
+    return False
+
+
+def http_persistent (response):
+    """
+    See if the HTTP connection can be kept open according the the
+    header values found in the response object.
+
+    @param response: response instance
+    @type response: httplib.HTTPResponse
+    @return: True if connection is persistent
+    @rtype: bool
+    """
+    headers = response.getheaders()
+    if response.version == 11:
+        return has_header_value(headers, 'Connection', 'Close')
+    return has_header_value(headers, "Connection", "Keep-Alive")
+
+
+def http_timeout (response):
+    """
+    Get HTTP timeout value, either from the Keep-Alive header or a
+    default value.
+
+    @param response: response instance
+    @type response: httplib.HTTPResponse
+    @return: timeout
+    @rtype: int
+    """
+    timeout = response.getheader("Keep-Alive")
+    if timeout is not None:
+        try:
+            timeout = int(timeout[8:].strip())
+        except ValueError, msg:
+            timeout = DEFAULT_TIMEOUT_SECS
+    else:
+        timeout = DEFAULT_TIMEOUT_SECS
+    return timeout
+
+
+def get_content_type (headers):
+    """
+    Get the MIME type from the Content-Type header value, or
+    'application/octet-stream' if not found.
+
+    @return: MIME type
+    @rtype: string
+    """
+    ptype = headers.get('Content-Type', 'application/octet-stream')
+    if ";" in ptype:
+        # split off not needed extension info
+        ptype = ptype.split(';')[0]
+    return ptype.strip()
+
+
+def get_content_encoding (headers):
+    """
+    Get the content encoding from the Content-Encoding header value, or
+    an empty string if not found.
+
+    @return: encoding string
+    @rtype: string
+    """
+    return headers.get("Content-Encoding", "").strip()
+
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -32,6 +32,7 @@ import linkcheck.url
 import linkcheck.strformat
 import linkcheck.robotparser2
 import linkcheck.httplib2
+import linkcheck.checker.httpheaders as headers
 import urlbase
 import proxysupport

@ -43,61 +44,6 @@ _supported_encodings = ('gzip', 'x-gzip', 'deflate')
 # Amazon blocks all HEAD requests
 _is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search

-DEFAULT_TIMEOUT_SECS = 300
-
-
-def has_header_value (headers, name, value):
-    """
-    Look in headers for a specific header name and value.
-    Both name and value are case insensitive.
-
-    @return: True if header name and value are found
-    @rtype: bool
-    """
-    name = name.lower()
-    value = value.lower()
-    for hname, hvalue in headers:
-        if hname.lower()==name and hvalue.lower()==value:
-            return True
-    return False
-
-
-def http_persistent (response):
-    """
-    See if the HTTP connection can be kept open according the the
-    header values found in the response object.
-
-    @param response: response instance
-    @type response: httplib.HTTPResponse
-    @return: True if connection is persistent
-    @rtype: bool
-    """
-    headers = response.getheaders()
-    if response.version == 11:
-        return has_header_value(headers, 'Connection', 'Close')
-    return has_header_value(headers, "Connection", "Keep-Alive")
-
-
-def http_timeout (response):
-    """
-    Get HTTP timeout value, either from the Keep-Alive header or a
-    default value.
-
-    @param response: response instance
-    @type response: httplib.HTTPResponse
-    @return: timeout
-    @rtype: int
-    """
-    timeout = response.getheader("Keep-Alive")
-    if timeout is not None:
-        try:
-            timeout = int(timeout[8:].strip())
-        except ValueError, msg:
-            timeout = DEFAULT_TIMEOUT_SECS
-    else:
-        timeout = DEFAULT_TIMEOUT_SECS
-    return timeout
-

 class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
    """
@ -486,8 +432,8 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
                self.url_connection.putheader("Cookie", c)
        self.url_connection.endheaders()
        response = self.url_connection.getresponse()
-        self.persistent = http_persistent(response)
-        self.timeout = http_timeout(response)
+        self.persistent = headers.http_persistent(response)
+        self.timeout = headers.http_timeout(response)
        return response

    def get_http_object (self, host, scheme):
@ -535,7 +481,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
            response = self._get_http_response()
            self.headers = response.msg
            self.data = response.read()
-            encoding = self.get_content_encoding()
+            encoding = headers.get_content_encoding(self.headers)
            if encoding in _supported_encodings:
                try:
                    if encoding == 'deflate':
@ -561,9 +507,9 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        """
        if not (self.valid and self.headers):
            return False
-        if self.headers.gettype()[:9] != "text/html":
+        if headers.get_content_type(self.headers) != "text/html":
            return False
-        encoding = self.get_content_encoding()
+        encoding = headers.get_content_encoding(self.headers)
        if encoding and encoding not in _supported_encodings and \
           encoding != 'identity':
            self.add_warning(_('Unsupported content encoding %r.') % encoding)
@ -579,30 +525,6 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        """
        return True

-    def get_content_type (self):
-        """
-        Get the MIME type from the Content-Type header value, or
-        'application/octet-stream' if not found.
-
-        @return: MIME type
-        @rtype: string
-        """
-        ptype = self.headers.get('Content-Type', 'application/octet-stream')
-        if ";" in ptype:
-            # split off not needed extension info
-            ptype = ptype.split(';')[0]
-        return ptype.strip()
-
-    def get_content_encoding (self):
-        """
-        Get the content encoding from the Content-Encoding header value, or
-        an empty string if not found.
-
-        @return: encoding string
-        @rtype: string
-        """
-        return self.headers.get("Content-Encoding", "").strip()
-
    def is_parseable (self):
        """
        Check if content is parseable for recursion.
@ -612,9 +534,10 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        """
        if not (self.valid and self.headers):
            return False
-        if self.get_content_type() not in ("text/html", "text/css"):
+        if headers.get_content_type(self.headers) not in \
+           ("text/html", "text/css"):
            return False
-        encoding = self.get_content_encoding()
+        encoding = headers.get_content_encoding(self.headers)
        if encoding and encoding not in _supported_encodings and \
           encoding != 'identity':
            self.add_warning(_('Unsupported content encoding %r.') % encoding)
@ -625,7 +548,7 @@ class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
        """
        Parse file contents for new links to check.
        """
-        ptype = self.get_content_type()
+        ptype = headers.get_content_type(self.headers)
        if ptype == "text/html":
            self.parse_html()
        elif ptype == "text/css":