updated for python2.3

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@995 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-23 23:54:44 +00:00 · 2003-08-11 11:19:38 +00:00 · 2003-08-11 11:19:38 +00:00 · 648df944b0
commit 648df944b0
parent f39f25d950
15 changed files with 27 additions and 372 deletions
--- a/7
+++ b/7
@ -1,6 +1,7 @@
-1.8.23
-  * merged robotparser.py with newest official version
-    Changed: linkcheck/robotparser.py
+1.9.23
+  * Require Python 2.3
+    - removed timeoutsocket.py and robotparser.py
+    - use True/False values
    Closes: SF bug 784977
  * start CGI output immediately
    Changed: lc.cgi, lc.fcgi, lc.sz_fcgi, linkcheck/lc_cgi.py
--- a/2
+++ b/2
@ -3,7 +3,7 @@

 Requirements
 ------------
-Python >= 2.2.1 from http://www.python.org/
+Python >= 2.3 from http://www.python.org/
 For HTTPS support you need to compile Python with the SSL _socket
 module.

--- a/6
+++ b/6
@ -1,5 +1,5 @@
 # This Makefile is only used by developers.
-PYTHON=python2.2
+PYTHON=python2.3
 VERSION=$(shell $(PYTHON) setup.py --version)
 PACKAGE=linkchecker
 NAME=$(shell $(PYTHON) setup.py --name)
@ -46,11 +46,11 @@ dist:	locale config
 	md5sum dist/* > $(MD5SUMS)
 	for f in dist/*; do gpg --detach-sign --armor $$f; done

-# to build in the current directory (assumes python 2.2)
+# to build in the current directory (assumes python 2.3)
 localbuild:
 	$(MAKE) -C linkcheck/parser
 	$(PYTHON) setup.py build
-	cp -f build/lib.linux-i686-2.2/linkcheck/parser/htmlsax.so linkcheck/parser
+	cp -f build/lib.linux-i686-2.3/linkcheck/parser/htmlsax.so linkcheck/parser


 # produce the .deb Debian package
--- a/2
+++ b/2
@ -4,12 +4,10 @@
 - the HTML parser should be even more forgiving with badly formatted html

 possible Python 2.3 improvements (ie needs Python >= 2.3)
- get rid of timeoutsocket.py, the default socket has timeouts
 - use optparse instead of getopt with more flexible commandline help
 - replace the debug() function with the logging module
  we'll see how we can insert multiple debug levels into this thing
 - use Bool object type
- get rid of the patched robotparser.py
 - use new csv module
 - use the Set type instead of hashmaps (did I use hashmaps for sets here?)

--- a/debian/changelog
+++ b/debian/changelog
@ -1,4 +1,4 @@
-linkchecker (1.8.23-1) unstable; urgency=low
+linkchecker (1.9.0-1) unstable; urgency=low

  * New upstream release.

--- a/debian/control
+++ b/debian/control
@ -2,12 +2,12 @@ Source: linkchecker
 Section: web
 Priority: optional
 Maintainer: Bastian Kleineidam <calvin@debian.org>
-Build-Depends: python2.2-dev (>= 2.2.1), debhelper (>= 4.1.0), gettext
+Build-Depends: python-dev (>= 2.3), python-dev (<< 2.4), debhelper (>= 4.1.0), gettext
 Standards-Version: 3.6.0

 Package: linkchecker
 Architecture: any
-Depends: python2.2 (>= 2.2.1)
+Depends: ${python:Depends}
 Conflicts: linkchecker-ssl
 Suggests: httpd
 Description: check HTML documents for broken links
--- a/debian/linkchecker.postinst
+++ b/debian/linkchecker.postinst
@ -1,34 +0,0 @@
-#!/bin/sh
-#
-# Written 1998 by Gregor Hoffleit <flight@debian.org>.
-# used by Bastian Kleineidam for LinkChecker
-set -e
-PYVER=2.2
-PYTHON=python
-PYTHONXY=${PYTHON}${PYVER}
-SITEPACKAGES=/usr/lib/$PYTHONXY/site-packages
-DIRLIST=linkcheck
-FILELIST=_linkchecker_configdata.py
-COMMAND="'import sys,py_compile;py_compile.compile(sys.argv[1])'"
-
-case "$1" in
-    configure|abort-upgrade|abort-remove|abort-deconfigure)
-        for i in $DIRLIST; do
-            $PYTHONXY -O /usr/lib/$PYTHONXY/compileall.py -q $SITEPACKAGES/$i
-            $PYTHONXY /usr/lib/$PYTHONXY/compileall.py -q $SITEPACKAGES/$i
-        done
-	# use /bin/sh -c, otherwise I get a SyntaxError from Python
-	for i in $FILELIST; do
-	    /bin/sh -c "$PYTHONXY -O -c $COMMAND $SITEPACKAGES/$i"
-	    /bin/sh -c "$PYTHONXY -c $COMMAND $SITEPACKAGES/$i"
-	done
-    ;;
-    *)
-        echo "postinst called with unknown argument \`$1'" >&2
-        exit 1
-    ;;
-esac
-
-#DEBHELPER#
-
-exit 0
--- a/debian/linkchecker.prerm
+++ b/debian/linkchecker.prerm
@ -1,19 +0,0 @@
-#!/bin/sh
-#
-# Written 1998 by Gregor Hoffleit <flight@debian.org>.
-# used by Bastian Kleineidam for LinkChecker
-set -e
-PACKAGE=linkchecker
-PYVER=2.2
-PYTHON=python
-PYTHONXY=${PYTHON}${PYVER}
-
-#DEBHELPER#
-
-dpkg --listfiles $PACKAGE |
-        awk '$0~/\.py$/ {print $0"c\n" $0"o"}' |
-        xargs rm -f >&2
-
-rmdir /usr/lib/$PYTHONXY/site-packages/linkcheck 2>/dev/null || true
-
-exit 0
--- a/debian/rules
+++ b/debian/rules
@ -4,7 +4,7 @@
 PACKAGE=linkchecker
 ROOT=$(CURDIR)/debian/$(PACKAGE)
 DOCDIR = $(ROOT)/usr/share/doc/$(PACKAGE)
-PYTHON=python2.2
+PYTHON=python
 CGIDIR=$(ROOT)/usr/lib/cgi-bin/lconline
 HTMLDIR=$(ROOT)/var/www/lconline

@ -66,6 +66,7 @@ binary-arch: build install
 	dh_installchangelogs 
 	dh_link
 	dh_strip
+	dh_python
 	dh_compress
 	dh_fixperms
 	dh_installdeb
--- a/linkcheck/AnsiColor.py
+++ b/linkcheck/AnsiColor.py
@ -29,7 +29,7 @@ AnsiType = {
    'invert': '7',
 }

-# color numbers
+# color numbers (the capitalized colors are bright)
 AnsiColor = {
    'default': '0',
    'black':   '30',
@ -72,6 +72,7 @@ def esc_ansicolor (color):

 AnsiReset = esc_ansicolor("default")

+
 def colorize (text, color=None):
    "return text colorized if TERM is set"
    if (color is not None) and os.environ.get('TERM'):
--- a/linkcheck/HttpUrlData.py
+++ b/linkcheck/HttpUrlData.py
@ -16,10 +16,9 @@
 # along with this program; if not, write to the Free Software
 # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

-import urlparse, sys, time, re, httplib
+import urlparse, sys, time, re, httplib, robotparser
 from urllib import quote, unquote
 import Config, StringUtil, i18n
-from linkcheck import robotparser
 from debug import *
 # XXX not dynamic
 if get_debuglevel() > 0:
--- a/linkcheck/parser/Makefile
+++ b/linkcheck/parser/Makefile
@ -1,7 +1,7 @@
 # use beta flex from http://lex.sf.net/ for reentrant
 # bison parser support
 FLEX=flex
-PYTHON=python2.2
+PYTHON=python2.3

 all: htmllex.c htmlparse.c

--- a/linkcheck/robotparser.py
+++ b/linkcheck/robotparser.py
@ -1,298 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-""" robotparser.py
-
-    Copyright (C) 2000  Bastian Kleineidam
-
-    You can choose between two licenses when using this package:
-    1) GNU GPLv2
-    2) PSF license for Python 2.2
-
-    The robots.txt Exclusion Protocol is implemented as specified in
-    http://info.webcrawler.com/mak/projects/robots/norobots-rfc.html
-"""
-import urlparse, urllib
-
-__all__ = ["RobotFileParser"]
-
-debug = 0
-
-def _debug(msg):
-    if debug: print msg
-
-
-class RobotFileParser:
-    """ This class provides a set of methods to read, parse and answer
-    questions about a single robots.txt file.
-
-    """
-
-    def __init__(self, url=''):
-        self.entries = []
-        self.default_entry = None
-        self.disallow_all = 0
-        self.allow_all = 0
-        self.set_url(url)
-        self.last_checked = 0
-
-    def mtime(self):
-        """Returns the time the robots.txt file was last fetched.
-
-        This is useful for long-running web spiders that need to
-        check for new robots.txt files periodically.
-
-        """
-        return self.last_checked
-
-    def modified(self):
-        """Sets the time the robots.txt file was last fetched to the
-        current time.
-
-        """
-        import time
-        self.last_checked = time.time()
-
-    def set_url(self, url):
-        """Sets the URL referring to a robots.txt file."""
-        self.url = url
-        self.host, self.path = urlparse.urlparse(url)[1:3]
-
-    def read(self):
-        """Reads the robots.txt URL and feeds it to the parser."""
-        opener = URLopener()
-        f = opener.open(self.url)
-        lines = []
-        line = f.readline()
-        while line:
-            lines.append(line.strip())
-            line = f.readline()
-        self.errcode = opener.errcode
-        if self.errcode == 401 or self.errcode == 403:
-            self.disallow_all = 1
-            _debug("disallow all")
-        elif self.errcode >= 400:
-            self.allow_all = 1
-            _debug("allow all")
-        elif self.errcode == 200 and lines:
-            _debug("parse lines")
-            self.parse(lines)
-
-    def _add_entry(self, entry):
-        if "*" in entry.useragents:
-            # the default entry is considered last
-            self.default_entry = entry
-        else:
-            self.entries.append(entry)
-
-    def parse(self, lines):
-        """parse the input lines from a robot.txt file.
-           We allow that a user-agent: line is not preceded by
-           one or more blank lines."""
-        state = 0
-        linenumber = 0
-        entry = Entry()
-
-        for line in lines:
-            linenumber = linenumber + 1
-            if not line:
-                if state==1:
-                    _debug("line %d: warning: you should insert"
-                           " allow: or disallow: directives below any"
-                           " user-agent: line" % linenumber)
-                    entry = Entry()
-                    state = 0
-                elif state==2:
-                    self._add_entry(entry)
-                    entry = Entry()
-                    state = 0
-            # remove optional comment and strip line
-            i = line.find('#')
-            if i>=0:
-                line = line[:i]
-            line = line.strip()
-            if not line:
-                continue
-            line = line.split(':', 1)
-            if len(line) == 2:
-                line[0] = line[0].strip().lower()
-                line[1] = urllib.unquote(line[1].strip())
-                if line[0] == "user-agent":
-                    if state==2:
-                        _debug("line %d: warning: you should insert a blank"
-                               " line before any user-agent"
-                               " directive" % linenumber)
-                        self._add_entry(entry)
-                        entry = Entry()
-                    entry.useragents.append(line[1])
-                    state = 1
-                elif line[0] == "disallow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
-                        entry.rulelines.append(RuleLine(line[1], 0))
-                        state = 2
-                elif line[0] == "allow":
-                    if state==0:
-                        _debug("line %d: error: you must insert a user-agent:"
-                               " directive before this line" % linenumber)
-                    else:
-                        entry.rulelines.append(RuleLine(line[1], 1))
-                else:
-                    _debug("line %d: warning: unknown key %s" % (linenumber,
-                               line[0]))
-            else:
-                _debug("line %d: error: malformed line %s"%(linenumber, line))
-        if state==2:
-            self.entries.append(entry)
-        _debug("Parsed rules:\n%s" % str(self))
-
-
-    def can_fetch(self, useragent, url):
-        """using the parsed robots.txt decide if useragent can fetch url"""
-        _debug("Checking robot.txt allowance for:\n  user agent: %s\n  url: %s" %
-               (useragent, url))
-        if self.disallow_all:
-            return 0
-        if self.allow_all:
-            return 1
-        # search for given user agent matches
-        # the first match counts
-        url = urllib.quote(urlparse.urlparse(urllib.unquote(url))[2]) or "/"
-        for entry in self.entries:
-            if entry.applies_to(useragent):
-                return entry.allowance(url)
-        # try the default entry last
-        if self.default_entry:
-            return self.default_entry.allowance(url)
-        # agent not found ==> access granted
-        return 1
-
-
-    def __str__(self):
-        ret = ""
-        for entry in self.entries:
-            ret = ret + str(entry) + "\n"
-        return ret
-
-
-class RuleLine:
-    """A rule line is a single "Allow:" (allowance==1) or "Disallow:"
-       (allowance==0) followed by a path."""
-    def __init__(self, path, allowance):
-        if path == '' and not allowance:
-            # an empty value means allow all
-            allowance = 1
-        self.path = urllib.quote(path)
-        self.allowance = allowance
-
-    def applies_to(self, filename):
-        return self.path=="*" or filename.startswith(self.path)
-
-    def __str__(self):
-        return (self.allowance and "Allow" or "Disallow")+": "+self.path
-
-
-class Entry:
-    """An entry has one or more user-agents and zero or more rulelines"""
-    def __init__(self):
-        self.useragents = []
-        self.rulelines = []
-
-    def __str__(self):
-        ret = ""
-        for agent in self.useragents:
-            ret = ret + "User-agent: "+agent+"\n"
-        for line in self.rulelines:
-            ret = ret + str(line) + "\n"
-        return ret
-
-    def applies_to(self, useragent):
-        """check if this entry applies to the specified agent"""
-        # split the name token and make it lower case
-        useragent = useragent.split("/")[0].lower()
-        for agent in self.useragents:
-            if agent=='*':
-                # we have the catch-all agent
-                return 1
-            agent = agent.lower()
-            if useragent.find(agent) != -1:
-                return 1
-        return 0
-
-    def allowance(self, filename):
-        """Preconditions:
-        - our agent applies to this entry
-        - filename is URL decoded"""
-        for line in self.rulelines:
-            _debug((filename, str(line), line.allowance))
-            if line.applies_to(filename):
-                return line.allowance
-        return 1
-
-class URLopener(urllib.FancyURLopener):
-    def __init__(self, *args):
-        apply(urllib.FancyURLopener.__init__, (self,) + args)
-        self.errcode = 200
-
-    def http_error_default(self, url, fp, errcode, errmsg, headers):
-        self.errcode = errcode
-        return urllib.FancyURLopener.http_error_default(self, url, fp, errcode,
-                                                        errmsg, headers)
-
-    def prompt_user_passwd (self, host, realm):
-        """Do not ask interactively any password; return None, None"""
-        return None, None
-
-
-def _check(a,b):
-    if not b:
-        ac = "access denied"
-    else:
-        ac = "access allowed"
-    if a!=b:
-        print "failed"
-    else:
-        print "ok (%s)" % ac
-    print
-
-def _test():
-    global debug
-    rp = RobotFileParser()
-    debug = 1
-
-    # robots.txt that exists, gotten to by redirection
-    rp.set_url('http://www.musi-cal.com/robots.txt')
-    rp.read()
-
-    # test for re.escape
-    _check(rp.can_fetch('*', 'http://www.musi-cal.com/'), 1)
-    # this should match the first rule, which is a disallow
-    _check(rp.can_fetch('', 'http://www.musi-cal.com/'), 0)
-    # various cherry pickers
-    _check(rp.can_fetch('CherryPickerSE',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.0',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    _check(rp.can_fetch('CherryPickerSE/1.5',
-                       'http://www.musi-cal.com/cgi-bin/event-search'
-                       '?city=San+Francisco'), 0)
-    # case sensitivity
-    _check(rp.can_fetch('ExtractorPro', 'http://www.musi-cal.com/blubba'), 0)
-    _check(rp.can_fetch('extractorpro', 'http://www.musi-cal.com/blubba'), 0)
-    # substring test
-    _check(rp.can_fetch('toolpak/1.1', 'http://www.musi-cal.com/blubba'), 0)
-    # tests for catch-all * agent
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/search'), 0)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/Musician/me'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-    _check(rp.can_fetch('spam', 'http://www.musi-cal.com/'), 1)
-
-    # robots.txt that does not exist
-    rp.set_url('http://www.lycos.com/robots.txt')
-    rp.read()
-    _check(rp.can_fetch('Mozilla', 'http://www.lycos.com/search'), 1)
-
-if __name__ == '__main__':
-    _test()
--- a/4
+++ b/4
@ -19,8 +19,8 @@

 # imports and checks
 import sys
-if not hasattr(sys, 'version_info') or sys.version_info<(2, 2, 1, 'final', 0):
-    raise SystemExit, "This program requires Python 2.2.1 or later."
+if not hasattr(sys, 'version_info') or sys.version_info<(2, 3, 0, 'final', 0):
+    raise SystemExit, "This program requires Python 2.3 or later."

 import getopt, re, os, pprint, linkcheck
 import linkcheck.timeoutsocket
--- a/setup.py
+++ b/setup.py
@ -146,7 +146,7 @@ myname = "Bastian Kleineidam"
 myemail = "calvin@users.sourceforge.net"

 setup (name = "linkchecker",
-       version = "1.8.23",
+       version = "1.9.0",
       description = "check HTML documents for broken links",
       author = myname,
       author_email = myemail,
@ -199,5 +199,11 @@ o a (Fast)CGI web interface (requires HTTP server)
              'lc.cgi','lc.fcgi','lc.sz_fcgi','linkchecker.bat']),
         ('share/man/man1', ['linkchecker.1']),
      ],
-      #classifiers = ['Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking',]
+      classifiers = [
+        'Topic :: Internet :: WWW/HTTP :: Site Management :: Link Checking',
+        'Development Status :: 5 - Production/Stable',
+        'License :: OSI Approved :: GNU General Public License (GPL)',
+        'Programming Language :: Python',
+        'Programming Language :: C',
+      ],
 )