From b6de623f4e3428acbb42db865d7a946fff4ad2cd Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 2 Nov 2000 08:26:14 +0000 Subject: [PATCH] proxy config git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@185 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- TODO | 7 ++++++- linkcheck/HttpUrlData.py | 2 ++ linkcheck/UrlData.py | 2 +- test/test1.html | 1 + test/test2.html | 11 ++++++++--- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/TODO b/TODO index 8eb1b90d..7d460a51 100644 --- a/TODO +++ b/TODO @@ -1,6 +1,11 @@ High priority -o Use Python 2.0 features +o Proxy geht nicht: + - getrennter http/https/ftp proxy + - environment Variablen werden bei RobotParser benutzt, also muß ich + das auch machen. + +o Robot parser testen o I want to be able to supply a "break" command even when multiple threads are running. diff --git a/linkcheck/HttpUrlData.py b/linkcheck/HttpUrlData.py index ed97ab7c..947a8ba5 100644 --- a/linkcheck/HttpUrlData.py +++ b/linkcheck/HttpUrlData.py @@ -196,7 +196,9 @@ class HttpUrlData(UrlData): roboturl="%s://%s/robots.txt" % self.urlTuple[0:2] rp = robotparser.RobotFileParser() rp.set_url(roboturl) + print roboturl rp.read() + print "2" robotsTxt = rp.can_fetch(Config.UserAgent, self.url) config.robotsTxtCache_set(self.urlTuple[0:2], robotsTxt) return config.robotsTxtCache_get(self.url) diff --git a/linkcheck/UrlData.py b/linkcheck/UrlData.py index 404bd2e8..12ca1675 100644 --- a/linkcheck/UrlData.py +++ b/linkcheck/UrlData.py @@ -37,8 +37,8 @@ _linkMatcher = r""" < # open tag \s* # whitespace %s # tag name - \s+ # whitespace [^>]*? # skip leading attributes + \s+ # whitespace %s # attrib name \s* # whitespace = # equal sign diff --git a/test/test1.html b/test/test1.html index 46ad1564..bf2371da 100644 --- a/test/test1.html +++ b/test/test1.html @@ -6,6 +6,7 @@ Just some HTTP links + diff --git a/test/test2.html b/test/test2.html index 016cc7ac..e582702a 100644 --- a/test/test2.html +++ b/test/test2.html @@ -16,6 +16,11 @@ -< img src="blubb"> -< link href="blubb"> -< script src="bla"> +< img src="blubb_image"> +< img lowsrc="blubb_lowimage"> +< link href="blubb_link"> +< script src="blubb_script"> +< area href="blubb_area"> +< body background="blubb_body"> +< area href="blubb_href"> +< form action="blubb_action">