new intern/extern handling

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2584 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2005-05-09 22:05:21 +00:00
parent 0c2b70cf1e
commit 1a7a771648
16 changed files with 123 additions and 171 deletions

3
TODO
View file

@ -1,8 +1,5 @@
Possible improvements people could work on:
- [USAGE] rethink intern/extern stuff. Especially when specifying
--extern one should not have to also specify --intern suddenly.
- [BUGFIX] when an URL is found in the cache and it has a broken anchor,
the broken anchor name is not displayed as a warning

View file

@ -119,9 +119,6 @@
# check anchors?
#anchors=0
#recursionlevel=1
# overall strict checking. You can specify for each extern URL
# separately if its strict or not. See the [filtering] section
#strict=0
# supply a regular expression for which warnings are printed if found
# in any HTML files.
#warningregex=Oracle DB Error
@ -135,15 +132,14 @@
# filtering options (see FAQ)
# for each extern link we can specify if it is strict or not
[filtering]
# everything with 'lconline' in the URL name is extern
# extern1=lconline 0
# everything with 'bookmark' in the URL name is strict
# extern2=bookmark 1
# links to our domain are intern
# internlinks=calvinsplayground\.de
# check only syntax of all mail adresses
# extern3=^mailto: 1
#denyallow=0
# ignore everything with 'lconline' in the URL name
#ignore1=lconline
# and ignore everything with 'bookmark' in the URL name
#ignore2=bookmark
# and ignore all mailto: URLs
#ignore3=^mailto:
# do not recurse into the following URLs
#nofollow1=http://justahomepage/bla
# specify hosts to contact directly without a proxy
# value is a regular expression
#noproxy1=*\.intra

View file

@ -160,34 +160,6 @@ import linkcheck.checker.nntpurl
import linkcheck.checker.errorurl
def set_intern_url (url, klass, config):
"""
Add intern url pattern for url given on the command line.
@param url: URL to add
@type url: string
@param klass: URL class
@type klass: class object
@param config: configuration data
@type config: linkcheck.configuration.Configuration
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "Set intern url for %r", url)
if klass == linkcheck.checker.fileurl.FileUrl:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern pattern ^file:")
config['internlinks'].append(linkcheck.get_link_pat("^file:"))
elif klass in [linkcheck.checker.httpurl.HttpUrl,
linkcheck.checker.httpsurl.HttpsUrl,
linkcheck.checker.ftpurl.FtpUrl]:
domain = linkcheck.strformat.url_unicode_split(url)[1]
domain, is_idn = linkcheck.url.idna_encode(domain)
if domain:
domain = "://%s" % re.escape(domain)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Add intern domain %r",
domain)
# add scheme colon to link pattern
config['internlinks'].append(linkcheck.get_link_pat(domain))
def absolute_url (base_url, base_ref, parent_url):
"""
Search for the absolute url to detect the link type. This does not
@ -212,7 +184,7 @@ def absolute_url (base_url, base_ref, parent_url):
def get_url_from (base_url, recursion_level, consumer,
parent_url=None, base_ref=None, line=0, column=0,
name=u"", cmdline=None):
name=u"", cmdline=True):
"""
Get url data from given base data.
@ -232,8 +204,6 @@ def get_url_from (base_url, recursion_level, consumer,
@type column: number
@param name: link name
@type name: string
@param cmdline: flag if url was given on command line
@type cmdline: bool
"""
base_url = linkcheck.strformat.unicode_safe(base_url)
if parent_url is not None:
@ -241,8 +211,6 @@ def get_url_from (base_url, recursion_level, consumer,
if base_ref is not None:
base_ref = linkcheck.strformat.unicode_safe(base_ref)
name = linkcheck.strformat.unicode_safe(name)
#if cmdline and linkcheck.url.url_needs_quoting(base_url):
# base_url = linkcheck.url.url_quote(base_url)
url = absolute_url(base_url, base_ref, parent_url)
# test scheme
if url.startswith("http:"):
@ -272,13 +240,17 @@ def get_url_from (base_url, recursion_level, consumer,
else:
# error url, no further checking, just log this
klass = linkcheck.checker.errorurl.ErrorUrl
if cmdline and not (consumer.config['internlinks'] or
consumer.config['externlinks']):
# set automatic intern/extern stuff if no filter was given
set_intern_url(url, klass, consumer.config)
return klass(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
url_data = klass(base_url, recursion_level, consumer,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
if cmdline:
# add intern URL regex to config for every URL that was given
# on the command line
pat = url_data.get_intern_pattern()
linkcheck.log.debug(linkcheck.LOG_CMDLINE, "Pattern %r", pat)
if pat:
consumer.config['internlinks'].append(linkcheck.get_link_pat(pat))
return url_data
def get_index_html (urls):

View file

@ -51,7 +51,7 @@ def _check_morsel (m, host, path):
class Cache (linkcheck.lock.AssertLock):
"""
Store and provide routines for cached data. Currently there are
caches for cookies, checked urls, FTP connections and robots.txt
caches for cookies, checked URLs, FTP connections and robots.txt
contents.
All public operations (except __init__()) are thread-safe.
@ -62,11 +62,11 @@ class Cache (linkcheck.lock.AssertLock):
Initialize the default options.
"""
super(Cache, self).__init__()
# already checked urls
# already checked URLs
self.checked = {}
# urls that are being checked
# URLs that are being checked
self.in_progress = {}
# to-be-checked urls
# to-be-checked URLs
self.incoming = collections.deque()
# downloaded robots.txt files
self.robots_txt = {}

View file

@ -51,7 +51,7 @@ def print_duration (duration):
class Consumer (linkcheck.lock.AssertLock):
"""
Consume urls from the url queue in a thread-safe manner.
Consume URLs from the URL queue in a thread-safe manner.
"""
def __init__ (self, config, cache):

View file

@ -255,3 +255,19 @@ class FileUrl (urlbase.UrlBase):
if ro.search(self.get_content()[:30]):
getattr(self, "parse_"+key)()
return
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
absolute = linkcheck.checker.absolute_url
url = absolute(self.base_url, self.base_ref, self.parent_url)
if not url:
return None
parts = linkcheck.strformat.url_unicode_split(url)
path = parts[2]
return "file://%s" % re.escape(path)

View file

@ -27,12 +27,13 @@ import linkcheck
import urlbase
import proxysupport
import httpurl
import internpaturl
import linkcheck.ftpparse._ftpparse as ftpparse
DEFAULT_TIMEOUT_SECS = 300
class FtpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with ftp scheme.
"""

View file

@ -34,6 +34,7 @@ import linkcheck.robotparser2
import linkcheck.httplib2
import httpheaders as headers
import urlbase
import internpaturl
import proxysupport
supportHttps = hasattr(linkcheck.httplib2, "HTTPSConnection") and \
@ -45,7 +46,7 @@ _supported_encodings = ('gzip', 'x-gzip', 'deflate')
_is_amazon = re.compile(r'^www\.amazon\.(com|de|ca|fr|co\.(uk|jp))').search
class HttpUrl (urlbase.UrlBase, proxysupport.ProxySupport):
class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
"""
Url link with http scheme.
"""

View file

@ -22,14 +22,14 @@ import urllib
class ProxySupport (object):
"""
Get support for proxying and for urls with user:pass@host setting.
Get support for proxying and for URLs with user:pass@host setting.
"""
def set_proxy (self, proxy):
"""
Parse given proxy information and store parsed values.
Note that only http:// proxies are supported, both for ftp://
and http:// urls.
and http:// URLs.
"""
self.proxy = proxy
self.proxyauth = None

View file

@ -518,54 +518,34 @@ class UrlBase (object):
@rtype: bool
"""
linkcheck.log.debug(linkcheck.LOG_CHECK, "extern=%s", self.extern)
return self.extern[0] and \
(self.consumer.config["externstrictall"] or self.extern[1])
return self.extern[0] and self.extern[1]
def _get_extern (self, url):
"""
Match URL against intern and extern link patterns, according
to the configured denyallow order.
Match URL against extern and intern link patterns. If no pattern
matches the URL is extern.
@return: a tuple (is_extern, is_strict)
@rtype: tuple (bool, bool)
"""
if not (self.consumer.config["externlinks"] or \
self.consumer.config["internlinks"]):
return (0, 0)
# deny and allow external checking
linkcheck.log.debug(linkcheck.LOG_CHECK, "Url %r", url)
if self.consumer.config["denyallow"]:
for entry in self.consumer.config["externlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (1, entry['strict'])
for entry in self.consumer.config["internlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (0, 0)
return (0, 0)
else:
for entry in self.consumer.config["internlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (0, 0)
for entry in self.consumer.config["externlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
return (1, entry['strict'])
return (1, 0)
for entry in self.consumer.config["externlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern URL %r", url)
return (1, entry['strict'])
for entry in self.consumer.config["internlinks"]:
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern entry %r",
entry)
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
(match and not entry['negate']):
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern URL %r", url)
return (0, 0)
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern URL %r", url)
return (1, 0)
def can_get_content (self):
"""
@ -726,6 +706,15 @@ class UrlBase (object):
u"name=%r" % self.name,
])
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
return None
def __str__ (self):
"""
Get URL info.

View file

@ -78,11 +78,9 @@ class Configuration (dict):
self['quiet'] = False
self["anchors"] = False
self["anchorcaching"] = True
self["externstrictall"] = False
self["externlinks"] = []
self["internlinks"] = []
self["noproxyfor"] = []
self["denyallow"] = False
self["interactive"] = False
# on ftp, password is set by Pythons ftplib
self["authentication"] = []
@ -270,8 +268,8 @@ class Configuration (dict):
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
self[key]['fields'] = [f.strip() \
for f in cfgparser.get(key, 'fields').split(',')]
self[key]['parts'] = [f.strip() \
for f in cfgparser.get(key, 'parts').split(',')]
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
@ -341,11 +339,6 @@ class Configuration (dict):
self["recursionlevel"] = num
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
self["externstrictall"] = \
cfgparser.getboolean(section, "externstrictall")
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
arg = cfgparser.get(section, "warningregex")
if arg:
@ -415,13 +408,26 @@ class Configuration (dict):
try:
i = 1
while 1:
ctuple = cfgparser.get(section, "extern%d" % i).split()
ctuple = cfgparser.get(section, "nofollow%d" % i).split()
if len(ctuple)!=2:
linkcheck.log.error(
_("extern%d: syntax error %s\n") % (i, ctuple))
_("nofollow%d: syntax error %s\n") % (i, ctuple))
break
self["externlinks"].append(
linkcheck.get_link_pat(ctuple[0], strict=int(ctuple[1])))
linkcheck.get_link_pat(ctuple[0], strict=0))
i += 1
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
i = 1
while 1:
ctuple = cfgparser.get(section, "ignore%d" % i).split()
if len(ctuple)!=2:
linkcheck.log.error(
_("ignore%d: syntax error %s\n") % (i, ctuple))
break
self["externlinks"].append(
linkcheck.get_link_pat(ctuple[0], strict=1))
i += 1
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
@ -430,10 +436,6 @@ class Configuration (dict):
linkcheck.get_link_pat(cfgparser.get(section, "internlinks")))
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
try:
self["denyallow"] = cfgparser.getboolean(section, "denyallow")
except ConfigParser.Error, msg:
linkcheck.log.debug(linkcheck.LOG_CHECK, msg)
def write_boolean_config (self, fp, boolopts):
"""

View file

@ -86,8 +86,6 @@ def checklink (out=sys.stdout, form=None, env=os.environ):
config["recursionlevel"] = int(form["level"].value)
config["logger"] = config.logger_new('html', fd=out)
config["threads"] = 0
if form.has_key('externstrictall'):
config['externstrictall'] = True
if form.has_key("anchors"):
config["anchors"] = True
if not form.has_key("errors"):
@ -146,7 +144,7 @@ def checkform (form):
if not _is_level(level):
raise FormError, _("invalid recursion level")
# check options
for option in ("externstrictall", "anchors", "errors", "intern"):
for option in ("anchors", "errors", "intern"):
if form.has_key(option):
if not form[option].value == "on":
raise FormError, _("invalid %s option syntax") % option

View file

@ -23,6 +23,9 @@ try:
except ImportError:
import dummy_threading as threading
import linkcheck
import linkcheck.log
lock_klass = threading.RLock().__class__
class AssertLock (lock_klass):
@ -36,6 +39,7 @@ class AssertLock (lock_klass):
Acquire lock.
"""
assert not self.is_locked(), "deadlock"
#linkcheck.log.debug(linkcheck.LOG_THREAD, "Acquire %s", self, tb=True)
super(AssertLock, self).acquire(blocking=blocking)
def release (self):
@ -43,6 +47,7 @@ class AssertLock (lock_klass):
Release lock.
"""
assert self.is_locked(), "double release"
#linkcheck.log.debug(linkcheck.LOG_THREAD, "Release %s", self, tb=True)
super(AssertLock, self).release()
def is_locked (self):
@ -50,3 +55,4 @@ class AssertLock (lock_klass):
See if this lock is owned.
"""
return self._is_owned()

View file

@ -339,7 +339,7 @@ def collapse_segments (path):
# backslashes to be left alone, and finally quoted with '%5C')
# But replacing has several positive effects:
# - Prevents path attacks on Windows systems (using \.. parent refs)
# - Fixes bad urls where users used backslashes instead of slashes.
# - Fixes bad URLs where users used backslashes instead of slashes.
# This is a far more probable case than users having an intentional
# backslash in the path name.
path = path.replace('\\', '/')

View file

@ -79,7 +79,7 @@ o URLs on the command line starting with "ftp." are treated like
o If you have your system configured to automatically establish a
connection to the internet (e.g. with diald), it will connect when
checking links not pointing to your local system.
See the --extern-strict-all option on how to prevent this.
See the --ignore-url option on how to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker disables it
automatically.
@ -107,7 +107,7 @@ thousands URLs. Use the -r option to restrict the recursion depth.
Don't connect to mailto: hosts, only check their URL syntax. All other
links are checked as usual:
linkchecker --intern='!^mailto:' --extern-strict-all www.mysite.org
linkchecker --ignore-url=^mailto: www.mysite.org
Checking a local HTML file on Unix:
linkchecker ../bla.html
@ -140,7 +140,7 @@ sql Log check result as SQL script with INSERT commands. An example
blacklist
Suitable for cron jobs. Logs the check result into a file
~/.linkchecker/blacklist which only contains entries with invalid
urls and the number of times they have failed.
URLs and the number of times they have failed.
none Logs nothing. Suitable for scripts.
""")
@ -396,29 +396,12 @@ group.add_option("-r", "--recursion-level", type="int", dest="recursionlevel",
help=_(
"""Check recursively all links up to given depth. A negative depth
will enable inifinite recursion. Default depth is infinite."""))
group.add_option("-i", "--intern", type="string", action="append",
dest="intern", help=_(
""" regex, --intern=regex
Assume URLs that match the given expression as internal.
LinkChecker descends recursively only to internal URLs, not to
external."""))
group.add_option("-e", "--extern", type="string", action="append",
group.add_option("--no-follow-url", type="string", action="append",
dest="extern", help=_(
"""Assume urls that match the given expression as external.
Only internal HTML links are checked recursively."""))
group.add_option("--extern-strict", type="string", action="append",
"""Check but do not recurse into URLs matching the given regex."""))
group.add_option("--ignore-url", type="string", action="append",
dest="externstrict", help=_(
"""Assume urls that match the given expression as strict external.
Only internal HTML links are checked recursively."""))
group.add_option("-s", "--extern-strict-all", action="store_true",
dest="externstrictall", help=_(
"""Check only syntax of external links, do not try to connect to them.
For local file urls, only local files are internal. For
http and ftp urls, all urls at the same domain name are internal."""))
group.add_option("-d", "--denyallow", action="store_true", dest="denyallow",
help=_(
"""Swap checking order to external/internal. Default checking order
is internal/external."""))
"""Only check syntax of URLs matching the given regex."""))
group.add_option("-C", "--cookies", action="store_true", dest="cookies",
help=_(
"""Accept and send HTTP cookies according to RFC 2109. Only cookies
@ -427,8 +410,7 @@ Sent and accepted cookies are provided as additional logging
information."""))
group.add_option("-a", "--anchors", action="store_true", dest="anchors",
help=_(
"""Check HTTP anchor references. This option applies to both internal
and external urls. Default is don't check anchors.
"""Check HTTP anchor references. Default is don't check anchors.
This option implies -w because anchor errors are always warnings."""))
group.add_option("--no-anchor-caching", action="store_false",
dest="anchorcaching", help=_(
@ -462,12 +444,6 @@ group.add_option("--no-proxy-for", type="string", action="append",
going through a proxy."""))
optparser.add_option_group(group)
################# deprecated options ##################
group = optparse.OptionGroup(optparser, _("Deprecated options"))
group.add_option("-w", "--warnings", action="store_true", dest="warnings",
help=_("""Log warnings. This is the default."""))
optparser.add_option_group(group)
################# auto completion #####################
if has_optcomplete:
optcomplete.autocomplete(optparser)
@ -570,11 +546,6 @@ if options.fileoutput:
config['fileoutput'].append(logger)
if options.interactive is not None:
config['interactive'] = options.interactive
if options.intern:
pats = [linkcheck.get_link_pat(arg) for arg in options.intern]
config["internlinks"].extend(pats)
if options.denyallow is not None:
config["denyallow"] = options.denyallow
if options.nntpserver:
config["nntpserver"] = options.nntpserver
if options.anchorcaching is not None:
@ -594,8 +565,6 @@ if options.quiet is not None:
config['logger'] = config.logger_new('none')
if options.recursionlevel is not None:
config["recursionlevel"] = options.recursionlevel
if options.externstrictall is not None:
config["externstrictall"] = options.externstrictall
if options.status is not None:
config['status'] = options.status
if options.threads is not None:
@ -646,10 +615,10 @@ if (linkcheck.logger.gml.GMLLogger in klasses or \
if len(args) <= 0:
if config['interactive']:
urls = raw_input(
_("enter one or more urls, separated by white-space\n--> "))
_("enter one or more URLs, separated by white-space\n--> "))
args = urls.split()
else:
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or urls given"))
linkcheck.log.warn(linkcheck.LOG_CMDLINE, _("no files or URLs given"))
# initialize the cache and the consumer model
cache = linkcheck.checker.cache.Cache()
@ -661,9 +630,9 @@ for url in args:
elif url.lower().startswith("ftp."):
url = "ftp://%s" % url
url_data = linkcheck.checker.get_url_from(url, 0, consumer, cmdline=True)
# add to consumer queue
consumer.append_url(url_data)
############################# check the urls ################################
############################# check the URLs ################################
if do_profile and not has_profile:
linkcheck.log.warn(linkcheck.LOG_CMDLINE,
_("The `profile' Python module is not installed,"
@ -697,7 +666,7 @@ elif options.psyco:
except ImportError:
# no psyco available, just ignore
pass
linkcheck.checker.check_urls(consumer)
linkcheck.checker.check_urls(consumer)
#############################################################################
# interactive input end

View file

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: calvin@users.sourceforge.net\n"
"POT-Creation-Date: 2005-05-08 22:12+0200\n"
"POT-Creation-Date: 2005-05-09 23:59+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@ -618,7 +618,12 @@ msgstr ""
#: ../linkcheck/configuration.py:414
#, python-format
msgid "extern%d: syntax error %s\n"
msgid "nofollow%d: syntax error %s\n"
msgstr ""
#: ../linkcheck/configuration.py:427
#, python-format
msgid "ignore%d: syntax error %s\n"
msgstr ""
#: ../linkcheck/strformat.py:180