mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-02 11:54:43 +00:00
Optimize intern/extern pattern parsing.
This commit is contained in:
parent
5554c16aa2
commit
a03090c20f
13 changed files with 72 additions and 69 deletions
|
|
@ -101,14 +101,6 @@ class LinkCheckerInterrupt (StandardError):
|
|||
"""Used for testing."""
|
||||
pass
|
||||
|
||||
def add_intern_pattern (url_data, config):
|
||||
"""Add intern URL regex to config."""
|
||||
pat = url_data.get_intern_pattern()
|
||||
if pat:
|
||||
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
|
||||
config['internlinks'].append(get_link_pat(pat))
|
||||
url_data.set_extern(url_data.url)
|
||||
|
||||
|
||||
def get_link_pat (arg, strict=False):
|
||||
"""Get a link pattern matcher for intern/extern links.
|
||||
|
|
|
|||
|
|
@ -50,7 +50,7 @@ def absolute_url (base_url, base_ref, parent_url):
|
|||
|
||||
def get_url_from (base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=0, column=0,
|
||||
name=u"", parent_content_type=None):
|
||||
name=u"", parent_content_type=None, extern=None):
|
||||
"""
|
||||
Get url data from given base data.
|
||||
|
||||
|
|
@ -70,6 +70,8 @@ def get_url_from (base_url, recursion_level, aggregate,
|
|||
@type column: number
|
||||
@param name: link name
|
||||
@type name: string
|
||||
@param extern: (is_extern, is_strict) or None
|
||||
@type extern: tuple(int, int) or None
|
||||
"""
|
||||
if base_url is not None:
|
||||
base_url = strformat.unicode_safe(base_url)
|
||||
|
|
@ -96,7 +98,7 @@ def get_url_from (base_url, recursion_level, aggregate,
|
|||
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
|
||||
return klass(base_url, recursion_level, aggregate,
|
||||
parent_url=parent_url, base_ref=base_ref,
|
||||
line=line, column=column, name=name)
|
||||
line=line, column=column, name=name, extern=extern)
|
||||
|
||||
|
||||
def get_urlclass_from (url, assume_local_file=False):
|
||||
|
|
|
|||
|
|
@ -95,7 +95,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
"""
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name, url_encoding):
|
||||
aggregate, line, column, name, url_encoding, extern):
|
||||
"""
|
||||
Besides the usual initialization the URL is normed according
|
||||
to the platform:
|
||||
|
|
@ -103,7 +103,7 @@ class FileUrl (urlbase.UrlBase):
|
|||
- under Windows platform the drive specifier is normed
|
||||
"""
|
||||
super(FileUrl, self).init(base_ref, base_url, parent_url,
|
||||
recursion_level, aggregate, line, column, name, url_encoding)
|
||||
recursion_level, aggregate, line, column, name, url_encoding, extern)
|
||||
self.scheme = u'file'
|
||||
if self.base_url is None:
|
||||
return
|
||||
|
|
@ -286,13 +286,14 @@ class FileUrl (urlbase.UrlBase):
|
|||
self.content_type = u""
|
||||
return self.content_type
|
||||
|
||||
def get_intern_pattern (self):
|
||||
def get_intern_pattern (self, url=None):
|
||||
"""Get pattern for intern URL matching.
|
||||
|
||||
@return non-empty regex pattern or None
|
||||
@rtype String or None
|
||||
"""
|
||||
url = self.url
|
||||
if url is None:
|
||||
url = self.url
|
||||
if not url:
|
||||
return None
|
||||
if url.startswith('file://'):
|
||||
|
|
|
|||
|
|
@ -30,8 +30,7 @@ from cStringIO import StringIO
|
|||
from datetime import datetime
|
||||
|
||||
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
|
||||
httplib2 as httplib, LinkCheckerError, get_link_pat, httputil,
|
||||
configuration)
|
||||
httplib2 as httplib, LinkCheckerError, httputil, configuration)
|
||||
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
|
||||
get_url_from)
|
||||
# import warnings
|
||||
|
|
@ -380,11 +379,10 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
|
||||
# Add intern patterns for redirection of URLs given by the
|
||||
# user for HTTP schemes.
|
||||
pat = internpaturl.get_intern_pattern(redirected)
|
||||
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
|
||||
self.aggregate.config['internlinks'].append(get_link_pat(pat))
|
||||
self.add_intern_pattern(url=redirected)
|
||||
return True
|
||||
# check extern filter again
|
||||
self.extern = None
|
||||
self.set_extern(redirected)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
if set_result:
|
||||
|
|
|
|||
|
|
@ -49,14 +49,15 @@ def get_intern_pattern (url):
|
|||
class InternPatternUrl (urlbase.UrlBase):
|
||||
"""Class supporting an intern URL pattern."""
|
||||
|
||||
def get_intern_pattern (self):
|
||||
def get_intern_pattern (self, url=None):
|
||||
"""
|
||||
Get pattern for intern URL matching.
|
||||
|
||||
@return non-empty regex pattern or None
|
||||
@rtype String or None
|
||||
"""
|
||||
url = absolute_url(self.base_url, self.base_ref, self.parent_url)
|
||||
if url is None:
|
||||
url = absolute_url(self.base_url, self.base_ref, self.parent_url)
|
||||
if not url:
|
||||
return None
|
||||
return get_intern_pattern(url)
|
||||
|
|
|
|||
|
|
@ -74,7 +74,6 @@ class UnknownUrl (urlbase.UrlBase):
|
|||
|
||||
def local_check (self):
|
||||
"""Only logs that this URL is unknown."""
|
||||
self.set_extern(self.url)
|
||||
if self.extern[0] and self.extern[1]:
|
||||
self.add_info(_("Outside of domain filter, checked only syntax."))
|
||||
elif self.ignored():
|
||||
|
|
|
|||
|
|
@ -31,7 +31,7 @@ import select
|
|||
from . import absolute_url, StoringHandler, get_url_from
|
||||
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
|
||||
strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip,
|
||||
fileutil)
|
||||
fileutil, get_link_pat)
|
||||
from ..HtmlParser import htmlsax
|
||||
from ..htmlutil import linkparse
|
||||
from ..network import iputil
|
||||
|
|
@ -109,7 +109,7 @@ class UrlBase (object):
|
|||
|
||||
def __init__ (self, base_url, recursion_level, aggregate,
|
||||
parent_url=None, base_ref=None, line=-1, column=-1,
|
||||
name=u"", url_encoding=None):
|
||||
name=u"", url_encoding=None, extern=None):
|
||||
"""
|
||||
Initialize check data, and store given variables.
|
||||
|
||||
|
|
@ -122,14 +122,18 @@ class UrlBase (object):
|
|||
@param column: column number of url in parent content
|
||||
@param name: name of url or empty
|
||||
@param url_encoding: encoding of URL or None
|
||||
@param extern: None or (is_extern, is_strict)
|
||||
"""
|
||||
self.reset()
|
||||
self.init(base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name, url_encoding)
|
||||
aggregate, line, column, name, url_encoding, extern)
|
||||
self.check_syntax()
|
||||
if recursion_level == 0:
|
||||
self.add_intern_pattern()
|
||||
self.set_extern(self.url)
|
||||
|
||||
def init (self, base_ref, base_url, parent_url, recursion_level,
|
||||
aggregate, line, column, name, url_encoding):
|
||||
aggregate, line, column, name, url_encoding, extern):
|
||||
"""
|
||||
Initialize internal data.
|
||||
"""
|
||||
|
|
@ -143,6 +147,7 @@ class UrlBase (object):
|
|||
self.name = name
|
||||
self.encoding = url_encoding
|
||||
self.charset = None
|
||||
self.extern = extern
|
||||
if self.base_ref:
|
||||
assert not urlutil.url_needs_quoting(self.base_ref), \
|
||||
"unquoted base reference URL %r" % self.base_ref
|
||||
|
|
@ -199,8 +204,8 @@ class UrlBase (object):
|
|||
# cache keys, are set by build_url() calling set_cache_keys()
|
||||
self.cache_url_key = None
|
||||
self.cache_content_key = None
|
||||
# extern flags (is_extern, is_strict), both enabled as default
|
||||
self.extern = (1, 1)
|
||||
# extern flags (is_extern, is_strict)
|
||||
self.extern = None
|
||||
# flag if the result should be cached
|
||||
self.caching = True
|
||||
# title is either the URL or parsed from content
|
||||
|
|
@ -399,7 +404,6 @@ class UrlBase (object):
|
|||
self.set_result(unicode_safe(msg), valid=False)
|
||||
else:
|
||||
self.set_cache_keys()
|
||||
self.set_extern(self.url)
|
||||
|
||||
def check_url_warnings(self):
|
||||
"""Check URL name and length."""
|
||||
|
|
@ -704,6 +708,11 @@ class UrlBase (object):
|
|||
|
||||
@return: None
|
||||
"""
|
||||
if self.extern:
|
||||
return
|
||||
if not url:
|
||||
self.extern = (1, 1)
|
||||
return
|
||||
for entry in self.aggregate.config["externlinks"]:
|
||||
match = entry['pattern'].search(url)
|
||||
if (entry['negate'] and not match) or \
|
||||
|
|
@ -1133,15 +1142,28 @@ class UrlBase (object):
|
|||
u"anchor=%r" % self.anchor,
|
||||
])
|
||||
|
||||
def get_intern_pattern (self):
|
||||
"""
|
||||
Get pattern for intern URL matching.
|
||||
def get_intern_pattern (self, url=None):
|
||||
"""Get pattern for intern URL matching.
|
||||
|
||||
@param url: the URL to set intern pattern for, else self.url
|
||||
@ptype url: unicode or None
|
||||
@return non-empty regex pattern or None
|
||||
@rtype String or None
|
||||
"""
|
||||
return None
|
||||
|
||||
def add_intern_pattern(self, url=None):
|
||||
"""Add intern URL regex to config."""
|
||||
try:
|
||||
pat = self.get_intern_pattern(url=url)
|
||||
if pat:
|
||||
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
|
||||
self.aggregate.config['internlinks'].append(get_link_pat(pat))
|
||||
except UnicodeError, msg:
|
||||
res = _("URL has unparsable domain name: %(domain)s") % \
|
||||
{"domain": msg}
|
||||
self.set_result(res, valid=False)
|
||||
|
||||
def __str__ (self):
|
||||
"""
|
||||
Get URL info.
|
||||
|
|
|
|||
|
|
@ -19,8 +19,7 @@ Utility functions suitable for command line clients.
|
|||
"""
|
||||
import sys
|
||||
import optparse
|
||||
from . import fileutil, ansicolor, strformat, add_intern_pattern, checker, \
|
||||
log, LOG_CMDLINE
|
||||
from . import fileutil, ansicolor, strformat, checker
|
||||
from .director import console
|
||||
from .decorators import notimplemented
|
||||
|
||||
|
|
@ -113,7 +112,7 @@ class LCOptionParser (optparse.OptionParser, object):
|
|||
pass
|
||||
|
||||
|
||||
def aggregate_url (aggregate, config, url, err_exit_code=2):
|
||||
def aggregate_url (aggregate, url, err_exit_code=2):
|
||||
"""Append given commandline URL to input queue."""
|
||||
get_url_from = checker.get_url_from
|
||||
if url.lower().startswith("www."):
|
||||
|
|
@ -122,12 +121,5 @@ def aggregate_url (aggregate, config, url, err_exit_code=2):
|
|||
elif url.lower().startswith("ftp."):
|
||||
# syntactic sugar
|
||||
url = "ftp://%s" % url
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
try:
|
||||
add_intern_pattern(url_data, config)
|
||||
except UnicodeError:
|
||||
log.error(LOG_CMDLINE,
|
||||
_("URL has unparsable domain name: %(domain)s") %
|
||||
{"domain": sys.exc_info()[1]})
|
||||
sys.exit(err_exit_code)
|
||||
url_data = get_url_from(url, 0, aggregate, extern=(0, 0))
|
||||
aggregate.urlqueue.put(url_data)
|
||||
|
|
|
|||
|
|
@ -36,8 +36,8 @@ from .urlsave import urlsave
|
|||
from .settings import Settings
|
||||
from .recentdocs import RecentDocumentModel
|
||||
from .projects import openproject, saveproject, loadproject, ProjectExt
|
||||
from .. import configuration, checker, director, add_intern_pattern, \
|
||||
strformat, fileutil, LinkCheckerError, get_link_pat, memoryutil
|
||||
from .. import configuration, checker, director, get_link_pat, \
|
||||
strformat, fileutil, LinkCheckerError, memoryutil
|
||||
from ..containers import enum
|
||||
from .. import url as urlutil
|
||||
from ..checker import httpheaders
|
||||
|
|
@ -458,14 +458,15 @@ Version 2 or later.
|
|||
self.set_statusmsg(_("Error, empty URL"))
|
||||
return
|
||||
self.set_statusmsg(_("Checking '%s'.") % strformat.limit(url, 40))
|
||||
url_data = checker.get_url_from(url, 0, aggregate)
|
||||
try:
|
||||
self.backup_config('internlinks')
|
||||
add_intern_pattern(url_data, self.config)
|
||||
except UnicodeError:
|
||||
self.set_statusmsg(_("Error, invalid URL `%s'.") %
|
||||
strformat.limit(url, 40))
|
||||
return
|
||||
url_data = checker.get_url_from(url, 0, aggregate, extern=(0, 0))
|
||||
# XXX
|
||||
#try:
|
||||
# self.backup_config('internlinks')
|
||||
# add_intern_pattern(url_data, self.config)
|
||||
#except UnicodeError:
|
||||
# self.set_statusmsg(_("Error, invalid URL `%s'.") %
|
||||
# strformat.limit(url, 40))
|
||||
# return
|
||||
self.recent.add_document(url)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
self.aggregate = aggregate
|
||||
|
|
|
|||
|
|
@ -25,8 +25,8 @@ import locale
|
|||
import re
|
||||
import time
|
||||
import urlparse
|
||||
from . import configuration, strformat, checker, director, \
|
||||
add_intern_pattern, get_link_pat, init_i18n, url as urlutil
|
||||
from . import configuration, strformat, checker, director, get_link_pat, \
|
||||
init_i18n, url as urlutil
|
||||
from .decorators import synchronized
|
||||
|
||||
# 5 minutes timeout for requests
|
||||
|
|
@ -139,14 +139,7 @@ def checklink (form=None, env=os.environ):
|
|||
config = get_configuration(form, out)
|
||||
url = strformat.stripurl(formvalue(form, "url"))
|
||||
aggregate = director.get_aggregate(config)
|
||||
url_data = checker.get_url_from(url, 0, aggregate)
|
||||
try:
|
||||
add_intern_pattern(url_data, config)
|
||||
except UnicodeError, errmsg:
|
||||
log(env, errmsg)
|
||||
msg = _("URL has unparsable domain name: %s") % errmsg
|
||||
yield encode(format_error(msg))
|
||||
return
|
||||
url_data = checker.get_url_from(url, 0, aggregate, extern=(0, 0))
|
||||
aggregate.urlqueue.put(url_data)
|
||||
for html_str in start_check(aggregate, out):
|
||||
yield encode(html_str)
|
||||
|
|
|
|||
|
|
@ -454,12 +454,16 @@ if has_optcomplete:
|
|||
|
||||
def read_stdin_urls ():
|
||||
"""Read list of URLs, separated by white-space, from stdin."""
|
||||
num = 0
|
||||
while True:
|
||||
lines = sys.stdin.readlines(8 * 1024)
|
||||
if not lines:
|
||||
break
|
||||
for line in lines:
|
||||
for url in line.split():
|
||||
num += 1
|
||||
if num % 10000 == 0:
|
||||
log.info(LOG_CMDLINE, "Read %d URLs from stdin", num)
|
||||
yield url
|
||||
|
||||
|
||||
|
|
@ -662,10 +666,10 @@ if options.trace:
|
|||
# add urls to queue
|
||||
if options.stdin:
|
||||
for url in read_stdin_urls():
|
||||
aggregate_url(aggregate, config, url)
|
||||
aggregate_url(aggregate, url)
|
||||
elif args:
|
||||
for url in args:
|
||||
aggregate_url(aggregate, config, strformat.stripurl(url))
|
||||
aggregate_url(aggregate, strformat.stripurl(url))
|
||||
else:
|
||||
log.warn(LOG_CMDLINE, _("no files or URLs given"))
|
||||
# set up profiling
|
||||
|
|
|
|||
|
|
@ -163,7 +163,7 @@ aggregate = get_aggregate(config)
|
|||
# add urls to queue
|
||||
if args:
|
||||
for url in args:
|
||||
aggregate_url(aggregate, config, strformat.stripurl(url), err_exit_code=3)
|
||||
aggregate_url(aggregate, strformat.stripurl(url), err_exit_code=3)
|
||||
else:
|
||||
log.warn(LOG_CMDLINE, _("no files or URLs given"))
|
||||
sys.exit(3)
|
||||
|
|
|
|||
|
|
@ -184,8 +184,7 @@ class LinkCheckTest (unittest.TestCase):
|
|||
confargs = {}
|
||||
logargs = {'expected': self.get_resultlines(filename)}
|
||||
aggregate = get_test_aggregate(confargs, logargs)
|
||||
url_data = get_url_from(url, 0, aggregate)
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
url_data = get_url_from(url, 0, aggregate, extern=(0, 0))
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
|
|
@ -213,7 +212,6 @@ class LinkCheckTest (unittest.TestCase):
|
|||
# initial URL has recursion level zero
|
||||
url_reclevel = 0
|
||||
url_data = get_url_from(url, url_reclevel, aggregate)
|
||||
linkcheck.add_intern_pattern(url_data, aggregate.config)
|
||||
aggregate.urlqueue.put(url_data)
|
||||
linkcheck.director.check_urls(aggregate)
|
||||
diff = aggregate.config['logger'].diff
|
||||
|
|
|
|||
Loading…
Reference in a new issue