Optimize intern/extern pattern parsing.

This commit is contained in:
Bastian Kleineidam 2012-09-20 20:19:13 +02:00
parent 5554c16aa2
commit a03090c20f
13 changed files with 72 additions and 69 deletions

View file

@ -101,14 +101,6 @@ class LinkCheckerInterrupt (StandardError):
"""Used for testing."""
pass
def add_intern_pattern (url_data, config):
"""Add intern URL regex to config."""
pat = url_data.get_intern_pattern()
if pat:
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
config['internlinks'].append(get_link_pat(pat))
url_data.set_extern(url_data.url)
def get_link_pat (arg, strict=False):
"""Get a link pattern matcher for intern/extern links.

View file

@ -50,7 +50,7 @@ def absolute_url (base_url, base_ref, parent_url):
def get_url_from (base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=0, column=0,
name=u"", parent_content_type=None):
name=u"", parent_content_type=None, extern=None):
"""
Get url data from given base data.
@ -70,6 +70,8 @@ def get_url_from (base_url, recursion_level, aggregate,
@type column: number
@param name: link name
@type name: string
@param extern: (is_extern, is_strict) or None
@type extern: tuple(int, int) or None
"""
if base_url is not None:
base_url = strformat.unicode_safe(base_url)
@ -96,7 +98,7 @@ def get_url_from (base_url, recursion_level, aggregate,
log.debug(LOG_CHECK, "%s handles url %s", klass.__name__, base_url)
return klass(base_url, recursion_level, aggregate,
parent_url=parent_url, base_ref=base_ref,
line=line, column=column, name=name)
line=line, column=column, name=name, extern=extern)
def get_urlclass_from (url, assume_local_file=False):

View file

@ -95,7 +95,7 @@ class FileUrl (urlbase.UrlBase):
"""
def init (self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding):
aggregate, line, column, name, url_encoding, extern):
"""
Besides the usual initialization the URL is normed according
to the platform:
@ -103,7 +103,7 @@ class FileUrl (urlbase.UrlBase):
- under Windows platform the drive specifier is normed
"""
super(FileUrl, self).init(base_ref, base_url, parent_url,
recursion_level, aggregate, line, column, name, url_encoding)
recursion_level, aggregate, line, column, name, url_encoding, extern)
self.scheme = u'file'
if self.base_url is None:
return
@ -286,13 +286,14 @@ class FileUrl (urlbase.UrlBase):
self.content_type = u""
return self.content_type
def get_intern_pattern (self):
def get_intern_pattern (self, url=None):
"""Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
url = self.url
if url is None:
url = self.url
if not url:
return None
if url.startswith('file://'):

View file

@ -30,8 +30,7 @@ from cStringIO import StringIO
from datetime import datetime
from .. import (log, LOG_CHECK, gzip2 as gzip, strformat, url as urlutil,
httplib2 as httplib, LinkCheckerError, get_link_pat, httputil,
configuration)
httplib2 as httplib, LinkCheckerError, httputil, configuration)
from . import (internpaturl, proxysupport, httpheaders as headers, urlbase,
get_url_from)
# import warnings
@ -380,11 +379,10 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
if self.recursion_level == 0 and urlparts[0] in ('http', 'https'):
# Add intern patterns for redirection of URLs given by the
# user for HTTP schemes.
pat = internpaturl.get_intern_pattern(redirected)
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
self.aggregate.config['internlinks'].append(get_link_pat(pat))
self.add_intern_pattern(url=redirected)
return True
# check extern filter again
self.extern = None
self.set_extern(redirected)
if self.extern[0] and self.extern[1]:
if set_result:

View file

@ -49,14 +49,15 @@ def get_intern_pattern (url):
class InternPatternUrl (urlbase.UrlBase):
"""Class supporting an intern URL pattern."""
def get_intern_pattern (self):
def get_intern_pattern (self, url=None):
"""
Get pattern for intern URL matching.
@return non-empty regex pattern or None
@rtype String or None
"""
url = absolute_url(self.base_url, self.base_ref, self.parent_url)
if url is None:
url = absolute_url(self.base_url, self.base_ref, self.parent_url)
if not url:
return None
return get_intern_pattern(url)

View file

@ -74,7 +74,6 @@ class UnknownUrl (urlbase.UrlBase):
def local_check (self):
"""Only logs that this URL is unknown."""
self.set_extern(self.url)
if self.extern[0] and self.extern[1]:
self.add_info(_("Outside of domain filter, checked only syntax."))
elif self.ignored():

View file

@ -31,7 +31,7 @@ import select
from . import absolute_url, StoringHandler, get_url_from
from .. import (log, LOG_CHECK, LOG_CACHE, httputil, httplib2 as httplib,
strformat, LinkCheckerError, url as urlutil, trace, clamav, winutil, geoip,
fileutil)
fileutil, get_link_pat)
from ..HtmlParser import htmlsax
from ..htmlutil import linkparse
from ..network import iputil
@ -109,7 +109,7 @@ class UrlBase (object):
def __init__ (self, base_url, recursion_level, aggregate,
parent_url=None, base_ref=None, line=-1, column=-1,
name=u"", url_encoding=None):
name=u"", url_encoding=None, extern=None):
"""
Initialize check data, and store given variables.
@ -122,14 +122,18 @@ class UrlBase (object):
@param column: column number of url in parent content
@param name: name of url or empty
@param url_encoding: encoding of URL or None
@param extern: None or (is_extern, is_strict)
"""
self.reset()
self.init(base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding)
aggregate, line, column, name, url_encoding, extern)
self.check_syntax()
if recursion_level == 0:
self.add_intern_pattern()
self.set_extern(self.url)
def init (self, base_ref, base_url, parent_url, recursion_level,
aggregate, line, column, name, url_encoding):
aggregate, line, column, name, url_encoding, extern):
"""
Initialize internal data.
"""
@ -143,6 +147,7 @@ class UrlBase (object):
self.name = name
self.encoding = url_encoding
self.charset = None
self.extern = extern
if self.base_ref:
assert not urlutil.url_needs_quoting(self.base_ref), \
"unquoted base reference URL %r" % self.base_ref
@ -199,8 +204,8 @@ class UrlBase (object):
# cache keys, are set by build_url() calling set_cache_keys()
self.cache_url_key = None
self.cache_content_key = None
# extern flags (is_extern, is_strict), both enabled as default
self.extern = (1, 1)
# extern flags (is_extern, is_strict)
self.extern = None
# flag if the result should be cached
self.caching = True
# title is either the URL or parsed from content
@ -399,7 +404,6 @@ class UrlBase (object):
self.set_result(unicode_safe(msg), valid=False)
else:
self.set_cache_keys()
self.set_extern(self.url)
def check_url_warnings(self):
"""Check URL name and length."""
@ -704,6 +708,11 @@ class UrlBase (object):
@return: None
"""
if self.extern:
return
if not url:
self.extern = (1, 1)
return
for entry in self.aggregate.config["externlinks"]:
match = entry['pattern'].search(url)
if (entry['negate'] and not match) or \
@ -1133,15 +1142,28 @@ class UrlBase (object):
u"anchor=%r" % self.anchor,
])
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.
def get_intern_pattern (self, url=None):
"""Get pattern for intern URL matching.
@param url: the URL to set intern pattern for, else self.url
@ptype url: unicode or None
@return non-empty regex pattern or None
@rtype String or None
"""
return None
def add_intern_pattern(self, url=None):
"""Add intern URL regex to config."""
try:
pat = self.get_intern_pattern(url=url)
if pat:
log.debug(LOG_CHECK, "Add intern pattern %r", pat)
self.aggregate.config['internlinks'].append(get_link_pat(pat))
except UnicodeError, msg:
res = _("URL has unparsable domain name: %(domain)s") % \
{"domain": msg}
self.set_result(res, valid=False)
def __str__ (self):
"""
Get URL info.

View file

@ -19,8 +19,7 @@ Utility functions suitable for command line clients.
"""
import sys
import optparse
from . import fileutil, ansicolor, strformat, add_intern_pattern, checker, \
log, LOG_CMDLINE
from . import fileutil, ansicolor, strformat, checker
from .director import console
from .decorators import notimplemented
@ -113,7 +112,7 @@ class LCOptionParser (optparse.OptionParser, object):
pass
def aggregate_url (aggregate, config, url, err_exit_code=2):
def aggregate_url (aggregate, url, err_exit_code=2):
"""Append given commandline URL to input queue."""
get_url_from = checker.get_url_from
if url.lower().startswith("www."):
@ -122,12 +121,5 @@ def aggregate_url (aggregate, config, url, err_exit_code=2):
elif url.lower().startswith("ftp."):
# syntactic sugar
url = "ftp://%s" % url
url_data = get_url_from(url, 0, aggregate)
try:
add_intern_pattern(url_data, config)
except UnicodeError:
log.error(LOG_CMDLINE,
_("URL has unparsable domain name: %(domain)s") %
{"domain": sys.exc_info()[1]})
sys.exit(err_exit_code)
url_data = get_url_from(url, 0, aggregate, extern=(0, 0))
aggregate.urlqueue.put(url_data)

View file

@ -36,8 +36,8 @@ from .urlsave import urlsave
from .settings import Settings
from .recentdocs import RecentDocumentModel
from .projects import openproject, saveproject, loadproject, ProjectExt
from .. import configuration, checker, director, add_intern_pattern, \
strformat, fileutil, LinkCheckerError, get_link_pat, memoryutil
from .. import configuration, checker, director, get_link_pat, \
strformat, fileutil, LinkCheckerError, memoryutil
from ..containers import enum
from .. import url as urlutil
from ..checker import httpheaders
@ -458,14 +458,15 @@ Version 2 or later.
self.set_statusmsg(_("Error, empty URL"))
return
self.set_statusmsg(_("Checking '%s'.") % strformat.limit(url, 40))
url_data = checker.get_url_from(url, 0, aggregate)
try:
self.backup_config('internlinks')
add_intern_pattern(url_data, self.config)
except UnicodeError:
self.set_statusmsg(_("Error, invalid URL `%s'.") %
strformat.limit(url, 40))
return
url_data = checker.get_url_from(url, 0, aggregate, extern=(0, 0))
# XXX
#try:
# self.backup_config('internlinks')
# add_intern_pattern(url_data, self.config)
#except UnicodeError:
# self.set_statusmsg(_("Error, invalid URL `%s'.") %
# strformat.limit(url, 40))
# return
self.recent.add_document(url)
aggregate.urlqueue.put(url_data)
self.aggregate = aggregate

View file

@ -25,8 +25,8 @@ import locale
import re
import time
import urlparse
from . import configuration, strformat, checker, director, \
add_intern_pattern, get_link_pat, init_i18n, url as urlutil
from . import configuration, strformat, checker, director, get_link_pat, \
init_i18n, url as urlutil
from .decorators import synchronized
# 5 minutes timeout for requests
@ -139,14 +139,7 @@ def checklink (form=None, env=os.environ):
config = get_configuration(form, out)
url = strformat.stripurl(formvalue(form, "url"))
aggregate = director.get_aggregate(config)
url_data = checker.get_url_from(url, 0, aggregate)
try:
add_intern_pattern(url_data, config)
except UnicodeError, errmsg:
log(env, errmsg)
msg = _("URL has unparsable domain name: %s") % errmsg
yield encode(format_error(msg))
return
url_data = checker.get_url_from(url, 0, aggregate, extern=(0, 0))
aggregate.urlqueue.put(url_data)
for html_str in start_check(aggregate, out):
yield encode(html_str)

View file

@ -454,12 +454,16 @@ if has_optcomplete:
def read_stdin_urls ():
"""Read list of URLs, separated by white-space, from stdin."""
num = 0
while True:
lines = sys.stdin.readlines(8 * 1024)
if not lines:
break
for line in lines:
for url in line.split():
num += 1
if num % 10000 == 0:
log.info(LOG_CMDLINE, "Read %d URLs from stdin", num)
yield url
@ -662,10 +666,10 @@ if options.trace:
# add urls to queue
if options.stdin:
for url in read_stdin_urls():
aggregate_url(aggregate, config, url)
aggregate_url(aggregate, url)
elif args:
for url in args:
aggregate_url(aggregate, config, strformat.stripurl(url))
aggregate_url(aggregate, strformat.stripurl(url))
else:
log.warn(LOG_CMDLINE, _("no files or URLs given"))
# set up profiling

View file

@ -163,7 +163,7 @@ aggregate = get_aggregate(config)
# add urls to queue
if args:
for url in args:
aggregate_url(aggregate, config, strformat.stripurl(url), err_exit_code=3)
aggregate_url(aggregate, strformat.stripurl(url), err_exit_code=3)
else:
log.warn(LOG_CMDLINE, _("no files or URLs given"))
sys.exit(3)

View file

@ -184,8 +184,7 @@ class LinkCheckTest (unittest.TestCase):
confargs = {}
logargs = {'expected': self.get_resultlines(filename)}
aggregate = get_test_aggregate(confargs, logargs)
url_data = get_url_from(url, 0, aggregate)
linkcheck.add_intern_pattern(url_data, aggregate.config)
url_data = get_url_from(url, 0, aggregate, extern=(0, 0))
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff
@ -213,7 +212,6 @@ class LinkCheckTest (unittest.TestCase):
# initial URL has recursion level zero
url_reclevel = 0
url_data = get_url_from(url, url_reclevel, aggregate)
linkcheck.add_intern_pattern(url_data, aggregate.config)
aggregate.urlqueue.put(url_data)
linkcheck.director.check_urls(aggregate)
diff = aggregate.config['logger'].diff