Increase checked cache in URL queue.

This commit is contained in:
Bastian Kleineidam 2012-09-02 22:21:49 +02:00
parent 4c16d3e702
commit 7a6436f08f
9 changed files with 87 additions and 58 deletions

View file

@ -7,6 +7,7 @@ Features:
Closes: SF bug #3538365
- checking: Support WML sites.
Closes: SF bug #3553175
- checking: Show number of parsed URLs in page content.
- cmdline: Added Nagios plugin script.
Changes:

View file

@ -84,8 +84,8 @@ def parse_bookmark_file (file):
def parse_bookmark_json (data):
"""Parse complete JSON data for Chromium Bookmarks."""
for entry in data["roots"].values():
for entry in parse_bookmark_node(entry):
yield entry
for url, name in parse_bookmark_node(entry):
yield url, name
def parse_bookmark_node (node):

View file

@ -54,7 +54,7 @@ class UrlQueue (object):
self.unfinished_tasks = 0
self.finished_tasks = 0
self.in_progress = {}
self.checked = LFUCache(size=10000)
self.checked = LFUCache(size=100000)
self.shutdown = False
self.unsorted = 0

View file

@ -262,6 +262,7 @@ class FileUrl (urlbase.UrlBase):
mime = self.get_content_type()
key = self.ContentMimetypes[mime]
getattr(self, "parse_"+key)()
self.add_num_url_info()
def parse_firefox (self):
"""Parse a Firefox3 bookmark file."""

View file

@ -198,6 +198,7 @@ class FtpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
return
key = self.ContentMimetypes[self.get_content_type(self.get_content)]
getattr(self, "parse_"+key)()
self.add_num_url_info()
def get_content_type (self, read=None):
"""Return URL content type, or an empty string if content

View file

@ -806,6 +806,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
self.parse_word()
elif ctype == "text/vnd.wap.wml":
self.parse_wml()
self.add_num_url_info()
def get_robots_txt_url (self):
"""

View file

@ -205,6 +205,8 @@ class UrlBase (object):
self.do_check_content = True
# MIME content type
self.content_type = None
# number of URLs in page content
self.num_urls = 0
def set_result (self, msg, valid=True, overwrite=False):
"""
@ -941,6 +943,7 @@ class UrlBase (object):
Default parse type is html.
"""
self.parse_html()
self.add_num_url_info()
def get_user_password (self):
"""Get tuple (user, password) from configured authentication.
@ -960,6 +963,7 @@ class UrlBase (object):
def add_url (self, url, line=0, column=0, name=u"", base=None):
"""Queue URL data for checking."""
self.num_urls += 1
if base:
base_ref = urlutil.url_norm(base)[0]
else:
@ -971,6 +975,13 @@ class UrlBase (object):
# Only queue URLs which have a result or are not strict extern.
self.aggregate.urlqueue.put(url_data)
def add_num_url_info(self):
"""Add number of URLs parsed to info."""
if self.num_urls > 0:
attrs = {"num": self.num_urls}
msg = _n("%(num)d URL parsed.", "%(num)d URLs parsed.", self.num_urls)
self.add_info(msg % attrs)
def parse_opera (self):
"""Parse an opera bookmark file."""
log.debug(LOG_CHECK, "Parsing Opera bookmarks %s", self)

View file

@ -5,8 +5,8 @@ msgid ""
msgstr ""
"Project-Id-Version: $Id$\n"
"Report-Msgid-Bugs-To: calvin@users.sourceforge.net\n"
"POT-Creation-Date: 2012-08-26 10:48+0200\n"
"PO-Revision-Date: 2012-08-26 10:48+0100\n"
"POT-Creation-Date: 2012-09-02 19:55+0200\n"
"PO-Revision-Date: 2012-09-02 19:56+0100\n"
"Last-Translator: Bastian Kleineidam <calvin@users.sourceforge.net>\n"
"Language-Team: de <de@li.org>\n"
"Language: \n"
@ -744,122 +744,129 @@ msgstr "URL besitzt einen nicht analysierbaren Rechnernamen: %(name)s"
msgid "Leading or trailing whitespace in URL `%(url)s'."
msgstr "Die URL %(url)s enthält Leerzeichen am Anfang oder Ende."
#: ../linkcheck/checker/urlbase.py:379
#: ../linkcheck/checker/urlbase.py:381
msgid "URL is missing"
msgstr "URL fehlt"
#: ../linkcheck/checker/urlbase.py:382
#: ../linkcheck/checker/urlbase.py:384
msgid "URL is empty"
msgstr "URL ist leer"
#: ../linkcheck/checker/urlbase.py:389
#: ../linkcheck/checker/urlbase.py:391
#, python-format
msgid "Effective URL %(url)r."
msgstr "Effektive URL %(url)r."
#: ../linkcheck/checker/urlbase.py:448
#: ../linkcheck/checker/urlbase.py:450
#, python-format
msgid "URL has invalid port %(port)r"
msgstr "URL hat eine ungültige Portnummer %(port)r"
#: ../linkcheck/checker/urlbase.py:453
#: ../linkcheck/checker/urlbase.py:455
msgid "URL has empty hostname"
msgstr "URL hat leeren Rechnernamen"
#: ../linkcheck/checker/urlbase.py:464
#: ../linkcheck/checker/urlbase.py:466
#, python-format
msgid "URL %(url)s has obfuscated IP address %(ip)s"
msgstr "URL %(url)s besitzt die verschleierte IP-Adresse %(ip)s"
#: ../linkcheck/checker/urlbase.py:491
#: ../linkcheck/checker/urlbase.py:493
#, python-format
msgid "URL is located in %(country)s."
msgstr "URL befindet sich in %(country)s."
#: ../linkcheck/checker/urlbase.py:516
#: ../linkcheck/checker/urlbase.py:518
msgid "Hostname not found"
msgstr "Rechnername nicht gefunden"
#: ../linkcheck/checker/urlbase.py:519
#: ../linkcheck/checker/urlbase.py:521
#, python-format
msgid "Bad HTTP response %(line)r"
msgstr "Ungültige HTTP Antwort %(line)r"
#: ../linkcheck/checker/urlbase.py:532
#: ../linkcheck/checker/urlbase.py:534
#, python-format
msgid "could not get content: %(msg)r"
msgstr "konnte Inhalt nicht parsen: %(msg)r"
#: ../linkcheck/checker/urlbase.py:673
#: ../linkcheck/checker/urlbase.py:675
#, python-format
msgid "Anchor `%(name)s' not found."
msgstr "Anker `%(name)s' nicht gefunden."
#: ../linkcheck/checker/urlbase.py:674
#: ../linkcheck/checker/urlbase.py:676
#, python-format
msgid "Available anchors: %(anchors)s."
msgstr "Verfügbare Anker: %(anchors)s."
#: ../linkcheck/checker/urlbase.py:728
#: ../linkcheck/checker/urlbase.py:730
#: ../linkcheck/checker/fileurl.py:193
#: ../linkcheck/checker/httpurl.py:685
msgid "File size too large"
msgstr "Dateigröße ist zu groß"
#: ../linkcheck/checker/urlbase.py:770
#: ../linkcheck/checker/urlbase.py:772
#, python-format
msgid "Found %(match)r at line %(line)d in link contents."
msgstr "Habe %(match)r in Zeile %(line)d im Inhalt der Verknüpfung gefunden."
#: ../linkcheck/checker/urlbase.py:786
#: ../linkcheck/checker/urlbase.py:788
msgid "Content size is zero."
msgstr "Größe des Inhalts ist Null."
#: ../linkcheck/checker/urlbase.py:792
#: ../linkcheck/checker/urlbase.py:794
#, python-format
msgid "Content size %(dlsize)s is larger than %(maxbytes)s."
msgstr "Inhalt %(dlsize)s is größer als %(maxbytes)s."
#: ../linkcheck/checker/urlbase.py:797
#: ../linkcheck/checker/urlbase.py:799
#, python-format
msgid "Download size (%(dlsize)d Byte) does not equal content size (%(size)d Byte)."
msgstr "Download Grüße (%(dlsize)d Byte) ist ungleich der Inhaltsgröße (%(size)d Byte)."
#: ../linkcheck/checker/urlbase.py:818
#: ../linkcheck/checker/urlbase.py:883
#: ../linkcheck/checker/urlbase.py:820
#: ../linkcheck/checker/urlbase.py:885
msgid "valid HTML syntax"
msgstr "gültige HTML Syntax"
#: ../linkcheck/checker/urlbase.py:824
#: ../linkcheck/checker/urlbase.py:826
#, python-format
msgid "tidy HTML parsing caused error: %(msg)s "
msgstr "tidy HTML Parser verursachte Fehler: %(msg)s"
#: ../linkcheck/checker/urlbase.py:846
#: ../linkcheck/checker/urlbase.py:919
#: ../linkcheck/checker/urlbase.py:848
#: ../linkcheck/checker/urlbase.py:921
msgid "valid CSS syntax"
msgstr "gültige CSS Syntax"
#: ../linkcheck/checker/urlbase.py:852
#: ../linkcheck/checker/urlbase.py:854
#, python-format
msgid "cssutils parsing caused error: %(msg)s"
msgstr "cssutils Parser verursachte Fehler: %(msg)s"
#: ../linkcheck/checker/urlbase.py:861
#: ../linkcheck/checker/urlbase.py:863
#, python-format
msgid "%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s"
msgstr "%(w3type)s Validierungsfehler in Zeile %(line)s Spalte %(column)s: %(msg)s"
#: ../linkcheck/checker/urlbase.py:892
#: ../linkcheck/checker/urlbase.py:894
#, python-format
msgid "HTML W3C validation caused error: %(msg)s "
msgstr "HTML W3C Validierung verursachte Fehler: %(msg)s"
#: ../linkcheck/checker/urlbase.py:927
#: ../linkcheck/checker/urlbase.py:929
#, python-format
msgid "CSS W3C validation caused error: %(msg)s "
msgstr "CSS W3C Validierung verursachte Fehler: %(msg)s"
#: ../linkcheck/checker/urlbase.py:982
#, python-format
msgid "%(num)d URL parsed."
msgid_plural "%(num)d URLs parsed."
msgstr[0] "%(num)d URL geparst."
msgstr[1] "%(num)d URLs geparst."
#: ../linkcheck/checker/proxysupport.py:43
#, python-format
msgid "Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
@ -925,7 +932,7 @@ msgstr "Keine Antwort vom FTP Server"
msgid "Missing trailing directory slash in ftp url."
msgstr "Fehlender / am Ende der FTP url."
#: ../linkcheck/checker/ftpurl.py:224
#: ../linkcheck/checker/ftpurl.py:225
msgid "FTP file size too large"
msgstr "FTP Dateigröße ist zu groß"

View file

@ -8,7 +8,7 @@ msgid ""
msgstr ""
"Project-Id-Version: PACKAGE VERSION\n"
"Report-Msgid-Bugs-To: calvin@users.sourceforge.net\n"
"POT-Creation-Date: 2012-08-26 10:48+0200\n"
"POT-Creation-Date: 2012-09-02 19:55+0200\n"
"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n"
"Last-Translator: FULL NAME <EMAIL@ADDRESS>\n"
"Language-Team: LANGUAGE <LL@li.org>\n"
@ -713,120 +713,127 @@ msgstr ""
msgid "Leading or trailing whitespace in URL `%(url)s'."
msgstr ""
#: ../linkcheck/checker/urlbase.py:379
#: ../linkcheck/checker/urlbase.py:381
msgid "URL is missing"
msgstr ""
#: ../linkcheck/checker/urlbase.py:382
#: ../linkcheck/checker/urlbase.py:384
msgid "URL is empty"
msgstr ""
#: ../linkcheck/checker/urlbase.py:389
#: ../linkcheck/checker/urlbase.py:391
#, python-format
msgid "Effective URL %(url)r."
msgstr ""
#: ../linkcheck/checker/urlbase.py:448
#: ../linkcheck/checker/urlbase.py:450
#, python-format
msgid "URL has invalid port %(port)r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:453
#: ../linkcheck/checker/urlbase.py:455
msgid "URL has empty hostname"
msgstr ""
#: ../linkcheck/checker/urlbase.py:464
#: ../linkcheck/checker/urlbase.py:466
#, python-format
msgid "URL %(url)s has obfuscated IP address %(ip)s"
msgstr ""
#: ../linkcheck/checker/urlbase.py:491
#: ../linkcheck/checker/urlbase.py:493
#, python-format
msgid "URL is located in %(country)s."
msgstr ""
#: ../linkcheck/checker/urlbase.py:516
#: ../linkcheck/checker/urlbase.py:518
msgid "Hostname not found"
msgstr ""
#: ../linkcheck/checker/urlbase.py:519
#: ../linkcheck/checker/urlbase.py:521
#, python-format
msgid "Bad HTTP response %(line)r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:532
#: ../linkcheck/checker/urlbase.py:534
#, python-format
msgid "could not get content: %(msg)r"
msgstr ""
#: ../linkcheck/checker/urlbase.py:673
#: ../linkcheck/checker/urlbase.py:675
#, python-format
msgid "Anchor `%(name)s' not found."
msgstr ""
#: ../linkcheck/checker/urlbase.py:674
#: ../linkcheck/checker/urlbase.py:676
#, python-format
msgid "Available anchors: %(anchors)s."
msgstr ""
#: ../linkcheck/checker/urlbase.py:728 ../linkcheck/checker/fileurl.py:193
#: ../linkcheck/checker/urlbase.py:730 ../linkcheck/checker/fileurl.py:193
#: ../linkcheck/checker/httpurl.py:685
msgid "File size too large"
msgstr ""
#: ../linkcheck/checker/urlbase.py:770
#: ../linkcheck/checker/urlbase.py:772
#, python-format
msgid "Found %(match)r at line %(line)d in link contents."
msgstr ""
#: ../linkcheck/checker/urlbase.py:786
#: ../linkcheck/checker/urlbase.py:788
msgid "Content size is zero."
msgstr ""
#: ../linkcheck/checker/urlbase.py:792
#: ../linkcheck/checker/urlbase.py:794
#, python-format
msgid "Content size %(dlsize)s is larger than %(maxbytes)s."
msgstr ""
#: ../linkcheck/checker/urlbase.py:797
#: ../linkcheck/checker/urlbase.py:799
#, python-format
msgid ""
"Download size (%(dlsize)d Byte) does not equal content size (%(size)d Byte)."
msgstr ""
#: ../linkcheck/checker/urlbase.py:818 ../linkcheck/checker/urlbase.py:883
#: ../linkcheck/checker/urlbase.py:820 ../linkcheck/checker/urlbase.py:885
msgid "valid HTML syntax"
msgstr ""
#: ../linkcheck/checker/urlbase.py:824
#: ../linkcheck/checker/urlbase.py:826
#, python-format
msgid "tidy HTML parsing caused error: %(msg)s "
msgstr ""
#: ../linkcheck/checker/urlbase.py:846 ../linkcheck/checker/urlbase.py:919
#: ../linkcheck/checker/urlbase.py:848 ../linkcheck/checker/urlbase.py:921
msgid "valid CSS syntax"
msgstr ""
#: ../linkcheck/checker/urlbase.py:852
#: ../linkcheck/checker/urlbase.py:854
#, python-format
msgid "cssutils parsing caused error: %(msg)s"
msgstr ""
#: ../linkcheck/checker/urlbase.py:861
#: ../linkcheck/checker/urlbase.py:863
#, python-format
msgid "%(w3type)s validation error at line %(line)s col %(column)s: %(msg)s"
msgstr ""
#: ../linkcheck/checker/urlbase.py:892
#: ../linkcheck/checker/urlbase.py:894
#, python-format
msgid "HTML W3C validation caused error: %(msg)s "
msgstr ""
#: ../linkcheck/checker/urlbase.py:927
#: ../linkcheck/checker/urlbase.py:929
#, python-format
msgid "CSS W3C validation caused error: %(msg)s "
msgstr ""
#: ../linkcheck/checker/urlbase.py:982
#, python-format
msgid "%(num)d URL parsed."
msgid_plural "%(num)d URLs parsed."
msgstr[0] ""
msgstr[1] ""
#: ../linkcheck/checker/proxysupport.py:43
#, python-format
msgid "Proxy value `%(proxy)s' must start with 'http:' or 'https:'."
@ -892,7 +899,7 @@ msgstr ""
msgid "Missing trailing directory slash in ftp url."
msgstr ""
#: ../linkcheck/checker/ftpurl.py:224
#: ../linkcheck/checker/ftpurl.py:225
msgid "FTP file size too large"
msgstr ""