Merge branch 'master' into py3

This commit is contained in:
Bastian Kleineidam 2014-09-12 17:36:46 +02:00
commit d89217efaa
40 changed files with 4802 additions and 4167 deletions

1
.gitignore vendored
View file

@ -29,7 +29,6 @@ Changelog.linkchecker*
/doc/html/*.qch
/.achievements
/doc/*.mo
/po/*.mo
/LinkChecker-*-portable.zip
/LinkChecker-*.exe
/LinkChecker.egg-info

View file

@ -1,4 +1,4 @@
include README.txt COPYING MANIFEST.in
include README.rst COPYING MANIFEST.in
include config/linkchecker-completion config/create.sql
include config/linkcheckerrc
include config/linkchecker.apache2.conf install-rpm.sh
@ -17,7 +17,7 @@ include linkcheck/gui/rc/Makefile
include linkcheck/gui/rc/*.png
include linkcheck/gui/rc/*.qrc
include linkcheck/gui/ui/*.ui
include po/*.po po/*.pot po/Makefile po/msgfmt.py
include po/*.po po/*.mo po/*.pot po/Makefile
include doc/*.example doc/*.txt
include doc/html/*.ico
include doc/html/*.html

View file

@ -75,7 +75,6 @@ all:
clean:
-$(PYTHON) setup.py clean --all
rm -f $(LAPPNAME)-out.* *-stamp*
$(MAKE) -C po clean
$(MAKE) -C doc/html clean
$(MAKE) -C linkcheck/HtmlParser clean
rm -f linkcheck/network/_network*.so
@ -94,7 +93,7 @@ MANIFEST: MANIFEST.in setup.py
$(PYTHON) setup.py sdist --manifest-only
locale:
$(MAKE) -C po mofiles
$(MAKE) -C po
# to build in the current directory
localbuild: MANIFEST locale

View file

@ -1,4 +1,23 @@
9.3 "" (released xx.xx.2014)
9.4 "" (released xx.xx.xxxx)
Features:
- checking: Support itms-services: URLs.
Closes: GH bug #532
Changes:
- installation: Remove dependency on msgfmt.py by pre-generating the
*.mo files and adding them to version control.
Reason was the difficulty to run msgfmt.py under both Python 2 and 3.
- checking: When checking SSL certificates under POSIX systems try
to use the system certificate store.
Fixes:
- checking: Correct typos in the proxy handling code.
Closes: GH bug #536
- cmdline: Reactivate paging of help pages.
9.3 "Better Living Through Chemistry" (released 16.7.2014)
Features:
- checking: Parse and check links in PDF files.
@ -12,6 +31,7 @@ Changes:
import needed third party modules.
- checking: Treat empty URLs as same as parent URL.
Closes: GH bug #524
- installation: Replaced the twill dependency with local code.
Fixes:
- checking: Catch XML parse errors in sitemap XML files and print them
@ -28,6 +48,8 @@ Fixes:
Closes: GH bug #519
- checking: Use user-supplied authentication and proxies when requestiong
robot.txt.
- plugins: Fix Word file check plugin.
Closes: GH bug #530
9.2 "Rick and Morty" (released 23.4.2014)

View file

@ -39,9 +39,6 @@ installation is recommended.
- *Optional, for displaying country codes:*
Pygeoip from http://code.google.com/p/pygeoip/
- *Optional, used for login form submission:*
Twill from http://twill.idyll.org/
Setup for Unix/Linux
--------------------

View file

@ -73,15 +73,12 @@ First, install the required software.
7. *Optional, used for Virus checking:*
ClamAv from http://www.clamav.net/
8. *Optional, used for login form submission:*
Twill from http://twill.idyll.org/
9. *Optional, for GNOME proxy setting parsing:*
8. *Optional, for GNOME proxy setting parsing:*
Python Gtk from http://www.pygtk.org/downloads.html
10. *Optional, to run the WSGI web interface:*
Apache from http://httpd.apache.org/
mod_wsgi from http://code.google.com/p/modwsgi/
9. *Optional, to run the WSGI web interface:*
Apache from http://httpd.apache.org/
mod_wsgi from http://code.google.com/p/modwsgi/
Now install the application.

View file

@ -1,8 +1,8 @@
Date: 15.7.2014
Porting status of dependent Python packages
============================================
Date: 3.3.2014
OK Python
OK requests
OK Qt/PyQt
@ -11,4 +11,8 @@ OK argcomplete from https://pypi.python.org/pypi/argcomplete
OK dnspython (as dnspython3)
OK pygeoip from https://pypi.python.org/pypi/pygeoip/
OK Port Python Gtk stuff to PyGObject https://live.gnome.org/PyGObject/IntrospectionPorting
TODO(optional) Twill from http://twill.idyll.org/
Overall Porting status
=======================
NOT STARTED

View file

@ -1,4 +1,4 @@
version: "9.2"
version: "9.3"
name: "LinkChecker"
lname: "linkchecker"
maintainer: "Bastian Kleineidam"

View file

@ -19,6 +19,11 @@ policy by the webmaster running the website you are checking. Look in
the ``/robots.txt`` file which follows the
[robots.txt exclusion standard](http://www.robotstxt.org/robotstxt.html).
For identification LinkChecker adds to each request a User-Agent header
like this:
Mozilla/5.0 (compatible; LinkChecker/9.3; +http://wummel.github.io/linkchecker/)
If you yourself are the webmaster, consider allowing LinkChecker to
check your web pages by adding the following to your robots.txt file:

View file

@ -6,7 +6,9 @@ Introduction
LinkChecker is a free, [GPL](http://www.gnu.org/licenses/gpl-2.0.html)
licensed website validator.
LinkChecker checks links in web documents or full websites.
It runs on systems with Python 2.7.2 or later.
It runs on Python 2 systems, requiring Python 2.7.2 or later.
Python 3 is not (yet) supported.
Features
---------

View file

@ -17,7 +17,7 @@
"""
Cache robots.txt contents.
"""
from .. import robotparser2, configuration
from .. import robotparser2
from ..containers import LFUCache
from ..decorators import synchronized
from ..lock import get_lock
@ -33,14 +33,14 @@ class RobotsTxt (object):
Thread-safe cache of downloaded robots.txt files.
format: {cache key (string) -> robots.txt content (RobotFileParser)}
"""
useragent = str(configuration.UserAgent)
def __init__ (self):
def __init__ (self, useragent):
"""Initialize per-URL robots.txt cache."""
# mapping {URL -> parsed robots.txt}
self.cache = LFUCache(size=100)
self.hits = self.misses = 0
self.roboturl_locks = {}
self.useragent = useragent
def allows_url (self, url_data):
"""Ask robots.txt allowance."""
@ -59,7 +59,7 @@ class RobotsTxt (object):
self.misses += 1
kwargs = dict(auth=url_data.auth, session=url_data.session)
if url_data.proxy:
kwargs["proxies"] = {url_data.proxy_type, url_data.proxy}
kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
rp = robotparser2.RobotFileParser(**kwargs)
rp.set_url(roboturl)
rp.read()

View file

@ -143,6 +143,8 @@ def get_urlclass_from (scheme, assume_local_file=False):
klass = nntpurl.NntpUrl
elif scheme == "dns":
klass = dnsurl.DnsUrl
elif scheme == "itms-services":
klass = itmsservicesurl.ItmsServicesUrl
elif scheme and unknownurl.is_unknown_scheme(scheme):
klass = unknownurl.UnknownUrl
elif assume_local_file:
@ -174,4 +176,4 @@ def get_index_html (urls):
# all the URL classes
from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
mailtourl, telneturl, nntpurl, ignoreurl)
mailtourl, telneturl, nntpurl, ignoreurl, itmsservicesurl)

View file

@ -47,6 +47,7 @@ ExcCacheList = [
EOFError,
# http errors
requests.exceptions.RequestException,
requests.packages.urllib3.exceptions.HTTPError,
# ftp errors
ftplib.error_reply,
ftplib.error_temp,

View file

@ -19,6 +19,13 @@ Handle http links.
"""
import requests
# The validity of SSL certs is ignored to be able
# the check the URL and recurse into it.
# The warning about invalid SSL certs is given to the
# user instead.
import warnings
warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
from cStringIO import StringIO
from .. import (log, LOG_CHECK, strformat, mimeutil,

View file

@ -0,0 +1,45 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Handle itms-services URLs.
"""
from . import urlbase
from .. import log, LOG_CHECK
class ItmsServicesUrl(urlbase.UrlBase):
"""Apple iOS application download URLs."""
def check_syntax(self):
"""Only logs that this URL is unknown."""
super(ItmsServicesUrl, self).check_syntax()
if u"url=" not in self.urlparts[3]:
self.set_result(_("Missing required url parameter"), valid=False)
def local_check(self):
"""Disable content checks."""
log.debug(LOG_CHECK, "Checking %s", unicode(self))
pass
def check_content(self):
"""Allow recursion to check the url CGI param."""
return True
def is_parseable(self):
"""This URL is parseable."""
return True

View file

@ -18,6 +18,7 @@
Mixin class for URLs that can be fetched over a proxy.
"""
import urllib
import urlparse
import os
from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil
@ -35,29 +36,30 @@ class ProxySupport (object):
self.proxyauth = None
if not self.proxy:
return
self.proxytype, self.proxy = urllib.splittype(self.proxy)
proxyurl = urlparse.urlparse(self.proxy)
self.proxytype = proxyurl.scheme
if self.proxytype not in ('http', 'https'):
# Note that invalid proxies might raise TypeError in urllib2,
# so make sure to stop checking at this point, not later.
msg = _("Proxy value `%(proxy)s' must start with 'http:' or 'https:'.") \
% dict(proxy=proxy)
raise LinkCheckerError(msg)
self.proxy = urllib.splithost(self.proxy)[0]
self.proxyauth, self.proxy = urllib.splituser(self.proxy)
if self.ignore_proxy_host():
# log proxy without auth info
log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
self.add_info(_("Ignoring proxy setting `%(proxy)s'.") %
dict(proxy=proxy))
self.proxy = self.proxyauth = None
self.proxy = None
return
log.debug(LOG_CHECK, "using proxy %r", self.proxy)
self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy))
if self.proxyauth is not None:
if ":" not in self.proxyauth:
self.proxyauth += ":"
self.proxyauth = httputil.encode_base64(self.proxyauth)
self.proxyauth = "Basic "+self.proxyauth
self.proxyhost = proxyurl.hostname
self.proxyport = proxyurl.port
if proxyurl.username is not None:
username = proxyurl.username
password = proxyurl.password if proxy.password is not None else ""
auth = "%s:%s" % (username, password)
self.proxyauth = "Basic "+httputil.encode_base64(auth)
def ignore_proxy_host (self):
"""Check if self.host is in the $no_proxy ignore list."""
@ -79,7 +81,8 @@ class ProxySupport (object):
"""
if self.proxy:
scheme = self.proxytype
host, port = urlutil.splitport(self.proxy)
host = self.proxyhost
port = self.proxyport
else:
scheme = self.scheme
host = self.host

View file

@ -27,7 +27,7 @@ from .director import console
class LCArgumentParser(argparse.ArgumentParser):
"""Custom argument parser to format help text."""
def print_help(self, file=None):
def print_help(self, file=sys.stdout):
"""Print a help message to stdout."""
msg = console.encode(self.format_help())
if fileutil.is_tty(file):

View file

@ -63,7 +63,6 @@ Modules = (
("argcomplete", u"Argcomplete"),
("GeoIP", u"GeoIP"), # on Unix systems
("pygeoip", u"GeoIP"), # on Windows systems
("twill", u"Twill"),
("sqlite3", u"Sqlite"),
("gconf", u"Gconf"),
("meliae", u"Meliae"),
@ -117,6 +116,34 @@ def get_share_file (filename, devel_dir=None):
raise ValueError(msg)
def get_system_cert_file():
"""Try to find a system-wide SSL certificate file.
@return: the filename to the cert file
@raises: ValueError when no system cert file could be found
"""
if os.name == 'posix':
filename = "/etc/ssl/certs/ca-certificates.crt"
if os.path.isfile(filename):
return filename
msg = "no system certificate file found"
raise ValueError(msg)
def get_certifi_file():
"""Get the SSL certifications installed by the certifi package.
@return: the filename to the cert file
@rtype: string
@raises: ImportError when certifi is not installed or ValueError when
the file is not found
"""
import certifi
filename = certifi.where()
if os.path.isfile(filename):
return filename
msg = "%s not found; check your certifi installation" % filename
raise ValueError(msg)
# dynamic options
class Configuration (dict):
"""
@ -219,7 +246,6 @@ class Configuration (dict):
filtered_cfiles.append(cfile)
log.debug(LOG_CHECK, "reading configuration from %s", filtered_cfiles)
confparse.LCConfigParser(self).read(filtered_cfiles)
self.sanitize()
def add_auth (self, user=None, password=None, pattern=None):
"""Add given authentication data."""
@ -317,12 +343,20 @@ class Configuration (dict):
self[plugin] = {}
def sanitize_ssl(self):
"""Use locally installed certificate file if available."""
"""Use local installed certificate file if available.
Tries to get system, then certifi, then the own
installed certificate file."""
if self["sslverify"] is True:
try:
self["sslverify"] = get_share_file('cacert.pem')
self["sslverify"] = get_system_cert_file()
except ValueError:
pass
try:
self["sslverify"] = get_certifi_file()
except (ValueError, ImportError):
try:
self["sslverify"] = get_share_file('cacert.pem')
except ValueError:
pass
def get_plugin_folders():

View file

@ -20,100 +20,18 @@ Management of checking a queue of links with several threads.
import os
import thread
import time
from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
fileutil, strformat, plugins
from .. import log, LOG_CHECK, LinkCheckerInterrupt, plugins
from ..cache import urlqueue, robots_txt, results
from . import aggregator, console
def visit_loginurl (aggregate):
"""Check for a login URL and visit it."""
config = aggregate.config
url = config["loginurl"]
if not url:
return
if not fileutil.has_module("twill"):
msg = strformat.format_feature_warning(module=u'twill',
feature=u'login URL visit',
url=u'http://twill.idyll.org/')
log.warn(LOG_CHECK, msg)
return
from twill import commands as tc
log.debug(LOG_CHECK, u"Visiting login URL %s", url)
configure_twill(tc)
tc.go(url)
if tc.get_browser().get_code() != 200:
log.warn(LOG_CHECK, _("Error visiting login URL %(url)s.") % \
{"url": url})
return
submit_login_form(config, url, tc)
if tc.get_browser().get_code() != 200:
log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
{"url": url})
return
#XXX store_cookies(tc.get_browser().cj, aggregate.cookies, url)
resulturl = tc.get_browser().get_url()
log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
# add result URL to check list
from ..checker import get_url_from
aggregate.urlqueue.put(get_url_from(resulturl, 0, aggregate))
def configure_twill (tc):
"""Configure twill to be used by LinkChecker.
Note that there is no need to set a proxy since twill uses the same
ones (provided from urllib) as LinkChecker does.
"""
# make sure readonly controls are writeable (might be needed)
tc.config("readonly_controls_writeable", True)
# disable page refreshing
tc.config("acknowledge_equiv_refresh", False)
# fake IE 6.0 to talk sense into some sites (eg. SourceForge)
tc.agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)")
# tell twill to shut up
tc.OUT = dummy.Dummy()
from twill import browser
browser.OUT = dummy.Dummy()
# set debug level
if log.is_debug(LOG_CHECK):
tc.debug("http", 1)
def submit_login_form (config, url, tc):
"""Fill and submit login form."""
user, password = config.get_user_password(url)
cgiuser = config["loginuserfield"]
cgipassword = config["loginpasswordfield"]
formname = search_formname((cgiuser, cgipassword), tc)
tc.formvalue(formname, cgiuser, user)
tc.formvalue(formname, cgipassword, password)
for key, value in config["loginextrafields"].items():
tc.formvalue(formname, key, value)
tc.submit()
def search_formname (fieldnames, tc):
"""Search form that has all given CGI fieldnames."""
browser = tc.get_browser()
for formcounter, form in enumerate(browser.get_all_forms()):
for name in fieldnames:
try:
browser.get_form_field(form, name)
except tc.TwillException:
break
else:
return form.name or form.attrs.get('id') or formcounter
# none found
return None
def check_urls (aggregate):
"""Main check function; checks all configured URLs until interrupted
with Ctrl-C.
@return: None
"""
try:
visit_loginurl(aggregate)
aggregate.visit_loginurl()
except Exception as msg:
log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \
dict(msg=msg))
@ -210,7 +128,7 @@ def abort_now ():
def get_aggregate (config):
"""Get an aggregator instance with given configuration."""
_urlqueue = urlqueue.UrlQueue(max_allowed_urls=config["maxnumurls"])
_robots_txt = robots_txt.RobotsTxt()
_robots_txt = robots_txt.RobotsTxt(config["useragent"])
plugin_manager = plugins.PluginManager(config)
result_cache = results.ResultCache()
return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,

View file

@ -21,10 +21,12 @@ import threading
import thread
import requests
import time
import urlparse
import random
from .. import log, LOG_CHECK, strformat, cookies
from .. import log, LOG_CHECK, strformat, LinkCheckerError
from ..decorators import synchronized
from ..cache import urlqueue
from ..htmlutil import formsearch
from . import logger, status, checker, interrupt
@ -32,15 +34,15 @@ _threads_lock = threading.RLock()
_hosts_lock = threading.RLock()
_downloadedbytes_lock = threading.RLock()
def new_request_session(config):
def new_request_session(config, cookies):
"""Create a new request session."""
session = requests.Session()
if cookies:
session.cookies = cookies
session.max_redirects = config["maxhttpredirects"]
session.headers = {
"User-Agent": config["useragent"],
"DNT": "1",
}
# XXX proxies
if config["cookiefile"]:
for cookie in cookies.from_file(config["cookiefile"]):
session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
@ -62,11 +64,36 @@ class Aggregate (object):
self.plugin_manager = plugin_manager
self.result_cache = result_cache
self.times = {}
self.cookies = None
requests_per_second = config["maxrequestspersecond"]
self.wait_time_min = 1.0 / requests_per_second
self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
self.downloaded_bytes = 0
def visit_loginurl(self):
"""Check for a login URL and visit it."""
url = self.config["loginurl"]
if not url:
return
user, password = self.config.get_user_password(url)
session = requests.Session()
# XXX user-agent header
# XXX timeout
response = session.get(url)
cgiuser = self.config["loginuserfield"]
cgipassword = self.config["loginpasswordfield"]
form = formsearch.search_form(response.content, cgiuser, cgipassword,
encoding=response.encoding)
form.data[cgiuser] = user
form.data[cgipassword] = password
for key, value in self.config["loginextrafields"].items():
form.data[key] = value
formurl = urlparse.urljoin(url, form.url)
response = session.post(formurl, data=form.data)
self.cookies = session.cookies
if len(self.cookies) == 0:
raise LinkCheckerError("No cookies set by login URL %s" % url)
@synchronized(_threads_lock)
def start_threads (self):
"""Spawn threads for URL checking and status printing."""
@ -85,13 +112,13 @@ class Aggregate (object):
self.threads.append(t)
t.start()
else:
self.request_sessions[thread.get_ident()] = new_request_session(self.config)
self.request_sessions[thread.get_ident()] = new_request_session(self.config, self.cookies)
checker.check_urls(self.urlqueue, self.logger)
@synchronized(_threads_lock)
def add_request_session(self):
"""Add a request session for current thread."""
session = new_request_session(self.config)
session = new_request_session(self.config, self.cookies)
self.request_sessions[thread.get_ident()] = session
@synchronized(_threads_lock)

View file

@ -0,0 +1,113 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
HTML form utils
"""
from ..HtmlParser import htmlsax
from .. import log, LOG_CHECK
class Form(object):
"""Store HTML form URL and form data."""
def __init__(self, url):
"""Set URL and empty form data."""
self.url = url
self.data = {}
def add_value(self, key, value):
"""Add a form value."""
self.data[key] = value
def __repr__(self):
"""Return unicode representation displaying URL and form data."""
return unicode(self)
def __unicode__(self):
"""Return unicode string displaying URL and form data."""
return u"<url=%s data=%s>" % (self.url, self.data)
def __str__(self):
"""Return string displaying URL and form data."""
return unicode(self).encode('utf-8')
class FormFinder(object):
"""Base class handling HTML start elements.
TagFinder instances are used as HtmlParser handlers."""
def __init__(self):
"""Initialize local variables."""
super(FormFinder, self).__init__()
# parser object will be initialized when it is used as
# a handler object
self.parser = None
self.forms = []
self.form = None
def start_element(self, tag, attrs):
"""Does nothing, override in a subclass."""
if tag == u'form':
if u'action' in attrs:
url = attrs['action']
self.form = Form(url)
elif tag == u'input':
if self.form:
if 'name' in attrs:
key = attrs['name']
value = attrs.get('value')
self.form.add_value(key, value)
else:
log.warning(LOG_CHECK, "nameless form input %s" % attrs)
pass
else:
log.warning(LOG_CHECK, "formless input´%s" % attrs)
pass
def start_end_element(self, tag, attrs):
"""Delegate a combined start/end element (eg. <input .../>) to
the start_element method. Ignore the end element part."""
self.start_element(tag, attrs)
def end_element(self, tag):
"""search for ending form values."""
if tag == u'form':
self.forms.append(self.form)
self.form = None
def search_form(content, cgiuser, cgipassword, encoding='utf-8'):
"""Search for a HTML form in the given HTML content that has the given
CGI fields. If no form is found return None.
"""
handler = FormFinder()
parser = htmlsax.parser(handler)
handler.parser = parser
parser.encoding = encoding
# parse
parser.feed(content)
parser.flush()
# break cyclic dependencies
handler.parser = None
parser.handler = None
log.debug(LOG_CHECK, "Found forms %s", handler.forms)
cginames = (cgiuser.lower(), cgipassword.lower())
for form in handler.forms:
for key, value in form.data.items():
if key.lower() in cginames:
return form
# not found
return None

View file

@ -17,7 +17,7 @@
"""
Main functions for link parsing
"""
from .. import log, LOG_CHECK, strformat
from .. import log, LOG_CHECK, strformat, url as urlutil
from ..htmlutil import linkparse
from ..HtmlParser import htmlsax
from ..bookmarks import firefox
@ -30,6 +30,8 @@ def parse_url(url_data):
key = "html"
elif url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
key = "firefox"
elif url_data.scheme == "itms-services":
key = "itms_services"
else:
# determine parse routine according to content types
mime = url_data.content_type
@ -140,4 +142,13 @@ def parse_firefox (url_data):
url_data.add_url(url, name=name)
def parse_itms_services(url_data):
"""Get "url" CGI parameter value as child URL."""
query = url_data.urlparts[3]
for k, v, sep in urlutil.parse_qsl(query, keep_blank_values=True):
if k == "url":
url_data.add_url(v)
break
from .sitemap import parse_sitemap, parse_sitemapindex

View file

@ -32,7 +32,7 @@ from .. import fileutil, log, LOG_PLUGIN
_initialized = False
def init_win32com ():
"""Initialize the win32com.client cache."""
global _initialized
global _initialized
if _initialized:
return
import win32com.client
@ -117,7 +117,8 @@ class WordParser(_ParserPlugin):
def check(self, url_data):
"""Parse Word data."""
filename = get_temp_filename()
content = url_data.get_content()
filename = get_temp_filename(content)
# open word file and parse hyperlinks
try:
app = get_word_app()

View file

@ -255,7 +255,7 @@ def url_parse_query (query, encoding=None):
query, rest = query.rsplit('?', 1)
append = '?'+url_parse_query(rest)+append
l = []
for k, v, sep in parse_qsl(query, True):
for k, v, sep in parse_qsl(query, keep_blank_values=True):
k = url_quote_part(k, '/-:,;')
if v:
v = url_quote_part(v, '/-:,;')
@ -373,7 +373,7 @@ def collapse_segments (path):
return path
url_is_absolute = re.compile("^[a-z]+:", re.I).match
url_is_absolute = re.compile(r"^[-\.a-z]+:", re.I).match
def url_quote (url):

View file

@ -1,5 +1,5 @@
Project: LinkChecker
Version: 9.2
Version: 9.3
Website-URL: http://wummel.github.io/linkchecker/
Changelog-URL: https://github.com/wummel/linkchecker/blob/master/doc/changelog.txt
Source-Package-URL: https://pypi.python.org/packages/source/L/LinkChecker/LinkChecker-${version}.tar.gz

View file

@ -10,17 +10,12 @@ MYMAIL := bastian.kleineidam@web.de
LFILE = LC_MESSAGES/$(PACKAGE).mo
# defined language (add new languages here)
LANGUAGES = de fr es
MOFILES = $(wildcard *.po)
all:
all: $(MOFILES)
mofiles:
@for la in $(LANGUAGES); do \
if [ ! -d $(LDIR)/$$la/LC_MESSAGES ]; then \
mkdir -p $(LDIR)/$$la/LC_MESSAGES; \
fi; \
echo "Formatting language catalog $${la}:"; \
$(MSGFMT) -c --statistics -o $(LDIR)/$$la/$(LFILE) $$la.po; \
done
%.mo: %.po
$(MSGFMT) -c --statistics -o $@ $<
%.po: $(TEMPLATE)
$(MSGMERGE) -U --suffix=.bak $@ $<
@ -42,4 +37,4 @@ clean:
@for f in $(LANGUAGES); do rm -f $(LDIR)/$$f/$(LFILE); done
rm -f *.mo *.bak
.PHONY: mofiles clean
.PHONY: clean

BIN
po/de.mo Normal file

Binary file not shown.

433
po/de.po

File diff suppressed because it is too large Load diff

BIN
po/es.mo Normal file

Binary file not shown.

3663
po/es.po

File diff suppressed because it is too large Load diff

BIN
po/fr.mo Normal file

Binary file not shown.

3702
po/fr.po

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,210 +0,0 @@
# -*- coding: iso-8859-1 -*-
# License: Python license
# Copyright by Martin v. Löwis <loewis@informatik.hu-berlin.de>
# Plural forms support added by alexander smishlajev <alex@tycobka.lv>
"""
Generate binary message catalog from textual translation description.
This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation.
Usage: msgfmt.py [OPTIONS] filename.po
Options:
-o file
--output-file=file
Specify the output file to write to. If omitted, output will go to a
file named filename.mo (based off the input file name).
-h
--help
Print this message and exit.
-V
--version
Display version information and exit.
"""
import sys
import os
import getopt
import struct
import array
__version__ = "1.1"
MESSAGES = {}
def usage (ecode, msg=''):
"""Print usage and msg and exit with given code."""
print >> sys.stderr, __doc__
if msg:
print >> sys.stderr, msg
sys.exit(ecode)
def add (msgid, transtr, fuzzy):
"""Add a non-fuzzy translation to the dictionary."""
if not fuzzy and transtr and not transtr.startswith('\0'):
MESSAGES[msgid] = transtr
def generate ():
"""Return the generated output."""
keys = MESSAGES.keys()
# the keys are sorted in the .mo file
keys.sort()
offsets = []
ids = strs = ''
for _id in keys:
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
offsets.append((len(ids), len(_id), len(strs), len(MESSAGES[_id])))
ids += _id + '\0'
strs += MESSAGES[_id] + '\0'
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4+16*len(keys)
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
voffsets = []
# The string table first has the list of keys, then the list of values.
# Each entry has first the size of the string, then the file offset.
for o1, l1, o2, l2 in offsets:
koffsets += [l1, o1+keystart]
voffsets += [l2, o2+valuestart]
offsets = koffsets + voffsets
output = struct.pack("Iiiiiii",
0x950412deL, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
0, 0) # size and offset of hash table
output += array.array("i", offsets).tostring()
output += ids
output += strs
return output
def make (filename, outfile):
ID = 1
STR = 2
MESSAGES.clear()
# Compute .mo name from .po name and arguments
if filename.endswith('.po'):
infile = filename
else:
infile = filename + '.po'
if outfile is None:
outfile = os.path.splitext(infile)[0] + '.mo'
try:
lines = open(infile).readlines()
except IOError, msg:
print >> sys.stderr, msg
sys.exit(1)
section = None
fuzzy = 0
# Parse the catalog
msgid = msgstr = ''
lno = 0
for l in lines:
lno += 1
# If we get a comment line after a msgstr, this is a new entry
if l[0] == '#' and section == STR:
add(msgid, msgstr, fuzzy)
section = None
fuzzy = 0
# Record a fuzzy mark
if l[:2] == '#,' and (l.find('fuzzy') >= 0):
fuzzy = 1
# Skip comments
if l[0] == '#':
continue
# Start of msgid_plural section, separate from singular form with \0
if l.startswith('msgid_plural'):
msgid += '\0'
l = l[12:]
# Now we are in a msgid section, output previous section
elif l.startswith('msgid'):
if section == STR:
add(msgid, msgstr, fuzzy)
section = ID
l = l[5:]
msgid = msgstr = ''
# Now we are in a msgstr section
elif l.startswith('msgstr'):
section = STR
l = l[6:]
# Check for plural forms
if l.startswith('['):
# Separate plural forms with \0
if not l.startswith('[0]'):
msgstr += '\0'
# Ignore the index - must come in sequence
l = l[l.index(']') + 1:]
# Skip empty lines
l = l.strip()
if not l:
continue
# XXX: Does this always follow Python escape semantics?
l = eval(l)
if section == ID:
msgid += l
elif section == STR:
msgstr += l
else:
print >> sys.stderr, 'Syntax error on %s:%d' % (infile, lno), \
'before:'
print >> sys.stderr, l
sys.exit(1)
# Add last entry
if section == STR:
add(msgid, msgstr, fuzzy)
# Compute output
output = generate()
try:
open(outfile,"wb").write(output)
except IOError,msg:
print >> sys.stderr, msg
def main ():
try:
opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
['help', 'version', 'output-file='])
except getopt.error, msg:
usage(1, msg)
outfile = None
# parse options
for opt, arg in opts:
if opt in ('-h', '--help'):
usage(0)
elif opt in ('-V', '--version'):
print >> sys.stderr, "msgfmt.py", __version__
sys.exit(0)
elif opt in ('-o', '--output-file'):
outfile = arg
# do it
if not args:
print >> sys.stderr, 'No input file given'
print >> sys.stderr, "Try `msgfmt --help' for more information."
return
for filename in args:
make(filename, outfile)
if __name__ == '__main__':
main()

View file

@ -2,7 +2,6 @@
requests>=2.2.0
# optional:
argcomplete
#twill
# for testing:
#pytest
#pyftpdlib

View file

@ -99,7 +99,7 @@ except ImportError:
has_py2app = False
# the application version
AppVersion = "9.3"
AppVersion = "9.4"
# the application name
AppName = "LinkChecker"
Description = "check links in web documents or full websites"
@ -119,8 +119,6 @@ MSVCP90Token = '1fc8b3b9a1e18e3b'
# basic includes for py2exe and py2app
py_includes = ['dns.rdtypes.IN.*', 'dns.rdtypes.ANY.*',
'twill.extensions.*', 'twill.extensions.match_parse.*',
'twill.other_packages.*', 'twill.other_packages._mechanize_dist.*',
'linkcheck.logger.*',
]
# basic excludes for py2exe and py2app
@ -399,20 +397,13 @@ class MyInstallLib (install_lib, object):
class MyInstallData (install_data, object):
"""Handle locale files and permissions."""
"""Fix file permissions."""
def run (self):
"""Adjust permissions on POSIX systems."""
self.add_message_files()
super(MyInstallData, self).run()
self.fix_permissions()
def add_message_files (self):
"""Add locale message files to data_files list."""
for (src, dst) in list_message_files(self.distribution.get_name()):
dstdir = os.path.dirname(dst)
self.data_files.append((dstdir, [os.path.join("build", dst)]))
def fix_permissions (self):
"""Set correct read permissions on POSIX systems. Might also
be possible by setting umask?"""
@ -553,7 +544,7 @@ class MyBuildExt (build_ext, object):
self.build_extension(ext)
def list_message_files (package, suffix=".po"):
def list_message_files (package, suffix=".mo"):
"""Return list of all found message files and their installation paths."""
for fname in glob.glob("po/*" + suffix):
# basename (without extension) is a locale name
@ -587,21 +578,9 @@ def check_manifest ():
class MyBuild (build, object):
"""Custom build command."""
def build_message_files (self):
"""For each po/*.po, build .mo file in target locale directory."""
# msgfmt.py is in the po/ subdirectory
sys.path.append('po')
import msgfmt
for (src, dst) in list_message_files(self.distribution.get_name()):
build_dst = os.path.join("build", dst)
self.mkpath(os.path.dirname(build_dst))
self.announce("Compiling %s -> %s" % (src, build_dst))
msgfmt.make(src, build_dst)
def run (self):
"""Check MANIFEST and build message files before building."""
"""Check MANIFEST before building."""
check_manifest()
self.build_message_files()
build.run(self)
@ -670,6 +649,9 @@ data_files = [
]),
]
for (src, dst) in list_message_files(AppName):
data_files.append((src, dst))
if os.name == 'posix':
data_files.append(('share/man/man1', ['doc/en/linkchecker.1', 'doc/en/linkchecker-gui.1']))
data_files.append(('share/man/man5', ['doc/en/linkcheckerrc.5']))
@ -961,7 +943,6 @@ args = dict(
# See also doc/install.txt for more detailed dependency documentation.
#extra_requires = {
# "IP country info": ['GeoIP'], # http://www.maxmind.com/app/python
# "Login form": ['twill'], # http://twill.idyll.org/
# "GNOME proxies": ['pygtk'], # http://www.pygtk.org/downloads.html
# "Bash completion": ['argcomplete'], # https://pypi.python.org/pypi/argcomplete
# "Memory debugging": ['meliae'], # https://launchpad.net/meliae

View file

@ -15,14 +15,14 @@
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Test miscellaneous html tag parsing.
Test miscellaneous html tag parsing and URL types
"""
from . import LinkCheckTest
class TestMisc (LinkCheckTest):
"""
Test link checking of HTML tags.
Test misc link types.
"""
def test_misc (self):
@ -33,3 +33,17 @@ class TestMisc (LinkCheckTest):
def test_archive (self):
self.file_test("archive.html")
def test_itms_services(self):
url = u"itms-services:?action=download-manifest&url=http://www.example.com/"
resultlines = [
u"url %s" % url,
u"cache key %s" % url,
u"real url %s" % url,
u"valid",
u"url http://www.example.com/",
u"cache key http://www.example.com/",
u"real url http://www.example.com/",
u"valid",
]
self.direct(url, resultlines, recursionlevel=1)

View file

@ -38,6 +38,7 @@ class TestConfig (unittest.TestCase):
config = linkcheck.configuration.Configuration()
files = [get_file("config0.ini")]
config.read(files)
config.sanitize()
# checking section
for scheme in ("http", "https", "ftp"):
self.assertTrue(scheme in config["allowedschemes"])

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2004-2012 Bastian Kleineidam
# Copyright (C) 2004-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by

View file

@ -1,5 +1,5 @@
# -*- coding: iso-8859-1 -*-
# Copyright (C) 2011 Bastian Kleineidam
# Copyright (C) 2011-2014 Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by