Merge branch 'master' into py3

2026-04-21 06:41:00 +00:00 · 2014-09-12 17:36:46 +02:00 · 2014-09-12 17:36:46 +02:00 · d89217efaa
commit d89217efaa
parent 628d925716 9960c179fb
40 changed files with 4802 additions and 4167 deletions
--- a/.gitignore
+++ b/.gitignore
@ -29,7 +29,6 @@ Changelog.linkchecker*
 /doc/html/*.qch
 /.achievements
 /doc/*.mo
-/po/*.mo
 /LinkChecker-*-portable.zip
 /LinkChecker-*.exe
 /LinkChecker.egg-info
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
-include README.txt COPYING MANIFEST.in
+include README.rst COPYING MANIFEST.in
 include config/linkchecker-completion config/create.sql
 include config/linkcheckerrc
 include config/linkchecker.apache2.conf install-rpm.sh
@ -17,7 +17,7 @@ include linkcheck/gui/rc/Makefile
 include linkcheck/gui/rc/*.png
 include linkcheck/gui/rc/*.qrc
 include linkcheck/gui/ui/*.ui
-include po/*.po po/*.pot po/Makefile po/msgfmt.py
+include po/*.po po/*.mo po/*.pot po/Makefile
 include doc/*.example doc/*.txt
 include doc/html/*.ico
 include doc/html/*.html
--- a/3
+++ b/3
@ -75,7 +75,6 @@ all:
 clean:
 	-$(PYTHON) setup.py clean --all
 	rm -f $(LAPPNAME)-out.* *-stamp*
-	$(MAKE) -C po clean
 	$(MAKE) -C doc/html clean
 	$(MAKE) -C linkcheck/HtmlParser clean
 	rm -f linkcheck/network/_network*.so
@ -94,7 +93,7 @@ MANIFEST: MANIFEST.in setup.py
 	$(PYTHON) setup.py sdist --manifest-only

 locale:
-	$(MAKE) -C po mofiles
+	$(MAKE) -C po

 # to build in the current directory
 localbuild: MANIFEST locale
--- a/doc/changelog.txt
+++ b/doc/changelog.txt
@ -1,4 +1,23 @@
-9.3 "" (released xx.xx.2014)
+9.4 "" (released xx.xx.xxxx)
+
+Features:
+- checking: Support itms-services: URLs.
+  Closes: GH bug #532
+
+Changes:
+- installation: Remove dependency on msgfmt.py by pre-generating the
+  *.mo files and adding them to version control.
+  Reason was the difficulty to run msgfmt.py under both Python 2 and 3.
+- checking: When checking SSL certificates under POSIX systems try
+  to use the system certificate store.
+
+Fixes:
+- checking: Correct typos in the proxy handling code.
+  Closes: GH bug #536
+- cmdline: Reactivate paging of help pages.
+
+
+9.3 "Better Living Through Chemistry" (released 16.7.2014)

 Features:
 - checking: Parse and check links in PDF files.
@ -12,6 +31,7 @@ Changes:
  import needed third party modules.
 - checking: Treat empty URLs as same as parent URL.
  Closes: GH bug #524
+- installation: Replaced the twill dependency with local code.

 Fixes:
 - checking: Catch XML parse errors in sitemap XML files and print them
@ -28,6 +48,8 @@ Fixes:
  Closes: GH bug #519
 - checking: Use user-supplied authentication and proxies when requestiong
  robot.txt.
+- plugins: Fix Word file check plugin.
+  Closes: GH bug #530


 9.2 "Rick and Morty" (released 23.4.2014)
--- a/doc/development.txt
+++ b/doc/development.txt
@ -39,9 +39,6 @@ installation is recommended.
 - *Optional, for displaying country codes:*
  Pygeoip from http://code.google.com/p/pygeoip/

- *Optional, used for login form submission:*
-  Twill from http://twill.idyll.org/
-

 Setup for Unix/Linux
 --------------------
--- a/doc/install.txt
+++ b/doc/install.txt
@ -73,15 +73,12 @@ First, install the required software.
 7. *Optional, used for Virus checking:*
   ClamAv from http://www.clamav.net/

-8. *Optional, used for login form submission:*
-   Twill from http://twill.idyll.org/
-
-9. *Optional, for GNOME proxy setting parsing:*
+8. *Optional, for GNOME proxy setting parsing:*
    Python Gtk from http://www.pygtk.org/downloads.html

-10. *Optional, to run the WSGI web interface:*
-    Apache from http://httpd.apache.org/
-    mod_wsgi from http://code.google.com/p/modwsgi/
+9. *Optional, to run the WSGI web interface:*
+   Apache from http://httpd.apache.org/
+   mod_wsgi from http://code.google.com/p/modwsgi/


 Now install the application.
--- a/doc/python3.txt
+++ b/doc/python3.txt
@ -1,8 +1,8 @@
+Date: 15.7.2014
+
 Porting status of dependent Python packages
 ============================================

-Date: 3.3.2014
-
 OK   Python
 OK   requests
 OK   Qt/PyQt
@ -11,4 +11,8 @@ OK   argcomplete from https://pypi.python.org/pypi/argcomplete
 OK   dnspython (as dnspython3)
 OK   pygeoip from https://pypi.python.org/pypi/pygeoip/
 OK   Port Python Gtk stuff to PyGObject https://live.gnome.org/PyGObject/IntrospectionPorting
-TODO(optional) Twill from http://twill.idyll.org/
+
+Overall Porting status
+=======================
+
+NOT STARTED
--- a/doc/web/app.yaml
+++ b/doc/web/app.yaml
@ -1,4 +1,4 @@
-version: "9.2"
+version: "9.3"
 name: "LinkChecker"
 lname: "linkchecker"
 maintainer: "Bastian Kleineidam"
--- a/doc/web/content/faq.md
+++ b/doc/web/content/faq.md
@ -19,6 +19,11 @@ policy by the webmaster running the website you are checking. Look in
 the ``/robots.txt`` file which follows the
 [robots.txt exclusion standard](http://www.robotstxt.org/robotstxt.html).

+For identification LinkChecker adds to each request a User-Agent header
+like this:
+
+    Mozilla/5.0 (compatible; LinkChecker/9.3; +http://wummel.github.io/linkchecker/)
+
 If you yourself are the webmaster, consider allowing LinkChecker to
 check your web pages by adding the following to your robots.txt file:

--- a/doc/web/content/index.md
+++ b/doc/web/content/index.md
@ -6,7 +6,9 @@ Introduction
 LinkChecker is a free, [GPL](http://www.gnu.org/licenses/gpl-2.0.html)
 licensed website validator.
 LinkChecker checks links in web documents or full websites.
-It runs on systems with Python 2.7.2 or later.
+It runs on Python 2 systems, requiring Python 2.7.2 or later.
+Python 3 is not (yet) supported.
+

 Features
 ---------
--- a/linkcheck/cache/robots_txt.py
+++ b/linkcheck/cache/robots_txt.py
@ -17,7 +17,7 @@
 """
 Cache robots.txt contents.
 """
-from .. import robotparser2, configuration
+from .. import robotparser2
 from ..containers import LFUCache
 from ..decorators import synchronized
 from ..lock import get_lock
@ -33,14 +33,14 @@ class RobotsTxt (object):
    Thread-safe cache of downloaded robots.txt files.
    format: {cache key (string) -> robots.txt content (RobotFileParser)}
    """
-    useragent = str(configuration.UserAgent)

-    def __init__ (self):
+    def __init__ (self, useragent):
        """Initialize per-URL robots.txt cache."""
        # mapping {URL -> parsed robots.txt}
        self.cache = LFUCache(size=100)
        self.hits = self.misses = 0
        self.roboturl_locks = {}
+        self.useragent = useragent

    def allows_url (self, url_data):
        """Ask robots.txt allowance."""
@ -59,7 +59,7 @@ class RobotsTxt (object):
            self.misses += 1
        kwargs = dict(auth=url_data.auth, session=url_data.session)
        if url_data.proxy:
-            kwargs["proxies"] = {url_data.proxy_type, url_data.proxy}
+            kwargs["proxies"] = {url_data.proxytype: url_data.proxy}
        rp = robotparser2.RobotFileParser(**kwargs)
        rp.set_url(roboturl)
        rp.read()
--- a/linkcheck/checker/init.py
+++ b/linkcheck/checker/init.py
@ -143,6 +143,8 @@ def get_urlclass_from (scheme, assume_local_file=False):
        klass = nntpurl.NntpUrl
    elif scheme == "dns":
        klass = dnsurl.DnsUrl
+    elif scheme == "itms-services":
+        klass = itmsservicesurl.ItmsServicesUrl
    elif scheme and unknownurl.is_unknown_scheme(scheme):
        klass = unknownurl.UnknownUrl
    elif assume_local_file:
@ -174,4 +176,4 @@ def get_index_html (urls):

 # all the URL classes
 from . import (fileurl, unknownurl, ftpurl, httpurl, dnsurl,
-    mailtourl, telneturl, nntpurl, ignoreurl)
+    mailtourl, telneturl, nntpurl, ignoreurl, itmsservicesurl)
--- a/linkcheck/checker/const.py
+++ b/linkcheck/checker/const.py
@ -47,6 +47,7 @@ ExcCacheList = [
    EOFError,
    # http errors
    requests.exceptions.RequestException,
+    requests.packages.urllib3.exceptions.HTTPError,
    # ftp errors
    ftplib.error_reply,
    ftplib.error_temp,
--- a/linkcheck/checker/httpurl.py
+++ b/linkcheck/checker/httpurl.py
@ -19,6 +19,13 @@ Handle http links.
 """

 import requests
+# The validity of SSL certs is ignored to be able
+# the check the URL and recurse into it.
+# The warning about invalid SSL certs is given to the
+# user instead.
+import warnings
+warnings.simplefilter('ignore', requests.packages.urllib3.exceptions.InsecureRequestWarning)
+
 from cStringIO import StringIO

 from .. import (log, LOG_CHECK, strformat, mimeutil,
--- a/linkcheck/checker/itmsservicesurl.py
+++ b/linkcheck/checker/itmsservicesurl.py
@ -0,0 +1,45 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2014 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+Handle itms-services URLs.
+"""
+
+from . import urlbase
+from .. import log, LOG_CHECK
+
+
+class ItmsServicesUrl(urlbase.UrlBase):
+    """Apple iOS application download URLs."""
+
+    def check_syntax(self):
+        """Only logs that this URL is unknown."""
+        super(ItmsServicesUrl, self).check_syntax()
+        if u"url=" not in self.urlparts[3]:
+            self.set_result(_("Missing required url parameter"), valid=False)
+
+    def local_check(self):
+        """Disable content checks."""
+        log.debug(LOG_CHECK, "Checking %s", unicode(self))
+        pass
+
+    def check_content(self):
+        """Allow recursion to check the url CGI param."""
+        return True
+
+    def is_parseable(self):
+        """This URL is parseable."""
+        return True
--- a/linkcheck/checker/proxysupport.py
+++ b/linkcheck/checker/proxysupport.py
@ -18,6 +18,7 @@
 Mixin class for URLs that can be fetched over a proxy.
 """
 import urllib
+import urlparse
 import os
 from .. import LinkCheckerError, log, LOG_CHECK, url as urlutil, httputil

@ -35,29 +36,30 @@ class ProxySupport (object):
        self.proxyauth = None
        if not self.proxy:
            return
-        self.proxytype, self.proxy = urllib.splittype(self.proxy)
+        proxyurl = urlparse.urlparse(self.proxy)
+        self.proxytype = proxyurl.scheme
        if self.proxytype not in ('http', 'https'):
            # Note that invalid proxies might raise TypeError in urllib2,
            # so make sure to stop checking at this point, not later.
            msg = _("Proxy value `%(proxy)s' must start with 'http:' or 'https:'.") \
                 % dict(proxy=proxy)
            raise LinkCheckerError(msg)
-        self.proxy = urllib.splithost(self.proxy)[0]
-        self.proxyauth, self.proxy = urllib.splituser(self.proxy)
        if self.ignore_proxy_host():
            # log proxy without auth info
            log.debug(LOG_CHECK, "ignoring proxy %r", self.proxy)
            self.add_info(_("Ignoring proxy setting `%(proxy)s'.") %
                dict(proxy=proxy))
-            self.proxy = self.proxyauth = None
+            self.proxy = None
            return
        log.debug(LOG_CHECK, "using proxy %r", self.proxy)
        self.add_info(_("Using proxy `%(proxy)s'.") % dict(proxy=self.proxy))
-        if self.proxyauth is not None:
-            if ":" not in self.proxyauth:
-                self.proxyauth += ":"
-            self.proxyauth = httputil.encode_base64(self.proxyauth)
-            self.proxyauth = "Basic "+self.proxyauth
+        self.proxyhost = proxyurl.hostname
+        self.proxyport = proxyurl.port
+        if proxyurl.username is not None:
+            username = proxyurl.username
+            password = proxyurl.password if proxy.password is not None else ""
+            auth = "%s:%s" % (username, password)
+            self.proxyauth = "Basic "+httputil.encode_base64(auth)

    def ignore_proxy_host (self):
        """Check if self.host is in the $no_proxy ignore list."""
@ -79,7 +81,8 @@ class ProxySupport (object):
        """
        if self.proxy:
            scheme = self.proxytype
-            host, port = urlutil.splitport(self.proxy)
+            host = self.proxyhost
+            port = self.proxyport
        else:
            scheme = self.scheme
            host = self.host
--- a/linkcheck/cmdline.py
+++ b/linkcheck/cmdline.py
@ -27,7 +27,7 @@ from .director import console
 class LCArgumentParser(argparse.ArgumentParser):
    """Custom argument parser to format help text."""

-    def print_help(self, file=None):
+    def print_help(self, file=sys.stdout):
        """Print a help message to stdout."""
        msg = console.encode(self.format_help())
        if fileutil.is_tty(file):
--- a/linkcheck/configuration/init.py
+++ b/linkcheck/configuration/init.py
@ -63,7 +63,6 @@ Modules = (
    ("argcomplete", u"Argcomplete"),
    ("GeoIP", u"GeoIP"),   # on Unix systems
    ("pygeoip", u"GeoIP"), # on Windows systems
-    ("twill", u"Twill"),
    ("sqlite3", u"Sqlite"),
    ("gconf", u"Gconf"),
    ("meliae", u"Meliae"),
@ -117,6 +116,34 @@ def get_share_file (filename, devel_dir=None):
    raise ValueError(msg)


+def get_system_cert_file():
+    """Try to find a system-wide SSL certificate file.
+    @return: the filename to the cert file
+    @raises: ValueError when no system cert file could be found
+    """
+    if os.name == 'posix':
+        filename = "/etc/ssl/certs/ca-certificates.crt"
+        if os.path.isfile(filename):
+            return filename
+    msg = "no system certificate file found"
+    raise ValueError(msg)
+
+
+def get_certifi_file():
+    """Get the SSL certifications installed by the certifi package.
+    @return: the filename to the cert file
+    @rtype: string
+    @raises: ImportError when certifi is not installed or ValueError when
+             the file is not found
+    """
+    import certifi
+    filename = certifi.where()
+    if os.path.isfile(filename):
+        return filename
+    msg = "%s not found; check your certifi installation" % filename
+    raise ValueError(msg)
+
+
 # dynamic options
 class Configuration (dict):
    """
@ -219,7 +246,6 @@ class Configuration (dict):
                filtered_cfiles.append(cfile)
        log.debug(LOG_CHECK, "reading configuration from %s", filtered_cfiles)
        confparse.LCConfigParser(self).read(filtered_cfiles)
-        self.sanitize()

    def add_auth (self, user=None, password=None, pattern=None):
        """Add given authentication data."""
@ -317,12 +343,20 @@ class Configuration (dict):
                self[plugin] = {}

    def sanitize_ssl(self):
-        """Use locally installed certificate file if available."""
+        """Use local installed certificate file if available.
+        Tries to get system, then certifi, then the own
+        installed certificate file."""
        if self["sslverify"] is True:
            try:
-                self["sslverify"] = get_share_file('cacert.pem')
+                self["sslverify"] = get_system_cert_file()
            except ValueError:
-                pass
+                try:
+                    self["sslverify"] = get_certifi_file()
+                except (ValueError, ImportError):
+                    try:
+                        self["sslverify"] = get_share_file('cacert.pem')
+                    except ValueError:
+                        pass


 def get_plugin_folders():
--- a/linkcheck/director/init.py
+++ b/linkcheck/director/init.py
@ -20,100 +20,18 @@ Management of checking a queue of links with several threads.
 import os
 import thread
 import time
-from .. import log, LOG_CHECK, LinkCheckerInterrupt, dummy, \
-  fileutil, strformat, plugins
+from .. import log, LOG_CHECK, LinkCheckerInterrupt, plugins
 from ..cache import urlqueue, robots_txt, results
 from . import aggregator, console


-def visit_loginurl (aggregate):
-    """Check for a login URL and visit it."""
-    config = aggregate.config
-    url = config["loginurl"]
-    if not url:
-        return
-    if not fileutil.has_module("twill"):
-        msg = strformat.format_feature_warning(module=u'twill',
-            feature=u'login URL visit',
-            url=u'http://twill.idyll.org/')
-        log.warn(LOG_CHECK, msg)
-        return
-    from twill import commands as tc
-    log.debug(LOG_CHECK, u"Visiting login URL %s", url)
-    configure_twill(tc)
-    tc.go(url)
-    if tc.get_browser().get_code() != 200:
-        log.warn(LOG_CHECK, _("Error visiting login URL %(url)s.") % \
-          {"url": url})
-        return
-    submit_login_form(config, url, tc)
-    if tc.get_browser().get_code() != 200:
-        log.warn(LOG_CHECK, _("Error posting form at login URL %(url)s.") % \
-          {"url": url})
-        return
-    #XXX store_cookies(tc.get_browser().cj, aggregate.cookies, url)
-    resulturl = tc.get_browser().get_url()
-    log.debug(LOG_CHECK, u"URL after POST is %s" % resulturl)
-    # add result URL to check list
-    from ..checker import get_url_from
-    aggregate.urlqueue.put(get_url_from(resulturl, 0, aggregate))
-
-
-def configure_twill (tc):
-    """Configure twill to be used by LinkChecker.
-    Note that there is no need to set a proxy since twill uses the same
-    ones (provided from urllib) as LinkChecker does.
-    """
-    # make sure readonly controls are writeable (might be needed)
-    tc.config("readonly_controls_writeable", True)
-    # disable page refreshing
-    tc.config("acknowledge_equiv_refresh", False)
-    # fake IE 6.0 to talk sense into some sites (eg. SourceForge)
-    tc.agent("Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)")
-    # tell twill to shut up
-    tc.OUT = dummy.Dummy()
-    from twill import browser
-    browser.OUT = dummy.Dummy()
-    # set debug level
-    if log.is_debug(LOG_CHECK):
-        tc.debug("http", 1)
-
-
-def submit_login_form (config, url, tc):
-    """Fill and submit login form."""
-    user, password = config.get_user_password(url)
-    cgiuser = config["loginuserfield"]
-    cgipassword = config["loginpasswordfield"]
-    formname = search_formname((cgiuser, cgipassword), tc)
-    tc.formvalue(formname, cgiuser, user)
-    tc.formvalue(formname, cgipassword, password)
-    for key, value in config["loginextrafields"].items():
-        tc.formvalue(formname, key, value)
-    tc.submit()
-
-
-def search_formname (fieldnames, tc):
-    """Search form that has all given CGI fieldnames."""
-    browser = tc.get_browser()
-    for formcounter, form in enumerate(browser.get_all_forms()):
-        for name in fieldnames:
-            try:
-                browser.get_form_field(form, name)
-            except tc.TwillException:
-                break
-        else:
-            return form.name or form.attrs.get('id') or formcounter
-    # none found
-    return None
-
-
 def check_urls (aggregate):
    """Main check function; checks all configured URLs until interrupted
    with Ctrl-C.
    @return: None
    """
    try:
-        visit_loginurl(aggregate)
+        aggregate.visit_loginurl()
    except Exception as msg:
        log.warn(LOG_CHECK, _("Error using login URL: %(msg)s.") % \
                 dict(msg=msg))
@ -210,7 +128,7 @@ def abort_now ():
 def get_aggregate (config):
    """Get an aggregator instance with given configuration."""
    _urlqueue = urlqueue.UrlQueue(max_allowed_urls=config["maxnumurls"])
-    _robots_txt = robots_txt.RobotsTxt()
+    _robots_txt = robots_txt.RobotsTxt(config["useragent"])
    plugin_manager = plugins.PluginManager(config)
    result_cache = results.ResultCache()
    return aggregator.Aggregate(config, _urlqueue, _robots_txt, plugin_manager,
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@ -21,10 +21,12 @@ import threading
 import thread
 import requests
 import time
+import urlparse
 import random
-from .. import log, LOG_CHECK, strformat, cookies
+from .. import log, LOG_CHECK, strformat, LinkCheckerError
 from ..decorators import synchronized
 from ..cache import urlqueue
+from ..htmlutil import formsearch
 from . import logger, status, checker, interrupt


@ -32,15 +34,15 @@ _threads_lock = threading.RLock()
 _hosts_lock = threading.RLock()
 _downloadedbytes_lock = threading.RLock()

-def new_request_session(config):
+def new_request_session(config, cookies):
    """Create a new request session."""
    session = requests.Session()
+    if cookies:
+        session.cookies = cookies
    session.max_redirects = config["maxhttpredirects"]
    session.headers = {
        "User-Agent": config["useragent"],
-        "DNT": "1",
    }
-    # XXX proxies
    if config["cookiefile"]:
        for cookie in cookies.from_file(config["cookiefile"]):
            session.cookies = requests.cookies.merge_cookies(session.cookies, cookie)
@ -62,11 +64,36 @@ class Aggregate (object):
        self.plugin_manager = plugin_manager
        self.result_cache = result_cache
        self.times = {}
+        self.cookies = None
        requests_per_second = config["maxrequestspersecond"]
        self.wait_time_min = 1.0 / requests_per_second
        self.wait_time_max = max(self.wait_time_min + 0.5, 0.5)
        self.downloaded_bytes = 0

+    def visit_loginurl(self):
+        """Check for a login URL and visit it."""
+        url = self.config["loginurl"]
+        if not url:
+            return
+        user, password = self.config.get_user_password(url)
+        session = requests.Session()
+        # XXX user-agent header
+        # XXX timeout
+        response = session.get(url)
+        cgiuser = self.config["loginuserfield"]
+        cgipassword = self.config["loginpasswordfield"]
+        form = formsearch.search_form(response.content, cgiuser, cgipassword,
+              encoding=response.encoding)
+        form.data[cgiuser] = user
+        form.data[cgipassword] = password
+        for key, value in self.config["loginextrafields"].items():
+            form.data[key] = value
+        formurl = urlparse.urljoin(url, form.url)
+        response = session.post(formurl, data=form.data)
+        self.cookies = session.cookies
+        if len(self.cookies) == 0:
+            raise LinkCheckerError("No cookies set by login URL %s" % url)
+
    @synchronized(_threads_lock)
    def start_threads (self):
        """Spawn threads for URL checking and status printing."""
@ -85,13 +112,13 @@ class Aggregate (object):
                self.threads.append(t)
                t.start()
        else:
-            self.request_sessions[thread.get_ident()] = new_request_session(self.config)
+            self.request_sessions[thread.get_ident()] = new_request_session(self.config, self.cookies)
            checker.check_urls(self.urlqueue, self.logger)

    @synchronized(_threads_lock)
    def add_request_session(self):
        """Add a request session for current thread."""
-        session = new_request_session(self.config)
+        session = new_request_session(self.config, self.cookies)
        self.request_sessions[thread.get_ident()] = session

    @synchronized(_threads_lock)
--- a/linkcheck/htmlutil/formsearch.py
+++ b/linkcheck/htmlutil/formsearch.py
@ -0,0 +1,113 @@
+# -*- coding: iso-8859-1 -*-
+# Copyright (C) 2014 Bastian Kleineidam
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+HTML form utils
+"""
+from ..HtmlParser import htmlsax
+from .. import log, LOG_CHECK
+
+class Form(object):
+    """Store HTML form URL and form data."""
+
+    def __init__(self, url):
+        """Set URL and empty form data."""
+        self.url = url
+        self.data = {}
+
+    def add_value(self, key, value):
+        """Add a form value."""
+        self.data[key] = value
+
+    def __repr__(self):
+        """Return unicode representation displaying URL and form data."""
+        return unicode(self)
+
+    def __unicode__(self):
+        """Return unicode string displaying URL and form data."""
+        return u"<url=%s data=%s>" % (self.url, self.data)
+
+    def __str__(self):
+        """Return string displaying URL and form data."""
+        return unicode(self).encode('utf-8')
+
+
+class FormFinder(object):
+    """Base class handling HTML start elements.
+    TagFinder instances are used as HtmlParser handlers."""
+
+    def __init__(self):
+        """Initialize local variables."""
+        super(FormFinder, self).__init__()
+        # parser object will be initialized when it is used as
+        # a handler object
+        self.parser = None
+        self.forms = []
+        self.form = None
+
+    def start_element(self, tag, attrs):
+        """Does nothing, override in a subclass."""
+        if tag == u'form':
+            if u'action' in attrs:
+                url = attrs['action']
+                self.form = Form(url)
+        elif tag == u'input':
+            if self.form:
+                if 'name' in attrs:
+                    key = attrs['name']
+                    value = attrs.get('value')
+                    self.form.add_value(key, value)
+                else:
+                    log.warning(LOG_CHECK, "nameless form input %s" % attrs)
+                    pass
+            else:
+                log.warning(LOG_CHECK, "formless input´%s" % attrs)
+                pass
+
+    def start_end_element(self, tag, attrs):
+        """Delegate a combined start/end element (eg. <input .../>) to
+        the start_element method. Ignore the end element part."""
+        self.start_element(tag, attrs)
+
+    def end_element(self, tag):
+        """search for ending form values."""
+        if tag == u'form':
+            self.forms.append(self.form)
+            self.form = None
+
+
+def search_form(content, cgiuser, cgipassword, encoding='utf-8'):
+    """Search for a HTML form in the given HTML content that has the given
+    CGI fields. If no form is found return None.
+    """
+    handler = FormFinder()
+    parser = htmlsax.parser(handler)
+    handler.parser = parser
+    parser.encoding = encoding
+    # parse
+    parser.feed(content)
+    parser.flush()
+    # break cyclic dependencies
+    handler.parser = None
+    parser.handler = None
+    log.debug(LOG_CHECK, "Found forms %s", handler.forms)
+    cginames = (cgiuser.lower(), cgipassword.lower())
+    for form in handler.forms:
+        for key, value in form.data.items():
+            if key.lower() in cginames:
+                return form
+    # not found
+    return None
--- a/linkcheck/parser/init.py
+++ b/linkcheck/parser/init.py
@ -17,7 +17,7 @@
 """
 Main functions for link parsing
 """
-from .. import log, LOG_CHECK, strformat
+from .. import log, LOG_CHECK, strformat, url as urlutil
 from ..htmlutil import linkparse
 from ..HtmlParser import htmlsax
 from ..bookmarks import firefox
@ -30,6 +30,8 @@ def parse_url(url_data):
        key = "html"
    elif url_data.is_file() and firefox.has_sqlite and firefox.extension.search(url_data.url):
        key = "firefox"
+    elif url_data.scheme == "itms-services":
+        key = "itms_services"
    else:
        # determine parse routine according to content types
        mime = url_data.content_type
@ -140,4 +142,13 @@ def parse_firefox (url_data):
        url_data.add_url(url, name=name)


+def parse_itms_services(url_data):
+    """Get "url" CGI parameter value as child URL."""
+    query = url_data.urlparts[3]
+    for k, v, sep in urlutil.parse_qsl(query, keep_blank_values=True):
+        if k == "url":
+            url_data.add_url(v)
+            break
+
+
 from .sitemap import parse_sitemap, parse_sitemapindex
--- a/linkcheck/plugins/parseword.py
+++ b/linkcheck/plugins/parseword.py
@ -32,7 +32,7 @@ from .. import fileutil, log, LOG_PLUGIN
 _initialized = False
 def init_win32com ():
    """Initialize the win32com.client cache."""
-    global _initialized 
+    global _initialized
    if _initialized:
        return
    import win32com.client
@ -117,7 +117,8 @@ class WordParser(_ParserPlugin):

    def check(self, url_data):
        """Parse Word data."""
-        filename = get_temp_filename()
+        content = url_data.get_content()
+        filename = get_temp_filename(content)
        # open word file and parse hyperlinks
        try:
            app = get_word_app()
--- a/linkcheck/url.py
+++ b/linkcheck/url.py
@ -255,7 +255,7 @@ def url_parse_query (query, encoding=None):
        query, rest = query.rsplit('?', 1)
        append = '?'+url_parse_query(rest)+append
    l = []
-    for k, v, sep in parse_qsl(query, True):
+    for k, v, sep in parse_qsl(query, keep_blank_values=True):
        k = url_quote_part(k, '/-:,;')
        if v:
            v = url_quote_part(v, '/-:,;')
@ -373,7 +373,7 @@ def collapse_segments (path):
    return path


-url_is_absolute = re.compile("^[a-z]+:", re.I).match
+url_is_absolute = re.compile(r"^[-\.a-z]+:", re.I).match


 def url_quote (url):
--- a/linkchecker.freecode
+++ b/linkchecker.freecode
@ -1,5 +1,5 @@
 Project: LinkChecker
-Version: 9.2
+Version: 9.3
 Website-URL: http://wummel.github.io/linkchecker/
 Changelog-URL: https://github.com/wummel/linkchecker/blob/master/doc/changelog.txt
 Source-Package-URL: https://pypi.python.org/packages/source/L/LinkChecker/LinkChecker-${version}.tar.gz
--- a/po/Makefile
+++ b/po/Makefile
@ -10,17 +10,12 @@ MYMAIL := bastian.kleineidam@web.de
 LFILE = LC_MESSAGES/$(PACKAGE).mo
 # defined language (add new languages here)
 LANGUAGES = de fr es
+MOFILES = $(wildcard *.po)

-all:
+all:	$(MOFILES)

-mofiles:
-	@for la in $(LANGUAGES); do \
-	  if [ ! -d $(LDIR)/$$la/LC_MESSAGES ]; then \
-	    mkdir -p $(LDIR)/$$la/LC_MESSAGES; \
-	  fi; \
-	  echo "Formatting language catalog $${la}:"; \
-	  $(MSGFMT) -c --statistics -o $(LDIR)/$$la/$(LFILE) $$la.po; \
-	done
+%.mo:	%.po
+	$(MSGFMT) -c --statistics -o $@ $<

 %.po:	$(TEMPLATE)
 	$(MSGMERGE) -U --suffix=.bak $@ $<
@ -42,4 +37,4 @@ clean:
 	@for f in $(LANGUAGES); do rm -f $(LDIR)/$$f/$(LFILE); done
 	rm -f *.mo *.bak

-.PHONY: mofiles clean
+.PHONY: clean
--- a/po/de.mo
+++ b/po/de.mo
--- a/po/de.po
+++ b/po/de.po
--- a/po/es.mo
+++ b/po/es.mo
--- a/po/es.po
+++ b/po/es.po
--- a/po/fr.mo
+++ b/po/fr.mo
--- a/po/fr.po
+++ b/po/fr.po
--- a/po/linkchecker.pot
+++ b/po/linkchecker.pot
--- a/po/msgfmt.py
+++ b/po/msgfmt.py
@ -1,210 +0,0 @@
-# -*- coding: iso-8859-1 -*-
-# License: Python license
-# Copyright by Martin v. Löwis <loewis@informatik.hu-berlin.de>
-# Plural forms support added by alexander smishlajev <alex@tycobka.lv>
-"""
-Generate binary message catalog from textual translation description.
-
-This program converts a textual Uniforum-style message catalog (.po file) into
-a binary GNU catalog (.mo file).  This is essentially the same function as the
-GNU msgfmt program, however, it is a simpler implementation.
-
-Usage: msgfmt.py [OPTIONS] filename.po
-
-Options:
-    -o file
-    --output-file=file
-        Specify the output file to write to.  If omitted, output will go to a
-        file named filename.mo (based off the input file name).
-
-    -h
-    --help
-        Print this message and exit.
-
-    -V
-    --version
-        Display version information and exit.
-"""
-
-import sys
-import os
-import getopt
-import struct
-import array
-
-__version__ = "1.1"
-
-MESSAGES = {}
-
-
-def usage (ecode, msg=''):
-    """Print usage and msg and exit with given code."""
-    print >> sys.stderr, __doc__
-    if msg:
-        print >> sys.stderr, msg
-    sys.exit(ecode)
-
-
-def add (msgid, transtr, fuzzy):
-    """Add a non-fuzzy translation to the dictionary."""
-    if not fuzzy and transtr and not transtr.startswith('\0'):
-        MESSAGES[msgid] = transtr
-
-
-def generate ():
-    """Return the generated output."""
-    keys = MESSAGES.keys()
-    # the keys are sorted in the .mo file
-    keys.sort()
-    offsets = []
-    ids = strs = ''
-    for _id in keys:
-        # For each string, we need size and file offset.  Each string is NUL
-        # terminated; the NUL does not count into the size.
-        offsets.append((len(ids), len(_id), len(strs), len(MESSAGES[_id])))
-        ids += _id + '\0'
-        strs += MESSAGES[_id] + '\0'
-    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
-    # the keys start right after the index tables.
-    # translated string.
-    keystart = 7*4+16*len(keys)
-    # and the values start after the keys
-    valuestart = keystart + len(ids)
-    koffsets = []
-    voffsets = []
-    # The string table first has the list of keys, then the list of values.
-    # Each entry has first the size of the string, then the file offset.
-    for o1, l1, o2, l2 in offsets:
-        koffsets += [l1, o1+keystart]
-        voffsets += [l2, o2+valuestart]
-    offsets = koffsets + voffsets
-    output = struct.pack("Iiiiiii",
-                         0x950412deL,       # Magic
-                         0,                 # Version
-                         len(keys),         # # of entries
-                         7*4,               # start of key index
-                         7*4+len(keys)*8,   # start of value index
-                         0, 0)              # size and offset of hash table
-    output += array.array("i", offsets).tostring()
-    output += ids
-    output += strs
-    return output
-
-
-def make (filename, outfile):
-    ID = 1
-    STR = 2
-    MESSAGES.clear()
-
-    # Compute .mo name from .po name and arguments
-    if filename.endswith('.po'):
-        infile = filename
-    else:
-        infile = filename + '.po'
-    if outfile is None:
-        outfile = os.path.splitext(infile)[0] + '.mo'
-
-    try:
-        lines = open(infile).readlines()
-    except IOError, msg:
-        print >> sys.stderr, msg
-        sys.exit(1)
-
-    section = None
-    fuzzy = 0
-
-    # Parse the catalog
-    msgid = msgstr = ''
-    lno = 0
-    for l in lines:
-        lno += 1
-        # If we get a comment line after a msgstr, this is a new entry
-        if l[0] == '#' and section == STR:
-            add(msgid, msgstr, fuzzy)
-            section = None
-            fuzzy = 0
-        # Record a fuzzy mark
-        if l[:2] == '#,' and (l.find('fuzzy') >= 0):
-            fuzzy = 1
-        # Skip comments
-        if l[0] == '#':
-            continue
-        # Start of msgid_plural section, separate from singular form with \0
-        if l.startswith('msgid_plural'):
-            msgid += '\0'
-            l = l[12:]
-        # Now we are in a msgid section, output previous section
-        elif l.startswith('msgid'):
-            if section == STR:
-                add(msgid, msgstr, fuzzy)
-            section = ID
-            l = l[5:]
-            msgid = msgstr = ''
-        # Now we are in a msgstr section
-        elif l.startswith('msgstr'):
-            section = STR
-            l = l[6:]
-            # Check for plural forms
-            if l.startswith('['):
-                # Separate plural forms with \0
-                if not l.startswith('[0]'):
-                    msgstr += '\0'
-                # Ignore the index - must come in sequence
-                l = l[l.index(']') + 1:]
-        # Skip empty lines
-        l = l.strip()
-        if not l:
-            continue
-        # XXX: Does this always follow Python escape semantics?
-        l = eval(l)
-        if section == ID:
-            msgid += l
-        elif section == STR:
-            msgstr += l
-        else:
-            print >> sys.stderr, 'Syntax error on %s:%d' % (infile, lno), \
-                  'before:'
-            print >> sys.stderr, l
-            sys.exit(1)
-    # Add last entry
-    if section == STR:
-        add(msgid, msgstr, fuzzy)
-
-    # Compute output
-    output = generate()
-
-    try:
-        open(outfile,"wb").write(output)
-    except IOError,msg:
-        print >> sys.stderr, msg
-
-
-def main ():
-    try:
-        opts, args = getopt.getopt(sys.argv[1:], 'hVo:',
-                                   ['help', 'version', 'output-file='])
-    except getopt.error, msg:
-        usage(1, msg)
-
-    outfile = None
-    # parse options
-    for opt, arg in opts:
-        if opt in ('-h', '--help'):
-            usage(0)
-        elif opt in ('-V', '--version'):
-            print >> sys.stderr, "msgfmt.py", __version__
-            sys.exit(0)
-        elif opt in ('-o', '--output-file'):
-            outfile = arg
-    # do it
-    if not args:
-        print >> sys.stderr, 'No input file given'
-        print >> sys.stderr, "Try `msgfmt --help' for more information."
-        return
-
-    for filename in args:
-        make(filename, outfile)
-
-
-if __name__ == '__main__':
-    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -2,7 +2,6 @@
 requests>=2.2.0
 # optional:
 argcomplete
-#twill
 # for testing:
 #pytest
 #pyftpdlib
--- a/setup.py
+++ b/setup.py
@ -99,7 +99,7 @@ except ImportError:
    has_py2app = False

 # the application version
-AppVersion = "9.3"
+AppVersion = "9.4"
 # the application name
 AppName = "LinkChecker"
 Description = "check links in web documents or full websites"
@ -119,8 +119,6 @@ MSVCP90Token = '1fc8b3b9a1e18e3b'

 # basic includes for py2exe and py2app
 py_includes = ['dns.rdtypes.IN.*', 'dns.rdtypes.ANY.*',
-    'twill.extensions.*', 'twill.extensions.match_parse.*',
-    'twill.other_packages.*', 'twill.other_packages._mechanize_dist.*',
    'linkcheck.logger.*',
 ]
 # basic excludes for py2exe and py2app
@ -399,20 +397,13 @@ class MyInstallLib (install_lib, object):


 class MyInstallData (install_data, object):
-    """Handle locale files and permissions."""
+    """Fix file permissions."""

    def run (self):
        """Adjust permissions on POSIX systems."""
-        self.add_message_files()
        super(MyInstallData, self).run()
        self.fix_permissions()

-    def add_message_files (self):
-        """Add locale message files to data_files list."""
-        for (src, dst) in list_message_files(self.distribution.get_name()):
-            dstdir = os.path.dirname(dst)
-            self.data_files.append((dstdir, [os.path.join("build", dst)]))
-
    def fix_permissions (self):
        """Set correct read permissions on POSIX systems. Might also
        be possible by setting umask?"""
@ -553,7 +544,7 @@ class MyBuildExt (build_ext, object):
            self.build_extension(ext)


-def list_message_files (package, suffix=".po"):
+def list_message_files (package, suffix=".mo"):
    """Return list of all found message files and their installation paths."""
    for fname in glob.glob("po/*" + suffix):
        # basename (without extension) is a locale name
@ -587,21 +578,9 @@ def check_manifest ():
 class MyBuild (build, object):
    """Custom build command."""

-    def build_message_files (self):
-        """For each po/*.po, build .mo file in target locale directory."""
-        # msgfmt.py is in the po/ subdirectory
-        sys.path.append('po')
-        import msgfmt
-        for (src, dst) in list_message_files(self.distribution.get_name()):
-            build_dst = os.path.join("build", dst)
-            self.mkpath(os.path.dirname(build_dst))
-            self.announce("Compiling %s -> %s" % (src, build_dst))
-            msgfmt.make(src, build_dst)
-
    def run (self):
-        """Check MANIFEST and build message files before building."""
+        """Check MANIFEST before building."""
        check_manifest()
-        self.build_message_files()
        build.run(self)


@ -670,6 +649,9 @@ data_files = [
        ]),
 ]

+for (src, dst) in list_message_files(AppName):
+    data_files.append((src, dst))
+
 if os.name == 'posix':
    data_files.append(('share/man/man1', ['doc/en/linkchecker.1', 'doc/en/linkchecker-gui.1']))
    data_files.append(('share/man/man5', ['doc/en/linkcheckerrc.5']))
@ -961,7 +943,6 @@ args = dict(
    # See also doc/install.txt for more detailed dependency documentation.
    #extra_requires = {
    #    "IP country info": ['GeoIP'], # http://www.maxmind.com/app/python
-    #    "Login form": ['twill'], # http://twill.idyll.org/
    #    "GNOME proxies": ['pygtk'], # http://www.pygtk.org/downloads.html
    #    "Bash completion": ['argcomplete'], # https://pypi.python.org/pypi/argcomplete
    #    "Memory debugging": ['meliae'], # https://launchpad.net/meliae
--- a/tests/checker/test_misc.py
+++ b/tests/checker/test_misc.py
@ -15,14 +15,14 @@
 # with this program; if not, write to the Free Software Foundation, Inc.,
 # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 """
-Test miscellaneous html tag parsing.
+Test miscellaneous html tag parsing and URL types
 """
 from . import LinkCheckTest


 class TestMisc (LinkCheckTest):
    """
-    Test link checking of HTML tags.
+    Test misc link types.
    """

    def test_misc (self):
@ -33,3 +33,17 @@ class TestMisc (LinkCheckTest):

    def test_archive (self):
        self.file_test("archive.html")
+
+    def test_itms_services(self):
+        url = u"itms-services:?action=download-manifest&url=http://www.example.com/"
+        resultlines = [
+            u"url %s" % url,
+            u"cache key %s" % url,
+            u"real url %s" % url,
+            u"valid",
+            u"url http://www.example.com/",
+            u"cache key http://www.example.com/",
+            u"real url http://www.example.com/",
+            u"valid",
+        ]
+        self.direct(url, resultlines, recursionlevel=1)
--- a/tests/configuration/test_config.py
+++ b/tests/configuration/test_config.py
@ -38,6 +38,7 @@ class TestConfig (unittest.TestCase):
        config = linkcheck.configuration.Configuration()
        files = [get_file("config0.ini")]
        config.read(files)
+        config.sanitize()
        # checking section
        for scheme in ("http", "https", "ftp"):
            self.assertTrue(scheme in config["allowedschemes"])
--- a/tests/test_robotparser.py
+++ b/tests/test_robotparser.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2004-2012 Bastian Kleineidam
+# Copyright (C) 2004-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
--- a/tests/test_updater.py
+++ b/tests/test_updater.py
@ -1,5 +1,5 @@
 # -*- coding: iso-8859-1 -*-
-# Copyright (C) 2011 Bastian Kleineidam
+# Copyright (C) 2011-2014 Bastian Kleineidam
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by