mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-21 08:20:25 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@2946 e7d03fd6-7b0d-0410-9947-9c21f3af8025
757 lines
27 KiB
Python
757 lines
27 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
# Copyright (C) 2000-2005 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
"""
|
|
Base URL handler.
|
|
"""
|
|
|
|
import sys
|
|
import os
|
|
import urlparse
|
|
import urllib2
|
|
import urllib
|
|
import time
|
|
import errno
|
|
import socket
|
|
import select
|
|
import traceback
|
|
|
|
import linkcheck
|
|
import linkcheck.linkparse
|
|
import linkcheck.checker
|
|
import linkcheck.strformat
|
|
import linkcheck.containers
|
|
import linkcheck.log
|
|
import linkcheck.httplib2
|
|
import linkcheck.HtmlParser.htmlsax
|
|
|
|
|
|
def urljoin (parent, url, scheme):
|
|
"""
|
|
If url is relative, join parent and url. Else leave url as-is.
|
|
|
|
@return joined url
|
|
"""
|
|
if url.startswith(scheme+":"):
|
|
return url
|
|
return urlparse.urljoin(parent, url)
|
|
|
|
|
|
class UrlBase (object):
|
|
"""
|
|
An URL with additional information like validity etc.
|
|
"""
|
|
|
|
def __init__ (self, base_url, recursion_level, consumer,
|
|
parent_url = None, base_ref = None,
|
|
line = -1, column = -1, name = u""):
|
|
"""
|
|
Initialize check data, and store given variables.
|
|
|
|
@param base_url: unquoted and possibly unnormed url
|
|
@param recursion_level: on what check level lies the base url
|
|
@param consumer: consumer instance
|
|
@param parent_url: quoted and normed url of parent or None
|
|
@param base_ref: quoted and normed url of <base href=""> or None
|
|
@param line: line number of url in parent content
|
|
@param column: column number of url in parent content
|
|
@param name: name of url or empty
|
|
"""
|
|
self.init(base_ref, base_url, parent_url, recursion_level,
|
|
consumer, line, column, name)
|
|
self.reset()
|
|
self.check_syntax()
|
|
|
|
|
|
def init (self, base_ref, base_url, parent_url, recursion_level,
|
|
consumer, line, column, name):
|
|
"""
|
|
Initialize internal data.
|
|
"""
|
|
self.base_ref = base_ref
|
|
# note that self.base_url must not be modified
|
|
self.base_url = base_url
|
|
self.parent_url = parent_url
|
|
self.recursion_level = recursion_level
|
|
self.consumer = consumer
|
|
self.line = line
|
|
self.column = column
|
|
self.name = name
|
|
if self.base_ref:
|
|
assert not linkcheck.url.url_needs_quoting(self.base_ref), \
|
|
"unquoted %r" % self.base_ref
|
|
if self.parent_url:
|
|
assert not linkcheck.url.url_needs_quoting(self.parent_url), \
|
|
"unquoted %r" % self.parent_url
|
|
url = linkcheck.checker.absolute_url(base_url, base_ref, parent_url)
|
|
# assume file link if no scheme is found
|
|
self.scheme = url.split(":", 1)[0] or "file"
|
|
|
|
def reset (self):
|
|
"""
|
|
Reset all variables to default values.
|
|
"""
|
|
# self.url is constructed by self.build_url() out of base_url
|
|
# and (base_ref or parent) as absolute and normed url.
|
|
# This the real url we use when checking so it also referred to
|
|
# as 'real url'
|
|
self.url = None
|
|
# a splitted version of url for convenience
|
|
self.urlparts = None
|
|
# the anchor part of url
|
|
self.anchor = None
|
|
# the result message string and flag
|
|
self.result = u""
|
|
self.has_result = False
|
|
# cached or not
|
|
self.cached = False
|
|
# valid or not
|
|
self.valid = True
|
|
# list of warnings (without duplicates)
|
|
self.warnings = linkcheck.containers.SetList()
|
|
# list of infos (without duplicates)
|
|
self.info = linkcheck.containers.SetList()
|
|
# download time
|
|
self.dltime = -1
|
|
# download size
|
|
self.dlsize = -1
|
|
# check time
|
|
self.checktime = 0
|
|
# connection object
|
|
self.url_connection = None
|
|
# data of url content
|
|
self.data = None
|
|
# if data is filled
|
|
self.has_content = False
|
|
# cache keys, are set by build_url() calling set_cache_keys()
|
|
self.cache_url_key = None
|
|
self.cache_content_key = None
|
|
# extern flags (is_extern, is_strict), both enabled as default
|
|
self.extern = (1, 1)
|
|
# flag if the result should be cached
|
|
self.caching = True
|
|
|
|
def set_result (self, msg, valid=True):
|
|
"""
|
|
Set result string and validity.
|
|
"""
|
|
if self.has_result:
|
|
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
|
"Double result %r (previous %r) for %s", msg, self.result, self)
|
|
else:
|
|
self.has_result = True
|
|
if not isinstance(msg, unicode):
|
|
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
|
"Non-unicode result for %s: %r", self, msg)
|
|
elif not msg:
|
|
linkcheck.log.warn(linkcheck.LOG_CHECK,
|
|
"Empty result for %s", self)
|
|
self.result = msg
|
|
self.valid = valid
|
|
|
|
def is_parseable (self):
|
|
"""
|
|
Return True iff content of this url is parseable.
|
|
"""
|
|
return False
|
|
|
|
def is_html (self):
|
|
"""
|
|
Return True iff content of this url is HTML formatted.
|
|
"""
|
|
return False
|
|
|
|
def is_http (self):
|
|
"""
|
|
Return True for http:// URLs.
|
|
"""
|
|
return False
|
|
|
|
def is_file (self):
|
|
"""
|
|
Return True for file:// URLs.
|
|
"""
|
|
return False
|
|
|
|
def add_warning (self, s, tag=None):
|
|
"""
|
|
Add a warning string.
|
|
"""
|
|
self.warnings.append((tag, s))
|
|
|
|
def add_info (self, s):
|
|
"""
|
|
Add an info string.
|
|
"""
|
|
self.info.append(s)
|
|
|
|
def copy_from_cache (self, cache_data):
|
|
"""
|
|
Fill attributes from cache data.
|
|
"""
|
|
self.result = cache_data["result"]
|
|
self.warnings.extend(cache_data["warnings"])
|
|
self.info.extend(cache_data["info"])
|
|
self.valid = cache_data["valid"]
|
|
self.dltime = cache_data["dltime"]
|
|
self.dlsize = cache_data["dlsize"]
|
|
self.cached = True
|
|
|
|
def get_cache_data (self):
|
|
"""
|
|
Return all data values that should be put in the cache.
|
|
"""
|
|
return {"result": self.result,
|
|
"warnings": self.warnings,
|
|
"info": self.info,
|
|
"valid": self.valid,
|
|
"dltime": self.dltime,
|
|
"dlsize": self.dlsize,
|
|
}
|
|
|
|
def set_cache_keys (self):
|
|
"""
|
|
Set keys for URL checking and content recursion.
|
|
"""
|
|
# remove anchor from content cache key since we assume
|
|
# URLs with different anchors to have the same content
|
|
self.cache_content_key = urlparse.urlunsplit(self.urlparts[:4]+[u''])
|
|
assert isinstance(self.cache_content_key, unicode), self
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "Content cache key %r",
|
|
self.cache_content_key)
|
|
# construct cache key
|
|
if self.consumer.config("anchorcaching") and \
|
|
self.consumer.config("anchors"):
|
|
# do not ignore anchor
|
|
parts = self.urlparts[:]
|
|
parts[4] = self.anchor
|
|
self.cache_url_key = urlparse.urlunsplit(parts)
|
|
else:
|
|
# no anchor caching
|
|
self.cache_url_key = self.cache_content_key
|
|
assert isinstance(self.cache_url_key, unicode), self
|
|
linkcheck.log.debug(linkcheck.LOG_CACHE, "URL cache key %r",
|
|
self.cache_url_key)
|
|
|
|
def check_syntax (self):
|
|
"""
|
|
Called before self.check(), this function inspects the
|
|
url syntax. Success enables further checking, failure
|
|
immediately logs this url. Syntax checks must not
|
|
use any network resources.
|
|
|
|
@return: True if syntax is correct, else False.
|
|
@rtype: bool
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking syntax")
|
|
if (self.base_url is None) or \
|
|
(not self.base_url and not self.parent_url):
|
|
self.set_result(_("URL is empty"), valid=False)
|
|
return
|
|
try:
|
|
self.build_url()
|
|
# check url warnings
|
|
effectiveurl = urlparse.urlunsplit(self.urlparts)
|
|
if self.url != effectiveurl:
|
|
self.add_warning(_("Effective URL %r.") % effectiveurl,
|
|
tag="url-effective-url")
|
|
self.url = effectiveurl
|
|
except tuple(linkcheck.checker.ExcSyntaxList), msg:
|
|
self.set_result(linkcheck.strformat.unicode_safe(msg),
|
|
valid=False)
|
|
return
|
|
self.set_cache_keys()
|
|
|
|
def build_url (self):
|
|
"""
|
|
Construct self.url and self.urlparts out of the given base
|
|
url information self.base_url, self.parent_url and self.base_ref.
|
|
"""
|
|
# norm base url
|
|
base_url, is_idn = linkcheck.url.url_norm(self.base_url)
|
|
if is_idn:
|
|
self.add_warning(_("""URL %r has a unicode domain name which
|
|
is not yet widely supported. You should use
|
|
the URL %r instead.""") % (self.base_url, base_url),
|
|
tag="url-unicode-domain")
|
|
elif self.base_url != base_url:
|
|
self.add_warning(
|
|
_("Base URL is not properly normed. Normed URL is %(url)s.") % \
|
|
{'url': base_url}, tag="url-unnormed")
|
|
# make url absolute
|
|
if self.base_ref:
|
|
# use base reference as parent url
|
|
if ":" not in self.base_ref:
|
|
# some websites have a relative base reference
|
|
self.base_ref = urljoin(self.parent_url, self.base_ref,
|
|
self.scheme)
|
|
self.url = urljoin(self.base_ref, base_url, self.scheme)
|
|
elif self.parent_url:
|
|
# strip the parent url query and anchor
|
|
urlparts = list(urlparse.urlsplit(self.parent_url))
|
|
urlparts[3] = urlparts[4] = ""
|
|
parent_url = urlparse.urlunsplit(urlparts)
|
|
self.url = urljoin(parent_url, base_url, self.scheme)
|
|
else:
|
|
self.url = base_url
|
|
# note: urljoin can unnorm the url path, so norm it again
|
|
urlparts = list(urlparse.urlsplit(self.url))
|
|
if urlparts[2]:
|
|
urlparts[2] = linkcheck.url.collapse_segments(urlparts[2])
|
|
self.url = urlparse.urlunsplit(urlparts)
|
|
# split into (modifiable) list
|
|
self.urlparts = linkcheck.strformat.url_unicode_split(self.url)
|
|
# and unsplit again
|
|
self.url = urlparse.urlunsplit(self.urlparts)
|
|
# check userinfo@host:port syntax
|
|
self.userinfo, host = urllib.splituser(self.urlparts[1])
|
|
# set host lowercase
|
|
if self.userinfo:
|
|
self.urlparts[1] = "%s@%s" % (self.userinfo, host.lower())
|
|
else:
|
|
self.urlparts[1] = host.lower()
|
|
# safe anchor for later checking
|
|
self.anchor = self.urlparts[4]
|
|
self.host, self.port = urllib.splitport(host)
|
|
if self.port is not None:
|
|
if not linkcheck.url.is_numeric_port(self.port):
|
|
raise linkcheck.LinkCheckerError, \
|
|
_("URL has invalid port %r") % str(self.port)
|
|
self.port = int(self.port)
|
|
|
|
def check (self):
|
|
"""
|
|
Main check function for checking this URL.
|
|
"""
|
|
if self.consumer.config("trace"):
|
|
linkcheck.trace.trace_on()
|
|
try:
|
|
self.local_check()
|
|
self.consumer.checked(self)
|
|
except (socket.error, select.error):
|
|
self.consumer.interrupted(self)
|
|
# on Unix, ctrl-c can raise
|
|
# error: (4, 'Interrupted system call')
|
|
etype, value = sys.exc_info()[:2]
|
|
if etype == 4:
|
|
raise KeyboardInterrupt, value
|
|
else:
|
|
raise
|
|
except KeyboardInterrupt:
|
|
self.consumer.interrupted(self)
|
|
raise
|
|
except:
|
|
self.consumer.interrupted(self)
|
|
linkcheck.checker.internal_error()
|
|
|
|
def add_country_info (self):
|
|
"""
|
|
Try to ask GeoIP database for country info.
|
|
"""
|
|
country = self.consumer.get_country_name(self.host)
|
|
if country is not None:
|
|
self.add_info(_("URL is located in %s.") % _(country))
|
|
|
|
def local_check (self):
|
|
"""
|
|
Local check function can be overridden in subclasses.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Checking %s", self)
|
|
if self.recursion_level and self.consumer.config('wait'):
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"sleeping for %d seconds",
|
|
self.consumer.config('wait'))
|
|
time.sleep(self.consumer.config('wait'))
|
|
t = time.time()
|
|
self.set_extern(self.url)
|
|
if self.extern[0] and self.extern[1]:
|
|
self.add_info(_("Outside of domain filter, checked only syntax."))
|
|
return
|
|
|
|
# check connection
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking connection")
|
|
try:
|
|
self.check_connection()
|
|
self.add_country_info()
|
|
if self.consumer.config("anchors"):
|
|
self.check_anchors()
|
|
except tuple(linkcheck.checker.ExcList):
|
|
value = self.handle_exception()
|
|
# make nicer error msg for unknown hosts
|
|
if isinstance(value, socket.error) and value[0] == -2:
|
|
value = _('Hostname not found')
|
|
# make nicer error msg for bad status line
|
|
if isinstance(value, linkcheck.httplib2.BadStatusLine):
|
|
value = _('Bad HTTP response %r') % str(value)
|
|
self.set_result(linkcheck.strformat.unicode_safe(value),
|
|
valid=False)
|
|
|
|
# check content
|
|
warningregex = self.consumer.config("warningregex")
|
|
if warningregex and self.valid:
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking content")
|
|
try:
|
|
self.check_content(warningregex)
|
|
except tuple(linkcheck.checker.ExcList):
|
|
value = self.handle_exception()
|
|
self.set_result(linkcheck.strformat.unicode_safe(value),
|
|
valid=False)
|
|
|
|
self.checktime = time.time() - t
|
|
# check recursion
|
|
try:
|
|
if self.allows_recursion():
|
|
self.parse_url()
|
|
# check content size
|
|
self.check_size()
|
|
except tuple(linkcheck.checker.ExcList):
|
|
value = self.handle_exception()
|
|
self.add_warning(_("could not get content: %r") % str(value),
|
|
tag="url-error-getting-content")
|
|
# close
|
|
self.close_connection()
|
|
|
|
def close_connection (self):
|
|
"""
|
|
Close an opened url connection.
|
|
"""
|
|
if self.url_connection is None:
|
|
# no connection is open
|
|
return
|
|
try:
|
|
self.url_connection.close()
|
|
except:
|
|
# ignore close errors
|
|
pass
|
|
self.url_connection = None
|
|
|
|
def handle_exception (self):
|
|
"""
|
|
An exception occurred. Log it and set the cache flag.
|
|
"""
|
|
etype, value, tb = sys.exc_info()
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "exception %s",
|
|
traceback.format_tb(tb))
|
|
# note: etype must be the exact class, not a subclass
|
|
if (etype in linkcheck.checker.ExcNoCacheList) or \
|
|
(etype == socket.error and value[0]==errno.EBADF) or \
|
|
not value:
|
|
# EBADF occurs when operating on an already socket
|
|
self.caching = False
|
|
return value
|
|
|
|
def check_connection (self):
|
|
"""
|
|
The basic connection check uses urllib2.urlopen to initialize
|
|
a connection object.
|
|
"""
|
|
self.url_connection = urllib2.urlopen(self.url)
|
|
|
|
def allows_recursion (self):
|
|
"""
|
|
Return True iff we can recurse into the url's content.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"checking recursion of %r ...", self.url)
|
|
# Test self.valid before self.is_parseable().
|
|
if not self.valid:
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, invalid.")
|
|
return False
|
|
if not self.is_parseable():
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, not parseable.")
|
|
return False
|
|
if not self.can_get_content():
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"... no, cannot get content.")
|
|
return False
|
|
if self.consumer.config("recursionlevel") >= 0 and \
|
|
self.recursion_level >= self.consumer.config("recursionlevel"):
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"... no, maximum recursion level reached.")
|
|
return False
|
|
if self.extern[0]:
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, extern.")
|
|
return False
|
|
if not self.content_allows_robots():
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "... no, robots.")
|
|
return False
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "... yes, recursion.")
|
|
return True
|
|
|
|
def content_allows_robots (self):
|
|
"""
|
|
Return True if the content of this URL forbids robots to
|
|
search for recursive links.
|
|
"""
|
|
if not self.is_html():
|
|
return True
|
|
if not (self.is_http() or self.is_file()):
|
|
return True
|
|
h = linkcheck.linkparse.MetaRobotsFinder(self.get_content())
|
|
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.get_content())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
return h.follow
|
|
|
|
def check_anchors (self):
|
|
"""
|
|
If URL was valid and a HTML resource, check the anchors and
|
|
log a warning when an anchor was not found.
|
|
"""
|
|
if not (self.valid and self.anchor and self.is_html() and \
|
|
self.can_get_content()):
|
|
# do not bother
|
|
return
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "checking anchor %r",
|
|
self.anchor)
|
|
h = linkcheck.linkparse.LinkFinder(self.get_content(),
|
|
tags={'a': [u'name'], None: [u'id']})
|
|
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.get_content())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
for cur_anchor, line, column, name, base in h.urls:
|
|
if cur_anchor == self.anchor:
|
|
return
|
|
self.add_warning(_("Anchor #%s not found.") % self.anchor,
|
|
tag="url-anchor-not-found")
|
|
|
|
def set_extern (self, url):
|
|
"""
|
|
Match URL against extern and intern link patterns. If no pattern
|
|
matches the URL is extern. Sets self.extern to a tuple (bool,
|
|
bool) with content (is_extern, is_strict).
|
|
|
|
@return: None
|
|
"""
|
|
for entry in self.consumer.config("externlinks"):
|
|
match = entry['pattern'].search(url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Extern URL %r", url)
|
|
self.extern = (1, entry['strict'])
|
|
return
|
|
for entry in self.consumer.config("internlinks"):
|
|
match = entry['pattern'].search(url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Intern URL %r", url)
|
|
self.extern = (0, 0)
|
|
return
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"Explicit extern URL %r", url)
|
|
self.extern = (1, 0)
|
|
return
|
|
|
|
def can_get_content (self):
|
|
"""
|
|
Indicate wether url get_content() can be called.
|
|
"""
|
|
return True
|
|
|
|
def get_content (self):
|
|
"""
|
|
Precondition: url_connection is an opened URL.
|
|
"""
|
|
if not self.has_content:
|
|
t = time.time()
|
|
self.data = self.url_connection.read()
|
|
self.dltime = time.time() - t
|
|
self.dlsize = len(self.data)
|
|
self.has_content = True
|
|
return self.data
|
|
|
|
def check_content (self, warningregex):
|
|
"""
|
|
If a warning expression was given, call this function to check it
|
|
against the content of this url.
|
|
"""
|
|
if not self.can_get_content():
|
|
return
|
|
match = warningregex.search(self.get_content())
|
|
if match:
|
|
self.add_warning(_("Found %r in link contents.") % match.group(),
|
|
tag="url-warnregex-found")
|
|
|
|
def check_size (self):
|
|
"""
|
|
If a maximum size was given, call this function to check it
|
|
against the content size of this url.
|
|
"""
|
|
maxbytes = self.consumer.config("warnsizebytes")
|
|
if maxbytes is not None and self.dlsize >= maxbytes:
|
|
self.add_warning(_("Content size %s is larger than %s.") % \
|
|
(linkcheck.strformat.strsize(self.dlsize),
|
|
linkcheck.strformat.strsize(maxbytes)),
|
|
tag="url-content-too-large")
|
|
|
|
def parse_url (self):
|
|
"""
|
|
Parse url content and search for recursive links.
|
|
Default parse type is html.
|
|
"""
|
|
self.parse_html()
|
|
|
|
def get_user_password (self):
|
|
"""
|
|
Get tuple (user, password) from configured authentication.
|
|
Both user and password can be None if not specified.
|
|
"""
|
|
for auth in self.consumer.config("authentication"):
|
|
if auth['pattern'].match(self.url):
|
|
return auth['user'], auth['password']
|
|
return None, None
|
|
|
|
def parse_html (self):
|
|
"""
|
|
Parse into HTML content and search for URLs to check.
|
|
Found URLs are added to the URL queue.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Parsing HTML %s", self)
|
|
h = linkcheck.linkparse.LinkFinder(self.get_content())
|
|
p = linkcheck.HtmlParser.htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.get_content())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
for url, line, column, name, codebase in h.urls:
|
|
if codebase:
|
|
base_ref = codebase
|
|
else:
|
|
base_ref = h.base_ref
|
|
url_data = linkcheck.checker.get_url_from(url,
|
|
self.recursion_level+1, self.consumer, parent_url=self.url,
|
|
base_ref=base_ref, line=line, column=column, name=name,
|
|
cmdline=False)
|
|
self.consumer.append_url(url_data)
|
|
|
|
def parse_opera (self):
|
|
"""
|
|
Parse an opera bookmark file.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK,
|
|
"Parsing Opera bookmarks %s", self)
|
|
name = ""
|
|
lineno = 0
|
|
lines = self.get_content().splitlines()
|
|
for line in lines:
|
|
lineno += 1
|
|
line = line.strip()
|
|
if line.startswith("NAME="):
|
|
name = line[5:]
|
|
elif line.startswith("URL="):
|
|
url = line[4:]
|
|
if url:
|
|
url_data = linkcheck.checker.get_url_from(url,
|
|
self.recursion_level+1, self.consumer,
|
|
parent_url=self.url, line=lineno, name=name,
|
|
cmdline=False)
|
|
self.consumer.append_url(url_data)
|
|
name = ""
|
|
|
|
def parse_text (self):
|
|
"""
|
|
Parse a text file with on url per line; comment and blank
|
|
lines are ignored.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Parsing text %s", self)
|
|
lineno = 0
|
|
for line in self.get_content().splitlines():
|
|
lineno += 1
|
|
line = line.strip()
|
|
if not line or line.startswith('#'):
|
|
continue
|
|
url_data = linkcheck.checker.get_url_from(line,
|
|
self.recursion_level+1, self.consumer,
|
|
parent_url=self.url, line=lineno,
|
|
cmdline=False)
|
|
self.consumer.append_url(url_data)
|
|
|
|
def parse_css (self):
|
|
"""
|
|
Parse a CSS file for url() patterns.
|
|
"""
|
|
linkcheck.log.debug(linkcheck.LOG_CHECK, "Parsing CSS %s", self)
|
|
lineno = 0
|
|
for line in self.get_content().splitlines():
|
|
lineno += 1
|
|
for mo in linkcheck.linkparse.css_url_re.finditer(line):
|
|
column = mo.start("url")
|
|
url = linkcheck.strformat.unquote(mo.group("url").strip())
|
|
url_data = linkcheck.checker.get_url_from(url,
|
|
self.recursion_level+1, self.consumer,
|
|
parent_url=self.url, line=lineno, column=column,
|
|
cmdline=False)
|
|
self.consumer.append_url(url_data)
|
|
|
|
def serialized (self):
|
|
"""
|
|
Return serialized url check data as unicode string.
|
|
"""
|
|
sep = linkcheck.strformat.unicode_safe(os.linesep)
|
|
if self.base_url is not None:
|
|
assert isinstance(self.base_url, unicode), self
|
|
if self.parent_url is not None:
|
|
assert isinstance(self.parent_url, unicode), self
|
|
if self.base_ref is not None:
|
|
assert isinstance(self.base_ref, unicode), self
|
|
assert isinstance(self.name, unicode), self
|
|
return sep.join([
|
|
u"%s link" % self.scheme,
|
|
u"base_url=%r" % self.base_url,
|
|
u"parent_url=%r" % self.parent_url,
|
|
u"base_ref=%r" % self.base_ref,
|
|
u"recursion_level=%s" % self.recursion_level,
|
|
u"url_connection=%s" % self.url_connection,
|
|
u"line=%d" % self.line,
|
|
u"column=%d" % self.column,
|
|
u"name=%r" % self.name,
|
|
])
|
|
|
|
def get_intern_pattern (self):
|
|
"""
|
|
Get pattern for intern URL matching.
|
|
|
|
@return non-empty regex pattern or None
|
|
@rtype String or None
|
|
"""
|
|
return None
|
|
|
|
def __str__ (self):
|
|
"""
|
|
Get URL info.
|
|
|
|
@return: URL info, encoded with the output logger encoding
|
|
@rtype: string
|
|
"""
|
|
s = self.serialized()
|
|
return self.consumer.config('logger').encode(s)
|
|
|
|
def __repr__ (self):
|
|
"""
|
|
Get URL info.
|
|
|
|
@return: URL info
|
|
@rtype: unicode
|
|
"""
|
|
return u"<%s >" % self.serialized()
|
|
|