mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-22 08:50:24 +00:00
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1339 e7d03fd6-7b0d-0410-9947-9c21f3af8025
684 lines
23 KiB
Python
684 lines
23 KiB
Python
# -*- coding: iso-8859-1 -*-
|
|
"""Base URL handler"""
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU General Public License as published by
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
# (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# You should have received a copy of the GNU General Public License
|
|
# along with this program; if not, write to the Free Software
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
|
|
|
import sys, re, urlparse, urllib2, time, traceback, socket, select, i18n
|
|
from urllib import splituser, splitport, unquote
|
|
from linkcheck import DNS, LinkCheckerError, getLinkPat, httplib2
|
|
from linkcheck.parser import htmlsax
|
|
DNS.DiscoverNameServers()
|
|
|
|
import Config, StringUtil, test_support
|
|
from linkparse import LinkFinder, MetaRobotsFinder, css_url_re
|
|
from debug import *
|
|
|
|
ws_at_start_or_end = re.compile(r"(^\s+)|(\s+$)").search
|
|
|
|
# helper function for internal errors
|
|
def internal_error ():
|
|
print >>sys.stderr, i18n._("""\n********** Oops, I did it again. *************
|
|
|
|
You have found an internal error in LinkChecker. Please write a bug report
|
|
at http://sourceforge.net/tracker/?func=add&group_id=1913&atid=101913
|
|
or send mail to %s and include the following information:
|
|
1) The URL or file you are testing
|
|
2) Your commandline arguments and/or configuration.
|
|
3) The system information below.
|
|
|
|
If you disclose some information because its too private to you thats ok.
|
|
I will try to help you nontheless (but you have to give me *something*
|
|
I can work with ;).
|
|
""") % Config.Email
|
|
etype, value = sys.exc_info()[:2]
|
|
print >>sys.stderr, etype, value
|
|
traceback.print_exc()
|
|
print_app_info()
|
|
print >>sys.stderr, i18n._("\n******** LinkChecker internal error, bailing out ********")
|
|
sys.exit(1)
|
|
|
|
|
|
def print_app_info ():
|
|
import os
|
|
print >>sys.stderr, i18n._("System info:")
|
|
print >>sys.stderr, Config.App
|
|
print >>sys.stderr, "Python %s on %s" % (sys.version, sys.platform)
|
|
for key in ("LC_ALL", "LC_MESSAGES", "http_proxy", "ftp_proxy"):
|
|
value = os.getenv(key)
|
|
if value is not None:
|
|
print >>sys.stderr, key, "=", repr(value)
|
|
|
|
|
|
def get_absolute_url (urlName, baseRef, parentName):
|
|
"""Search for the absolute url to detect the link type. This does not
|
|
join any url fragments together! Returns the url in lower case to
|
|
simplify urltype matching."""
|
|
if urlName and ":" in urlName:
|
|
return urlName.lower()
|
|
elif baseRef and ":" in baseRef:
|
|
return baseRef.lower()
|
|
elif parentName and ":" in parentName:
|
|
return parentName.lower()
|
|
return ""
|
|
|
|
|
|
# we catch these exceptions, all other exceptions are internal
|
|
# or system errors
|
|
ExcList = [
|
|
IOError,
|
|
ValueError, # from httplib.py
|
|
LinkCheckerError,
|
|
DNS.Error,
|
|
socket.timeout,
|
|
socket.error,
|
|
select.error,
|
|
]
|
|
|
|
if hasattr(socket, "sslerror"):
|
|
ExcList.append(socket.sslerror)
|
|
|
|
# regular expression for port numbers
|
|
is_valid_port = re.compile(r"\d+").match
|
|
|
|
|
|
def GetUrlDataFrom (urlName, recursionLevel, config, parentName=None,
|
|
baseRef=None, line=0, column=0, name=None,
|
|
cmdline=None):
|
|
from FileUrlData import FileUrlData
|
|
from IgnoredUrlData import IgnoredUrlData, ignored_schemes_re
|
|
from FtpUrlData import FtpUrlData
|
|
from GopherUrlData import GopherUrlData
|
|
from HttpUrlData import HttpUrlData
|
|
from HttpsUrlData import HttpsUrlData
|
|
from MailtoUrlData import MailtoUrlData
|
|
from TelnetUrlData import TelnetUrlData
|
|
from NntpUrlData import NntpUrlData
|
|
url = get_absolute_url(urlName, baseRef, parentName)
|
|
# test scheme
|
|
if url.startswith("http:"):
|
|
klass = HttpUrlData
|
|
elif url.startswith("ftp:"):
|
|
klass = FtpUrlData
|
|
elif url.startswith("file:"):
|
|
klass = FileUrlData
|
|
elif url.startswith("telnet:"):
|
|
klass = TelnetUrlData
|
|
elif url.startswith("mailto:"):
|
|
klass = MailtoUrlData
|
|
elif url.startswith("gopher:"):
|
|
klass = GopherUrlData
|
|
elif url.startswith("https:"):
|
|
klass = HttpsUrlData
|
|
elif url.startswith("nttp:") or \
|
|
url.startswith("news:") or \
|
|
url.startswith("snews:"):
|
|
klass = NntpUrlData
|
|
# application specific links are ignored
|
|
elif ignored_schemes_re.search(url):
|
|
klass = IgnoredUrlData
|
|
# assume local file
|
|
else:
|
|
klass = FileUrlData
|
|
if config['strict'] and cmdline and \
|
|
not (config['internlinks'] or config['externlinks']):
|
|
# set automatic intern/extern stuff if no filter was given
|
|
set_intern_url(url, klass, config)
|
|
return klass(urlName, recursionLevel, config, parentName, baseRef,
|
|
line=line, column=column, name=name)
|
|
|
|
|
|
def set_intern_url (url, klass, config):
|
|
"""Precondition: config['strict'] is true (ie strict checking) and
|
|
recursion level is zero (ie url given on the command line)"""
|
|
from FileUrlData import FileUrlData
|
|
from FtpUrlData import FtpUrlData
|
|
from HttpUrlData import HttpUrlData
|
|
from HttpsUrlData import HttpsUrlData
|
|
if klass == FileUrlData:
|
|
debug(BRING_IT_ON, "Add intern pattern ^file:")
|
|
config['internlinks'].append(getLinkPat("^file:"))
|
|
elif klass in [HttpUrlData, HttpsUrlData, FtpUrlData]:
|
|
domain = urlparse.urlsplit(url)[1]
|
|
if domain:
|
|
domain = "://%s"%re.escape(domain)
|
|
debug(BRING_IT_ON, "Add intern domain", domain)
|
|
# add scheme colon to link pattern
|
|
config['internlinks'].append(getLinkPat(domain))
|
|
|
|
|
|
class UrlData (object):
|
|
"Representing a URL with additional information like validity etc"
|
|
|
|
def __init__ (self,
|
|
urlName,
|
|
recursionLevel,
|
|
config,
|
|
parentName = None,
|
|
baseRef = None,
|
|
line = 0,
|
|
column = 0,
|
|
name = ""):
|
|
self.urlName = urlName
|
|
self.anchor = None
|
|
self.recursionLevel = recursionLevel
|
|
self.config = config
|
|
self.parentName = parentName
|
|
self.baseRef = baseRef
|
|
self.errorString = i18n._("Error")
|
|
self.validString = i18n._("Valid")
|
|
self.warningString = None
|
|
self.infoString = None
|
|
self.valid = True
|
|
self.url = None
|
|
self.urlparts = None
|
|
self.line = line
|
|
self.column = column
|
|
self.name = name
|
|
self.dltime = -1
|
|
self.dlsize = -1
|
|
self.checktime = 0
|
|
self.cached = False
|
|
self.urlConnection = None
|
|
self.extern = (1, 0)
|
|
self.data = None
|
|
self.has_content = False
|
|
url = get_absolute_url(self.urlName, self.baseRef, self.parentName)
|
|
# assume file link if no scheme is found
|
|
self.scheme = url.split(":", 1)[0] or "file"
|
|
|
|
|
|
def setError (self, s):
|
|
self.valid = False
|
|
self.errorString = i18n._("Error")+": "+s
|
|
|
|
|
|
def setValid (self, s):
|
|
self.valid = True
|
|
self.validString = i18n._("Valid")+": "+s
|
|
|
|
|
|
def isParseable (self):
|
|
return False
|
|
|
|
|
|
def isHtml (self):
|
|
return False
|
|
|
|
|
|
def isHttp (self):
|
|
return False
|
|
|
|
|
|
def isFile (self):
|
|
return False
|
|
|
|
|
|
def setWarning (self, s):
|
|
if self.warningString:
|
|
self.warningString += "\n"+s
|
|
else:
|
|
self.warningString = s
|
|
|
|
|
|
def setInfo (self, s):
|
|
if self.infoString:
|
|
self.infoString += "\n"+s
|
|
else:
|
|
self.infoString = s
|
|
|
|
|
|
def copyFromCache (self, cacheData):
|
|
"""fill attributes from cache data"""
|
|
self.errorString = cacheData["errorString"]
|
|
self.validString = cacheData["validString"]
|
|
if self.warningString:
|
|
if cacheData["warningString"]:
|
|
self.warningString += "\n"+cacheData["warningString"]
|
|
else:
|
|
self.warningString = cacheData["warningString"]
|
|
self.infoString = cacheData["infoString"]
|
|
self.valid = cacheData["valid"]
|
|
self.dltime = cacheData["dltime"]
|
|
|
|
|
|
def getCacheData (self):
|
|
"""return all data values that should be put in the cache"""
|
|
return {"errorString": self.errorString,
|
|
"validString": self.validString,
|
|
"warningString": self.warningString,
|
|
"infoString": self.infoString,
|
|
"valid": self.valid,
|
|
"dltime": self.dltime,
|
|
}
|
|
|
|
|
|
def buildUrl (self):
|
|
if self.baseRef:
|
|
if ":" not in self.baseRef:
|
|
self.baseRef = urlparse.urljoin(self.parentName, self.baseRef)
|
|
self.url = urlparse.urljoin(self.baseRef, self.urlName)
|
|
elif self.parentName:
|
|
self.url = urlparse.urljoin(self.parentName, self.urlName)
|
|
else:
|
|
self.url = self.urlName
|
|
# unquote url
|
|
self.url = unquote(self.url)
|
|
# split into (modifiable) list
|
|
self.urlparts = list(urlparse.urlsplit(self.url))
|
|
# check userinfo@host:port syntax
|
|
self.userinfo, host = splituser(self.urlparts[1])
|
|
x, port = splitport(host)
|
|
if port is not None and not is_valid_port(port):
|
|
raise LinkCheckerError(i18n._("URL has invalid port number %r")\
|
|
% str(port))
|
|
# set host lowercase and without userinfo
|
|
self.urlparts[1] = host.lower()
|
|
# safe anchor for later checking
|
|
self.anchor = self.urlparts[4]
|
|
|
|
|
|
def logMe (self):
|
|
debug(BRING_IT_ON, "logging url")
|
|
self.config.incrementLinknumber()
|
|
if self.config["verbose"] or not self.valid or \
|
|
(self.warningString and self.config["warnings"]):
|
|
self.config.log_newUrl(self)
|
|
|
|
|
|
def check (self):
|
|
try:
|
|
self._check()
|
|
except KeyboardInterrupt:
|
|
raise
|
|
except (socket.error, select.error):
|
|
# on Unix, ctrl-c can raise
|
|
# error: (4, 'Interrupted system call')
|
|
etype, value = sys.exc_info()[:2]
|
|
if etype!=4:
|
|
raise
|
|
except test_support.Error:
|
|
raise
|
|
except:
|
|
internal_error()
|
|
|
|
|
|
def _check (self):
|
|
debug(BRING_IT_ON, "Checking", self)
|
|
if self.recursionLevel and self.config['wait']:
|
|
debug(BRING_IT_ON, "sleeping for", self.config['wait'], "seconds")
|
|
time.sleep(self.config['wait'])
|
|
t = time.time()
|
|
if not self.checkCache():
|
|
return
|
|
# apply filter
|
|
debug(BRING_IT_ON, "extern =", self.extern)
|
|
if self.extern[0] and (self.config["strict"] or self.extern[1]):
|
|
self.setWarning(
|
|
i18n._("outside of domain filter, checked only syntax"))
|
|
self.logMe()
|
|
return
|
|
|
|
# check connection
|
|
debug(BRING_IT_ON, "checking connection")
|
|
try:
|
|
self.checkConnection()
|
|
if self.cached:
|
|
return
|
|
if self.config["anchors"]:
|
|
self.checkAnchors()
|
|
except tuple(ExcList):
|
|
etype, evalue, etb = sys.exc_info()
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(etb))
|
|
# make nicer error msg for unknown hosts
|
|
if isinstance(evalue, socket.error) and evalue[0]==-2:
|
|
evalue = i18n._('Hostname not found')
|
|
# make nicer error msg for bad status line
|
|
if isinstance(evalue, httplib2.BadStatusLine):
|
|
evalue = i18n._('Bad HTTP response %r')%str(evalue)
|
|
self.setError(str(evalue))
|
|
|
|
# check content
|
|
warningregex = self.config["warningregex"]
|
|
if warningregex and self.valid:
|
|
debug(BRING_IT_ON, "checking content")
|
|
try:
|
|
self.checkContent(warningregex)
|
|
except tuple(ExcList):
|
|
value, tb = sys.exc_info()[1:]
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
|
self.setError(str(value))
|
|
|
|
self.checktime = time.time() - t
|
|
# check recursion
|
|
debug(BRING_IT_ON, "checking recursion")
|
|
try:
|
|
if self.allowsRecursion():
|
|
self.parseUrl()
|
|
# check content size
|
|
self.checkSize()
|
|
except tuple(ExcList):
|
|
value, tb = sys.exc_info()[1:]
|
|
debug(HURT_ME_PLENTY, "exception", traceback.format_tb(tb))
|
|
self.setError(i18n._("could not parse content: %r")%str(value))
|
|
# close
|
|
self.closeConnection()
|
|
self.logMe()
|
|
debug(BRING_IT_ON, "caching")
|
|
self.putInCache()
|
|
|
|
|
|
def checkSyntax (self):
|
|
debug(BRING_IT_ON, "checking syntax")
|
|
if not self.urlName or self.urlName=="":
|
|
self.setError(i18n._("URL is null or empty"))
|
|
self.logMe()
|
|
return False
|
|
if ws_at_start_or_end(self.urlName):
|
|
self.setError(i18n._("URL has whitespace at beginning or end"))
|
|
self.logMe()
|
|
return False
|
|
try:
|
|
self.buildUrl()
|
|
self.extern = self._getExtern()
|
|
except LinkCheckerError, msg:
|
|
self.setError(str(msg))
|
|
self.logMe()
|
|
return False
|
|
return True
|
|
|
|
|
|
def checkCache (self):
|
|
debug(BRING_IT_ON, "checking cache")
|
|
for key in self.getCacheKeys():
|
|
if self.config.urlCache_has_key(key):
|
|
self.copyFromCache(self.config.urlCache_get(key))
|
|
self.cached = True
|
|
self.logMe()
|
|
return False
|
|
return True
|
|
|
|
|
|
def closeConnection (self):
|
|
# brute force closing
|
|
if self.urlConnection is not None:
|
|
try: self.urlConnection.close()
|
|
except: pass
|
|
# release variable for garbage collection
|
|
self.urlConnection = None
|
|
|
|
|
|
def putInCache (self):
|
|
if not self.cached:
|
|
data = self.getCacheData()
|
|
for key in self.getCacheKeys():
|
|
self.config.urlCache_set(key, data)
|
|
self.config.urlSeen_set(key)
|
|
self.cached = True
|
|
|
|
|
|
def getCacheKeys (self):
|
|
key = self.getCacheKey()
|
|
if key is None:
|
|
return []
|
|
return [key]
|
|
|
|
|
|
def isCached (self):
|
|
key = self.getCacheKey()
|
|
return self.cached or self.config.urlSeen_has_key(key)
|
|
|
|
|
|
def getCacheKey (self):
|
|
# note: the host is already lowercase
|
|
if self.urlparts:
|
|
if self.config["anchorcaching"]:
|
|
# do not ignore anchor
|
|
return urlparse.urlunsplit(self.urlparts)
|
|
else:
|
|
# removed anchor from cache key
|
|
return urlparse.urlunsplit(self.urlparts[:4]+[''])
|
|
return None
|
|
|
|
|
|
def checkConnection (self):
|
|
self.urlConnection = urllib2.urlopen(self.url)
|
|
|
|
|
|
def allowsRecursion (self):
|
|
# note: test self.valid before self.isParseable()
|
|
return self.valid and \
|
|
self.isParseable() and \
|
|
self.hasContent() and \
|
|
not self.isCached() and \
|
|
(self.config["recursionlevel"] < 0 or
|
|
self.recursionLevel < self.config["recursionlevel"]) and \
|
|
not self.extern[0] and self.contentAllowsRobots()
|
|
|
|
|
|
def contentAllowsRobots (self):
|
|
if not self.isHtml():
|
|
return True
|
|
if not (self.isHttp() or self.isFile()):
|
|
return True
|
|
h = MetaRobotsFinder(self.getContent())
|
|
p = htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.getContent())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
return h.follow
|
|
|
|
|
|
def checkAnchors (self):
|
|
if not (self.valid and self.anchor and self.isHtml() and \
|
|
self.hasContent()):
|
|
# do not bother
|
|
return
|
|
debug(HURT_ME_PLENTY, "checking anchor", self.anchor)
|
|
h = LinkFinder(self.getContent(), tags={'a': ['name'], None: ['id']})
|
|
p = htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.getContent())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
for cur_anchor,line,column,name,base in h.urls:
|
|
if cur_anchor == self.anchor:
|
|
return
|
|
self.setWarning(i18n._("anchor #%s not found") % self.anchor)
|
|
|
|
|
|
def _getExtern (self):
|
|
if not (self.config["externlinks"] or self.config["internlinks"]):
|
|
return (0, 0)
|
|
# deny and allow external checking
|
|
Config.debug(HURT_ME_PLENTY, "Url", self.url)
|
|
if self.config["denyallow"]:
|
|
for entry in self.config["externlinks"]:
|
|
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
|
|
match = entry['pattern'].search(self.url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
return (1, entry['strict'])
|
|
for entry in self.config["internlinks"]:
|
|
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
|
|
match = entry['pattern'].search(self.url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
return (0, 0)
|
|
return (0, 0)
|
|
else:
|
|
for entry in self.config["internlinks"]:
|
|
Config.debug(HURT_ME_PLENTY, "Intern entry", entry)
|
|
match = entry['pattern'].search(self.url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
return (0, 0)
|
|
for entry in self.config["externlinks"]:
|
|
Config.debug(HURT_ME_PLENTY, "Extern entry", entry)
|
|
match = entry['pattern'].search(self.url)
|
|
if (entry['negate'] and not match) or \
|
|
(match and not entry['negate']):
|
|
return (1, entry['strict'])
|
|
return (1,0)
|
|
|
|
|
|
def hasContent (self):
|
|
"""indicate wether url getContent() can be called"""
|
|
return True
|
|
|
|
|
|
def getContent (self):
|
|
"""Precondition: urlConnection is an opened URL."""
|
|
if not self.has_content:
|
|
self.has_content = True
|
|
t = time.time()
|
|
self.data = self.urlConnection.read()
|
|
self.dltime = time.time() - t
|
|
self.dlsize = len(self.data)
|
|
return self.data
|
|
|
|
|
|
def checkContent (self, warningregex):
|
|
"""if a warning expression was given, call this function to check it
|
|
against the content of this url"""
|
|
if not self.hasContent():
|
|
return
|
|
match = warningregex.search(self.getContent())
|
|
if match:
|
|
self.setWarning(i18n._("Found %r in link contents")%match.group())
|
|
|
|
|
|
def checkSize (self):
|
|
"""if a maximum size was given, call this function to check it
|
|
against the content size of this url"""
|
|
maxbytes = self.config["warnsizebytes"]
|
|
if maxbytes is not None and self.dlsize >= maxbytes:
|
|
self.setWarning(i18n._("Content size %s is larger than %s")%\
|
|
(StringUtil.strsize(self.dlsize),
|
|
StringUtil.strsize(maxbytes)))
|
|
|
|
|
|
def parseUrl (self):
|
|
# default parse type is html
|
|
debug(BRING_IT_ON, "Parsing recursively into", self)
|
|
self.parse_html();
|
|
|
|
|
|
def getUserPassword (self):
|
|
for auth in self.config["authentication"]:
|
|
if auth['pattern'].match(self.url):
|
|
return auth['user'], auth['password']
|
|
return None,None
|
|
|
|
|
|
def parse_html (self):
|
|
# search for a possible base reference
|
|
h = LinkFinder(self.getContent(), tags={'base': ['href']})
|
|
p = htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.getContent())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
baseRef = None
|
|
if len(h.urls)>=1:
|
|
baseRef = h.urls[0][0]
|
|
if len(h.urls)>1:
|
|
self.setWarning(i18n._(
|
|
"more than one <base> tag found, using only the first one"))
|
|
h = LinkFinder(self.getContent())
|
|
p = htmlsax.parser(h)
|
|
h.parser = p
|
|
p.feed(self.getContent())
|
|
p.flush()
|
|
h.parser = None
|
|
p.handler = None
|
|
for s in h.parse_info:
|
|
# the parser had warnings/errors
|
|
self.setWarning(s)
|
|
for url,line,column,name,codebase in h.urls:
|
|
if codebase:
|
|
base = codebase
|
|
else:
|
|
base = baseRef
|
|
debug(NIGHTMARE, "Put url %r in queue"%url)
|
|
self.config.appendUrl(GetUrlDataFrom(url,
|
|
self.recursionLevel+1, self.config,
|
|
parentName=self.url, baseRef=base,
|
|
line=line, column=column, name=name))
|
|
|
|
|
|
def parse_opera (self):
|
|
# parse an opera bookmark file
|
|
name = ""
|
|
lineno = 0
|
|
lines = self.getContent().splitlines()
|
|
for line in lines:
|
|
lineno += 1
|
|
line = line.strip()
|
|
if line.startswith("NAME="):
|
|
name = line[5:]
|
|
elif line.startswith("URL="):
|
|
url = line[4:]
|
|
if url:
|
|
self.config.appendUrl(GetUrlDataFrom(url,
|
|
self.recursionLevel+1, self.config, self.url, None, lineno, name))
|
|
name = ""
|
|
|
|
|
|
def parse_text (self):
|
|
"""parse a text file with on url per line; comment and blank
|
|
lines are ignored
|
|
UNUSED and UNTESTED, just use linkchecker `cat file.txt`
|
|
"""
|
|
lineno = 0
|
|
for line in self.getContent().splitlines():
|
|
lineno += 1
|
|
line = line.strip()
|
|
if not line or line.startswith('#'): continue
|
|
self.config.appendUrl(GetUrlDataFrom(line, self.recursionLevel+1,
|
|
self.config, parentName=self.url, line=lineno))
|
|
|
|
|
|
def parse_css (self):
|
|
"""parse a CSS file for url() patterns"""
|
|
lineno = 0
|
|
for line in self.getContent().splitlines():
|
|
lineno += 1
|
|
for mo in css_url_re.finditer(line):
|
|
column = mo.start("url")
|
|
self.config.appendUrl(GetUrlDataFrom(mo.group("url"),
|
|
self.recursionLevel+1, self.config,
|
|
parentName=self.url, line=lineno, column=column))
|
|
|
|
|
|
def __str__ (self):
|
|
return ("%s link\n"
|
|
"urlname=%s\n"
|
|
"parentName=%s\n"
|
|
"baseRef=%s\n"
|
|
"cached=%s\n"
|
|
"recursionLevel=%s\n"
|
|
"urlConnection=%s\n"
|
|
"line=%s\n"
|
|
"column=%s\n"
|
|
"name=%s" % \
|
|
(self.scheme, self.urlName, self.parentName, self.baseRef,
|
|
self.cached, self.recursionLevel, self.urlConnection, self.line,
|
|
self.column, self.name))
|
|
|