2003-07-04 14:24:44 +00:00
|
|
|
# -*- coding: iso-8859-1 -*-
|
2001-03-15 01:19:35 +00:00
|
|
|
"""main function module for link checking"""
|
2004-01-03 14:59:33 +00:00
|
|
|
# Copyright (C) 2000-2004 Bastian Kleineidam
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
2001-03-15 01:19:35 +00:00
|
|
|
#
|
2001-05-23 21:20:44 +00:00
|
|
|
# You should have received a copy of the GNU General Public License
|
|
|
|
|
# along with this program; if not, write to the Free Software
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
2000-04-10 16:58:05 +00:00
|
|
|
|
2004-01-07 21:27:49 +00:00
|
|
|
|
2003-01-22 19:50:13 +00:00
|
|
|
class LinkCheckerError (Exception):
|
2000-06-10 18:06:43 +00:00
|
|
|
pass
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2004-01-07 21:27:49 +00:00
|
|
|
|
2004-01-02 23:30:11 +00:00
|
|
|
import time, re, i18n
|
2003-08-11 11:49:30 +00:00
|
|
|
def getLinkPat (arg, strict=False):
|
2002-06-07 20:47:35 +00:00
|
|
|
"""get a link pattern matcher for intern/extern links"""
|
2003-12-20 11:28:55 +00:00
|
|
|
debug(BRING_IT_ON, "Link pattern %r", arg)
|
2002-06-07 20:47:35 +00:00
|
|
|
if arg[0:1] == '!':
|
|
|
|
|
pattern = arg[1:]
|
2003-08-11 11:49:30 +00:00
|
|
|
negate = True
|
2002-06-07 20:47:35 +00:00
|
|
|
else:
|
|
|
|
|
pattern = arg
|
2003-08-11 11:49:30 +00:00
|
|
|
negate = False
|
2002-06-07 20:47:35 +00:00
|
|
|
return {
|
|
|
|
|
"pattern": re.compile(pattern),
|
|
|
|
|
"negate": negate,
|
|
|
|
|
"strict": strict,
|
|
|
|
|
}
|
|
|
|
|
|
2002-12-07 00:45:31 +00:00
|
|
|
# file extensions we can parse recursively
|
|
|
|
|
extensions = {
|
2003-01-05 12:38:34 +00:00
|
|
|
"html": re.compile(r'(?i)\.s?html?$'),
|
|
|
|
|
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
|
2003-10-17 10:53:48 +00:00
|
|
|
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
|
2003-01-05 12:38:34 +00:00
|
|
|
# "text": re.compile(r'(?i)\.(txt|xml|tsv|csv|sgml?|py|java|cc?|cpp|h)$'),
|
2002-12-07 00:45:31 +00:00
|
|
|
}
|
|
|
|
|
|
2003-01-05 23:07:46 +00:00
|
|
|
import UrlData
|
2003-01-08 08:47:49 +00:00
|
|
|
from debug import *
|
2004-01-04 09:23:00 +00:00
|
|
|
from linkcheck.log import strduration
|
2000-02-26 10:24:46 +00:00
|
|
|
|
2004-01-07 21:27:49 +00:00
|
|
|
|
2002-01-29 21:20:37 +00:00
|
|
|
# main check function
|
2002-05-04 13:27:02 +00:00
|
|
|
def checkUrls (config):
|
2000-04-10 16:58:05 +00:00
|
|
|
""" checkUrls gets a complete configuration object as parameter where all
|
|
|
|
|
runtime-dependent options are stored.
|
|
|
|
|
If you call checkUrls more than once, you can specify different
|
|
|
|
|
configurations.
|
|
|
|
|
|
|
|
|
|
In the config object there are functions to get a new URL (getUrl) and
|
|
|
|
|
to check it (checkUrl).
|
|
|
|
|
"""
|
2000-02-26 10:24:46 +00:00
|
|
|
config.log_init()
|
|
|
|
|
try:
|
2004-01-03 13:27:47 +00:00
|
|
|
start_time = time.time()
|
|
|
|
|
status_time = start_time
|
2004-01-07 21:27:49 +00:00
|
|
|
while True:
|
2000-02-26 10:24:46 +00:00
|
|
|
if config.hasMoreUrls():
|
|
|
|
|
config.checkUrl(config.getUrl())
|
2004-01-07 21:27:49 +00:00
|
|
|
elif config.finished():
|
|
|
|
|
break
|
2004-01-02 23:30:11 +00:00
|
|
|
else:
|
|
|
|
|
# active connections are downloading/parsing, so
|
|
|
|
|
# wait a little
|
|
|
|
|
time.sleep(0.1)
|
2004-01-03 13:27:47 +00:00
|
|
|
if config['status']:
|
|
|
|
|
curtime = time.time()
|
|
|
|
|
if (curtime - status_time) > 5:
|
2004-01-04 09:23:00 +00:00
|
|
|
printStatus(config, curtime, start_time)
|
2004-01-03 13:27:47 +00:00
|
|
|
status_time = curtime
|
2001-01-06 18:00:43 +00:00
|
|
|
config.log_endOfOutput()
|
2000-02-26 10:24:46 +00:00
|
|
|
except KeyboardInterrupt:
|
|
|
|
|
config.finish()
|
|
|
|
|
config.log_endOfOutput()
|
2004-01-03 13:27:47 +00:00
|
|
|
active = config.threader.active_threads()
|
|
|
|
|
warn(i18n._("keyboard interrupt; waiting for %d active threads to finish") % active)
|
2002-10-19 11:48:33 +00:00
|
|
|
raise
|
2004-01-04 09:23:00 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
def printStatus (config, curtime, start_time):
|
|
|
|
|
tocheck = config.urls.qsize()
|
|
|
|
|
links = config['linknumber']
|
|
|
|
|
active = config.threader.active_threads()
|
|
|
|
|
duration = strduration(curtime - start_time)
|
|
|
|
|
print >>sys.stderr, i18n._("%5d urls queued, %4d links checked, %2d active threads, runtime %s")%\
|
|
|
|
|
(tocheck, links, active, duration)
|