2014-02-28 23:12:34 +00:00
|
|
|
# Copyright (C) 2006-2014 Bastian Kleineidam
|
2006-05-24 22:16:36 +00:00
|
|
|
#
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify
|
|
|
|
|
# it under the terms of the GNU General Public License as published by
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or
|
|
|
|
|
# (at your option) any later version.
|
|
|
|
|
#
|
|
|
|
|
# This program is distributed in the hope that it will be useful,
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
# GNU General Public License for more details.
|
|
|
|
|
#
|
2009-07-24 21:58:20 +00:00
|
|
|
# You should have received a copy of the GNU General Public License along
|
|
|
|
|
# with this program; if not, write to the Free Software Foundation, Inc.,
|
|
|
|
|
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
2007-11-29 07:50:22 +00:00
|
|
|
"""
|
|
|
|
|
URL checking functions.
|
|
|
|
|
"""
|
2014-03-03 22:29:45 +00:00
|
|
|
import copy
|
2014-03-08 18:35:10 +00:00
|
|
|
import time
|
2008-05-09 06:16:03 +00:00
|
|
|
from . import task
|
|
|
|
|
from ..cache import urlqueue
|
2014-03-08 18:35:10 +00:00
|
|
|
from .. import parser
|
2006-05-24 22:16:36 +00:00
|
|
|
|
2014-03-27 16:43:14 +00:00
|
|
|
# Interval in which each check thread looks if it's stopped.
|
|
|
|
|
QUEUE_POLL_INTERVALL_SECS = 1.0
|
|
|
|
|
|
2006-05-24 22:16:36 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def check_urls(urlqueue, logger):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Check URLs without threading."""
|
2006-05-24 22:16:36 +00:00
|
|
|
while not urlqueue.empty():
|
|
|
|
|
url_data = urlqueue.get()
|
|
|
|
|
try:
|
2014-03-08 18:35:10 +00:00
|
|
|
check_url(url_data, logger)
|
2006-05-24 22:16:36 +00:00
|
|
|
finally:
|
|
|
|
|
urlqueue.task_done(url_data)
|
|
|
|
|
|
|
|
|
|
|
2014-03-08 18:35:10 +00:00
|
|
|
def check_url(url_data, logger):
|
|
|
|
|
"""Check a single URL with logging."""
|
|
|
|
|
if url_data.has_result:
|
|
|
|
|
logger.log_url(url_data.to_wire())
|
|
|
|
|
else:
|
|
|
|
|
cache = url_data.aggregate.result_cache
|
|
|
|
|
key = url_data.cache_url
|
|
|
|
|
result = cache.get_result(key)
|
|
|
|
|
if result is None:
|
|
|
|
|
# check
|
|
|
|
|
check_start = time.time()
|
|
|
|
|
try:
|
|
|
|
|
url_data.check()
|
|
|
|
|
do_parse = url_data.check_content()
|
|
|
|
|
url_data.checktime = time.time() - check_start
|
|
|
|
|
# Add result to cache
|
|
|
|
|
result = url_data.to_wire()
|
|
|
|
|
cache.add_result(key, result)
|
|
|
|
|
for alias in url_data.aliases:
|
|
|
|
|
# redirect aliases
|
|
|
|
|
cache.add_result(alias, result)
|
|
|
|
|
# parse content recursively
|
2014-06-13 18:50:37 +00:00
|
|
|
# XXX this could add new warnings which should be cached.
|
2014-03-08 18:35:10 +00:00
|
|
|
if do_parse:
|
|
|
|
|
parser.parse_url(url_data)
|
|
|
|
|
finally:
|
|
|
|
|
# close/release possible open connection
|
|
|
|
|
url_data.close_connection()
|
|
|
|
|
else:
|
|
|
|
|
# copy data from cache and adjust it
|
|
|
|
|
result = copy.copy(result)
|
|
|
|
|
result.parent_url = url_data.parent_url
|
2020-04-30 19:11:59 +00:00
|
|
|
result.base_ref = url_data.base_ref or ""
|
|
|
|
|
result.base_url = url_data.base_url or ""
|
2014-03-08 18:35:10 +00:00
|
|
|
result.line = url_data.line
|
|
|
|
|
result.column = url_data.column
|
|
|
|
|
result.level = url_data.recursion_level
|
|
|
|
|
result.name = url_data.name
|
|
|
|
|
logger.log_url(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class Checker(task.LoggedCheckedTask):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""URL check thread."""
|
2006-05-24 22:16:36 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def __init__(self, urlqueue, logger, add_request_session):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Store URL queue and logger."""
|
2020-06-03 19:06:36 +00:00
|
|
|
super().__init__(logger)
|
2006-05-24 22:16:36 +00:00
|
|
|
self.urlqueue = urlqueue
|
2021-11-16 19:45:38 +00:00
|
|
|
self.origname = self.name
|
2014-02-28 23:12:34 +00:00
|
|
|
self.add_request_session = add_request_session
|
2006-05-24 22:16:36 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def run_checked(self):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Check URLs in the queue."""
|
2014-02-28 23:12:34 +00:00
|
|
|
# construct per-thread HTTP/S requests session
|
|
|
|
|
self.add_request_session()
|
2011-02-18 13:49:53 +00:00
|
|
|
while not self.stopped(0):
|
2006-05-24 22:16:36 +00:00
|
|
|
self.check_url()
|
|
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def check_url(self):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Try to get URL data from queue and check it."""
|
2006-05-24 22:16:36 +00:00
|
|
|
try:
|
2014-03-27 16:43:14 +00:00
|
|
|
url_data = self.urlqueue.get(timeout=QUEUE_POLL_INTERVALL_SECS)
|
2006-06-01 14:13:12 +00:00
|
|
|
if url_data is not None:
|
2007-11-29 07:50:22 +00:00
|
|
|
try:
|
|
|
|
|
self.check_url_data(url_data)
|
|
|
|
|
finally:
|
|
|
|
|
self.urlqueue.task_done(url_data)
|
2020-05-20 18:58:44 +00:00
|
|
|
self.name = self.origname
|
2008-05-09 06:16:03 +00:00
|
|
|
except urlqueue.Empty:
|
2010-10-23 23:40:32 +00:00
|
|
|
pass
|
2014-03-14 19:23:04 +00:00
|
|
|
except Exception:
|
|
|
|
|
self.internal_error()
|
2006-05-24 22:16:36 +00:00
|
|
|
|
2020-05-16 19:19:42 +00:00
|
|
|
def check_url_data(self, url_data):
|
2007-11-29 07:50:22 +00:00
|
|
|
"""Check one URL data instance."""
|
2020-05-20 18:58:44 +00:00
|
|
|
self.name = "CheckThread-%s" % (url_data.url or "")
|
2014-03-08 18:35:10 +00:00
|
|
|
check_url(url_data, self.logger)
|