mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-03 12:24:46 +00:00
opera bookmark and socket timeout support
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@285 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
637bf3e4aa
commit
f10a3a43b2
6 changed files with 187 additions and 62 deletions
9
README
9
README
|
|
@ -51,16 +51,11 @@ So for example 1.1.5 is the fifth release of the 1.1 development package.
|
|||
Included packages
|
||||
-----------------
|
||||
httpslib from http://home.att.net/~nvsoft1/ssl_wrapper.html
|
||||
DNS see DNS/README
|
||||
DNS from http://pydns.sourceforge.net/
|
||||
fcgi.py and sz_fcgi.py from http://saarland.sz-sb.de/~ajung/sz_fcgi/
|
||||
fintl.py from http://sourceforge.net/snippet/detail.php?type=snippet&id=100059
|
||||
|
||||
Note that the following packages are modified by me:
|
||||
fcgi.py (implemented streamed output)
|
||||
sz_fcgi.py (simplified the code)
|
||||
DNS/Lib.py:566 fixed rdlength name error
|
||||
DNS/Lib.py:105 tuple parameter for Python 1.6 compatibility
|
||||
DNS/Base.py: fixed /etc/resolv.conf parser to cope with empty lines
|
||||
Note that all included packages are modified by me.
|
||||
|
||||
|
||||
Internationalization
|
||||
|
|
|
|||
2
TODO
2
TODO
|
|
@ -1 +1 @@
|
|||
Proxy Authentification
|
||||
Nothing pending
|
||||
|
|
|
|||
|
|
@ -19,7 +19,10 @@ import re,string,os,urlparse,urllib
|
|||
from UrlData import UrlData
|
||||
from linkcheck import _
|
||||
|
||||
html_re = re.compile(r'\.s?html?$')
|
||||
html_re = re.compile(r'(?i)\.s?html?$')
|
||||
html_content_re = re.compile(r'(?i)<html>.*</html>')
|
||||
opera_re = re.compile(r'^(?i)opera.adr$')
|
||||
opera_content_re = re.compile(r'(?i)Opera Hotlist')
|
||||
|
||||
class FileUrlData(UrlData):
|
||||
"Url link with file scheme"
|
||||
|
|
@ -61,7 +64,31 @@ class FileUrlData(UrlData):
|
|||
|
||||
|
||||
def isHtml(self):
|
||||
return self.valid and html_re.search(self.url)
|
||||
return html_re.search(self.url) or opera_re.search(self.url) or \
|
||||
html_content_re.search(self.getContent()) or \
|
||||
opera_content_re.search(self.getContent())
|
||||
|
||||
|
||||
def parseUrl(self, config):
|
||||
if html_re.search(self.url) or \
|
||||
html_content_re.search(self.getContent()):
|
||||
UrlData.parseUrl(self, config)
|
||||
return
|
||||
# parse an opera bookmark file
|
||||
name = ""
|
||||
lineno = 0
|
||||
# XXX use iterators for this?
|
||||
for line in StringUtil.lines(self.getContent()):
|
||||
lineno += 1
|
||||
line = line.strip()
|
||||
if line.startwith("NAME="):
|
||||
name = line[5:]
|
||||
elif line.startswith("URL="):
|
||||
url = line[4:]
|
||||
if url:
|
||||
config.appendUrl(GetUrlDataFrom(url,
|
||||
self.recursionLevel+1, self.url, None, lineno, name))
|
||||
name = ""
|
||||
|
||||
|
||||
def get_scheme(self):
|
||||
|
|
|
|||
|
|
@ -372,6 +372,7 @@ class UrlData:
|
|||
self.html_comments.append((start, match.end()))
|
||||
debug(NIGHTMARE, "comment spans", self.html_comments)
|
||||
|
||||
|
||||
def is_in_comment(self, index):
|
||||
for low,high in self.html_comments:
|
||||
if low < index < high:
|
||||
|
|
|
|||
|
|
@ -1,4 +1,30 @@
|
|||
"""enables a timeout on all TCP connections
|
||||
|
||||
####
|
||||
# Copyright 2000,2001 by Timothy O'Malley <timo@alum.mit.edu>
|
||||
#
|
||||
# All Rights Reserved
|
||||
#
|
||||
# Permission to use, copy, modify, and distribute this software
|
||||
# and its documentation for any purpose and without fee is hereby
|
||||
# granted, provided that the above copyright notice appear in all
|
||||
# copies and that both that copyright notice and this permission
|
||||
# notice appear in supporting documentation, and that the name of
|
||||
# Timothy O'Malley not be used in advertising or publicity
|
||||
# pertaining to distribution of the software without specific, written
|
||||
# prior permission.
|
||||
#
|
||||
# Timothy O'Malley DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS
|
||||
# SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY
|
||||
# AND FITNESS, IN NO EVENT SHALL Timothy O'Malley BE LIABLE FOR
|
||||
# ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
||||
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
|
||||
# WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
|
||||
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
|
||||
# PERFORMANCE OF THIS SOFTWARE.
|
||||
#
|
||||
####
|
||||
|
||||
"""Timeout Socket
|
||||
|
||||
This module enables a timeout mechanism on all TCP connections. It
|
||||
does this by inserting a shim on top of the socket module. After
|
||||
|
|
@ -68,6 +94,28 @@ Good Luck!
|
|||
|
||||
#
|
||||
# Revision history
|
||||
# 1.17 Added these comments.
|
||||
# 1.16 Better handling of non-blocking sockets in connect,
|
||||
# accept, recv, and send.
|
||||
# Minor bug fix to exception short cuts.
|
||||
# 1.15 Accept now returns an instance of TimeoutSocket.
|
||||
# Added new Connected and Busy constants and modified the
|
||||
# connect() and accept() routines to use them.
|
||||
# 1.14 Fixed bug in accept().
|
||||
# Added a fix for windows 10022 error.
|
||||
# Thanks to Alex Martelli for pointing these out.
|
||||
# 1.13 Added license.
|
||||
# 1.12 Better mimicry of makefile()'s ability to duplicate a
|
||||
# file descriptor. This fixes Python 2.0 woes.
|
||||
# 1.10 As Tim Lavoie pointed out, setblocking() still had a bug.
|
||||
# 1.9 Thanks to Doug Fort for pointing these out.
|
||||
# BAD bug with accept() return value fixed.
|
||||
# Forgotten "_" in setblocking() fixed.
|
||||
# 1.8 Removed the error handling from send(). It was just wrong.
|
||||
# 1.7 Updated revision history
|
||||
# 1.6 Added setblocking() method and improved error handling
|
||||
# in connect(), accept(), and send()
|
||||
# 1.5 Updated revision history
|
||||
# 1.4 Updated document string
|
||||
# 1.3 Changed name to timeoutsocket.py on Pehr's suggestion
|
||||
# 1.2 Added the silent replacement of the socket module
|
||||
|
|
@ -80,8 +128,7 @@ __author__ = "Timothy O'Malley <timo@alum.mit.edu>"
|
|||
#
|
||||
# Imports
|
||||
#
|
||||
import select, errno
|
||||
import string
|
||||
import select
|
||||
try:
|
||||
from _timeoutsocket import *
|
||||
except ImportError:
|
||||
|
|
@ -89,6 +136,25 @@ except ImportError:
|
|||
_socket = socket
|
||||
del socket
|
||||
|
||||
#
|
||||
# Set up constants to test for Connected and Blocking operations.
|
||||
# We delete 'os' and 'errno' to keep our namespace clean(er).
|
||||
# Thanks to Alex Martelli and G. Li for the Windows error codes.
|
||||
#
|
||||
import os
|
||||
if os.name == "nt":
|
||||
_IsConnected = ( 10022, )
|
||||
_ConnectBusy = ( 10035, )
|
||||
_AcceptBusy = ( 10035, )
|
||||
else:
|
||||
import errno
|
||||
_IsConnected = ( errno.EISCONN, )
|
||||
_ConnectBusy = ( errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK )
|
||||
_AcceptBusy = ( errno.EAGAIN, errno.EWOULDBLOCK )
|
||||
del errno
|
||||
del os
|
||||
|
||||
|
||||
#
|
||||
# Default timeout value for ALL TimeoutSockets
|
||||
#
|
||||
|
|
@ -130,9 +196,12 @@ class TimeoutSocket:
|
|||
set_timeout() method.
|
||||
"""
|
||||
|
||||
_copies = 0
|
||||
_blocking = 1
|
||||
|
||||
def __init__(self, sock, timeout):
|
||||
self._sock = sock
|
||||
self._timeout = timeout
|
||||
self._sock = sock
|
||||
self._timeout = timeout
|
||||
# end __init__
|
||||
|
||||
def __getattr__(self, key):
|
||||
|
|
@ -147,33 +216,42 @@ class TimeoutSocket:
|
|||
self._timeout = timeout
|
||||
# end set_timeout
|
||||
|
||||
def connect(self, addr, dumbhack=None):
|
||||
def setblocking(self, blocking):
|
||||
self._blocking = blocking
|
||||
return self._sock.setblocking(blocking)
|
||||
# end set_timeout
|
||||
|
||||
def connect(self, addr, port=None, dumbhack=None):
|
||||
# In case we were called as connect(host, port)
|
||||
#if port != None: addr = (addr, port)
|
||||
if port != None: addr = (addr, port)
|
||||
|
||||
# Shortcuts
|
||||
sock = self._sock
|
||||
timeout = self._timeout
|
||||
blocking = self._blocking
|
||||
|
||||
# First, make a non-blocking call to connect
|
||||
try:
|
||||
sock.setblocking(0)
|
||||
sock.connect(addr)
|
||||
sock.setblocking(1)
|
||||
sock.setblocking(blocking)
|
||||
return
|
||||
except Error, why:
|
||||
# If we are already connected, then return success
|
||||
# Set the socket's blocking mode back
|
||||
sock.setblocking(blocking)
|
||||
|
||||
# If we are not blocking, re-raise
|
||||
if not blocking:
|
||||
raise
|
||||
|
||||
# If we are already connected, then return success.
|
||||
# If we got a genuine error, re-raise it.
|
||||
errcode = why[0]
|
||||
if errcode == errno.EISCONN:
|
||||
if dumbhack and errcode in _IsConnected:
|
||||
return
|
||||
if errcode == 10035 and why[1] == 'winsock error':
|
||||
# Windows error code from G.Li@med.ge.com
|
||||
return
|
||||
vals = (errno.EINPROGRESS, errno.EALREADY, errno.EWOULDBLOCK)
|
||||
if errcode not in vals:
|
||||
elif errcode not in _ConnectBusy:
|
||||
raise
|
||||
|
||||
|
||||
# Now, wait for the connect to happen
|
||||
# ONLY if dumbhack indicates this is pass number one.
|
||||
# If select raises an error, we pass it on.
|
||||
|
|
@ -187,25 +265,36 @@ class TimeoutSocket:
|
|||
raise Timeout("Attempted connect to %s timed out." % str(addr) )
|
||||
# end connect
|
||||
|
||||
def accept(self, dumbhack=1):
|
||||
def accept(self, dumbhack=None):
|
||||
# Shortcuts
|
||||
sock = self._sock
|
||||
timeout = self._timeout
|
||||
sock = self._sock
|
||||
timeout = self._timeout
|
||||
blocking = self._blocking
|
||||
|
||||
# First, make a non-blocking call to connect
|
||||
# First, make a non-blocking call to accept
|
||||
# If we get a valid result, then convert the
|
||||
# accept'ed socket into a TimeoutSocket.
|
||||
# Be carefult about the blocking mode of ourselves.
|
||||
try:
|
||||
sock.setblocking(0)
|
||||
sa = sock.accept()
|
||||
sock.setblocking(1)
|
||||
return sa
|
||||
newsock, addr = sock.accept()
|
||||
sock.setblocking(blocking)
|
||||
timeoutnewsock = self.__class__(newsock, timeout)
|
||||
timeoutnewsock.setblocking(blocking)
|
||||
return (timeoutnewsock, addr)
|
||||
except Error, why:
|
||||
# If we are already connected, then return success
|
||||
# Set the socket's blocking mode back
|
||||
sock.setblocking(blocking)
|
||||
|
||||
# If we are not supposed to block, then re-raise
|
||||
if not blocking:
|
||||
raise
|
||||
|
||||
# If we got a genuine error, re-raise it.
|
||||
errcode = why[0]
|
||||
vals = (errno.EAGAIN, errno.EWOULDBLOCK)
|
||||
if errcode not in vals:
|
||||
if errcode not in _AcceptBusy:
|
||||
raise
|
||||
|
||||
|
||||
# Now, wait for the accept to happen
|
||||
# ONLY if dumbhack indicates this is pass number one.
|
||||
# If select raises an error, we pass it on.
|
||||
|
|
@ -221,30 +310,34 @@ class TimeoutSocket:
|
|||
|
||||
def send(self, data, flags=0):
|
||||
sock = self._sock
|
||||
totallen = 0
|
||||
while data:
|
||||
r,w,e = select.select([],[sock], [], self._timeout)
|
||||
if self._blocking:
|
||||
r,w,e = select.select([],[sock],[], self._timeout)
|
||||
if not w:
|
||||
raise Timeout("Send timed out")
|
||||
sentlen = sock.send(data, flags)
|
||||
data = data[sentlen:]
|
||||
totallen += sentlen
|
||||
return totallen
|
||||
return sock.send(data, flags)
|
||||
# end send
|
||||
|
||||
def recv(self, bufsize, flags=0):
|
||||
sock = self._sock
|
||||
r,w,e = select.select([sock], [], [], self._timeout)
|
||||
if r:
|
||||
data = sock.recv(bufsize, flags)
|
||||
return data
|
||||
raise Timeout("Recv timed out")
|
||||
if self._blocking:
|
||||
r,w,e = select.select([sock], [], [], self._timeout)
|
||||
if not r:
|
||||
raise Timeout("Recv timed out")
|
||||
return sock.recv(bufsize, flags)
|
||||
# end recv
|
||||
|
||||
def makefile(self, flags="r", bufsize=-1):
|
||||
self._copies = self._copies +1
|
||||
return TimeoutFile(self, flags, bufsize)
|
||||
# end makefile
|
||||
|
||||
def close(self):
|
||||
if self._copies <= 0:
|
||||
self._sock.close()
|
||||
else:
|
||||
self._copies = self._copies -1
|
||||
# end close
|
||||
|
||||
# end TimeoutSocket
|
||||
|
||||
|
||||
|
|
@ -265,6 +358,11 @@ class TimeoutFile:
|
|||
return getattr(self._sock, key)
|
||||
# end __getattr__
|
||||
|
||||
def close(self):
|
||||
self._sock.close()
|
||||
self._sock = None
|
||||
# end close
|
||||
|
||||
def write(self, data):
|
||||
self.send(data)
|
||||
# end write
|
||||
|
|
@ -278,12 +376,12 @@ class TimeoutFile:
|
|||
break
|
||||
bufsize = self._bufsize
|
||||
if size > 0:
|
||||
bufsize = min(bufsize, size - datalen)
|
||||
bufsize = min(bufsize, size - datalen )
|
||||
buf = self.recv(bufsize)
|
||||
if not buf:
|
||||
break
|
||||
data += buf
|
||||
if datalen > size > 0:
|
||||
data = data + buf
|
||||
if size > 0 and datalen > size:
|
||||
self._sock._inqueue = data[size:]
|
||||
data = data[:size]
|
||||
return data
|
||||
|
|
@ -293,7 +391,7 @@ class TimeoutFile:
|
|||
data = self._sock._inqueue
|
||||
self._sock._inqueue = ""
|
||||
while 1:
|
||||
idx = string.find(data, "\n")
|
||||
idx = data.find("\n")
|
||||
if idx >= 0:
|
||||
break
|
||||
datalen = len(data)
|
||||
|
|
@ -305,10 +403,10 @@ class TimeoutFile:
|
|||
buf = self.recv(bufsize)
|
||||
if not buf:
|
||||
break
|
||||
data += buf
|
||||
data = data + buf
|
||||
|
||||
if idx >= 0:
|
||||
idx += 1
|
||||
idx = idx + 1
|
||||
self._sock._inqueue = data[idx:]
|
||||
data = data[:idx]
|
||||
elif size > 0 and datalen > size:
|
||||
|
|
|
|||
18
linkchecker
18
linkchecker
|
|
@ -20,11 +20,7 @@
|
|||
import sys
|
||||
if sys.version[:5] < "2.0":
|
||||
raise SystemExit, "This program requires Python 2.0 or later."
|
||||
import getopt,re,string,os,urlparse
|
||||
# 90 seconds timeout for all connections
|
||||
#import timeoutsocket
|
||||
#timeoutsocket.setDefaultSocketTimeout(90)
|
||||
import linkcheck
|
||||
import getopt, re, os, urlparse, linkcheck
|
||||
from linkcheck import _,StringUtil
|
||||
|
||||
|
||||
|
|
@ -84,6 +80,9 @@ For single-letter option arguments the space is not a necessity. So
|
|||
-t num, --threads=num
|
||||
Generate no more than num threads. Default number of threads is 5.
|
||||
To disable threading specify a non-positive number.
|
||||
--timeout=secs
|
||||
Set the timeout for connection attempts in seconds. Default timeout
|
||||
is system dependant.
|
||||
-u name, --user=name
|
||||
Try username name for HTML and FTP authorization.
|
||||
Default is 'anonymous'. See also -p.
|
||||
|
|
@ -175,6 +174,7 @@ try:
|
|||
"robots-txt",
|
||||
"strict",
|
||||
"threads=",
|
||||
"timeout=",
|
||||
"user=",
|
||||
"version",
|
||||
"verbose",
|
||||
|
|
@ -220,7 +220,7 @@ for opt,arg in options:
|
|||
elif opt=="-F" or opt=="--file-output":
|
||||
ns = {'fileoutput':1}
|
||||
try:
|
||||
type, ns['filename'] = string.split(arg, '/', 1)
|
||||
type, ns['filename'] = arg.split('/', 1)
|
||||
if not ns['filename']: raise ValueError
|
||||
except ValueError: type = arg
|
||||
if linkcheck.Config.Loggers.has_key(type) and type != "blacklist":
|
||||
|
|
@ -273,6 +273,10 @@ for opt,arg in options:
|
|||
else:
|
||||
config.disableThreading()
|
||||
|
||||
elif opt=="--timeout":
|
||||
import timeoutsocket
|
||||
timeoutsocket.setDefaultSocketTimeout(int(arg))
|
||||
|
||||
elif opt=="-u" or opt=="--user":
|
||||
_user = arg
|
||||
constructauth = 1
|
||||
|
|
@ -310,7 +314,7 @@ if len(args)==0:
|
|||
print _("warning: no files or urls given")
|
||||
|
||||
for url in args:
|
||||
url = string.strip(url)
|
||||
url = url.strip()
|
||||
if not (":" in url):
|
||||
if re.compile("^ftp\.").match(url):
|
||||
url = "ftp://"+url
|
||||
|
|
|
|||
Loading…
Reference in a new issue