mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-16 12:21:13 +00:00
Merge pull request #356 from cjmayo/parser1
Remove unecessary parser related code
This commit is contained in:
commit
7d55855ffb
8 changed files with 20 additions and 420 deletions
1
linkcheck/HtmlParser/.gitignore
vendored
1
linkcheck/HtmlParser/.gitignore
vendored
|
|
@ -1 +0,0 @@
|
|||
htmlparse.output
|
||||
|
|
@ -26,8 +26,6 @@ filterwarnings("ignore",
|
|||
|
||||
from bs4 import BeautifulSoup, Tag
|
||||
|
||||
from ..containers import ListDict
|
||||
|
||||
|
||||
class Parser(object):
|
||||
handler = None
|
||||
|
|
@ -53,34 +51,20 @@ class Parser(object):
|
|||
self.html_doc = None
|
||||
self.tag_lineno = None
|
||||
self.tag_column = None
|
||||
self.last_tag_lineno = None
|
||||
self.last_tag_column = None
|
||||
|
||||
def parse_contents(self, contents):
|
||||
for content in contents:
|
||||
if isinstance(content, Tag):
|
||||
attrs = ListDict()
|
||||
for k, v_list in sorted(content.attrs.items()):
|
||||
if not isinstance(v_list, list):
|
||||
v_list = [v_list]
|
||||
for v in v_list:
|
||||
# empty parameters returned by BS4
|
||||
# are sometimes in bytes:
|
||||
if v == b'':
|
||||
v = u''
|
||||
attrs[k] = v
|
||||
self.last_tag_lineno = self.tag_lineno
|
||||
self.last_tag_column = self.tag_column
|
||||
self.tag_lineno = content.sourceline
|
||||
self.tag_column = None if content.sourcepos is None \
|
||||
else content.sourcepos + 1
|
||||
if content.is_empty_element:
|
||||
self.handler.start_end_element(
|
||||
content.name, attrs, content.text.strip(),
|
||||
content.name, content.attrs, content.text.strip(),
|
||||
)
|
||||
else:
|
||||
self.handler.start_element(
|
||||
content.name, attrs, content.text.strip(),
|
||||
content.name, content.attrs, content.text.strip(),
|
||||
)
|
||||
if hasattr(content, 'contents'): # recursion
|
||||
self.parse_contents(content.contents)
|
||||
|
|
@ -89,26 +73,18 @@ class Parser(object):
|
|||
|
||||
def flush(self):
|
||||
if self.soup is None:
|
||||
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser')
|
||||
self.soup = BeautifulSoup(self.html_doc.getvalue(), 'html.parser',
|
||||
multi_valued_attributes=None)
|
||||
if hasattr(self.soup, 'contents'):
|
||||
self.parse_contents(self.soup.contents)
|
||||
self.encoding = self.soup.original_encoding
|
||||
|
||||
def debug(self, text):
|
||||
raise NotImplementedError("debug is not implemented")
|
||||
|
||||
def lineno(self):
|
||||
return self.tag_lineno
|
||||
|
||||
def last_lineno(self):
|
||||
return self.last_tag_lineno
|
||||
|
||||
def column(self):
|
||||
return self.tag_column
|
||||
|
||||
def last_column(self):
|
||||
return self.last_tag_column
|
||||
|
||||
|
||||
def parser(handler=None):
|
||||
return Parser(handler)
|
||||
|
|
|
|||
|
|
@ -310,6 +310,7 @@ class HttpUrl (internpaturl.InternPatternUrl, proxysupport.ProxySupport):
|
|||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None,
|
||||
from_encoding=self.encoding)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
return self.text
|
||||
|
|
|
|||
|
|
@ -657,7 +657,8 @@ class UrlBase (object):
|
|||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
self.soup = BeautifulSoup(self.data, "html.parser")
|
||||
self.soup = BeautifulSoup(self.data, "html.parser",
|
||||
multi_valued_attributes=None)
|
||||
self.text = self.data.decode(self.soup.original_encoding)
|
||||
self.encoding = self.soup.original_encoding
|
||||
return self.text
|
||||
|
|
|
|||
|
|
@ -18,170 +18,6 @@
|
|||
Special container classes.
|
||||
"""
|
||||
|
||||
from collections import namedtuple
|
||||
from past.builtins import basestring
|
||||
|
||||
class AttrDict (dict):
|
||||
"""Dictionary allowing attribute access to its elements if they
|
||||
are valid attribute names and not already existing methods."""
|
||||
|
||||
def __getattr__ (self, name):
|
||||
"""Return attribute name from dict."""
|
||||
return self[name]
|
||||
|
||||
|
||||
class ListDict (dict):
|
||||
"""A dictionary whose iterators reflect the order in which elements
|
||||
were added.
|
||||
"""
|
||||
|
||||
def __init__ (self):
|
||||
"""Initialize sorted key list."""
|
||||
super(ListDict, self).__init__()
|
||||
# sorted list of keys
|
||||
self._keys = []
|
||||
|
||||
def setdefault (self, key, *args):
|
||||
"""Remember key order if key not found."""
|
||||
if key not in self:
|
||||
self._keys.append(key)
|
||||
return super(ListDict, self).setdefault(key, *args)
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""Add key,value to dict, append key to sorted list."""
|
||||
if key not in self:
|
||||
self._keys.append(key)
|
||||
super(ListDict, self).__setitem__(key, value)
|
||||
|
||||
def __delitem__ (self, key):
|
||||
"""Remove key from dict."""
|
||||
self._keys.remove(key)
|
||||
super(ListDict, self).__delitem__(key)
|
||||
|
||||
def pop (self, key):
|
||||
"""Remove key from dict and return value."""
|
||||
if key in self._keys:
|
||||
self._keys.remove(key)
|
||||
super(ListDict, self).pop(key)
|
||||
|
||||
def popitem (self):
|
||||
"""Remove oldest key from dict and return item."""
|
||||
if self._keys:
|
||||
k = self._keys[0]
|
||||
v = self[k]
|
||||
del self[k]
|
||||
return (k, v)
|
||||
raise KeyError("popitem() on empty dictionary")
|
||||
|
||||
def values (self):
|
||||
"""Return sorted list of values."""
|
||||
return [self[k] for k in self._keys]
|
||||
|
||||
def items (self):
|
||||
"""Return sorted list of items."""
|
||||
return [(k, self[k]) for k in self._keys]
|
||||
|
||||
def keys (self):
|
||||
"""Return sorted list of keys."""
|
||||
return self._keys[:]
|
||||
|
||||
def itervalues (self):
|
||||
"""Return iterator over sorted values."""
|
||||
for k in self._keys:
|
||||
yield self[k]
|
||||
|
||||
def iteritems (self):
|
||||
"""Return iterator over sorted items."""
|
||||
for k in self._keys:
|
||||
yield (k, self[k])
|
||||
|
||||
def iterkeys (self):
|
||||
"""Return iterator over sorted keys."""
|
||||
return iter(self._keys)
|
||||
|
||||
def clear (self):
|
||||
"""Remove all dict entries."""
|
||||
self._keys = []
|
||||
super(ListDict, self).clear()
|
||||
|
||||
def get_true (self, key, default):
|
||||
"""Return default element if key is not in the dict, or if self[key]
|
||||
evaluates to False. Useful for example if value is None, but
|
||||
default value should be an empty string.
|
||||
"""
|
||||
if key not in self or not self[key]:
|
||||
return default
|
||||
return self[key]
|
||||
|
||||
|
||||
class CaselessDict (dict):
|
||||
"""A dictionary ignoring the case of keys (which must be strings)."""
|
||||
|
||||
def __getitem__ (self, key):
|
||||
"""Return lowercase key item."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.__getitem__(self, key.lower())
|
||||
|
||||
def __delitem__ (self, key):
|
||||
"""Remove lowercase key item."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.__delitem__(self, key.lower())
|
||||
|
||||
def __setitem__ (self, key, value):
|
||||
"""Set lowercase key item."""
|
||||
assert isinstance(key, basestring)
|
||||
dict.__setitem__(self, key.lower(), value)
|
||||
|
||||
def __contains__ (self, key):
|
||||
"""Check lowercase key item."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.__contains__(self, key.lower())
|
||||
|
||||
def get (self, key, def_val=None):
|
||||
"""Return lowercase key value."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.get(self, key.lower(), def_val)
|
||||
|
||||
def setdefault (self, key, *args):
|
||||
"""Set lowercase key value and return."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.setdefault(self, key.lower(), *args)
|
||||
|
||||
def update (self, other):
|
||||
"""Update this dict with lowercase key from other dict"""
|
||||
for k, v in other.items():
|
||||
dict.__setitem__(self, k.lower(), v)
|
||||
|
||||
def fromkeys (cls, iterable, value=None):
|
||||
"""Construct new caseless dict from given data."""
|
||||
d = cls()
|
||||
for k in iterable:
|
||||
dict.__setitem__(d, k.lower(), value)
|
||||
return d
|
||||
fromkeys = classmethod(fromkeys)
|
||||
|
||||
def pop (self, key, *args):
|
||||
"""Remove lowercase key from dict and return value."""
|
||||
assert isinstance(key, basestring)
|
||||
return dict.pop(self, key.lower(), *args)
|
||||
|
||||
|
||||
class CaselessSortedDict (CaselessDict):
|
||||
"""Caseless dictionary with sorted keys."""
|
||||
|
||||
def keys (self):
|
||||
"""Return sorted key list."""
|
||||
return sorted(super(CaselessSortedDict, self).keys())
|
||||
|
||||
def items (self):
|
||||
"""Return sorted item list."""
|
||||
return [(x, self[x]) for x in self.keys()]
|
||||
|
||||
def iteritems (self):
|
||||
"""Return sorted item iterator."""
|
||||
return ((x, self[x]) for x in self.keys())
|
||||
|
||||
|
||||
class LFUCache (dict):
|
||||
"""Limited cache which purges least frequently used items."""
|
||||
|
||||
|
|
@ -266,18 +102,3 @@ class LFUCache (dict):
|
|||
"""Remove and return a value."""
|
||||
value = super(LFUCache, self).pop()
|
||||
return value[1]
|
||||
|
||||
|
||||
def enum (*names):
|
||||
"""Return an enum datatype instance from given list of keyword names.
|
||||
The enum values are zero-based integers.
|
||||
|
||||
>>> Status = enum('open', 'pending', 'closed')
|
||||
>>> Status.open
|
||||
0
|
||||
>>> Status.pending
|
||||
1
|
||||
>>> Status.closed
|
||||
2
|
||||
"""
|
||||
return namedtuple('Enum', ' '.join(names))(*range(len(names)))
|
||||
|
|
|
|||
|
|
@ -130,7 +130,7 @@ class MetaRobotsFinder (TagFinder):
|
|||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""Search for meta robots.txt "nofollow" and "noindex" flags."""
|
||||
if tag == 'meta' and attrs.get('name') == 'robots':
|
||||
val = attrs.get_true('content', u'').lower().split(u',')
|
||||
val = attrs.get('content', u'').lower().split(u',')
|
||||
self.follow = u'nofollow' not in val
|
||||
self.index = u'noindex' not in val
|
||||
raise StopParse("found <meta name=robots> tag")
|
||||
|
|
@ -142,11 +142,11 @@ def is_meta_url (attr, attrs):
|
|||
"""Check if the meta attributes contain a URL."""
|
||||
res = False
|
||||
if attr == "content":
|
||||
equiv = attrs.get_true('http-equiv', u'').lower()
|
||||
scheme = attrs.get_true('scheme', u'').lower()
|
||||
equiv = attrs.get('http-equiv', u'').lower()
|
||||
scheme = attrs.get('scheme', u'').lower()
|
||||
res = equiv in (u'refresh',) or scheme in (u'dcterms.uri',)
|
||||
if attr == "href":
|
||||
rel = attrs.get_true('rel', u'').lower()
|
||||
rel = attrs.get('rel', u'').lower()
|
||||
res = rel in (u'shortcut icon', u'icon')
|
||||
return res
|
||||
|
||||
|
|
@ -155,7 +155,7 @@ def is_form_get(attr, attrs):
|
|||
"""Check if this is a GET form action URL."""
|
||||
res = False
|
||||
if attr == "action":
|
||||
method = attrs.get_true('method', u'').lower()
|
||||
method = attrs.get('method', u'').lower()
|
||||
res = method != 'post'
|
||||
return res
|
||||
|
||||
|
|
@ -180,9 +180,9 @@ class LinkFinder (TagFinder):
|
|||
def start_element (self, tag, attrs, element_text=None):
|
||||
"""Search for links and store found URLs in a list."""
|
||||
log.debug(LOG_CHECK, "LinkFinder tag %s attrs %s", tag, attrs)
|
||||
log.debug(LOG_CHECK, "line %d col %d old line %s old col %s", self.parser.lineno(), self.parser.column(), self.parser.last_lineno(), self.parser.last_column())
|
||||
log.debug(LOG_CHECK, "line %d col %d", self.parser.lineno(), self.parser.column())
|
||||
if tag == "base" and not self.base_ref:
|
||||
self.base_ref = attrs.get_true("href", u'')
|
||||
self.base_ref = attrs.get("href", u'')
|
||||
tagattrs = self.tags.get(tag, self.universal_attrs)
|
||||
# parse URLs in tag (possibly multiple URLs in CSS styles)
|
||||
for attr in sorted(tagattrs.intersection(attrs)):
|
||||
|
|
@ -195,7 +195,7 @@ class LinkFinder (TagFinder):
|
|||
# possible codebase
|
||||
base = u''
|
||||
if tag == 'applet':
|
||||
base = attrs.get_true('codebase', u'')
|
||||
base = attrs.get('codebase', u'')
|
||||
if not base:
|
||||
base = self.base_ref
|
||||
# note: value can be None
|
||||
|
|
@ -212,11 +212,11 @@ class LinkFinder (TagFinder):
|
|||
"""Parse attrs for link name. Return name of link."""
|
||||
if tag == 'a' and attr == 'href':
|
||||
if not name:
|
||||
name = attrs.get_true('title', u'')
|
||||
name = attrs.get('title', u'')
|
||||
elif tag == 'img':
|
||||
name = attrs.get_true('alt', u'')
|
||||
name = attrs.get('alt', u'')
|
||||
if not name:
|
||||
name = attrs.get_true('title', u'')
|
||||
name = attrs.get('title', u'')
|
||||
else:
|
||||
name = u""
|
||||
return name
|
||||
|
|
|
|||
|
|
@ -114,7 +114,7 @@ class HtmlPrettyPrinter:
|
|||
@return: None
|
||||
"""
|
||||
self.fd.write("<%s" % tag.replace("/", ""))
|
||||
for key, val in attrs.items():
|
||||
for key, val in sorted(attrs.items()):
|
||||
if val is None:
|
||||
self.fd.write(" %s" % key)
|
||||
else:
|
||||
|
|
|
|||
|
|
@ -19,199 +19,11 @@ Test container routines.
|
|||
"""
|
||||
|
||||
import unittest
|
||||
import random
|
||||
|
||||
import linkcheck.containers
|
||||
|
||||
from builtins import range
|
||||
|
||||
class TestAttrDict (unittest.TestCase):
|
||||
|
||||
def setUp (self):
|
||||
self.d = linkcheck.containers.AttrDict()
|
||||
|
||||
def test_access (self):
|
||||
self.d["test"] = 1
|
||||
self.assertEqual(self.d.test, self.d["test"])
|
||||
self.assertEqual(self.d.test, 1)
|
||||
|
||||
def test_method (self):
|
||||
self.d["get"] = 1
|
||||
self.assertTrue(isinstance(self.d.get, type({}.get)))
|
||||
|
||||
|
||||
class TestListDict (unittest.TestCase):
|
||||
"""Test list dictionary routines."""
|
||||
|
||||
def setUp (self):
|
||||
"""Set up self.d as empty listdict."""
|
||||
self.d = linkcheck.containers.ListDict()
|
||||
|
||||
def test_insertion_order (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
self.assertTrue(2 in self.d)
|
||||
self.assertTrue(1 in self.d)
|
||||
|
||||
def test_deletion_order (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
del self.d[1]
|
||||
self.assertTrue(2 in self.d)
|
||||
self.assertTrue(1 not in self.d)
|
||||
|
||||
def test_update_order (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 2
|
||||
self.d[1] = 1
|
||||
self.assertEqual(self.d[1], 1)
|
||||
|
||||
def test_sorting (self):
|
||||
self.assertTrue(not self.d)
|
||||
toinsert = random.sample(range(10000000), 60)
|
||||
for x in toinsert:
|
||||
self.d[x] = x
|
||||
for i, k in enumerate(self.d.keys()):
|
||||
self.assertEqual(self.d[k], toinsert[i])
|
||||
for i, k in enumerate(self.d.iterkeys()):
|
||||
self.assertEqual(self.d[k], toinsert[i])
|
||||
for x in self.d.values():
|
||||
self.assertTrue(x in toinsert)
|
||||
for x in self.d.itervalues():
|
||||
self.assertTrue(x in toinsert)
|
||||
for x, y in self.d.items():
|
||||
self.assertTrue(x in toinsert)
|
||||
self.assertTrue(y in toinsert)
|
||||
for x, y in self.d.iteritems():
|
||||
self.assertTrue(x in toinsert)
|
||||
self.assertTrue(y in toinsert)
|
||||
|
||||
def test_clear (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d[2] = 1
|
||||
self.d[1] = 3
|
||||
self.d.clear()
|
||||
self.assertTrue(not self.d)
|
||||
|
||||
def test_get_true (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 0
|
||||
self.d["b"] = 1
|
||||
self.assertEqual(self.d.get_true("a", 2), 2)
|
||||
self.assertEqual(self.d.get_true("b", 2), 1)
|
||||
|
||||
|
||||
class TestCaselessDict (unittest.TestCase):
|
||||
"""Test caseless dictionary routines."""
|
||||
|
||||
def setUp (self):
|
||||
"""Set up self.d as empty caseless dict."""
|
||||
self.d = linkcheck.containers.CaselessDict()
|
||||
|
||||
def test_insert (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 1
|
||||
self.assertTrue("a" in self.d)
|
||||
self.assertTrue("A" in self.d)
|
||||
self.d["aBcD"] = 2
|
||||
self.assertTrue("abcd" in self.d)
|
||||
self.assertTrue("Abcd" in self.d)
|
||||
self.assertTrue("ABCD" in self.d)
|
||||
|
||||
def test_delete (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 1
|
||||
del self.d["A"]
|
||||
self.assertTrue("a" not in self.d)
|
||||
self.assertTrue("A" not in self.d)
|
||||
|
||||
def test_update (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 1
|
||||
self.d["A"] = 2
|
||||
self.assertEqual(self.d["a"], 2)
|
||||
|
||||
def test_clear (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 5
|
||||
self.d["b"] = 6
|
||||
self.d.clear()
|
||||
self.assertTrue(not self.d)
|
||||
|
||||
def test_containment (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.assertTrue("A" not in self.d)
|
||||
self.assertTrue("a" not in self.d)
|
||||
self.d["a"] = 5
|
||||
self.assertTrue("A" in self.d)
|
||||
self.assertTrue("a" in self.d)
|
||||
|
||||
def test_setdefault (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 5
|
||||
self.assertEqual(self.d.setdefault("A", 6), 5)
|
||||
self.assertEqual(self.d.setdefault("b", 7), 7)
|
||||
|
||||
def test_get (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 42
|
||||
self.assertEqual(self.d.get("A"), 42)
|
||||
self.assertTrue(self.d.get("B") is None)
|
||||
|
||||
def test_update2 (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 42
|
||||
self.d.update({"A": 43})
|
||||
self.assertEqual(self.d["a"], 43)
|
||||
|
||||
def test_fromkeys (self):
|
||||
self.assertTrue(not self.d)
|
||||
keys = ["a", "A", "b", "C"]
|
||||
d1 = self.d.fromkeys(keys, 42)
|
||||
for key in keys:
|
||||
self.assertEqual(d1[key], 42)
|
||||
|
||||
def test_pop (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 42
|
||||
self.assertEqual(self.d.pop("A"), 42)
|
||||
self.assertTrue(not self.d)
|
||||
self.assertRaises(KeyError, self.d.pop, "A")
|
||||
|
||||
def test_popitem (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["a"] = 42
|
||||
self.assertEqual(self.d.popitem(), ("a", 42))
|
||||
self.assertTrue(not self.d)
|
||||
self.assertRaises(KeyError, self.d.popitem)
|
||||
|
||||
|
||||
class TestCaselessSortedDict (unittest.TestCase):
|
||||
"""Test caseless sorted dictionary routines."""
|
||||
|
||||
def setUp (self):
|
||||
"""Set up self.d as empty caseless sorted dict."""
|
||||
self.d = linkcheck.containers.CaselessSortedDict()
|
||||
|
||||
def test_sorted (self):
|
||||
self.assertTrue(not self.d)
|
||||
self.d["b"] = 6
|
||||
self.d["a"] = 7
|
||||
self.d["C"] = 8
|
||||
prev = None
|
||||
for key in self.d.keys():
|
||||
if prev is not None:
|
||||
self.assertTrue(key > prev)
|
||||
prev = key
|
||||
prev = None
|
||||
for key, value in self.d.items():
|
||||
self.assertEqual(value, self.d[key])
|
||||
if prev is not None:
|
||||
self.assertTrue(key > prev)
|
||||
prev = key
|
||||
|
||||
|
||||
class TestLFUCache (unittest.TestCase):
|
||||
"""Test LFU cache implementation."""
|
||||
|
|
@ -249,13 +61,3 @@ class TestLFUCache (unittest.TestCase):
|
|||
self.d[i] = i
|
||||
self.d[1001] = 1001
|
||||
self.assertTrue(950 <= len(self.d) <= self.size)
|
||||
|
||||
|
||||
class TestEnum (unittest.TestCase):
|
||||
|
||||
def test_enum (self):
|
||||
e = linkcheck.containers.enum("a", "b", "c")
|
||||
self.assertEqual(e.a, 0)
|
||||
self.assertEqual(e.b, 1)
|
||||
self.assertEqual(e.c, 2)
|
||||
self.assertEqual(e, (0, 1, 2))
|
||||
|
|
|
|||
Loading…
Reference in a new issue