diff --git a/linkcheck/HtmlParser/__init__.py b/linkcheck/HtmlParser/__init__.py
index 2d7d78b0..4e08af68 100644
--- a/linkcheck/HtmlParser/__init__.py
+++ b/linkcheck/HtmlParser/__init__.py
@@ -164,90 +164,49 @@ import codecs
import htmlentitydefs
-def _resolve_ascii_entity (mo):
+def _resolve_entity (mo):
"""
- Resolve one YZ; entity if it is an ASCII character. Else leave as is.
-
- @param mo: matched v{_num_re} object with a "num" match group
- @type mo: MatchObject instance
- @return: resolved ASCII entity char, or original entity
- @rtype: string
- """
- # convert to number
- ent = mo.group()
- num = mo.group("num")
- if ent.lower().startswith(''):
- radix = 16
- else:
- radix = 10
- try:
- num = int(num, radix)
- except (ValueError, OverflowError):
- return ent
- # check 7-bit ASCII char range
- if 0 <= num <= 127:
- return unicode(chr(num))
- # not in range
- return ent
-
-
-_num_re = re.compile(ur'(?i)?(?P[0-9a-z]+);')
-
-def resolve_ascii_entities (s):
- """
- Resolve entities in 7-bit ASCII range to eliminate obfuscation.
-
- @param s: string with entities
- @type s: string
- @return: string with resolved ASCII entities
- @rtype: string
- """
- return _num_re.sub(_resolve_ascii_entity, s)
-
-
-def _resolve_html_entity (mo):
- """
- Resolve html entity.
+ Resolve a HTML entity.
@param mo: matched _entity_re object with a "entity" match group
@type mo: MatchObject instance
- @return: resolved entity char, or original entity
- @rtype: string
+ @return: resolved entity char, or empty string on error
+ @rtype: unicode string
"""
ent = mo.group("entity")
s = mo.group()
- entdef = htmlentitydefs.entitydefs.get(ent)
- if entdef is None:
- return s
- # note: entdef is latin-1 encoded
- return entdef.decode("iso8859-1")
+ if s.startswith(''):
+ if s[2] in 'xX':
+ radix = 16
+ else:
+ radix = 10
+ try:
+ num = int(ent, radix)
+ except (ValueError, OverflowError):
+ return u''
+ else:
+ num = htmlentitydefs.name2codepoint.get(ent)
+ if num is None or num < 0:
+ # unknown entity -> ignore
+ return u''
+ try:
+ return unichr(num)
+ except ValueError:
+ return u''
-_entity_re = re.compile(ur'(?i)&(?P[a-z]+);')
-
-def resolve_html_entities (s):
- """
- Resolve HTML entities in s and return result.
-
- @param s: string with HTML entities
- @type s: string
- @return: string with resolved HTML entities
- @rtype: string
- """
- return _entity_re.sub(_resolve_html_entity, s)
-
+_entity_re = re.compile(ur'(?i)&(#x?)?(?P[0-9a-z]+);')
def resolve_entities (s):
"""
- Resolve both HTML and 7-bit ASCII entities in s.
+ Resolve HTML entities in s.
@param s: string with entities
@type s: string
@return: string with resolved entities
@rtype: string
"""
- s = resolve_ascii_entities(s)
- return resolve_html_entities(s)
+ return _entity_re.sub(_resolve_entity, s)
def strip_quotes (s):
diff --git a/linkcheck/tests/test_parser.py b/linkcheck/tests/test_parser.py
index f47795ae..39a9cc44 100644
--- a/linkcheck/tests/test_parser.py
+++ b/linkcheck/tests/test_parser.py
@@ -53,7 +53,7 @@ parsetests = [
("""< >""", """< >"""),
("""""", """"""),
("""""", """"""),
- ("""""", """"""),
+ ("""""", """"""),
# reduce test
("""<""", """<"""),
("""d>""", """d>"""),
@@ -123,7 +123,14 @@ parsetests = [
("""""",
""""""),
# entity resolving
+ ("""""", """"""),
+ ("""""", """"""),
+ ("""""", """"""),
+ ("""""", """"""),
("""""", """"""),
+ ("""""", """"""),
+ # note that \u8156 is not valid encoding and therefore gets removed
+ ("""""", """"""),
# non-ascii characters
("""<Üzgür> fahr ¹²³¼½¬{""",
"""<Üzgür> fahr ¹²³¼½¬{"""),