mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-05-13 09:03:11 +00:00
resolve number entity refs in unhtmlify
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@799 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
fc82bc9362
commit
11b662858a
1 changed files with 22 additions and 4 deletions
|
|
@ -20,11 +20,8 @@ import re, sys, htmlentitydefs
|
|||
markup_re = re.compile("<.*?>", re.DOTALL)
|
||||
entities = htmlentitydefs.entitydefs.items()
|
||||
HtmlTable = map(lambda x: (x[1], "&"+x[0]+";"), entities)
|
||||
UnHtmlTable = map(lambda x: ("&"+x[0]+";", x[1]), entities)
|
||||
# order matters!
|
||||
HtmlTable.sort()
|
||||
UnHtmlTable.sort()
|
||||
UnHtmlTable.reverse()
|
||||
# standard xml entities
|
||||
entities = {
|
||||
'lt': '<',
|
||||
|
|
@ -118,8 +115,29 @@ def htmlify (s):
|
|||
return applyTable(HtmlTable, s)
|
||||
|
||||
|
||||
is_charref = re.compile(r'(?i)x?(?P<num>\d+)').match
|
||||
|
||||
def resolve_entity (mo):
|
||||
ent = mo.group(0).lower()
|
||||
if htmlentitydefs.entitydefs.has_key(ent):
|
||||
return htmlentitydefs.entitydefs[ent]
|
||||
mo = is_charref(ent)
|
||||
if mo:
|
||||
# convert to number
|
||||
num = mo.group("num")
|
||||
if ent.startswith('x'):
|
||||
radix = 16
|
||||
else:
|
||||
radix = 10
|
||||
num = int(num, radix)
|
||||
# check char range
|
||||
if 0<=num<=255:
|
||||
return chr(num)
|
||||
return ent
|
||||
|
||||
|
||||
def unhtmlify (s):
|
||||
return applyTable(UnHtmlTable, s)
|
||||
return re.sub(r'(?i)&(x?\d+|[a-z]+);', s, resolve_entity)
|
||||
|
||||
|
||||
def xmlify (s):
|
||||
|
|
|
|||
Loading…
Reference in a new issue