mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-22 15:14:44 +00:00
resolve entities
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1202 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
24f05f9869
commit
66ecc466b7
6 changed files with 253 additions and 274 deletions
|
|
@ -16,3 +16,60 @@
|
|||
# along with this program; if not, write to the Free Software
|
||||
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
||||
|
||||
__version__ = "$Revision$"[11:-2]
|
||||
__date__ = "$Date$"[7:-2]
|
||||
|
||||
import re, htmlentitydefs
|
||||
|
||||
def _resolve_entity (mo):
|
||||
"""resolve one &#XXX; entity"""
|
||||
# convert to number
|
||||
ent = mo.group()
|
||||
num = mo.group("num")
|
||||
if ent.startswith('&#x'):
|
||||
radix = 16
|
||||
else:
|
||||
radix = 10
|
||||
num = int(num, radix)
|
||||
# check 7-bit ASCII char range
|
||||
if 0<=num<=127:
|
||||
return chr(num)
|
||||
# not in range
|
||||
return ent
|
||||
|
||||
|
||||
def resolve_entities (s):
|
||||
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
|
||||
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_entity, s)
|
||||
|
||||
entities = htmlentitydefs.entitydefs.items()
|
||||
|
||||
UnHtmlTable = map(lambda x: ("&"+x[0]+";", x[1]), entities)
|
||||
# order matters!
|
||||
UnHtmlTable.sort()
|
||||
UnHtmlTable.reverse()
|
||||
|
||||
def applyTable (table, s):
|
||||
"apply a table of replacement pairs to str"
|
||||
for mapping in table:
|
||||
s = s.replace(mapping[0], mapping[1])
|
||||
return s
|
||||
|
||||
|
||||
def resolve_html_entities (s):
|
||||
return applyTable(UnHtmlTable, s)
|
||||
|
||||
|
||||
def strip_quotes (s):
|
||||
"""remove possible double or single quotes"""
|
||||
if (s.startswith("'") and s.endswith("'")) or \
|
||||
(s.startswith('"') and s.endswith('"')):
|
||||
return s[1:-1]
|
||||
return s
|
||||
|
||||
|
||||
def _test ():
|
||||
print resolve_entities("&#%d;"%ord('a'))
|
||||
|
||||
if __name__=='__main__':
|
||||
_test()
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -124,61 +124,6 @@ void yyfree (void* ptr, void* yyscanner) {
|
|||
}
|
||||
|
||||
#include "htmlparse.h"
|
||||
|
||||
/* Find out if the given HTML attribute val must be quoted.
|
||||
The string will be surrounded by double quotes if it contains white space
|
||||
or <> or ends with /.
|
||||
All double quotes inside the string will be replaced with ".
|
||||
val must be a Python String object
|
||||
*/
|
||||
static PyObject* quote_string (PyObject* val) {
|
||||
int quote = 0;
|
||||
int replace = 0;
|
||||
int len = PyString_GET_SIZE(val);
|
||||
char* internal = PyString_AS_STRING(val);
|
||||
int i;
|
||||
PyObject* prefix;
|
||||
for (i=0; i<len; i++) {
|
||||
if (isspace(internal[i]) ||
|
||||
internal[i]=='<' ||
|
||||
internal[i]=='>' ||
|
||||
internal[i]=='\'') {
|
||||
quote = 1;
|
||||
}
|
||||
else if (internal[i]=='"') {
|
||||
replace = 1;
|
||||
}
|
||||
}
|
||||
if (len==0) {
|
||||
/* its an empty string */
|
||||
quote = 1;
|
||||
}
|
||||
else if (internal[len-1]=='/') {
|
||||
quote = 1;
|
||||
}
|
||||
if (replace) {
|
||||
PyObject* nval = PyObject_CallMethod(val, "replace", "ss", "\"", """);
|
||||
Py_DECREF(val);
|
||||
val = nval;
|
||||
}
|
||||
if (quote==0) {
|
||||
return val;
|
||||
}
|
||||
/* quote suffix */
|
||||
if ((prefix = PyString_FromString("\""))==NULL) return NULL;
|
||||
PyString_Concat(&val, prefix);
|
||||
if (val==NULL) {
|
||||
Py_DECREF(prefix);
|
||||
return NULL;
|
||||
}
|
||||
/* quote prefix */
|
||||
PyString_ConcatAndDel(&prefix, val);
|
||||
if (prefix==NULL) {
|
||||
Py_DECREF(val);
|
||||
return NULL;
|
||||
}
|
||||
return prefix;
|
||||
}
|
||||
%}
|
||||
|
||||
%option noyyalloc noyyrealloc noyyfree
|
||||
|
|
@ -800,7 +745,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -817,7 +763,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -845,7 +792,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -862,7 +810,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -879,7 +828,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_LINE;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -894,8 +844,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
if (!yyextra->tmp_attrval) return T_ERROR;
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
@ -917,8 +867,8 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
UPDATE_COLUMN;
|
||||
PYSTRING_TMP(yyextra->tmp_attrval);
|
||||
RESIZE_BUF(yyextra->tmp_buf, 1);
|
||||
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
|
||||
if (!yyextra->tmp_attrval) { return T_ERROR; }
|
||||
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
|
||||
if (yyextra->tmp_attrval==NULL) return T_ERROR;
|
||||
if (PyDict_SetItem(yyextra->tmp_attrs,
|
||||
yyextra->tmp_attrname,
|
||||
yyextra->tmp_attrval)==-1) return T_ERROR;
|
||||
|
|
|
|||
|
|
@ -130,6 +130,9 @@ static int yyerror (char* msg) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* parser.resolve_entities */
|
||||
static PyObject* resolve_entities;
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
/* test whether tag does not need an HTML end tag */
|
||||
|
|
@ -228,7 +231,7 @@ typedef int YYSTYPE;
|
|||
|
||||
|
||||
/* Line 214 of yacc.c. */
|
||||
#line 232 "htmlparse.c"
|
||||
#line 235 "htmlparse.c"
|
||||
|
||||
#if ! defined (yyoverflow) || YYERROR_VERBOSE
|
||||
|
||||
|
|
@ -398,8 +401,8 @@ static const yysigned_char yyrhs[] =
|
|||
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
|
||||
static const unsigned short yyrline[] =
|
||||
{
|
||||
0, 143, 143, 144, 147, 148, 155, 189, 235, 265,
|
||||
285, 305, 325, 345, 366, 387
|
||||
0, 146, 146, 147, 150, 151, 158, 192, 238, 268,
|
||||
288, 308, 328, 348, 369, 390
|
||||
};
|
||||
#endif
|
||||
|
||||
|
|
@ -1104,22 +1107,22 @@ yyreduce:
|
|||
switch (yyn)
|
||||
{
|
||||
case 2:
|
||||
#line 143 "htmlparse.y"
|
||||
#line 146 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 3:
|
||||
#line 144 "htmlparse.y"
|
||||
#line 147 "htmlparse.y"
|
||||
{;}
|
||||
break;
|
||||
|
||||
case 4:
|
||||
#line 147 "htmlparse.y"
|
||||
#line 150 "htmlparse.y"
|
||||
{ YYACCEPT; /* wait for more lexer input */ ;}
|
||||
break;
|
||||
|
||||
case 5:
|
||||
#line 149 "htmlparse.y"
|
||||
#line 152 "htmlparse.y"
|
||||
{
|
||||
/* an error occured in the scanner, the python exception must be set */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1129,7 +1132,7 @@ yyreduce:
|
|||
break;
|
||||
|
||||
case 6:
|
||||
#line 156 "htmlparse.y"
|
||||
#line 159 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1166,7 +1169,7 @@ finish_start:
|
|||
break;
|
||||
|
||||
case 7:
|
||||
#line 190 "htmlparse.y"
|
||||
#line 193 "htmlparse.y"
|
||||
{
|
||||
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1215,7 +1218,7 @@ finish_start_end:
|
|||
break;
|
||||
|
||||
case 8:
|
||||
#line 236 "htmlparse.y"
|
||||
#line 239 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1248,7 +1251,7 @@ finish_end:
|
|||
break;
|
||||
|
||||
case 9:
|
||||
#line 266 "htmlparse.y"
|
||||
#line 269 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1271,7 +1274,7 @@ finish_comment:
|
|||
break;
|
||||
|
||||
case 10:
|
||||
#line 286 "htmlparse.y"
|
||||
#line 289 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1294,7 +1297,7 @@ finish_pi:
|
|||
break;
|
||||
|
||||
case 11:
|
||||
#line 306 "htmlparse.y"
|
||||
#line 309 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1317,7 +1320,7 @@ finish_cdata:
|
|||
break;
|
||||
|
||||
case 12:
|
||||
#line 326 "htmlparse.y"
|
||||
#line 329 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1340,7 +1343,7 @@ finish_doctype:
|
|||
break;
|
||||
|
||||
case 13:
|
||||
#line 346 "htmlparse.y"
|
||||
#line 349 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1364,7 +1367,7 @@ finish_script:
|
|||
break;
|
||||
|
||||
case 14:
|
||||
#line 367 "htmlparse.y"
|
||||
#line 370 "htmlparse.y"
|
||||
{
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
PyObject* callback = NULL;
|
||||
|
|
@ -1388,7 +1391,7 @@ finish_style:
|
|||
break;
|
||||
|
||||
case 15:
|
||||
#line 388 "htmlparse.y"
|
||||
#line 391 "htmlparse.y"
|
||||
{
|
||||
/* Remember this is also called as a lexer error fallback */
|
||||
UserData* ud = yyget_extra(scanner);
|
||||
|
|
@ -1415,7 +1418,7 @@ finish_characters:
|
|||
}
|
||||
|
||||
/* Line 999 of yacc.c. */
|
||||
#line 1419 "htmlparse.c"
|
||||
#line 1422 "htmlparse.c"
|
||||
|
||||
yyvsp -= yylen;
|
||||
yyssp -= yylen;
|
||||
|
|
@ -1609,7 +1612,7 @@ yyreturn:
|
|||
}
|
||||
|
||||
|
||||
#line 410 "htmlparse.y"
|
||||
#line 413 "htmlparse.y"
|
||||
|
||||
|
||||
/* disable python memory interface */
|
||||
|
|
@ -1648,6 +1651,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->tmp_tag = self->userData->tmp_attrname =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrs =
|
||||
self->userData->lexbuf = NULL;
|
||||
self->userData->resolve_entities = resolve_entities;
|
||||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
|
|
@ -1989,5 +1993,11 @@ PyMODINIT_FUNC inithtmlsax (void) {
|
|||
/* init error */
|
||||
PyErr_Print();
|
||||
}
|
||||
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -44,6 +44,9 @@ static int yyerror (char* msg) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* parser.resolve_entities */
|
||||
static PyObject* resolve_entities;
|
||||
|
||||
/* macros for easier scanner state manipulation */
|
||||
|
||||
/* test whether tag does not need an HTML end tag */
|
||||
|
|
@ -445,6 +448,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
|
|||
self->userData->tmp_tag = self->userData->tmp_attrname =
|
||||
self->userData->tmp_attrval = self->userData->tmp_attrs =
|
||||
self->userData->lexbuf = NULL;
|
||||
self->userData->resolve_entities = resolve_entities;
|
||||
self->userData->exc_type = NULL;
|
||||
self->userData->exc_val = NULL;
|
||||
self->userData->exc_tb = NULL;
|
||||
|
|
@ -786,4 +790,10 @@ PyMODINIT_FUNC inithtmlsax (void) {
|
|||
/* init error */
|
||||
PyErr_Print();
|
||||
}
|
||||
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
|
||||
return;
|
||||
}
|
||||
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -69,6 +69,8 @@ typedef struct {
|
|||
PyObject* tmp_attrval;
|
||||
/* temporary HTML start tag attribute list */
|
||||
PyObject* tmp_attrs;
|
||||
/* parser.resolve_entities */
|
||||
PyObject* resolve_entities;
|
||||
/* stored Python exception (if error occurred in scanner) */
|
||||
PyObject* exc_type;
|
||||
PyObject* exc_val;
|
||||
|
|
|
|||
Loading…
Reference in a new issue