resolve entities

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1202 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-01-28 22:48:50 +00:00
parent 24f05f9869
commit 66ecc466b7
6 changed files with 253 additions and 274 deletions

View file

@ -16,3 +16,60 @@
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
__version__ = "$Revision$"[11:-2]
__date__ = "$Date$"[7:-2]
import re, htmlentitydefs
def _resolve_entity (mo):
"""resolve one &#XXX; entity"""
# convert to number
ent = mo.group()
num = mo.group("num")
if ent.startswith('&#x'):
radix = 16
else:
radix = 10
num = int(num, radix)
# check 7-bit ASCII char range
if 0<=num<=127:
return chr(num)
# not in range
return ent
def resolve_entities (s):
"""resolve entities in 7-bit ASCII range to eliminate obfuscation"""
return re.sub(r'(?i)&#x?(?P<num>\d+);', _resolve_entity, s)
entities = htmlentitydefs.entitydefs.items()
UnHtmlTable = map(lambda x: ("&"+x[0]+";", x[1]), entities)
# order matters!
UnHtmlTable.sort()
UnHtmlTable.reverse()
def applyTable (table, s):
"apply a table of replacement pairs to str"
for mapping in table:
s = s.replace(mapping[0], mapping[1])
return s
def resolve_html_entities (s):
return applyTable(UnHtmlTable, s)
def strip_quotes (s):
"""remove possible double or single quotes"""
if (s.startswith("'") and s.endswith("'")) or \
(s.startswith('"') and s.endswith('"')):
return s[1:-1]
return s
def _test ():
print resolve_entities("&#%d;"%ord('a'))
if __name__=='__main__':
_test()

File diff suppressed because it is too large Load diff

View file

@ -124,61 +124,6 @@ void yyfree (void* ptr, void* yyscanner) {
}
#include "htmlparse.h"
/* Find out if the given HTML attribute val must be quoted.
The string will be surrounded by double quotes if it contains white space
or <> or ends with /.
All double quotes inside the string will be replaced with &quot;.
val must be a Python String object
*/
static PyObject* quote_string (PyObject* val) {
int quote = 0;
int replace = 0;
int len = PyString_GET_SIZE(val);
char* internal = PyString_AS_STRING(val);
int i;
PyObject* prefix;
for (i=0; i<len; i++) {
if (isspace(internal[i]) ||
internal[i]=='<' ||
internal[i]=='>' ||
internal[i]=='\'') {
quote = 1;
}
else if (internal[i]=='"') {
replace = 1;
}
}
if (len==0) {
/* its an empty string */
quote = 1;
}
else if (internal[len-1]=='/') {
quote = 1;
}
if (replace) {
PyObject* nval = PyObject_CallMethod(val, "replace", "ss", "\"", "&quot;");
Py_DECREF(val);
val = nval;
}
if (quote==0) {
return val;
}
/* quote suffix */
if ((prefix = PyString_FromString("\""))==NULL) return NULL;
PyString_Concat(&val, prefix);
if (val==NULL) {
Py_DECREF(prefix);
return NULL;
}
/* quote prefix */
PyString_ConcatAndDel(&prefix, val);
if (prefix==NULL) {
Py_DECREF(val);
return NULL;
}
return prefix;
}
%}
%option noyyalloc noyyrealloc noyyfree
@ -800,7 +745,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -817,7 +763,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -845,7 +792,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -862,7 +810,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -879,7 +828,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_LINE;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -894,8 +844,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
if (!yyextra->tmp_attrval) return T_ERROR;
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;
@ -917,8 +867,8 @@ RX_DATA [-a-zA-Z0-9_:]+
UPDATE_COLUMN;
PYSTRING_TMP(yyextra->tmp_attrval);
RESIZE_BUF(yyextra->tmp_buf, 1);
yyextra->tmp_attrval = quote_string(yyextra->tmp_attrval);
if (!yyextra->tmp_attrval) { return T_ERROR; }
yyextra->tmp_attrval = PyObject_CallFunction(yyextra->resolve_entities, "O", yyextra->tmp_attrval);
if (yyextra->tmp_attrval==NULL) return T_ERROR;
if (PyDict_SetItem(yyextra->tmp_attrs,
yyextra->tmp_attrname,
yyextra->tmp_attrval)==-1) return T_ERROR;

View file

@ -130,6 +130,9 @@ static int yyerror (char* msg) {
return 0;
}
/* parser.resolve_entities */
static PyObject* resolve_entities;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
@ -228,7 +231,7 @@ typedef int YYSTYPE;
/* Line 214 of yacc.c. */
#line 232 "htmlparse.c"
#line 235 "htmlparse.c"
#if ! defined (yyoverflow) || YYERROR_VERBOSE
@ -398,8 +401,8 @@ static const yysigned_char yyrhs[] =
/* YYRLINE[YYN] -- source line where rule number YYN was defined. */
static const unsigned short yyrline[] =
{
0, 143, 143, 144, 147, 148, 155, 189, 235, 265,
285, 305, 325, 345, 366, 387
0, 146, 146, 147, 150, 151, 158, 192, 238, 268,
288, 308, 328, 348, 369, 390
};
#endif
@ -1104,22 +1107,22 @@ yyreduce:
switch (yyn)
{
case 2:
#line 143 "htmlparse.y"
#line 146 "htmlparse.y"
{;}
break;
case 3:
#line 144 "htmlparse.y"
#line 147 "htmlparse.y"
{;}
break;
case 4:
#line 147 "htmlparse.y"
#line 150 "htmlparse.y"
{ YYACCEPT; /* wait for more lexer input */ ;}
break;
case 5:
#line 149 "htmlparse.y"
#line 152 "htmlparse.y"
{
/* an error occured in the scanner, the python exception must be set */
UserData* ud = yyget_extra(scanner);
@ -1129,7 +1132,7 @@ yyreduce:
break;
case 6:
#line 156 "htmlparse.y"
#line 159 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1166,7 +1169,7 @@ finish_start:
break;
case 7:
#line 190 "htmlparse.y"
#line 193 "htmlparse.y"
{
/* $1 is a tuple (<tag>, <attrs>); <attrs> is a dictionary */
UserData* ud = yyget_extra(scanner);
@ -1215,7 +1218,7 @@ finish_start_end:
break;
case 8:
#line 236 "htmlparse.y"
#line 239 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1248,7 +1251,7 @@ finish_end:
break;
case 9:
#line 266 "htmlparse.y"
#line 269 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1271,7 +1274,7 @@ finish_comment:
break;
case 10:
#line 286 "htmlparse.y"
#line 289 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1294,7 +1297,7 @@ finish_pi:
break;
case 11:
#line 306 "htmlparse.y"
#line 309 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1317,7 +1320,7 @@ finish_cdata:
break;
case 12:
#line 326 "htmlparse.y"
#line 329 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1340,7 +1343,7 @@ finish_doctype:
break;
case 13:
#line 346 "htmlparse.y"
#line 349 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1364,7 +1367,7 @@ finish_script:
break;
case 14:
#line 367 "htmlparse.y"
#line 370 "htmlparse.y"
{
UserData* ud = yyget_extra(scanner);
PyObject* callback = NULL;
@ -1388,7 +1391,7 @@ finish_style:
break;
case 15:
#line 388 "htmlparse.y"
#line 391 "htmlparse.y"
{
/* Remember this is also called as a lexer error fallback */
UserData* ud = yyget_extra(scanner);
@ -1415,7 +1418,7 @@ finish_characters:
}
/* Line 999 of yacc.c. */
#line 1419 "htmlparse.c"
#line 1422 "htmlparse.c"
yyvsp -= yylen;
yyssp -= yylen;
@ -1609,7 +1612,7 @@ yyreturn:
}
#line 410 "htmlparse.y"
#line 413 "htmlparse.y"
/* disable python memory interface */
@ -1648,6 +1651,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->resolve_entities = resolve_entities;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
@ -1989,5 +1993,11 @@ PyMODINIT_FUNC inithtmlsax (void) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
return;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
return;
}
}

View file

@ -44,6 +44,9 @@ static int yyerror (char* msg) {
return 0;
}
/* parser.resolve_entities */
static PyObject* resolve_entities;
/* macros for easier scanner state manipulation */
/* test whether tag does not need an HTML end tag */
@ -445,6 +448,7 @@ static PyObject* parser_new (PyTypeObject* type, PyObject* args, PyObject* kwds)
self->userData->tmp_tag = self->userData->tmp_attrname =
self->userData->tmp_attrval = self->userData->tmp_attrs =
self->userData->lexbuf = NULL;
self->userData->resolve_entities = resolve_entities;
self->userData->exc_type = NULL;
self->userData->exc_val = NULL;
self->userData->exc_tb = NULL;
@ -786,4 +790,10 @@ PyMODINIT_FUNC inithtmlsax (void) {
/* init error */
PyErr_Print();
}
if ((m = PyImport_ImportModule("linkcheck.parser"))==NULL) {
return;
}
if ((resolve_entities = PyObject_GetAttrString(m, "resolve_entities"))==NULL) {
return;
}
}

View file

@ -69,6 +69,8 @@ typedef struct {
PyObject* tmp_attrval;
/* temporary HTML start tag attribute list */
PyObject* tmp_attrs;
/* parser.resolve_entities */
PyObject* resolve_entities;
/* stored Python exception (if error occurred in scanner) */
PyObject* exc_type;
PyObject* exc_val;