fix html comments

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1393 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2004-07-24 12:01:31 +00:00
parent 3749ff1982
commit ebd56d8fd3
6 changed files with 1979 additions and 1845 deletions

File diff suppressed because it is too large Load diff

View file

@ -199,29 +199,38 @@ RX_DATA [-a-zA-Z0-9_:]+
BEGIN(S_COMMENT);
}
<S_COMMENT>-*--[ ]*> {
<S_COMMENT>--[ ]*> {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng-3);
SETLVAL;
BEGIN(INITIAL);
RETURN(T_COMMENT);
}
<S_COMMENT>-/-- {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>-/[^-] {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>--/[^- >] {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>--[ ]+/[^ >] {
UPDATE_COLUMN;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>[^-]+ {
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>-+[^ ->]+ {
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>-+[ ]+[^>] {
UPDATE_LINE;
APPEND_TO_TMP(yyleng);
}
<S_COMMENT>. {
return T_WAIT;
}
@ -337,7 +346,7 @@ RX_DATA [-a-zA-Z0-9_:]+
/*********************** TAGSTART ************************/
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z] {
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z0-9] {
UPDATE_LINE;
yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL);
if (yyextra->tmp_attrs==NULL) {

View file

@ -1,7 +1,7 @@
D = ../../bk-python/bk
diff:
diff -BurN . $(D)
diff -BurN -xCVS -x.cvsignore -x.pyc . $(D)
update:
cp -r $(D)/* .

View file

@ -42,15 +42,17 @@ parsetests = [
("""<br/>""", """<br>"""),
("""<a b="50%"><br>""", """<a b="50%"><br>"""),
# comments
("""<!---->""", """<!---->"""),
("""<!-- a - b -->< br>""", """<!-- a - b --><br>"""),
("""<!----->""", """<!----->"""),
("""<!------>""", """<!------>"""),
("""<!------->""", """<!------->"""),
("""<!---- >""", """<!----->"""),
("""<!-- -->""", """<!-- -->"""),
("""<!-- -- >""", """<!-- --->"""),
("""<!---->< 1>""", """<!----><1>"""),
("""<!-- a - b -->< 2>""", """<!-- a - b --><2>"""),
("""<!----->< 3>""", """<!-----><3>"""),
("""<!------>< 4>""", """<!------><4>"""),
("""<!------->< 5>""", """<!-------><5>"""),
("""<!---- >< 6>""", """<!----><6>"""),
("""<!-- -->< 7>""", """<!-- --><7>"""),
("""<!-- -- >< 8>""", """<!-- --><8>"""),
("""<!---- />-->""", """<!---- />-->"""),
("""<!-- a-2 -->< 9>""", """<!-- a-2 --><9>"""),
("""<!-- --- -->< 10>""", """<!-- --- --><10>"""),
# end tags
("""</a>""", """</a>"""),
("""</ a>""", """</a>"""),

View file

@ -10,10 +10,16 @@ class TestUrl (unittest.TestCase):
nurl = "http://server/cskin.zip"
self.assertEquals(bk.url.url_quote(bk.url.url_norm(url)), nurl)
def test_quoting (self):
url = "http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&threadm=3845B54D.E546F9BD%40monmouth.com&rnum=2&prev=/groups%3Fq%3Dlogitech%2Bwingman%2Bextreme%2Bdigital%2B3d%26hl%3Den%26lr%3D%26ie%3DUTF-8%26selm%3D3845B54D.E546F9BD%2540monmouth.com%26rnum%3D2"
def test_norm (self):
url = "http://groups.google.com/groups?hl=en&lr&ie=UTF-8&threadm=3845B54D.E546F9BD%40monmouth.com&rnum=2&prev=/groups%3Fq%3Dlogitech%2Bwingman%2Bextreme%2Bdigital%2B3d%26hl%3Den%26lr%3D%26ie%3DUTF-8%26selm%3D3845B54D.E546F9BD%2540monmouth.com%26rnum%3D2"
nurl = url
self.assertEqual(bk.url.url_quote(bk.url.url_norm(url)), nurl)
self.assertEqual(bk.url.url_norm(url), nurl)
url = "http://redirect.alexa.com/redirect?http://www.offeroptimizer.com"
nurl = url
self.assertEqual(bk.url.url_norm(url), nurl)
url = "http://www.lesgensducinema.com/photo/Philippe%20Nahon.jpg"
nurl = url
self.assertEqual(bk.url.url_norm(url), nurl)
def test_fixing (self):
url = r"http://groups.google.com\test.html"

View file

@ -77,8 +77,45 @@ def stripsite (url):
return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) )
def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
"""Parse a query given as a string argument.
Arguments:
qs: URL-encoded query string to be parsed
keep_blank_values: flag indicating whether blank values in
URL encoded queries should be treated as blank strings. A
true value indicates that blanks should be retained as blank
strings. The default false value indicates that blank values
are to be ignored and treated as if they were not included.
strict_parsing: flag indicating what to do with parsing errors. If
false (the default), errors are silently ignored. If true,
errors raise a ValueError exception.
Returns a list, as G-d intended.
"""
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
r = []
for name_value in pairs:
nv = name_value.split('=', 1)
if len(nv) != 2:
if strict_parsing:
raise ValueError, "bad query field: %s" % `name_value`
elif len(nv) == 1:
nv = (nv[0], "")
else:
continue
if len(nv[1]) or keep_blank_values:
name = urllib.unquote(nv[0].replace('+', ' '))
value = urllib.unquote(nv[1].replace('+', ' '))
r.append((name, value))
return r
def url_norm (url):
"""unquote and normalize url which must be quoted"""
"""normalize url which must be quoted"""
urlparts = list(urlparse.urlsplit(url))
urlparts[0] = urllib.unquote(urlparts[0]) # scheme
urlparts[1] = urllib.unquote(urlparts[1]) # host
@ -96,7 +133,15 @@ def url_norm (url):
urlparts[1] = urlparts[1][:i]
else:
urlparts[2] = urllib.unquote(urlparts[2]) # path
urlparts[4] = urllib.unquote(urlparts[4]) # anchor
l = []
for k,v in parse_qsl(urlparts[3], True): # query
k = urllib.quote(k, '/-:,')
if v:
v = urllib.quote(v, '/-:,')
l.append("%s=%s" % (k, v))
else:
l.append(k)
urlparts[3] = '&'.join(l)
path = urlparts[2].replace('\\', '/').replace('//', '/')
if not path or path=='/':
urlparts[2] = '/'
@ -106,6 +151,10 @@ def url_norm (url):
urlparts[2] = os.path.normpath(path).replace('\\', '/')
if path.endswith('/'):
urlparts[2] += '/'
# quote parts again
urlparts[0] = urllib.quote(urlparts[0]) # scheme
urlparts[1] = urllib.quote(urlparts[1], ':') # host
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
return urlparse.urlunsplit(urlparts)
@ -115,9 +164,15 @@ def url_quote (url):
urlparts[0] = urllib.quote(urlparts[0]) # scheme
urlparts[1] = urllib.quote(urlparts[1], ':') # host
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
urlparts[3] = urllib.quote(urlparts[3], '&=,') # query
l = []
for k,v in cgi.parse_qsl(urlparts[3], True): # query
l.append("%s=%s" % (urllib.quote(k, '/-:,'), urllib.quote(v, '/-:,')))
for k,v in parse_qsl(urlparts[3], True): # query
k = urllib.quote(k, '/-:,')
if v:
v = urllib.quote(v, '/-:,')
l.append("%s=%s" % (k, v))
else:
l.append(k)
urlparts[3] = '&'.join(l)
urlparts[4] = urllib.quote(urlparts[4]) # anchor
return urlparse.urlunsplit(urlparts)