mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-21 22:54:45 +00:00
fix html comments
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@1393 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
3749ff1982
commit
ebd56d8fd3
6 changed files with 1979 additions and 1845 deletions
File diff suppressed because it is too large
Load diff
|
|
@ -199,29 +199,38 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
BEGIN(S_COMMENT);
|
||||
}
|
||||
|
||||
<S_COMMENT>-*--[ ]*> {
|
||||
<S_COMMENT>--[ ]*> {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng-3);
|
||||
SETLVAL;
|
||||
BEGIN(INITIAL);
|
||||
RETURN(T_COMMENT);
|
||||
}
|
||||
|
||||
<S_COMMENT>-/-- {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>-/[^-] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>--/[^- >] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>--[ ]+/[^ >] {
|
||||
UPDATE_COLUMN;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>[^-]+ {
|
||||
UPDATE_LINE;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>-+[^ ->]+ {
|
||||
UPDATE_LINE;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>-+[ ]+[^>] {
|
||||
UPDATE_LINE;
|
||||
APPEND_TO_TMP(yyleng);
|
||||
}
|
||||
|
||||
<S_COMMENT>. {
|
||||
return T_WAIT;
|
||||
}
|
||||
|
|
@ -337,7 +346,7 @@ RX_DATA [-a-zA-Z0-9_:]+
|
|||
|
||||
|
||||
/*********************** TAGSTART ************************/
|
||||
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z] {
|
||||
<INITIAL><{RX_WHITE_SPACE}*/[A-Za-z0-9] {
|
||||
UPDATE_LINE;
|
||||
yyextra->tmp_attrs = PyObject_CallObject(yyextra->list_dict, NULL);
|
||||
if (yyextra->tmp_attrs==NULL) {
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
D = ../../bk-python/bk
|
||||
|
||||
diff:
|
||||
diff -BurN . $(D)
|
||||
diff -BurN -xCVS -x.cvsignore -x.pyc . $(D)
|
||||
|
||||
update:
|
||||
cp -r $(D)/* .
|
||||
|
|
|
|||
|
|
@ -42,15 +42,17 @@ parsetests = [
|
|||
("""<br/>""", """<br>"""),
|
||||
("""<a b="50%"><br>""", """<a b="50%"><br>"""),
|
||||
# comments
|
||||
("""<!---->""", """<!---->"""),
|
||||
("""<!-- a - b -->< br>""", """<!-- a - b --><br>"""),
|
||||
("""<!----->""", """<!----->"""),
|
||||
("""<!------>""", """<!------>"""),
|
||||
("""<!------->""", """<!------->"""),
|
||||
("""<!---- >""", """<!----->"""),
|
||||
("""<!-- -->""", """<!-- -->"""),
|
||||
("""<!-- -- >""", """<!-- --->"""),
|
||||
("""<!---->< 1>""", """<!----><1>"""),
|
||||
("""<!-- a - b -->< 2>""", """<!-- a - b --><2>"""),
|
||||
("""<!----->< 3>""", """<!-----><3>"""),
|
||||
("""<!------>< 4>""", """<!------><4>"""),
|
||||
("""<!------->< 5>""", """<!-------><5>"""),
|
||||
("""<!---- >< 6>""", """<!----><6>"""),
|
||||
("""<!-- -->< 7>""", """<!-- --><7>"""),
|
||||
("""<!-- -- >< 8>""", """<!-- --><8>"""),
|
||||
("""<!---- />-->""", """<!---- />-->"""),
|
||||
("""<!-- a-2 -->< 9>""", """<!-- a-2 --><9>"""),
|
||||
("""<!-- --- -->< 10>""", """<!-- --- --><10>"""),
|
||||
# end tags
|
||||
("""</a>""", """</a>"""),
|
||||
("""</ a>""", """</a>"""),
|
||||
|
|
|
|||
|
|
@ -10,10 +10,16 @@ class TestUrl (unittest.TestCase):
|
|||
nurl = "http://server/cskin.zip"
|
||||
self.assertEquals(bk.url.url_quote(bk.url.url_norm(url)), nurl)
|
||||
|
||||
def test_quoting (self):
|
||||
url = "http://groups.google.com/groups?hl=en&lr=&ie=UTF-8&threadm=3845B54D.E546F9BD%40monmouth.com&rnum=2&prev=/groups%3Fq%3Dlogitech%2Bwingman%2Bextreme%2Bdigital%2B3d%26hl%3Den%26lr%3D%26ie%3DUTF-8%26selm%3D3845B54D.E546F9BD%2540monmouth.com%26rnum%3D2"
|
||||
def test_norm (self):
|
||||
url = "http://groups.google.com/groups?hl=en&lr&ie=UTF-8&threadm=3845B54D.E546F9BD%40monmouth.com&rnum=2&prev=/groups%3Fq%3Dlogitech%2Bwingman%2Bextreme%2Bdigital%2B3d%26hl%3Den%26lr%3D%26ie%3DUTF-8%26selm%3D3845B54D.E546F9BD%2540monmouth.com%26rnum%3D2"
|
||||
nurl = url
|
||||
self.assertEqual(bk.url.url_quote(bk.url.url_norm(url)), nurl)
|
||||
self.assertEqual(bk.url.url_norm(url), nurl)
|
||||
url = "http://redirect.alexa.com/redirect?http://www.offeroptimizer.com"
|
||||
nurl = url
|
||||
self.assertEqual(bk.url.url_norm(url), nurl)
|
||||
url = "http://www.lesgensducinema.com/photo/Philippe%20Nahon.jpg"
|
||||
nurl = url
|
||||
self.assertEqual(bk.url.url_norm(url), nurl)
|
||||
|
||||
def test_fixing (self):
|
||||
url = r"http://groups.google.com\test.html"
|
||||
|
|
|
|||
63
bk/url.py
63
bk/url.py
|
|
@ -77,8 +77,45 @@ def stripsite (url):
|
|||
return url[1], urlparse.urlunsplit( (0,0,url[2],url[3],url[4]) )
|
||||
|
||||
|
||||
def parse_qsl(qs, keep_blank_values=0, strict_parsing=0):
|
||||
"""Parse a query given as a string argument.
|
||||
|
||||
Arguments:
|
||||
|
||||
qs: URL-encoded query string to be parsed
|
||||
|
||||
keep_blank_values: flag indicating whether blank values in
|
||||
URL encoded queries should be treated as blank strings. A
|
||||
true value indicates that blanks should be retained as blank
|
||||
strings. The default false value indicates that blank values
|
||||
are to be ignored and treated as if they were not included.
|
||||
|
||||
strict_parsing: flag indicating what to do with parsing errors. If
|
||||
false (the default), errors are silently ignored. If true,
|
||||
errors raise a ValueError exception.
|
||||
|
||||
Returns a list, as G-d intended.
|
||||
"""
|
||||
pairs = [s2 for s1 in qs.split('&') for s2 in s1.split(';')]
|
||||
r = []
|
||||
for name_value in pairs:
|
||||
nv = name_value.split('=', 1)
|
||||
if len(nv) != 2:
|
||||
if strict_parsing:
|
||||
raise ValueError, "bad query field: %s" % `name_value`
|
||||
elif len(nv) == 1:
|
||||
nv = (nv[0], "")
|
||||
else:
|
||||
continue
|
||||
if len(nv[1]) or keep_blank_values:
|
||||
name = urllib.unquote(nv[0].replace('+', ' '))
|
||||
value = urllib.unquote(nv[1].replace('+', ' '))
|
||||
r.append((name, value))
|
||||
return r
|
||||
|
||||
|
||||
def url_norm (url):
|
||||
"""unquote and normalize url which must be quoted"""
|
||||
"""normalize url which must be quoted"""
|
||||
urlparts = list(urlparse.urlsplit(url))
|
||||
urlparts[0] = urllib.unquote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.unquote(urlparts[1]) # host
|
||||
|
|
@ -96,7 +133,15 @@ def url_norm (url):
|
|||
urlparts[1] = urlparts[1][:i]
|
||||
else:
|
||||
urlparts[2] = urllib.unquote(urlparts[2]) # path
|
||||
urlparts[4] = urllib.unquote(urlparts[4]) # anchor
|
||||
l = []
|
||||
for k,v in parse_qsl(urlparts[3], True): # query
|
||||
k = urllib.quote(k, '/-:,')
|
||||
if v:
|
||||
v = urllib.quote(v, '/-:,')
|
||||
l.append("%s=%s" % (k, v))
|
||||
else:
|
||||
l.append(k)
|
||||
urlparts[3] = '&'.join(l)
|
||||
path = urlparts[2].replace('\\', '/').replace('//', '/')
|
||||
if not path or path=='/':
|
||||
urlparts[2] = '/'
|
||||
|
|
@ -106,6 +151,10 @@ def url_norm (url):
|
|||
urlparts[2] = os.path.normpath(path).replace('\\', '/')
|
||||
if path.endswith('/'):
|
||||
urlparts[2] += '/'
|
||||
# quote parts again
|
||||
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.quote(urlparts[1], ':') # host
|
||||
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
|
||||
return urlparse.urlunsplit(urlparts)
|
||||
|
||||
|
||||
|
|
@ -115,9 +164,15 @@ def url_quote (url):
|
|||
urlparts[0] = urllib.quote(urlparts[0]) # scheme
|
||||
urlparts[1] = urllib.quote(urlparts[1], ':') # host
|
||||
urlparts[2] = urllib.quote(urlparts[2], '/=,') # path
|
||||
urlparts[3] = urllib.quote(urlparts[3], '&=,') # query
|
||||
l = []
|
||||
for k,v in cgi.parse_qsl(urlparts[3], True): # query
|
||||
l.append("%s=%s" % (urllib.quote(k, '/-:,'), urllib.quote(v, '/-:,')))
|
||||
for k,v in parse_qsl(urlparts[3], True): # query
|
||||
k = urllib.quote(k, '/-:,')
|
||||
if v:
|
||||
v = urllib.quote(v, '/-:,')
|
||||
l.append("%s=%s" % (k, v))
|
||||
else:
|
||||
l.append(k)
|
||||
urlparts[3] = '&'.join(l)
|
||||
urlparts[4] = urllib.quote(urlparts[4]) # anchor
|
||||
return urlparse.urlunsplit(urlparts)
|
||||
|
|
|
|||
Loading…
Reference in a new issue