mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-27 03:00:36 +00:00
105 lines
3 KiB
Python
105 lines
3 KiB
Python
import sys
|
|
import re
|
|
import csv
|
|
import requests
|
|
|
|
iana_uri_schemes = "https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml"
|
|
# CSV format: URI Scheme,Template,Description,Reference
|
|
csv_iana_uri_schemes_permanent = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv'
|
|
csv_iana_uri_schemes_provisional = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-2.csv'
|
|
csv_iana_uri_schemes_historical = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-3.csv'
|
|
|
|
iana_uri_schemes_permanent = {}
|
|
iana_uri_schemes_provisional = {}
|
|
iana_uri_schemes_historical = {}
|
|
iana_uri_schemes_other = {
|
|
"clsid": "Microsoft specific",
|
|
"find" : "Mozilla specific",
|
|
"isbn" : "ISBN (int. book numbers)",
|
|
"javascript": "JavaScript",
|
|
}
|
|
|
|
filter_uri_schemes_permanent = (
|
|
"file",
|
|
"ftp",
|
|
"http",
|
|
"https",
|
|
"mailto",
|
|
"news",
|
|
"nntp",
|
|
)
|
|
|
|
template = '''
|
|
# from %(uri)s
|
|
ignored_schemes_permanent = r"""
|
|
%(permanent)s
|
|
"""
|
|
|
|
ignored_schemes_provisional = r"""
|
|
%(provisional)s
|
|
"""
|
|
|
|
ignored_schemes_historical = r"""
|
|
%(historical)s
|
|
"""
|
|
|
|
ignored_schemes_other = r"""
|
|
%(other)s
|
|
"""
|
|
|
|
ignored_schemes = "^(%%s%%s%%s%%s)$" %% (
|
|
ignored_schemes_permanent,
|
|
ignored_schemes_provisional,
|
|
ignored_schemes_historical,
|
|
ignored_schemes_other,
|
|
)
|
|
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
|
|
|
|
is_unknown_scheme = ignored_schemes_re.match
|
|
'''
|
|
|
|
def main(args):
|
|
parse_csv_file(csv_iana_uri_schemes_permanent, iana_uri_schemes_permanent)
|
|
parse_csv_file(csv_iana_uri_schemes_provisional, iana_uri_schemes_provisional)
|
|
parse_csv_file(csv_iana_uri_schemes_historical, iana_uri_schemes_historical)
|
|
for scheme in iana_uri_schemes_other:
|
|
if (scheme in iana_uri_schemes_permanent or
|
|
scheme in iana_uri_schemes_provisional or
|
|
scheme in iana_uri_schemes_historical):
|
|
raise ValueError(scheme)
|
|
for scheme in filter_uri_schemes_permanent:
|
|
if scheme in iana_uri_schemes_permanent:
|
|
del iana_uri_schemes_permanent[scheme]
|
|
args = dict(
|
|
uri = iana_uri_schemes,
|
|
permanent = get_regex(iana_uri_schemes_permanent),
|
|
provisional = get_regex(iana_uri_schemes_provisional),
|
|
historical = get_regex(iana_uri_schemes_historical),
|
|
other = get_regex(iana_uri_schemes_other),
|
|
)
|
|
res = template % args
|
|
print res
|
|
return 0
|
|
|
|
def get_regex(schemes):
|
|
expr = ["|%s # %s" % (re.escape(scheme).ljust(10), description)
|
|
for scheme, description in sorted(schemes.items())]
|
|
return "\n".join(expr)
|
|
|
|
|
|
def parse_csv_file(url, res):
|
|
"""Parse given URL and write res with {scheme -> description}"""
|
|
response = requests.get(url, stream=True)
|
|
reader = csv.reader(response.iter_lines())
|
|
first_row = True
|
|
for row in reader:
|
|
if first_row:
|
|
# skip first row
|
|
first_row = False
|
|
else:
|
|
scheme, template, description, reference = row
|
|
res[scheme] = description
|
|
|
|
if __name__ == '__main__':
|
|
sys.exit(main(sys.argv[1:]))
|
|
|