linkchecker/scripts/update_iana_uri_schemes.py
Chris Mayo cf7aff2770 Fix remaining flake8 violations in doc/ and scripts/
doc/web/hooks/__hooks__.py:31:57: E999 SyntaxError: invalid token
scripts/analyze_memdump.py:42:12: E999 SyntaxError: invalid syntax
scripts/analyze_memdump.py:85:89: E501 line too long (105 > 88 characters)
scripts/analyze_memdump.py:132:89: E501 line too long (97 > 88 characters)
scripts/update_iana_uri_schemes.py:90:13: E999 SyntaxError: invalid syntax
2020-05-26 20:20:57 +01:00

117 lines
3 KiB
Python

import sys
import re
import csv
import requests
iana_uri_schemes = "https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml"
# CSV format: URI Scheme,Template,Description,Reference
csv_iana_uri_schemes_permanent = (
"https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv"
)
csv_iana_uri_schemes_provisional = (
"https://www.iana.org/assignments/uri-schemes/uri-schemes-2.csv"
)
csv_iana_uri_schemes_historical = (
"https://www.iana.org/assignments/uri-schemes/uri-schemes-3.csv"
)
iana_uri_schemes_permanent = {}
iana_uri_schemes_provisional = {}
iana_uri_schemes_historical = {}
iana_uri_schemes_other = {
"clsid": "Microsoft specific",
"find": "Mozilla specific",
"isbn": "ISBN (int. book numbers)",
"javascript": "JavaScript",
}
filter_uri_schemes_permanent = (
"file",
"ftp",
"http",
"https",
"mailto",
"news",
"nntp",
)
template = '''
# from %(uri)s
ignored_schemes_permanent = r"""
%(permanent)s
"""
ignored_schemes_provisional = r"""
%(provisional)s
"""
ignored_schemes_historical = r"""
%(historical)s
"""
ignored_schemes_other = r"""
%(other)s
"""
ignored_schemes = "^(%%s%%s%%s%%s)$" %% (
ignored_schemes_permanent,
ignored_schemes_provisional,
ignored_schemes_historical,
ignored_schemes_other,
)
ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE)
is_unknown_scheme = ignored_schemes_re.match
'''
def main(args):
parse_csv_file(csv_iana_uri_schemes_permanent, iana_uri_schemes_permanent)
parse_csv_file(csv_iana_uri_schemes_provisional, iana_uri_schemes_provisional)
parse_csv_file(csv_iana_uri_schemes_historical, iana_uri_schemes_historical)
for scheme in iana_uri_schemes_other:
if (
scheme in iana_uri_schemes_permanent
or scheme in iana_uri_schemes_provisional
or scheme in iana_uri_schemes_historical
):
raise ValueError(scheme)
for scheme in filter_uri_schemes_permanent:
if scheme in iana_uri_schemes_permanent:
del iana_uri_schemes_permanent[scheme]
args = dict(
uri=iana_uri_schemes,
permanent=get_regex(iana_uri_schemes_permanent),
provisional=get_regex(iana_uri_schemes_provisional),
historical=get_regex(iana_uri_schemes_historical),
other=get_regex(iana_uri_schemes_other),
)
res = template % args
print(res)
return 0
def get_regex(schemes):
expr = [
"|%s # %s" % (re.escape(scheme).ljust(10), description)
for scheme, description in sorted(schemes.items())
]
return "\n".join(expr)
def parse_csv_file(url, res):
"""Parse given URL and write res with {scheme -> description}"""
response = requests.get(url, stream=True)
reader = csv.reader(response.iter_lines())
first_row = True
for row in reader:
if first_row:
# skip first row
first_row = False
else:
scheme, template, description, reference = row
res[scheme] = description
if __name__ == "__main__":
sys.exit(main(sys.argv[1:]))