From ccd0d4ead7a17229f6284c383ee858aaa7259a18 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Wed, 12 Mar 2014 19:20:49 +0100 Subject: [PATCH] Updated the list of unknown or ignored URI schemes. --- doc/changelog.txt | 3 +- linkcheck/checker/unknownurl.py | 322 +++++++++++++++++++---------- scripts/removeafter.py | 19 ++ scripts/update_iana_uri_schemes.py | 105 ++++++++++ scripts/update_iana_uri_schemes.sh | 9 + 5 files changed, 345 insertions(+), 113 deletions(-) create mode 100755 scripts/removeafter.py create mode 100644 scripts/update_iana_uri_schemes.py create mode 100755 scripts/update_iana_uri_schemes.sh diff --git a/doc/changelog.txt b/doc/changelog.txt index 5be4c48c..246caf93 100644 --- a/doc/changelog.txt +++ b/doc/changelog.txt @@ -10,13 +10,14 @@ Changes: - checking: Ignored URLs are reported earlier now. - installation: Check requirement for Python requests >= 2.2.0. Closes: GH bug #478 +- checking: Updated the list of unkonwn or ignored URI schemes. Fixes: - checking: Fix internal errors in debug output. Closes: GH bug #472 - checking: Fix URL result caching. - checking: Fix assertion in external link checking. -- checking: Fix SSl errors on Windows. +- checking: Fix SSL errors on Windows. Closes: GH bug #471 diff --git a/linkcheck/checker/unknownurl.py b/linkcheck/checker/unknownurl.py index 34ed0b3f..dc628918 100644 --- a/linkcheck/checker/unknownurl.py +++ b/linkcheck/checker/unknownurl.py @@ -21,118 +21,6 @@ Handle uncheckable URLs. import re from . import urlbase -# from http://www.iana.org/assignments/uri-schemes.html -ignored_schemes_permanent = r""" -aaas? # Diameter Protocol -|about # about -|acap # application configuration access protocol -|cap # Calendar Access Protocol -|cid # content identifier -|crid # TV-Anytime Content Reference Identifier -|data # data -|dav # dav -|dict # dictionary service protocol -|geo # Geographic Locations -|go # go -|gopher # Gopher -|h323 # H.323 -|iax # Inter-Asterisk eXchange Version 2 -|icap # Internet Content Adaptation Protocol -|im # Instant Messaging -|imap # internet message access protocol -|info # Information Assets with Identifiers in Public Namespaces -|ipp # Internet Printing Protocol -|iris # Internet Registry Information Service -|iris\.(beep|xpcs?|lwz) # -|ldap # Lightweight Directory Access Protocol -|mid # message identifier -|msrps? # Message Session Relay Protocol -|mtqp # Message Tracking Query Protocol -|mupdate # Mailbox Update (MUPDATE) Protocol -|nfs # network file system protocol -|nih? # -|opaquelocktoken # opaquelocktoken -|pop # Post Office Protocol v3 -|pres # Presence -|rtsp # real time streaming protocol -|service # service location -|session # -|shttp # secure HTTP -|sieve # ManageSieve Protocol -|sips? # session initiation protocol -|sms # Short Message Service -|snmp # Simple Network Management Protocol -|soap\.beeps? # -|steam # Steam browser protocol -|tag # -|tel # telephone -|tftp # Trivial File Transfer Protocol -|thismessage # -|tip # Transaction Internet Protocol -|tn3270 # Interactive 3270 emulation sessions -|tv # TV Broadcasts -|urn # Uniform Resource Names -|vemmi # versatile multimedia interface -|wss? # WebSocket connections -|xcon(-userid)? # -|xmlrpc\.beeps? # -|xmpp # -|z39\.50r # Z39.50 Retrieval -|z39\.50s # Z39.50 Session -""" - -ignored_schemes_provisional = r""" -|afs # Andrew File System global file names -|callto # -|com-eventbrite-attendee # -|dlna-play(single|container) # -|dtn # DTNRG research and development -|dvb # -|hcp # -|icon # -|ipn # -|jms # Java Message Service -|mms # multimedia stream -|ms-help # -|msnim # -|oid # -|res # -|rsync # rsync protocol -|skype # Skype -|view-source # -""" - -ignored_schemes_historical = r""" -|fax # fax -|mailserver # Access to data available from mail servers -|modem # modem -|prospero # Prospero Directory Service -|videotex # -|wais # Wide Area Information Servers -|z39\.50 # Z39.50 information access -""" - -ignored_schemes_other = r""" -|chrome # Mozilla specific -|clsid # Microsoft specific -|feed # RSS or Atom feeds -|find # Mozilla specific -|isbn # ISBN (int. book numbers) -|ircs? # internet relay chat -|javascript # JavaScript -""" - - -ignored_schemes = "^(%s%s%s%s)$" % ( - ignored_schemes_permanent, - ignored_schemes_provisional, - ignored_schemes_historical, - ignored_schemes_other, -) -ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) - -is_unknown_scheme = ignored_schemes_re.match - class UnknownUrl (urlbase.UrlBase): """Handle unknown or just plain broken URLs.""" @@ -159,3 +47,213 @@ class UnknownUrl (urlbase.UrlBase): @rtype: bool """ return False + + +# do not edit anything below since these entries are generated from scripts/update.sh +# DO NOT REMOVE + +# from https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml +ignored_schemes_permanent = r""" +|aaa # Diameter Protocol +|aaas # Diameter Protocol with Secure Transport +|about # about +|acap # application configuration access protocol +|acct # acct +|cap # Calendar Access Protocol +|cid # content identifier +|coap # coap +|coaps # coaps +|crid # TV-Anytime Content Reference Identifier +|data # data +|dav # dav +|dict # dictionary service protocol +|dns # Domain Name System +|geo # Geographic Locations +|go # go +|gopher # The Gopher Protocol +|h323 # H.323 +|iax # Inter-Asterisk eXchange Version 2 +|icap # Internet Content Adaptation Protocol +|im # Instant Messaging +|imap # internet message access protocol +|info # Information Assets with Identifiers in Public Namespaces +|ipp # Internet Printing Protocol +|iris # Internet Registry Information Service +|iris\.beep # iris.beep +|iris\.lwz # iris.lwz +|iris\.xpc # iris.xpc +|iris\.xpcs # iris.xpcs +|jabber # jabber +|ldap # Lightweight Directory Access Protocol +|mid # message identifier +|msrp # Message Session Relay Protocol +|msrps # Message Session Relay Protocol Secure +|mtqp # Message Tracking Query Protocol +|mupdate # Mailbox Update (MUPDATE) Protocol +|nfs # network file system protocol +|ni # ni +|nih # nih +|opaquelocktoken # opaquelocktokent +|pop # Post Office Protocol v3 +|pres # Presence +|reload # reload +|rtsp # Real-time Streaming Protocol (RTSP) +|rtsps # Real-time Streaming Protocol (RTSP) over TLS +|rtspu # Real-time Streaming Protocol (RTSP) over unreliable datagram transport +|service # service location +|session # session +|shttp # Secure Hypertext Transfer Protocol +|sieve # ManageSieve Protocol +|sip # session initiation protocol +|sips # secure session initiation protocol +|sms # Short Message Service +|snmp # Simple Network Management Protocol +|soap\.beep # soap.beep +|soap\.beeps # soap.beeps +|stun # stun +|stuns # stuns +|tag # tag +|tel # telephone +|telnet # Reference to interactive sessions +|tftp # Trivial File Transfer Protocol +|thismessage # multipart/related relative reference resolution +|tip # Transaction Internet Protocol +|tn3270 # Interactive 3270 emulation sessions +|turn # turn +|turns # turns +|tv # TV Broadcasts +|urn # Uniform Resource Names +|vemmi # versatile multimedia interface +|ws # WebSocket connections +|wss # Encrypted WebSocket connections +|xcon # xcon +|xcon\-userid # xcon-userid +|xmlrpc\.beep # xmlrpc.beep +|xmlrpc\.beeps # xmlrpc.beeps +|xmpp # Extensible Messaging and Presence Protocol +|z39\.50r # Z39.50 Retrieval +|z39\.50s # Z39.50 Session +""" + +ignored_schemes_provisional = r""" +|adiumxtra # adiumxtra +|afp # afp +|afs # Andrew File System global file names +|aim # aim +|apt # apt +|attachment # attachment +|aw # aw +|beshare # beshare +|bitcoin # bitcoin +|bolo # bolo +|callto # callto +|chrome # chrome +|chrome\-extension # chrome-extension +|com\-eventbrite\-attendee # com-eventbrite-attendee +|content # content +|cvs # cvs +|dlna\-playcontainer # dlna-playcontainer +|dlna\-playsingle # dlna-playsingle +|dtn # DTNRG research and development +|dvb # dvb +|ed2k # ed2k +|facetime # facetime +|feed # feed +|feedready # feedready +|finger # finger +|fish # fish +|gg # gg +|git # git +|gizmoproject # gizmoproject +|gtalk # gtalk +|ham # ham +|hcp # hcp +|icon # icon +|ipn # ipn +|irc # irc +|irc6 # irc6 +|ircs # ircs +|itms # itms +|jar # jar +|jms # Java Message Service +|keyparc # keyparc +|lastfm # lastfm +|ldaps # ldaps +|magnet # magnet +|maps # maps +|market # market +|message # message +|mms # mms +|ms\-help # ms-help +|ms\-settings\-power # ms-settings-power +|msnim # msnim +|mumble # mumble +|mvn # mvn +|notes # notes +|oid # oid +|palm # palm +|paparazzi # paparazzi +|pkcs11 # pkcs11 +|platform # platform +|proxy # proxy +|psyc # psyc +|query # query +|res # res +|resource # resource +|rmi # rmi +|rsync # rsync +|rtmp # rtmp +|secondlife # query +|sftp # query +|sgn # sgn +|skype # skype +|smb # smb +|soldat # soldat +|spotify # spotify +|ssh # ssh +|steam # steam +|svn # svn +|teamspeak # teamspeak +|things # things +|udp # udp +|unreal # unreal +|ut2004 # ut2004 +|ventrilo # ventrilo +|view\-source # view-source +|webcal # webcal +|wtai # wtai +|wyciwyg # wyciwyg +|xfire # xfire +|xri # xri +|ymsgr # ymsgr +""" + +ignored_schemes_historical = r""" +|fax # fax +|mailserver # Access to data available from mail servers +|modem # modem +|pack # pack +|prospero # Prospero Directory Service +|snews # NNTP over SSL/TLS +|videotex # videotex +|wais # Wide Area Information Servers +|z39\.50 # Z39.50 information access +""" + +ignored_schemes_other = r""" +|clsid # Microsoft specific +|find # Mozilla specific +|isbn # ISBN (int. book numbers) +|javascript # JavaScript +""" + +ignored_schemes = "^(%s%s%s%s)$" % ( + ignored_schemes_permanent, + ignored_schemes_provisional, + ignored_schemes_historical, + ignored_schemes_other, +) +ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) + +is_unknown_scheme = ignored_schemes_re.match + diff --git a/scripts/removeafter.py b/scripts/removeafter.py new file mode 100755 index 00000000..1c0fab70 --- /dev/null +++ b/scripts/removeafter.py @@ -0,0 +1,19 @@ +#!/usr/bin/env python +# Copyright (C) 2012-2014 Bastian Kleineidam +"""Remove all lines after a given marker line. +""" +from __future__ import print_function +import fileinput +import sys + +def main(args): + """Remove lines after marker.""" + filename = args[0] + marker = args[1] + for line in fileinput.input(filename, inplace=1): + print(line.rstrip()) + if line.startswith(marker): + break + +if __name__ == '__main__': + main(sys.argv[1:]) diff --git a/scripts/update_iana_uri_schemes.py b/scripts/update_iana_uri_schemes.py new file mode 100644 index 00000000..57e21ab9 --- /dev/null +++ b/scripts/update_iana_uri_schemes.py @@ -0,0 +1,105 @@ +import sys +import re +import csv +import requests + +iana_uri_schemes = "https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml" +# CSV format: URI Scheme,Template,Description,Reference +csv_iana_uri_schemes_permanent = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-1.csv' +csv_iana_uri_schemes_provisional = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-2.csv' +csv_iana_uri_schemes_historical = 'https://www.iana.org/assignments/uri-schemes/uri-schemes-3.csv' + +iana_uri_schemes_permanent = {} +iana_uri_schemes_provisional = {} +iana_uri_schemes_historical = {} +iana_uri_schemes_other = { + "clsid": "Microsoft specific", + "find" : "Mozilla specific", + "isbn" : "ISBN (int. book numbers)", + "javascript": "JavaScript", +} + +filter_uri_schemes_permanent = ( + "file", + "ftp", + "http", + "https", + "mailto", + "news", + "nntp", +) + +template = ''' +# from %(uri)s +ignored_schemes_permanent = r""" +%(permanent)s +""" + +ignored_schemes_provisional = r""" +%(provisional)s +""" + +ignored_schemes_historical = r""" +%(historical)s +""" + +ignored_schemes_other = r""" +%(other)s +""" + +ignored_schemes = "^(%%s%%s%%s%%s)$" %% ( + ignored_schemes_permanent, + ignored_schemes_provisional, + ignored_schemes_historical, + ignored_schemes_other, +) +ignored_schemes_re = re.compile(ignored_schemes, re.VERBOSE) + +is_unknown_scheme = ignored_schemes_re.match +''' + +def main(args): + parse_csv_file(csv_iana_uri_schemes_permanent, iana_uri_schemes_permanent) + parse_csv_file(csv_iana_uri_schemes_provisional, iana_uri_schemes_provisional) + parse_csv_file(csv_iana_uri_schemes_historical, iana_uri_schemes_historical) + for scheme in iana_uri_schemes_other: + if (scheme in iana_uri_schemes_permanent or + scheme in iana_uri_schemes_provisional or + scheme in iana_uri_schemes_historical): + raise ValueError(scheme) + for scheme in filter_uri_schemes_permanent: + if scheme in iana_uri_schemes_permanent: + del iana_uri_schemes_permanent[scheme] + args = dict( + uri = iana_uri_schemes, + permanent = get_regex(iana_uri_schemes_permanent), + provisional = get_regex(iana_uri_schemes_provisional), + historical = get_regex(iana_uri_schemes_historical), + other = get_regex(iana_uri_schemes_other), + ) + res = template % args + print res + return 0 + +def get_regex(schemes): + expr = ["|%s # %s" % (re.escape(scheme).ljust(10), description) + for scheme, description in sorted(schemes.items())] + return "\n".join(expr) + + +def parse_csv_file(url, res): + """Parse given URL and write res with {scheme -> description}""" + response = requests.get(url, stream=True) + reader = csv.reader(response.iter_lines()) + first_row = True + for row in reader: + if first_row: + # skip first row + first_row = False + else: + scheme, template, description, reference = row + res[scheme] = description + +if __name__ == '__main__': + sys.exit(main(sys.argv[1:])) + diff --git a/scripts/update_iana_uri_schemes.sh b/scripts/update_iana_uri_schemes.sh new file mode 100755 index 00000000..088aeef3 --- /dev/null +++ b/scripts/update_iana_uri_schemes.sh @@ -0,0 +1,9 @@ +#!/bin/sh + +set -e +set -u + +target=linkcheck/checker/unknownurl.py + +python scripts/removeafter.py "$target" "# DO NOT REMOVE" +python scripts/update_iana_uri_schemes.py >> "$target"