diff --git a/config/linkcheckerrc b/config/linkcheckerrc index 2dc4e360..e678ccfa 100644 --- a/config/linkcheckerrc +++ b/config/linkcheckerrc @@ -269,3 +269,11 @@ # Parse and check links in Word files #[WordParser] +# Parse and check links in Markdown files. +# Supported links are: +# +# [name](http://link.com "Optional title") +# [id]: http://link.com "Optional title" +#[MarkdownCheck] +# Regexp of filename +#filename_re=.*\.(blog|markdown|md(own)?|mkdn?)$ diff --git a/linkcheck/plugins/markdowncheck.py b/linkcheck/plugins/markdowncheck.py new file mode 100644 index 00000000..f628c7d0 --- /dev/null +++ b/linkcheck/plugins/markdowncheck.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +# +# Copyright © 2014 Vadym Khokhlov +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. +""" +Parse links in Markdown files. + +Supported links are: + + [name](http://link.com "Optional title") + [id]: http://link.com "Optional title" +""" + +# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project + +import re + +from . import _ContentPlugin +from .. import log, LOG_PLUGIN + + +class MarkdownCheck(_ContentPlugin): + """Markdown parsing plugin.""" + + _filename_re_key = "filename_re" + _default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$') + + _link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I), + re.compile(r""" + \[.+\]: # id + [ \t]*\n? # maybe *one* newline + [ \t]* + ? # url = \1 + [ \t]* + (?: + \n? # maybe one newline + [ \t]* + (?<=\s) # lookbehind for whitespace + ['"(] + [^\n]* # title + ['")] + [ \t]* + )? # title is optional + (?:\n+|\Z) + """, re.X | re.M | re.U)] + + _whitespace = re.compile(r'\s*') + + _strip_anglebrackets = re.compile(r'<(.*)>.*') + + _inline_link_title = re.compile(r''' + ( # \1 + [ \t]+ + (['"]) # quote char + (.*?) + )? # title is optional + \)$ + ''', re.X | re.S) + + def __init__(self, config): + super(MarkdownCheck, self).__init__(config) + self.filename_re = self._default_filename_re + pattern = config.get(self._filename_re_key) + if pattern: + try: + self.filename_re = re.compile(pattern) + except re.error as msg: + log.warn(LOG_PLUGIN, "Invalid regex pattern %r: %s" % (pattern, msg)) + + @classmethod + def read_config(cls, configparser): + """Read configuration file options.""" + config = dict() + config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \ + if configparser.has_option(cls.__name__, cls._filename_re_key) else None + return config + + def applies_to(self, url_data, pagetype=None): + """Check for Markdown file.""" + return self.filename_re.search(url_data.base_url) is not None + + def check(self, url_data): + """Extracts urls from the file.""" + content = url_data.get_content() + self._check_by_re(url_data, content) + self._check_inline_links(url_data, content) + + def _save_url(self, url_data, content, url_text, url_pos): + """Saves url. Converts url to 1-line text and url position as offset from the file beginning to (line, column). + + :param url_data: object for url storing + :param content: file content + :param url_text: url text + :param url_pos: url position from the beginning + """ + line = content.count('\n', 0, url_pos) + 1 + column = url_pos - content.rfind('\n', 0, url_pos) + url_data.add_url(url_text.translate(None, '\n '), line=line, column=column) + + def _check_by_re(self, url_data, content): + """ Finds urls by re. + + :param url_data: object for url storing + :param content: file content + """ + for link_re in self._link_res: + for u in link_re.finditer(content): + self._save_url(url_data, content, u.group(1), u.start(1)) + + def _find_balanced(self, text, start, open_c, close_c): + """Returns the index where the open_c and close_c characters balance + out - the same number of open_c and close_c are encountered - or the + end of string if it's reached before the balance point is found. + """ + i = start + l = len(text) + count = 1 + while count > 0 and i < l: + if text[i] == open_c: + count += 1 + elif text[i] == close_c: + count -= 1 + i += 1 + return i + + def _extract_url_and_title(self, text, start): + """Extracts the url from the tail of a link.""" + # text[start] equals the opening parenthesis + idx = self._whitespace.match(text, start + 1).end() + if idx == len(text): + return None, None + end_idx = idx + has_anglebrackets = text[idx] == "<" + if has_anglebrackets: + end_idx = self._find_balanced(text, end_idx+1, "<", ">") + end_idx = self._find_balanced(text, end_idx, "(", ")") + match = self._inline_link_title.search(text, idx, end_idx) + if not match: + return None, None + url = text[idx:match.start()] + if has_anglebrackets: + url = self._strip_anglebrackets.sub(r'\1', url) + return url, end_idx + + def _check_inline_links(self, url_data, content): + """Checks inline links. + + :param url_data: url_data object + :param content: content for processing + """ + MAX_LINK_TEXT_SENTINEL = 3000 + curr_pos = 0 + content_length = len(content) + while True: # Handle the next link. + # The next '[' is the start of: + # - an inline anchor: [text](url "title") + # - an inline img: ![text](url "title") + # - not markup: [...anything else... + try: + start_idx = content.index('[', curr_pos) + except ValueError: + break + + # Find the matching closing ']'. + bracket_depth = 0 + for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)): + if content[p] == ']': + bracket_depth -= 1 + if bracket_depth < 0: + break + elif content[p] == '[': + bracket_depth += 1 + else: + # Closing bracket not found within sentinel length. This isn't markup. + curr_pos = start_idx + 1 + continue + + # Now determine what this is by the remainder. + p += 1 + if p >= content_length: + return + + if content[p] == '(': + url, url_end_idx = self._extract_url_and_title(content, p) + if url is not None: + self._save_url(url_data, content, url, p) + start_idx = url_end_idx + + # Otherwise, it isn't markup. + curr_pos = start_idx + 1 diff --git a/tests/checker/data/file.markdown b/tests/checker/data/file.markdown new file mode 100644 index 00000000..a670701a --- /dev/null +++ b/tests/checker/data/file.markdown @@ -0,0 +1,18 @@ +# Test # + + text + +[link]( http://www.urllink.com) + +[link2](http://www + .urllink2.com) + +[test][id1] +[URL][id2] + +[id1]: +http://www.urldef1.com + +[id2]: http://www.urldef2.com "URL" + +![img](http://www.urlimg.com) diff --git a/tests/checker/data/file.markdown.result b/tests/checker/data/file.markdown.result new file mode 100644 index 00000000..56b5ac55 --- /dev/null +++ b/tests/checker/data/file.markdown.result @@ -0,0 +1,42 @@ +url file://%(curdir)s/%(datadir)s/file.markdown +cache key file://%(curdir)s/%(datadir)s/file.markdown +real url file://%(curdir)s/%(datadir)s/file.markdown +name %(datadir)s/file.markdown +valid + +url http://www.url.com +cache key http://www.url.com +real url http://search.url.com/ +info Redirected to `http://search.url.com/'. +valid + +url http://www.url2.com +cache key http://www.url2.com +real url http://www.url2.com +valid + +url http://www.urldef1.com +cache key http://www.urldef1.com +real url http://www.urldef1.com +error + +url http://www.urldef2.com +cache key http://www.urldef2.com +real url http://www.urldef2.com +error + +url http://www.urllink.com +cache key http://www.urllink.com +real url http://www.urllink.com +valid + +url http://www.urllink2.com +cache key http://www.urllink2.com +real url http://www.urllink2.com +error + +url http://www.urlimg.com +cache key http://www.urlimg.com +real url http://www.urlimg.com +valid + diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index 6580aaa8..b9d78d86 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -73,6 +73,10 @@ class TestFile (LinkCheckTest): confargs = dict(enabledplugins=["PdfParser"]) self.file_test("file.pdf", confargs=confargs) + def test_markdown(self): + confargs = dict(enabledplugins=["MarkdownCheck"]) + self.file_test("file.markdown", confargs=confargs) + def test_urllist (self): self.file_test("urllist.txt")