Added plugin for parsing and checking links in Markdown files

2026-05-23 21:55:49 +00:00 · 2014-11-11 15:35:18 +02:00 · 2014-11-11 15:35:18 +02:00 · d4352fc828
commit d4352fc828
parent 27937e6f83
5 changed files with 275 additions and 0 deletions
--- a/config/linkcheckerrc
+++ b/config/linkcheckerrc
@ -269,3 +269,11 @@
 # Parse and check links in Word files
 #[WordParser]

+# Parse and check links in Markdown files.
+# Supported links are:
+#        <http://autolink.com>
+#        [name](http://link.com "Optional title")
+#        [id]: http://link.com "Optional title"
+#[MarkdownCheck]
+# Regexp of filename
+#filename_re=.*\.(blog|markdown|md(own)?|mkdn?)$
--- a/linkcheck/plugins/markdowncheck.py
+++ b/linkcheck/plugins/markdowncheck.py
@ -0,0 +1,203 @@
+# -*- coding: utf-8 -*-
+#
+# Copyright © 2014 Vadym Khokhlov
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License along
+# with this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+"""
+Parse links in Markdown files.
+
+Supported links are:
+        <http://autolink.com>
+        [name](http://link.com "Optional title")
+        [id]: http://link.com "Optional title"
+"""
+
+# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project
+
+import re
+
+from . import _ContentPlugin
+from .. import log, LOG_PLUGIN
+
+
+class MarkdownCheck(_ContentPlugin):
+    """Markdown parsing plugin."""
+
+    _filename_re_key = "filename_re"
+    _default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
+
+    _link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
+                 re.compile(r"""
+                    \[.+\]: # id
+                    [ \t]*\n?               # maybe *one* newline
+                    [ \t]*
+                    <?(.+?)>?           # url = \1
+                    [ \t]*
+                    (?:
+                    \n?             # maybe one newline
+                    [ \t]*
+                    (?<=\s)         # lookbehind for whitespace
+                    ['"(]
+                    [^\n]*          # title
+                    ['")]
+                    [ \t]*
+                    )?  # title is optional
+                    (?:\n+|\Z)
+                    """, re.X | re.M | re.U)]
+
+    _whitespace = re.compile(r'\s*')
+
+    _strip_anglebrackets = re.compile(r'<(.*)>.*')
+
+    _inline_link_title = re.compile(r'''
+            (                   # \1
+              [ \t]+
+              (['"])            # quote char
+              (.*?)
+            )?                  # title is optional
+          \)$
+        ''', re.X | re.S)
+
+    def __init__(self, config):
+        super(MarkdownCheck, self).__init__(config)
+        self.filename_re = self._default_filename_re
+        pattern = config.get(self._filename_re_key)
+        if pattern:
+            try:
+                self.filename_re = re.compile(pattern)
+            except re.error as msg:
+                log.warn(LOG_PLUGIN, "Invalid regex pattern %r: %s" % (pattern, msg))
+
+    @classmethod
+    def read_config(cls, configparser):
+        """Read configuration file options."""
+        config = dict()
+        config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \
+            if configparser.has_option(cls.__name__, cls._filename_re_key) else None
+        return config
+
+    def applies_to(self, url_data, pagetype=None):
+        """Check for Markdown file."""
+        return self.filename_re.search(url_data.base_url) is not None
+
+    def check(self, url_data):
+        """Extracts urls from the file."""
+        content = url_data.get_content()
+        self._check_by_re(url_data, content)
+        self._check_inline_links(url_data, content)
+
+    def _save_url(self, url_data, content, url_text, url_pos):
+        """Saves url. Converts url to 1-line text and url position as offset from the file beginning to (line, column).
+
+        :param url_data: object for url storing
+        :param content: file content
+        :param url_text: url text
+        :param url_pos: url position from the beginning
+        """
+        line = content.count('\n', 0, url_pos) + 1
+        column = url_pos - content.rfind('\n', 0, url_pos)
+        url_data.add_url(url_text.translate(None, '\n '), line=line, column=column)
+
+    def _check_by_re(self, url_data, content):
+        """ Finds urls by re.
+
+        :param url_data: object for url storing
+        :param content: file content
+        """
+        for link_re in self._link_res:
+            for u in link_re.finditer(content):
+                self._save_url(url_data, content, u.group(1), u.start(1))
+
+    def _find_balanced(self, text, start, open_c, close_c):
+        """Returns the index where the open_c and close_c characters balance
+        out - the same number of open_c and close_c are encountered - or the
+        end of string if it's reached before the balance point is found.
+        """
+        i = start
+        l = len(text)
+        count = 1
+        while count > 0 and i < l:
+            if text[i] == open_c:
+                count += 1
+            elif text[i] == close_c:
+                count -= 1
+            i += 1
+        return i
+
+    def _extract_url_and_title(self, text, start):
+        """Extracts the url from the tail of a link."""
+        # text[start] equals the opening parenthesis
+        idx = self._whitespace.match(text, start + 1).end()
+        if idx == len(text):
+            return None, None
+        end_idx = idx
+        has_anglebrackets = text[idx] == "<"
+        if has_anglebrackets:
+            end_idx = self._find_balanced(text, end_idx+1, "<", ">")
+        end_idx = self._find_balanced(text, end_idx, "(", ")")
+        match = self._inline_link_title.search(text, idx, end_idx)
+        if not match:
+            return None, None
+        url = text[idx:match.start()]
+        if has_anglebrackets:
+            url = self._strip_anglebrackets.sub(r'\1', url)
+        return url, end_idx
+
+    def _check_inline_links(self, url_data, content):
+        """Checks inline links.
+
+        :param url_data: url_data object
+        :param content: content for processing
+        """
+        MAX_LINK_TEXT_SENTINEL = 3000
+        curr_pos = 0
+        content_length = len(content)
+        while True:  # Handle the next link.
+            # The next '[' is the start of:
+            # - an inline anchor:   [text](url "title")
+            # - an inline img:      ![text](url "title")
+            # - not markup:         [...anything else...
+            try:
+                start_idx = content.index('[', curr_pos)
+            except ValueError:
+                break
+
+            # Find the matching closing ']'.
+            bracket_depth = 0
+            for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)):
+                if content[p] == ']':
+                    bracket_depth -= 1
+                    if bracket_depth < 0:
+                        break
+                elif content[p] == '[':
+                    bracket_depth += 1
+            else:
+                # Closing bracket not found within sentinel length. This isn't markup.
+                curr_pos = start_idx + 1
+                continue
+
+            # Now determine what this is by the remainder.
+            p += 1
+            if p >= content_length:
+                return
+
+            if content[p] == '(':
+                url, url_end_idx = self._extract_url_and_title(content, p)
+                if url is not None:
+                    self._save_url(url_data, content, url, p)
+                    start_idx = url_end_idx
+
+            # Otherwise, it isn't markup.
+            curr_pos = start_idx + 1
--- a/tests/checker/data/file.markdown
+++ b/tests/checker/data/file.markdown
@ -0,0 +1,18 @@
+# Test #
+
+<http://www.url.com>  text <http://www.url2.com>
+
+[link](     http://www.urllink.com)
+
+[link2](http://www
+  .urllink2.com)
+
+[test][id1]
+[URL][id2]
+
+[id1]:       
+http://www.urldef1.com
+
+[id2]: http://www.urldef2.com "URL"
+
+![img](http://www.urlimg.com)
--- a/tests/checker/data/file.markdown.result
+++ b/tests/checker/data/file.markdown.result
@ -0,0 +1,42 @@
+url file://%(curdir)s/%(datadir)s/file.markdown
+cache key file://%(curdir)s/%(datadir)s/file.markdown
+real url file://%(curdir)s/%(datadir)s/file.markdown
+name %(datadir)s/file.markdown
+valid
+
+url http://www.url.com
+cache key http://www.url.com
+real url http://search.url.com/
+info Redirected to `http://search.url.com/'.
+valid
+
+url http://www.url2.com
+cache key http://www.url2.com
+real url http://www.url2.com
+valid
+
+url http://www.urldef1.com
+cache key http://www.urldef1.com
+real url http://www.urldef1.com
+error
+
+url http://www.urldef2.com
+cache key http://www.urldef2.com
+real url http://www.urldef2.com
+error
+
+url http://www.urllink.com
+cache key http://www.urllink.com
+real url http://www.urllink.com
+valid
+
+url http://www.urllink2.com
+cache key http://www.urllink2.com
+real url http://www.urllink2.com
+error
+
+url http://www.urlimg.com
+cache key http://www.urlimg.com
+real url http://www.urlimg.com
+valid
+
--- a/tests/checker/test_file.py
+++ b/tests/checker/test_file.py
@ -73,6 +73,10 @@ class TestFile (LinkCheckTest):
        confargs = dict(enabledplugins=["PdfParser"])
        self.file_test("file.pdf", confargs=confargs)

+    def test_markdown(self):
+        confargs = dict(enabledplugins=["MarkdownCheck"])
+        self.file_test("file.markdown", confargs=confargs)
+
    def test_urllist (self):
        self.file_test("urllist.txt")