mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-08 08:30:59 +00:00
Added plugin for parsing and checking links in Markdown files
This commit is contained in:
parent
27937e6f83
commit
d4352fc828
5 changed files with 275 additions and 0 deletions
|
|
@ -269,3 +269,11 @@
|
|||
# Parse and check links in Word files
|
||||
#[WordParser]
|
||||
|
||||
# Parse and check links in Markdown files.
|
||||
# Supported links are:
|
||||
# <http://autolink.com>
|
||||
# [name](http://link.com "Optional title")
|
||||
# [id]: http://link.com "Optional title"
|
||||
#[MarkdownCheck]
|
||||
# Regexp of filename
|
||||
#filename_re=.*\.(blog|markdown|md(own)?|mkdn?)$
|
||||
|
|
|
|||
203
linkcheck/plugins/markdowncheck.py
Normal file
203
linkcheck/plugins/markdowncheck.py
Normal file
|
|
@ -0,0 +1,203 @@
|
|||
# -*- coding: utf-8 -*-
|
||||
#
|
||||
# Copyright © 2014 Vadym Khokhlov
|
||||
#
|
||||
# This program is free software; you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation; either version 2 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# This program is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License along
|
||||
# with this program; if not, write to the Free Software Foundation, Inc.,
|
||||
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||
"""
|
||||
Parse links in Markdown files.
|
||||
|
||||
Supported links are:
|
||||
<http://autolink.com>
|
||||
[name](http://link.com "Optional title")
|
||||
[id]: http://link.com "Optional title"
|
||||
"""
|
||||
|
||||
# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project
|
||||
|
||||
import re
|
||||
|
||||
from . import _ContentPlugin
|
||||
from .. import log, LOG_PLUGIN
|
||||
|
||||
|
||||
class MarkdownCheck(_ContentPlugin):
|
||||
"""Markdown parsing plugin."""
|
||||
|
||||
_filename_re_key = "filename_re"
|
||||
_default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
|
||||
|
||||
_link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
|
||||
re.compile(r"""
|
||||
\[.+\]: # id
|
||||
[ \t]*\n? # maybe *one* newline
|
||||
[ \t]*
|
||||
<?(.+?)>? # url = \1
|
||||
[ \t]*
|
||||
(?:
|
||||
\n? # maybe one newline
|
||||
[ \t]*
|
||||
(?<=\s) # lookbehind for whitespace
|
||||
['"(]
|
||||
[^\n]* # title
|
||||
['")]
|
||||
[ \t]*
|
||||
)? # title is optional
|
||||
(?:\n+|\Z)
|
||||
""", re.X | re.M | re.U)]
|
||||
|
||||
_whitespace = re.compile(r'\s*')
|
||||
|
||||
_strip_anglebrackets = re.compile(r'<(.*)>.*')
|
||||
|
||||
_inline_link_title = re.compile(r'''
|
||||
( # \1
|
||||
[ \t]+
|
||||
(['"]) # quote char
|
||||
(.*?)
|
||||
)? # title is optional
|
||||
\)$
|
||||
''', re.X | re.S)
|
||||
|
||||
def __init__(self, config):
|
||||
super(MarkdownCheck, self).__init__(config)
|
||||
self.filename_re = self._default_filename_re
|
||||
pattern = config.get(self._filename_re_key)
|
||||
if pattern:
|
||||
try:
|
||||
self.filename_re = re.compile(pattern)
|
||||
except re.error as msg:
|
||||
log.warn(LOG_PLUGIN, "Invalid regex pattern %r: %s" % (pattern, msg))
|
||||
|
||||
@classmethod
|
||||
def read_config(cls, configparser):
|
||||
"""Read configuration file options."""
|
||||
config = dict()
|
||||
config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \
|
||||
if configparser.has_option(cls.__name__, cls._filename_re_key) else None
|
||||
return config
|
||||
|
||||
def applies_to(self, url_data, pagetype=None):
|
||||
"""Check for Markdown file."""
|
||||
return self.filename_re.search(url_data.base_url) is not None
|
||||
|
||||
def check(self, url_data):
|
||||
"""Extracts urls from the file."""
|
||||
content = url_data.get_content()
|
||||
self._check_by_re(url_data, content)
|
||||
self._check_inline_links(url_data, content)
|
||||
|
||||
def _save_url(self, url_data, content, url_text, url_pos):
|
||||
"""Saves url. Converts url to 1-line text and url position as offset from the file beginning to (line, column).
|
||||
|
||||
:param url_data: object for url storing
|
||||
:param content: file content
|
||||
:param url_text: url text
|
||||
:param url_pos: url position from the beginning
|
||||
"""
|
||||
line = content.count('\n', 0, url_pos) + 1
|
||||
column = url_pos - content.rfind('\n', 0, url_pos)
|
||||
url_data.add_url(url_text.translate(None, '\n '), line=line, column=column)
|
||||
|
||||
def _check_by_re(self, url_data, content):
|
||||
""" Finds urls by re.
|
||||
|
||||
:param url_data: object for url storing
|
||||
:param content: file content
|
||||
"""
|
||||
for link_re in self._link_res:
|
||||
for u in link_re.finditer(content):
|
||||
self._save_url(url_data, content, u.group(1), u.start(1))
|
||||
|
||||
def _find_balanced(self, text, start, open_c, close_c):
|
||||
"""Returns the index where the open_c and close_c characters balance
|
||||
out - the same number of open_c and close_c are encountered - or the
|
||||
end of string if it's reached before the balance point is found.
|
||||
"""
|
||||
i = start
|
||||
l = len(text)
|
||||
count = 1
|
||||
while count > 0 and i < l:
|
||||
if text[i] == open_c:
|
||||
count += 1
|
||||
elif text[i] == close_c:
|
||||
count -= 1
|
||||
i += 1
|
||||
return i
|
||||
|
||||
def _extract_url_and_title(self, text, start):
|
||||
"""Extracts the url from the tail of a link."""
|
||||
# text[start] equals the opening parenthesis
|
||||
idx = self._whitespace.match(text, start + 1).end()
|
||||
if idx == len(text):
|
||||
return None, None
|
||||
end_idx = idx
|
||||
has_anglebrackets = text[idx] == "<"
|
||||
if has_anglebrackets:
|
||||
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
|
||||
end_idx = self._find_balanced(text, end_idx, "(", ")")
|
||||
match = self._inline_link_title.search(text, idx, end_idx)
|
||||
if not match:
|
||||
return None, None
|
||||
url = text[idx:match.start()]
|
||||
if has_anglebrackets:
|
||||
url = self._strip_anglebrackets.sub(r'\1', url)
|
||||
return url, end_idx
|
||||
|
||||
def _check_inline_links(self, url_data, content):
|
||||
"""Checks inline links.
|
||||
|
||||
:param url_data: url_data object
|
||||
:param content: content for processing
|
||||
"""
|
||||
MAX_LINK_TEXT_SENTINEL = 3000
|
||||
curr_pos = 0
|
||||
content_length = len(content)
|
||||
while True: # Handle the next link.
|
||||
# The next '[' is the start of:
|
||||
# - an inline anchor: [text](url "title")
|
||||
# - an inline img: 
|
||||
# - not markup: [...anything else...
|
||||
try:
|
||||
start_idx = content.index('[', curr_pos)
|
||||
except ValueError:
|
||||
break
|
||||
|
||||
# Find the matching closing ']'.
|
||||
bracket_depth = 0
|
||||
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)):
|
||||
if content[p] == ']':
|
||||
bracket_depth -= 1
|
||||
if bracket_depth < 0:
|
||||
break
|
||||
elif content[p] == '[':
|
||||
bracket_depth += 1
|
||||
else:
|
||||
# Closing bracket not found within sentinel length. This isn't markup.
|
||||
curr_pos = start_idx + 1
|
||||
continue
|
||||
|
||||
# Now determine what this is by the remainder.
|
||||
p += 1
|
||||
if p >= content_length:
|
||||
return
|
||||
|
||||
if content[p] == '(':
|
||||
url, url_end_idx = self._extract_url_and_title(content, p)
|
||||
if url is not None:
|
||||
self._save_url(url_data, content, url, p)
|
||||
start_idx = url_end_idx
|
||||
|
||||
# Otherwise, it isn't markup.
|
||||
curr_pos = start_idx + 1
|
||||
18
tests/checker/data/file.markdown
Normal file
18
tests/checker/data/file.markdown
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
# Test #
|
||||
|
||||
<http://www.url.com> text <http://www.url2.com>
|
||||
|
||||
[link]( http://www.urllink.com)
|
||||
|
||||
[link2](http://www
|
||||
.urllink2.com)
|
||||
|
||||
[test][id1]
|
||||
[URL][id2]
|
||||
|
||||
[id1]:
|
||||
http://www.urldef1.com
|
||||
|
||||
[id2]: http://www.urldef2.com "URL"
|
||||
|
||||

|
||||
42
tests/checker/data/file.markdown.result
Normal file
42
tests/checker/data/file.markdown.result
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
url file://%(curdir)s/%(datadir)s/file.markdown
|
||||
cache key file://%(curdir)s/%(datadir)s/file.markdown
|
||||
real url file://%(curdir)s/%(datadir)s/file.markdown
|
||||
name %(datadir)s/file.markdown
|
||||
valid
|
||||
|
||||
url http://www.url.com
|
||||
cache key http://www.url.com
|
||||
real url http://search.url.com/
|
||||
info Redirected to `http://search.url.com/'.
|
||||
valid
|
||||
|
||||
url http://www.url2.com
|
||||
cache key http://www.url2.com
|
||||
real url http://www.url2.com
|
||||
valid
|
||||
|
||||
url http://www.urldef1.com
|
||||
cache key http://www.urldef1.com
|
||||
real url http://www.urldef1.com
|
||||
error
|
||||
|
||||
url http://www.urldef2.com
|
||||
cache key http://www.urldef2.com
|
||||
real url http://www.urldef2.com
|
||||
error
|
||||
|
||||
url http://www.urllink.com
|
||||
cache key http://www.urllink.com
|
||||
real url http://www.urllink.com
|
||||
valid
|
||||
|
||||
url http://www.urllink2.com
|
||||
cache key http://www.urllink2.com
|
||||
real url http://www.urllink2.com
|
||||
error
|
||||
|
||||
url http://www.urlimg.com
|
||||
cache key http://www.urlimg.com
|
||||
real url http://www.urlimg.com
|
||||
valid
|
||||
|
||||
|
|
@ -73,6 +73,10 @@ class TestFile (LinkCheckTest):
|
|||
confargs = dict(enabledplugins=["PdfParser"])
|
||||
self.file_test("file.pdf", confargs=confargs)
|
||||
|
||||
def test_markdown(self):
|
||||
confargs = dict(enabledplugins=["MarkdownCheck"])
|
||||
self.file_test("file.markdown", confargs=confargs)
|
||||
|
||||
def test_urllist (self):
|
||||
self.file_test("urllist.txt")
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue