linkchecker/linkcheck/plugins/markdowncheck.py
Petr Dlouhý 6e8da10942 fixes for Python 3: fix markdowncheck
The translate() method of string objects (and Python 2 Unicode objects)
only accepts a single, table argument.
2019-09-30 19:46:24 +01:00

204 lines
7.3 KiB
Python

# -*- coding: utf-8 -*-
#
# Copyright © 2014 Vadym Khokhlov
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse links in Markdown files.
Supported links are:
<http://autolink.com>
[name](http://link.com "Optional title")
[id]: http://link.com "Optional title"
"""
# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project
import re
from . import _ContentPlugin
from .. import log, LOG_PLUGIN
from builtins import str as str_text
class MarkdownCheck(_ContentPlugin):
"""Markdown parsing plugin."""
_filename_re_key = "filename_re"
_default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')
_link_res = [re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
re.compile(r"""
\[.+\]: # id
[ \t]*\n? # maybe *one* newline
[ \t]*
<?(.+?)>? # url = \1
[ \t]*
(?:
\n? # maybe one newline
[ \t]*
(?<=\s) # lookbehind for whitespace
['"(]
[^\n]* # title
['")]
[ \t]*
)? # title is optional
(?:\n+|\Z)
""", re.X | re.M | re.U)]
_whitespace = re.compile(r'\s*')
_strip_anglebrackets = re.compile(r'<(.*)>.*')
_inline_link_title = re.compile(r'''
( # \1
[ \t]+
(['"]) # quote char
(.*?)
)? # title is optional
\)$
''', re.X | re.S)
def __init__(self, config):
super(MarkdownCheck, self).__init__(config)
self.filename_re = self._default_filename_re
pattern = config.get(self._filename_re_key)
if pattern:
try:
self.filename_re = re.compile(pattern)
except re.error as msg:
log.warn(LOG_PLUGIN, "Invalid regex pattern %r: %s" % (pattern, msg))
@classmethod
def read_config(cls, configparser):
"""Read configuration file options."""
config = dict()
config[cls._filename_re_key] = configparser.get(cls.__name__, cls._filename_re_key) \
if configparser.has_option(cls.__name__, cls._filename_re_key) else None
return config
def applies_to(self, url_data, pagetype=None):
"""Check for Markdown file."""
return self.filename_re.search(url_data.base_url) is not None
def check(self, url_data):
"""Extracts urls from the file."""
content = url_data.get_content()
self._check_by_re(url_data, content)
self._check_inline_links(url_data, content)
def _save_url(self, url_data, content, url_text, url_pos):
"""Saves url. Converts url to 1-line text and url position as offset from the file beginning to (line, column).
:param url_data: object for url storing
:param content: file content
:param url_text: url text
:param url_pos: url position from the beginning
"""
line = content.count('\n', 0, url_pos) + 1
column = url_pos - content.rfind('\n', 0, url_pos)
url_data.add_url(url_text.translate(str_text.maketrans("", "", '\n ')), line=line, column=column)
def _check_by_re(self, url_data, content):
""" Finds urls by re.
:param url_data: object for url storing
:param content: file content
"""
for link_re in self._link_res:
for u in link_re.finditer(content):
self._save_url(url_data, content, u.group(1), u.start(1))
def _find_balanced(self, text, start, open_c, close_c):
"""Returns the index where the open_c and close_c characters balance
out - the same number of open_c and close_c are encountered - or the
end of string if it's reached before the balance point is found.
"""
i = start
l = len(text)
count = 1
while count > 0 and i < l:
if text[i] == open_c:
count += 1
elif text[i] == close_c:
count -= 1
i += 1
return i
def _extract_url_and_title(self, text, start):
"""Extracts the url from the tail of a link."""
# text[start] equals the opening parenthesis
idx = self._whitespace.match(text, start + 1).end()
if idx == len(text):
return None, None
end_idx = idx
has_anglebrackets = text[idx] == "<"
if has_anglebrackets:
end_idx = self._find_balanced(text, end_idx+1, "<", ">")
end_idx = self._find_balanced(text, end_idx, "(", ")")
match = self._inline_link_title.search(text, idx, end_idx)
if not match:
return None, None
url = text[idx:match.start()]
if has_anglebrackets:
url = self._strip_anglebrackets.sub(r'\1', url)
return url, end_idx
def _check_inline_links(self, url_data, content):
"""Checks inline links.
:param url_data: url_data object
:param content: content for processing
"""
MAX_LINK_TEXT_SENTINEL = 3000
curr_pos = 0
content_length = len(content)
while True: # Handle the next link.
# The next '[' is the start of:
# - an inline anchor: [text](url "title")
# - an inline img: ![text](url "title")
# - not markup: [...anything else...
try:
start_idx = content.index('[', curr_pos)
except ValueError:
break
# Find the matching closing ']'.
bracket_depth = 0
for p in range(start_idx+1, min(start_idx+MAX_LINK_TEXT_SENTINEL, content_length)):
if content[p] == ']':
bracket_depth -= 1
if bracket_depth < 0:
break
elif content[p] == '[':
bracket_depth += 1
else:
# Closing bracket not found within sentinel length. This isn't markup.
curr_pos = start_idx + 1
continue
# Now determine what this is by the remainder.
p += 1
if p >= content_length:
return
if content[p] == '(':
url, url_end_idx = self._extract_url_and_title(content, p)
if url is not None:
self._save_url(url_data, content, url, p)
start_idx = url_end_idx
# Otherwise, it isn't markup.
curr_pos = start_idx + 1