From 0c90c718bf591e73b4a63c0298d49e49ab5fc0bc Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 1/6] Revert "Python3: fix bytes mark in parser/__init__.py" This reverts commit aec8243348fd29877152ac3985515290634dd556. --- linkcheck/parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index 59b0cb74..7af8a398 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -83,7 +83,7 @@ def parse_text (url_data): for line in url_data.get_content().splitlines(): lineno += 1 line = line.strip() - if not line or line.startswith(b'#'): + if not line or line.startswith('#'): continue url_data.add_url(line, line=lineno) From 5fc01455b7e2707509accaa58a586f7d1dc41443 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 2/6] Decode content when retrieved, use bs4 to detect encoding if non-Unicode UrlBase has been modified as follows: - the "data" variable now holds bytes - decoded content is stored in a new variable "text" - functionality from get_content() has been split out into get_raw_content() which returns "data" and download_content() which calls read_content() and sets the download related variables. This allows for subclasses to do their own decoding and parsers to use bytes. --- linkcheck/checker/urlbase.py | 56 ++++++++++++++++++++++++------------ requirements.txt | 1 + setup.py | 1 + 3 files changed, 39 insertions(+), 19 deletions(-) diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py index 10f9da4a..0f50621e 100644 --- a/linkcheck/checker/urlbase.py +++ b/linkcheck/checker/urlbase.py @@ -38,13 +38,16 @@ import time import errno import socket import select -try: - from cStringIO import StringIO -except ImportError: - # Python 3 - from io import StringIO +from io import BytesIO from builtins import str as str_text from future.utils import python_2_unicode_compatible +from warnings import filterwarnings + +filterwarnings("ignore", + message="The soupsieve package is not installed. CSS selectors cannot be used.", + category=UserWarning, module="bs4") + +from bs4 import BeautifulSoup from . import absolute_url, get_url_from from .. import (log, LOG_CHECK, @@ -216,6 +219,8 @@ class UrlBase (object): self.url_connection = None # data of url content, (data == None) means no data is available self.data = None + # url content as a Unicode string + self.text = None # cache url is set by build_url() calling set_cache_url() self.cache_url = None # extern flags (is_extern, is_strict) @@ -625,24 +630,35 @@ class UrlBase (object): """Indicate wether url get_content() can be called.""" return self.size <= self.aggregate.config["maxfilesizedownload"] - def get_content (self): - """Precondition: url_connection is an opened URL.""" - if self.data is None: - log.debug(LOG_CHECK, "Get content of %r", self.url) - t = time.time() - self.data = self.read_content() - self.size = len(self.data) - self.dltime = time.time() - t - if self.size == 0: - self.add_warning(_("Content size is zero."), + def download_content(self): + log.debug(LOG_CHECK, "Get content of %r", self.url) + t = time.time() + content = self.read_content() + self.size = len(content) + self.dltime = time.time() - t + if self.size == 0: + self.add_warning(_("Content size is zero."), tag=WARN_URL_CONTENT_SIZE_ZERO) - else: - self.aggregate.add_downloaded_bytes(self.size) + else: + self.aggregate.add_downloaded_bytes(self.size) + return content + + def get_raw_content(self): + if self.data is None: + self.data = self.download_content() return self.data + def get_content (self): + if self.text is None: + self.get_raw_content() + soup = BeautifulSoup(self.data, "html.parser") + self.text = self.data.decode(soup.original_encoding) + self.encoding = soup.original_encoding + return self.text + def read_content(self): """Return data for this URL. Can be overridden in subclasses.""" - buf = StringIO() + buf = BytesIO() data = self.read_content_chunk() while data: if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]: @@ -652,7 +668,9 @@ class UrlBase (object): return buf.getvalue() def read_content_chunk(self): - """Read one chunk of content from this URL.""" + """Read one chunk of content from this URL. + Precondition: url_connection is an opened URL. + """ return self.url_connection.read(self.ReadChunkBytes) def get_user_password (self): diff --git a/requirements.txt b/requirements.txt index 332a381c..76646350 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ # required: +bs4 requests >= 2.4 pyxdg dnspython diff --git a/setup.py b/setup.py index 9e85e5e2..39b91490 100755 --- a/setup.py +++ b/setup.py @@ -503,6 +503,7 @@ args = dict( install_requires = [ 'requests >= 2.4', 'dnspython', + 'bs4', 'pyxdg', 'future', ], From 9460064084645ef6d3f53c0db6d60accfb719c68 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 3/6] Use requests to decode the content of login form --- linkcheck/director/aggregator.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py index feef67af..92538b14 100644 --- a/linkcheck/director/aggregator.py +++ b/linkcheck/director/aggregator.py @@ -90,8 +90,7 @@ class Aggregate (object): response = session.get(url) cgiuser = self.config["loginuserfield"] cgipassword = self.config["loginpasswordfield"] - form = formsearch.search_form(response.content, cgiuser, cgipassword, - encoding=response.encoding) + form = formsearch.search_form(response.text, cgiuser, cgipassword) form.data[cgiuser] = user form.data[cgipassword] = password for key, value in self.config["loginextrafields"].items(): From ad33d359c1e9282d1caa83178a0365dc8ec63cb5 Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 4/6] Adapt Opera bookmark parser to work with decoded data --- linkcheck/bookmarks/opera.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/linkcheck/bookmarks/opera.py b/linkcheck/bookmarks/opera.py index 7f462e79..80dc0d40 100644 --- a/linkcheck/bookmarks/opera.py +++ b/linkcheck/bookmarks/opera.py @@ -63,9 +63,9 @@ def parse_bookmark_data (data): for line in data.splitlines(): lineno += 1 line = line.strip() - if line.startswith(b"NAME="): + if line.startswith("NAME="): name = line[5:] - elif line.startswith(b"URL="): + elif line.startswith("URL="): url = line[4:] if url and name is not None: yield (url, name, lineno) From e01ea0d9f05e8ab3bfe151f46738e8d91d9ec96c Mon Sep 17 00:00:00 2001 From: Chris Mayo Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 5/6] Safari bookmark parser requires bytes --- linkcheck/parser/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py index 7af8a398..7f6ad51b 100644 --- a/linkcheck/parser/__init__.py +++ b/linkcheck/parser/__init__.py @@ -72,7 +72,7 @@ def parse_chromium (url_data): def parse_safari (url_data): """Parse a Safari bookmark file.""" from ..bookmarks.safari import parse_bookmark_data - for url, name in parse_bookmark_data(url_data.get_content()): + for url, name in parse_bookmark_data(url_data.get_raw_content()): url_data.add_url(url, name=name) From 6e8da10942956db8314c871fde17c0999a44e167 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= Date: Mon, 30 Sep 2019 19:46:24 +0100 Subject: [PATCH 6/6] fixes for Python 3: fix markdowncheck The translate() method of string objects (and Python 2 Unicode objects) only accepts a single, table argument. --- linkcheck/plugins/markdowncheck.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/linkcheck/plugins/markdowncheck.py b/linkcheck/plugins/markdowncheck.py index f628c7d0..116f21b2 100644 --- a/linkcheck/plugins/markdowncheck.py +++ b/linkcheck/plugins/markdowncheck.py @@ -31,6 +31,7 @@ import re from . import _ContentPlugin from .. import log, LOG_PLUGIN +from builtins import str as str_text class MarkdownCheck(_ContentPlugin): """Markdown parsing plugin.""" @@ -108,7 +109,7 @@ class MarkdownCheck(_ContentPlugin): """ line = content.count('\n', 0, url_pos) + 1 column = url_pos - content.rfind('\n', 0, url_pos) - url_data.add_url(url_text.translate(None, '\n '), line=line, column=column) + url_data.add_url(url_text.translate(str_text.maketrans("", "", '\n ')), line=line, column=column) def _check_by_re(self, url_data, content): """ Finds urls by re.