From 0c90c718bf591e73b4a63c0298d49e49ab5fc0bc Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 1/6] Revert "Python3: fix bytes mark in parser/__init__.py"

This reverts commit aec8243348fd29877152ac3985515290634dd556.
---
 linkcheck/parser/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index 59b0cb74..7af8a398 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -83,7 +83,7 @@ def parse_text (url_data):
     for line in url_data.get_content().splitlines():
         lineno += 1
         line = line.strip()
-        if not line or line.startswith(b'#'):
+        if not line or line.startswith('#'):
             continue
         url_data.add_url(line, line=lineno)
 

From 5fc01455b7e2707509accaa58a586f7d1dc41443 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 2/6] Decode content when retrieved, use bs4 to detect encoding
 if non-Unicode

UrlBase has been modified as follows:
- the "data" variable now holds bytes
- decoded content is stored in a new variable "text"
- functionality from get_content() has been split out into
  get_raw_content() which returns "data" and download_content() which
  calls read_content() and sets the download related variables.
  This allows for subclasses to do their own decoding and parsers to
  use bytes.
---
 linkcheck/checker/urlbase.py | 56 ++++++++++++++++++++++++------------
 requirements.txt             |  1 +
 setup.py                     |  1 +
 3 files changed, 39 insertions(+), 19 deletions(-)

diff --git a/linkcheck/checker/urlbase.py b/linkcheck/checker/urlbase.py
index 10f9da4a..0f50621e 100644
--- a/linkcheck/checker/urlbase.py
+++ b/linkcheck/checker/urlbase.py
@@ -38,13 +38,16 @@ import time
 import errno
 import socket
 import select
-try:
-    from cStringIO import StringIO
-except ImportError:
-    # Python 3
-    from io import StringIO
+from io import BytesIO
 from builtins import str as str_text
 from future.utils import python_2_unicode_compatible
+from warnings import filterwarnings
+
+filterwarnings("ignore",
+    message="The soupsieve package is not installed. CSS selectors cannot be used.",
+    category=UserWarning, module="bs4")
+
+from bs4 import BeautifulSoup
 
 from . import absolute_url, get_url_from
 from .. import (log, LOG_CHECK,
@@ -216,6 +219,8 @@ class UrlBase (object):
         self.url_connection = None
         # data of url content,  (data == None) means no data is available
         self.data = None
+        # url content as a Unicode string
+        self.text = None
         # cache url is set by build_url() calling set_cache_url()
         self.cache_url = None
         # extern flags (is_extern, is_strict)
@@ -625,24 +630,35 @@ class UrlBase (object):
         """Indicate wether url get_content() can be called."""
         return self.size <= self.aggregate.config["maxfilesizedownload"]
 
-    def get_content (self):
-        """Precondition: url_connection is an opened URL."""
-        if self.data is None:
-            log.debug(LOG_CHECK, "Get content of %r", self.url)
-            t = time.time()
-            self.data = self.read_content()
-            self.size = len(self.data)
-            self.dltime = time.time() - t
-            if self.size == 0:
-                self.add_warning(_("Content size is zero."),
+    def download_content(self):
+        log.debug(LOG_CHECK, "Get content of %r", self.url)
+        t = time.time()
+        content = self.read_content()
+        self.size = len(content)
+        self.dltime = time.time() - t
+        if self.size == 0:
+            self.add_warning(_("Content size is zero."),
                              tag=WARN_URL_CONTENT_SIZE_ZERO)
-            else:
-                self.aggregate.add_downloaded_bytes(self.size)
+        else:
+            self.aggregate.add_downloaded_bytes(self.size)
+        return content
+
+    def get_raw_content(self):
+        if self.data is None:
+            self.data = self.download_content()
         return self.data
 
+    def get_content (self):
+        if self.text is None:
+            self.get_raw_content()
+            soup = BeautifulSoup(self.data, "html.parser")
+            self.text = self.data.decode(soup.original_encoding)
+            self.encoding = soup.original_encoding
+        return self.text
+
     def read_content(self):
         """Return data for this URL. Can be overridden in subclasses."""
-        buf = StringIO()
+        buf = BytesIO()
         data = self.read_content_chunk()
         while data:
             if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
@@ -652,7 +668,9 @@ class UrlBase (object):
         return buf.getvalue()
 
     def read_content_chunk(self):
-        """Read one chunk of content from this URL."""
+        """Read one chunk of content from this URL.
+        Precondition: url_connection is an opened URL.
+        """
         return self.url_connection.read(self.ReadChunkBytes)
 
     def get_user_password (self):
diff --git a/requirements.txt b/requirements.txt
index 332a381c..76646350 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 # required:
+bs4
 requests >= 2.4
 pyxdg
 dnspython
diff --git a/setup.py b/setup.py
index 9e85e5e2..39b91490 100755
--- a/setup.py
+++ b/setup.py
@@ -503,6 +503,7 @@ args = dict(
     install_requires = [
         'requests >= 2.4',
         'dnspython',
+        'bs4',
         'pyxdg',
         'future',
     ],

From 9460064084645ef6d3f53c0db6d60accfb719c68 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 3/6] Use requests to decode the content of login form

---
 linkcheck/director/aggregator.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/linkcheck/director/aggregator.py b/linkcheck/director/aggregator.py
index feef67af..92538b14 100644
--- a/linkcheck/director/aggregator.py
+++ b/linkcheck/director/aggregator.py
@@ -90,8 +90,7 @@ class Aggregate (object):
         response = session.get(url)
         cgiuser = self.config["loginuserfield"]
         cgipassword = self.config["loginpasswordfield"]
-        form = formsearch.search_form(response.content, cgiuser, cgipassword,
-              encoding=response.encoding)
+        form = formsearch.search_form(response.text, cgiuser, cgipassword)
         form.data[cgiuser] = user
         form.data[cgipassword] = password
         for key, value in self.config["loginextrafields"].items():

From ad33d359c1e9282d1caa83178a0365dc8ec63cb5 Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 4/6] Adapt Opera bookmark parser to work with decoded data

---
 linkcheck/bookmarks/opera.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/linkcheck/bookmarks/opera.py b/linkcheck/bookmarks/opera.py
index 7f462e79..80dc0d40 100644
--- a/linkcheck/bookmarks/opera.py
+++ b/linkcheck/bookmarks/opera.py
@@ -63,9 +63,9 @@ def parse_bookmark_data (data):
     for line in data.splitlines():
         lineno += 1
         line = line.strip()
-        if line.startswith(b"NAME="):
+        if line.startswith("NAME="):
             name = line[5:]
-        elif line.startswith(b"URL="):
+        elif line.startswith("URL="):
             url = line[4:]
             if url and name is not None:
                 yield (url, name, lineno)

From e01ea0d9f05e8ab3bfe151f46738e8d91d9ec96c Mon Sep 17 00:00:00 2001
From: Chris Mayo <aklhfex@gmail.com>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 5/6] Safari bookmark parser requires bytes

---
 linkcheck/parser/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/linkcheck/parser/__init__.py b/linkcheck/parser/__init__.py
index 7af8a398..7f6ad51b 100644
--- a/linkcheck/parser/__init__.py
+++ b/linkcheck/parser/__init__.py
@@ -72,7 +72,7 @@ def parse_chromium (url_data):
 def parse_safari (url_data):
     """Parse a Safari bookmark file."""
     from ..bookmarks.safari import parse_bookmark_data
-    for url, name in parse_bookmark_data(url_data.get_content()):
+    for url, name in parse_bookmark_data(url_data.get_raw_content()):
         url_data.add_url(url, name=name)
 
 

From 6e8da10942956db8314c871fde17c0999a44e167 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Petr=20Dlouh=C3=BD?= <petr.dlouhy@email.cz>
Date: Mon, 30 Sep 2019 19:46:24 +0100
Subject: [PATCH 6/6] fixes for Python 3: fix markdowncheck

The translate() method of string objects (and Python 2 Unicode objects)
only accepts a single, table argument.
---
 linkcheck/plugins/markdowncheck.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/linkcheck/plugins/markdowncheck.py b/linkcheck/plugins/markdowncheck.py
index f628c7d0..116f21b2 100644
--- a/linkcheck/plugins/markdowncheck.py
+++ b/linkcheck/plugins/markdowncheck.py
@@ -31,6 +31,7 @@ import re
 from . import _ContentPlugin
 from .. import log, LOG_PLUGIN
 
+from builtins import str as str_text
 
 class MarkdownCheck(_ContentPlugin):
     """Markdown parsing plugin."""
@@ -108,7 +109,7 @@ class MarkdownCheck(_ContentPlugin):
         """
         line = content.count('\n', 0, url_pos) + 1
         column = url_pos - content.rfind('\n', 0, url_pos)
-        url_data.add_url(url_text.translate(None, '\n '), line=line, column=column)
+        url_data.add_url(url_text.translate(str_text.maketrans("", "", '\n ')), line=line, column=column)
 
     def _check_by_re(self, url_data, content):
         """ Finds urls by re.