mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Merge pull request #308 from cjmayo/decode
Decode content when retrieved
This commit is contained in:
commit
f73ba54a2a
7 changed files with 46 additions and 26 deletions
|
|
@ -63,9 +63,9 @@ def parse_bookmark_data (data):
|
|||
for line in data.splitlines():
|
||||
lineno += 1
|
||||
line = line.strip()
|
||||
if line.startswith(b"NAME="):
|
||||
if line.startswith("NAME="):
|
||||
name = line[5:]
|
||||
elif line.startswith(b"URL="):
|
||||
elif line.startswith("URL="):
|
||||
url = line[4:]
|
||||
if url and name is not None:
|
||||
yield (url, name, lineno)
|
||||
|
|
|
|||
|
|
@ -38,13 +38,16 @@ import time
|
|||
import errno
|
||||
import socket
|
||||
import select
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
# Python 3
|
||||
from io import StringIO
|
||||
from io import BytesIO
|
||||
from builtins import str as str_text
|
||||
from future.utils import python_2_unicode_compatible
|
||||
from warnings import filterwarnings
|
||||
|
||||
filterwarnings("ignore",
|
||||
message="The soupsieve package is not installed. CSS selectors cannot be used.",
|
||||
category=UserWarning, module="bs4")
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from . import absolute_url, get_url_from
|
||||
from .. import (log, LOG_CHECK,
|
||||
|
|
@ -216,6 +219,8 @@ class UrlBase (object):
|
|||
self.url_connection = None
|
||||
# data of url content, (data == None) means no data is available
|
||||
self.data = None
|
||||
# url content as a Unicode string
|
||||
self.text = None
|
||||
# cache url is set by build_url() calling set_cache_url()
|
||||
self.cache_url = None
|
||||
# extern flags (is_extern, is_strict)
|
||||
|
|
@ -625,24 +630,35 @@ class UrlBase (object):
|
|||
"""Indicate wether url get_content() can be called."""
|
||||
return self.size <= self.aggregate.config["maxfilesizedownload"]
|
||||
|
||||
def get_content (self):
|
||||
"""Precondition: url_connection is an opened URL."""
|
||||
if self.data is None:
|
||||
log.debug(LOG_CHECK, "Get content of %r", self.url)
|
||||
t = time.time()
|
||||
self.data = self.read_content()
|
||||
self.size = len(self.data)
|
||||
self.dltime = time.time() - t
|
||||
if self.size == 0:
|
||||
self.add_warning(_("Content size is zero."),
|
||||
def download_content(self):
|
||||
log.debug(LOG_CHECK, "Get content of %r", self.url)
|
||||
t = time.time()
|
||||
content = self.read_content()
|
||||
self.size = len(content)
|
||||
self.dltime = time.time() - t
|
||||
if self.size == 0:
|
||||
self.add_warning(_("Content size is zero."),
|
||||
tag=WARN_URL_CONTENT_SIZE_ZERO)
|
||||
else:
|
||||
self.aggregate.add_downloaded_bytes(self.size)
|
||||
else:
|
||||
self.aggregate.add_downloaded_bytes(self.size)
|
||||
return content
|
||||
|
||||
def get_raw_content(self):
|
||||
if self.data is None:
|
||||
self.data = self.download_content()
|
||||
return self.data
|
||||
|
||||
def get_content (self):
|
||||
if self.text is None:
|
||||
self.get_raw_content()
|
||||
soup = BeautifulSoup(self.data, "html.parser")
|
||||
self.text = self.data.decode(soup.original_encoding)
|
||||
self.encoding = soup.original_encoding
|
||||
return self.text
|
||||
|
||||
def read_content(self):
|
||||
"""Return data for this URL. Can be overridden in subclasses."""
|
||||
buf = StringIO()
|
||||
buf = BytesIO()
|
||||
data = self.read_content_chunk()
|
||||
while data:
|
||||
if buf.tell() + len(data) > self.aggregate.config["maxfilesizedownload"]:
|
||||
|
|
@ -652,7 +668,9 @@ class UrlBase (object):
|
|||
return buf.getvalue()
|
||||
|
||||
def read_content_chunk(self):
|
||||
"""Read one chunk of content from this URL."""
|
||||
"""Read one chunk of content from this URL.
|
||||
Precondition: url_connection is an opened URL.
|
||||
"""
|
||||
return self.url_connection.read(self.ReadChunkBytes)
|
||||
|
||||
def get_user_password (self):
|
||||
|
|
|
|||
|
|
@ -90,8 +90,7 @@ class Aggregate (object):
|
|||
response = session.get(url)
|
||||
cgiuser = self.config["loginuserfield"]
|
||||
cgipassword = self.config["loginpasswordfield"]
|
||||
form = formsearch.search_form(response.content, cgiuser, cgipassword,
|
||||
encoding=response.encoding)
|
||||
form = formsearch.search_form(response.text, cgiuser, cgipassword)
|
||||
form.data[cgiuser] = user
|
||||
form.data[cgipassword] = password
|
||||
for key, value in self.config["loginextrafields"].items():
|
||||
|
|
|
|||
|
|
@ -72,7 +72,7 @@ def parse_chromium (url_data):
|
|||
def parse_safari (url_data):
|
||||
"""Parse a Safari bookmark file."""
|
||||
from ..bookmarks.safari import parse_bookmark_data
|
||||
for url, name in parse_bookmark_data(url_data.get_content()):
|
||||
for url, name in parse_bookmark_data(url_data.get_raw_content()):
|
||||
url_data.add_url(url, name=name)
|
||||
|
||||
|
||||
|
|
@ -83,7 +83,7 @@ def parse_text (url_data):
|
|||
for line in url_data.get_content().splitlines():
|
||||
lineno += 1
|
||||
line = line.strip()
|
||||
if not line or line.startswith(b'#'):
|
||||
if not line or line.startswith('#'):
|
||||
continue
|
||||
url_data.add_url(line, line=lineno)
|
||||
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ import re
|
|||
from . import _ContentPlugin
|
||||
from .. import log, LOG_PLUGIN
|
||||
|
||||
from builtins import str as str_text
|
||||
|
||||
class MarkdownCheck(_ContentPlugin):
|
||||
"""Markdown parsing plugin."""
|
||||
|
|
@ -108,7 +109,7 @@ class MarkdownCheck(_ContentPlugin):
|
|||
"""
|
||||
line = content.count('\n', 0, url_pos) + 1
|
||||
column = url_pos - content.rfind('\n', 0, url_pos)
|
||||
url_data.add_url(url_text.translate(None, '\n '), line=line, column=column)
|
||||
url_data.add_url(url_text.translate(str_text.maketrans("", "", '\n ')), line=line, column=column)
|
||||
|
||||
def _check_by_re(self, url_data, content):
|
||||
""" Finds urls by re.
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
# required:
|
||||
bs4
|
||||
requests >= 2.4
|
||||
pyxdg
|
||||
dnspython
|
||||
|
|
|
|||
1
setup.py
1
setup.py
|
|
@ -503,6 +503,7 @@ args = dict(
|
|||
install_requires = [
|
||||
'requests >= 2.4',
|
||||
'dnspython',
|
||||
'bs4',
|
||||
'pyxdg',
|
||||
'future',
|
||||
],
|
||||
|
|
|
|||
Loading…
Reference in a new issue