Merge pull request #646 from cjmayo/unidir

Fix checking directory containing Unicode filenames
This commit is contained in:
Chris Mayo 2022-09-06 19:23:07 +01:00 committed by GitHub
commit 595ce32e55
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 34 additions and 4 deletions

View file

@ -188,6 +188,9 @@ def get_index_html(urls):
name = html.escape(entry)
try:
url = html.escape(urllib.parse.quote(entry))
except UnicodeEncodeError:
log.warn(LOG_CHECK, "Unable to convert entry to Unicode")
continue
except KeyError:
# Some unicode entries raise KeyError.
url = name

View file

@ -234,7 +234,7 @@ class FileUrl(urlbase.UrlBase):
with links to the files."""
if self.is_directory():
data = get_index_html(get_files(self.get_os_filename()))
data = data.encode("iso8859-1", "ignore")
data = data.encode()
else:
data = super().read_content()
return data

View file

@ -0,0 +1,11 @@
url file://%(curdir)s/%(datadir)s/udir/
cache key file://%(curdir)s/%(datadir)s/udir/
real url file://%(curdir)s/%(datadir)s/udir/
name %(datadir)s/udir
valid
url %%C3%%AD%%C2%%BB%%C2%%AD%%C2%%AF%%C2%%BF.dat
cache key file://%(curdir)s/%(datadir)s/udir/%%C3%%AD%%C2%%BB%%C2%%AD%%C2%%AF%%C2%%BF.dat
real url file://%(curdir)s/%(datadir)s/udir/%%C3%%AD%%C2%%BB%%C2%%AD%%C2%%AF%%C2%%BF.dat
name í»­¯¿.dat
valid

View file

@ -17,15 +17,22 @@
Test file parsing.
"""
import os
from pathlib import Path
import sys
import zipfile
import pytest
from tests import need_network, need_word, need_pdflib
from . import LinkCheckTest, get_file
def unzip(filename, targetdir):
"""Unzip given zipfile into targetdir."""
# There are likely problems with zipfile and non-Unicode filenames
# https://github.com/python/cpython/issues/83042
# https://github.com/python/cpython/issues/72267
# https://github.com/python/cpython/issues/95463
zf = zipfile.ZipFile(filename)
for name in zf.namelist():
if name.endswith("/"):
@ -85,16 +92,25 @@ class TestFile(LinkCheckTest):
def test_urllist(self):
self.file_test("urllist.txt")
@pytest.mark.xfail(strict=True)
def test_directory_listing(self):
# unpack non-unicode filename which cannot be stored
# in the SF subversion repository
if os.name != "posix" or sys.platform != "linux2":
return
if os.name != "posix" or sys.platform != "linux":
pytest.skip("Not running on POSIX or Linux")
dirname = get_file("dir")
if not os.path.isdir(dirname):
unzip(dirname + ".zip", os.path.dirname(dirname))
self.file_test("dir")
def test_directory_listing_unicode(self):
if os.name != "posix" or sys.platform != "linux":
pytest.skip("Not running on POSIX or Linux")
dirname = Path(get_file("udir"))
dirname.mkdir(exist_ok=True)
Path(dirname, "í»­¯¿.dat").touch()
self.file_test("udir")
def test_unicode_filename(self):
# a unicode filename
self.file_test("Мошкова.bin")
@ -115,7 +131,7 @@ class TestFile(LinkCheckTest):
# Fails on NT platforms and I am too lazy to fix
# Cause: url get quoted %7C which gets lowercased to
# %7c and this fails.
return
pytest.skip("Not running on NT")
url = "file:/%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
nurl = self.norm(url)
resultlines = [