diff --git a/linkcheck/plugins/parsepdf.py b/linkcheck/plugins/parsepdf.py index 557810a7..d33c800a 100755 --- a/linkcheck/plugins/parsepdf.py +++ b/linkcheck/plugins/parsepdf.py @@ -44,6 +44,9 @@ def search_url(obj, url_data, pageno, seen_objs): if isinstance(obj, dict): for key, value in obj.items(): if key == 'URI' and isinstance(value, basestring): + # URIs should be 7bit ASCII encoded, but be safe and encode + # to unicode + # XXX this does not use an optional specified base URL url = strformat.unicode_safe(value) url_data.add_url(url, page=pageno) else: @@ -78,7 +81,7 @@ class PdfParser(_ParserPlugin): try: parser = PDFParser(fp) doc = PDFDocument(parser, password=password) - for (pageno, page) in enumerate(PDFPage.create_pages(doc)): + for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1): if "Contents" in page.attrs: search_url(page.attrs["Contents"], url_data, pageno, set()) if "Annots" in page.attrs: diff --git a/tests/__init__.py b/tests/__init__.py index 179b4a7b..41d3e10f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -249,6 +249,14 @@ def has_word(): need_word = _need_func(has_word, 'Word') +@memoized +def has_pdflib(): + from linkcheck.plugins import parsepdf + return parsepdf.has_pdflib + +need_pdflib = _need_func(has_pdflib, 'pdflib') + + @contextmanager def _limit_time (seconds): """Raises LinkCheckerInterrupt if given number of seconds have passed.""" diff --git a/tests/checker/data/file.pdf b/tests/checker/data/file.pdf new file mode 100644 index 00000000..244d57ff Binary files /dev/null and b/tests/checker/data/file.pdf differ diff --git a/tests/checker/data/file.pdf.result b/tests/checker/data/file.pdf.result new file mode 100644 index 00000000..752592a4 --- /dev/null +++ b/tests/checker/data/file.pdf.result @@ -0,0 +1,15 @@ +url file://%(curdir)s/%(datadir)s/file.pdf +cache key file://%(curdir)s/%(datadir)s/file.pdf +real url file://%(curdir)s/%(datadir)s/file.pdf +name %(datadir)s/file.pdf +valid + +url http://www.example.com/link1 +cache key http://www.example.com/link1 +real url http://www.example.com/link1 +error + +url http://www.example.com/link2 +cache key http://www.example.com/link2 +real url http://www.example.com/link2 +error diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index 481846bf..6580aaa8 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -20,7 +20,7 @@ Test file parsing. import os import sys import zipfile -from tests import need_word +from tests import need_word, need_pdflib from . import LinkCheckTest, get_file @@ -65,7 +65,13 @@ class TestFile (LinkCheckTest): @need_word def test_word (self): - self.file_test("file.doc") + confargs = dict(enabledplugins=["WordParser"]) + self.file_test("file.doc", confargs=confargs) + + @need_pdflib + def test_pdf(self): + confargs = dict(enabledplugins=["PdfParser"]) + self.file_test("file.pdf", confargs=confargs) def test_urllist (self): self.file_test("urllist.txt")