Add PDF test and fix page number.

2026-05-09 15:14:45 +00:00 · 2014-04-29 18:53:24 +02:00 · 2014-04-29 18:53:24 +02:00 · b152ce7a6e
commit b152ce7a6e
parent 0d9881cf03
5 changed files with 35 additions and 3 deletions
--- a/linkcheck/plugins/parsepdf.py
+++ b/linkcheck/plugins/parsepdf.py
@ -44,6 +44,9 @@ def search_url(obj, url_data, pageno, seen_objs):
    if isinstance(obj, dict):
        for key, value in obj.items():
            if key == 'URI' and isinstance(value, basestring):
+                # URIs should be 7bit ASCII encoded, but be safe and encode
+                # to unicode
+                # XXX this does not use an optional specified base URL
                url = strformat.unicode_safe(value)
                url_data.add_url(url, page=pageno)
            else:
@ -78,7 +81,7 @@ class PdfParser(_ParserPlugin):
        try:
            parser = PDFParser(fp)
            doc = PDFDocument(parser, password=password)
-            for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
+            for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
                if "Contents" in page.attrs:
                    search_url(page.attrs["Contents"], url_data, pageno, set())
                if "Annots" in page.attrs:
--- a/tests/init.py
+++ b/tests/init.py
@ -249,6 +249,14 @@ def has_word():
 need_word = _need_func(has_word, 'Word')


+@memoized
+def has_pdflib():
+    from linkcheck.plugins import parsepdf
+    return parsepdf.has_pdflib
+
+need_pdflib = _need_func(has_pdflib, 'pdflib')
+
+
@contextmanager
 def _limit_time (seconds):
    """Raises LinkCheckerInterrupt if given number of seconds have passed."""
--- a/tests/checker/data/file.pdf
+++ b/tests/checker/data/file.pdf
--- a/tests/checker/data/file.pdf.result
+++ b/tests/checker/data/file.pdf.result
@ -0,0 +1,15 @@
+url file://%(curdir)s/%(datadir)s/file.pdf
+cache key file://%(curdir)s/%(datadir)s/file.pdf
+real url file://%(curdir)s/%(datadir)s/file.pdf
+name %(datadir)s/file.pdf
+valid
+
+url http://www.example.com/link1
+cache key http://www.example.com/link1
+real url http://www.example.com/link1
+error
+
+url http://www.example.com/link2
+cache key http://www.example.com/link2
+real url http://www.example.com/link2
+error
--- a/tests/checker/test_file.py
+++ b/tests/checker/test_file.py
@ -20,7 +20,7 @@ Test file parsing.
 import os
 import sys
 import zipfile
-from tests import need_word
+from tests import need_word, need_pdflib
 from . import LinkCheckTest, get_file


@ -65,7 +65,13 @@ class TestFile (LinkCheckTest):

    @need_word
    def test_word (self):
-        self.file_test("file.doc")
+        confargs = dict(enabledplugins=["WordParser"])
+        self.file_test("file.doc", confargs=confargs)
+
+    @need_pdflib
+    def test_pdf(self):
+        confargs = dict(enabledplugins=["PdfParser"])
+        self.file_test("file.pdf", confargs=confargs)

    def test_urllist (self):
        self.file_test("urllist.txt")