Add PDF test and fix page number.

This commit is contained in:
Bastian Kleineidam 2014-04-29 18:53:24 +02:00
parent 0d9881cf03
commit b152ce7a6e
5 changed files with 35 additions and 3 deletions

View file

@ -44,6 +44,9 @@ def search_url(obj, url_data, pageno, seen_objs):
if isinstance(obj, dict):
for key, value in obj.items():
if key == 'URI' and isinstance(value, basestring):
# URIs should be 7bit ASCII encoded, but be safe and encode
# to unicode
# XXX this does not use an optional specified base URL
url = strformat.unicode_safe(value)
url_data.add_url(url, page=pageno)
else:
@ -78,7 +81,7 @@ class PdfParser(_ParserPlugin):
try:
parser = PDFParser(fp)
doc = PDFDocument(parser, password=password)
for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
if "Contents" in page.attrs:
search_url(page.attrs["Contents"], url_data, pageno, set())
if "Annots" in page.attrs:

View file

@ -249,6 +249,14 @@ def has_word():
need_word = _need_func(has_word, 'Word')
@memoized
def has_pdflib():
from linkcheck.plugins import parsepdf
return parsepdf.has_pdflib
need_pdflib = _need_func(has_pdflib, 'pdflib')
@contextmanager
def _limit_time (seconds):
"""Raises LinkCheckerInterrupt if given number of seconds have passed."""

BIN
tests/checker/data/file.pdf Normal file

Binary file not shown.

View file

@ -0,0 +1,15 @@
url file://%(curdir)s/%(datadir)s/file.pdf
cache key file://%(curdir)s/%(datadir)s/file.pdf
real url file://%(curdir)s/%(datadir)s/file.pdf
name %(datadir)s/file.pdf
valid
url http://www.example.com/link1
cache key http://www.example.com/link1
real url http://www.example.com/link1
error
url http://www.example.com/link2
cache key http://www.example.com/link2
real url http://www.example.com/link2
error

View file

@ -20,7 +20,7 @@ Test file parsing.
import os
import sys
import zipfile
from tests import need_word
from tests import need_word, need_pdflib
from . import LinkCheckTest, get_file
@ -65,7 +65,13 @@ class TestFile (LinkCheckTest):
@need_word
def test_word (self):
self.file_test("file.doc")
confargs = dict(enabledplugins=["WordParser"])
self.file_test("file.doc", confargs=confargs)
@need_pdflib
def test_pdf(self):
confargs = dict(enabledplugins=["PdfParser"])
self.file_test("file.pdf", confargs=confargs)
def test_urllist (self):
self.file_test("urllist.txt")