mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-03-16 22:10:26 +00:00
Add PDF test and fix page number.
This commit is contained in:
parent
0d9881cf03
commit
b152ce7a6e
5 changed files with 35 additions and 3 deletions
|
|
@ -44,6 +44,9 @@ def search_url(obj, url_data, pageno, seen_objs):
|
|||
if isinstance(obj, dict):
|
||||
for key, value in obj.items():
|
||||
if key == 'URI' and isinstance(value, basestring):
|
||||
# URIs should be 7bit ASCII encoded, but be safe and encode
|
||||
# to unicode
|
||||
# XXX this does not use an optional specified base URL
|
||||
url = strformat.unicode_safe(value)
|
||||
url_data.add_url(url, page=pageno)
|
||||
else:
|
||||
|
|
@ -78,7 +81,7 @@ class PdfParser(_ParserPlugin):
|
|||
try:
|
||||
parser = PDFParser(fp)
|
||||
doc = PDFDocument(parser, password=password)
|
||||
for (pageno, page) in enumerate(PDFPage.create_pages(doc)):
|
||||
for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1):
|
||||
if "Contents" in page.attrs:
|
||||
search_url(page.attrs["Contents"], url_data, pageno, set())
|
||||
if "Annots" in page.attrs:
|
||||
|
|
|
|||
|
|
@ -249,6 +249,14 @@ def has_word():
|
|||
need_word = _need_func(has_word, 'Word')
|
||||
|
||||
|
||||
@memoized
|
||||
def has_pdflib():
|
||||
from linkcheck.plugins import parsepdf
|
||||
return parsepdf.has_pdflib
|
||||
|
||||
need_pdflib = _need_func(has_pdflib, 'pdflib')
|
||||
|
||||
|
||||
@contextmanager
|
||||
def _limit_time (seconds):
|
||||
"""Raises LinkCheckerInterrupt if given number of seconds have passed."""
|
||||
|
|
|
|||
BIN
tests/checker/data/file.pdf
Normal file
BIN
tests/checker/data/file.pdf
Normal file
Binary file not shown.
15
tests/checker/data/file.pdf.result
Normal file
15
tests/checker/data/file.pdf.result
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
url file://%(curdir)s/%(datadir)s/file.pdf
|
||||
cache key file://%(curdir)s/%(datadir)s/file.pdf
|
||||
real url file://%(curdir)s/%(datadir)s/file.pdf
|
||||
name %(datadir)s/file.pdf
|
||||
valid
|
||||
|
||||
url http://www.example.com/link1
|
||||
cache key http://www.example.com/link1
|
||||
real url http://www.example.com/link1
|
||||
error
|
||||
|
||||
url http://www.example.com/link2
|
||||
cache key http://www.example.com/link2
|
||||
real url http://www.example.com/link2
|
||||
error
|
||||
|
|
@ -20,7 +20,7 @@ Test file parsing.
|
|||
import os
|
||||
import sys
|
||||
import zipfile
|
||||
from tests import need_word
|
||||
from tests import need_word, need_pdflib
|
||||
from . import LinkCheckTest, get_file
|
||||
|
||||
|
||||
|
|
@ -65,7 +65,13 @@ class TestFile (LinkCheckTest):
|
|||
|
||||
@need_word
|
||||
def test_word (self):
|
||||
self.file_test("file.doc")
|
||||
confargs = dict(enabledplugins=["WordParser"])
|
||||
self.file_test("file.doc", confargs=confargs)
|
||||
|
||||
@need_pdflib
|
||||
def test_pdf(self):
|
||||
confargs = dict(enabledplugins=["PdfParser"])
|
||||
self.file_test("file.pdf", confargs=confargs)
|
||||
|
||||
def test_urllist (self):
|
||||
self.file_test("urllist.txt")
|
||||
|
|
|
|||
Loading…
Reference in a new issue