# Copyright (C) 2014 Bastian Kleineidam # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. """ Parse links in PDF files with pdfminer. """ from io import BytesIO from . import _ParserPlugin try: from pdfminer.pdfparser import PDFParser from pdfminer.pdfdocument import PDFDocument from pdfminer.pdftypes import PDFStream, PDFObjRef from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSException except ImportError: has_pdflib = False else: has_pdflib = True from .. import log, LOG_PLUGIN def search_url(obj, url_data, pageno, seen_objs): """Recurse through a PDF object, searching for URLs.""" if isinstance(obj, PDFObjRef): if obj.objid in seen_objs: # prevent recursive loops return seen_objs.add(obj.objid) obj = obj.resolve() if isinstance(obj, dict): for key, value in obj.items(): if key == 'URI': url_data.add_url(value.decode("ascii"), page=pageno) else: search_url(value, url_data, pageno, seen_objs) elif isinstance(obj, list): for elem in obj: search_url(elem, url_data, pageno, seen_objs) elif isinstance(obj, PDFStream): search_url(obj.attrs, url_data, pageno, seen_objs) class PdfParser(_ParserPlugin): """PDF parsing plugin.""" def __init__(self, config): """Check for pdfminer.""" if not has_pdflib: log.warn(LOG_PLUGIN, "pdfminer not found for PdfParser plugin") super().__init__(config) def applies_to(self, url_data, pagetype=None): """Check for PDF pagetype.""" return has_pdflib and pagetype == 'pdf' def check(self, url_data): """Parse PDF data.""" # XXX user authentication from url_data password = '' data = url_data.get_raw_content() # PDFParser needs a seekable file object fp = BytesIO(data) try: parser = PDFParser(fp) doc = PDFDocument(parser, password=password) for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1): if "Contents" in page.attrs: search_url(page.attrs["Contents"], url_data, pageno, set()) if "Annots" in page.attrs: search_url(page.attrs["Annots"], url_data, pageno, set()) except PSException as msg: if not msg.args: # at least show the class name msg = repr(msg) log.warn(LOG_PLUGIN, "Error parsing PDF file: %s", msg)