From b152ce7a6e47bd630d96a83cd4d98065f7e4e9e9 Mon Sep 17 00:00:00 2001 From: Bastian Kleineidam Date: Tue, 29 Apr 2014 18:53:24 +0200 Subject: [PATCH] Add PDF test and fix page number. --- linkcheck/plugins/parsepdf.py | 5 ++++- tests/__init__.py | 8 ++++++++ tests/checker/data/file.pdf | Bin 0 -> 7411 bytes tests/checker/data/file.pdf.result | 15 +++++++++++++++ tests/checker/test_file.py | 10 ++++++++-- 5 files changed, 35 insertions(+), 3 deletions(-) create mode 100644 tests/checker/data/file.pdf create mode 100644 tests/checker/data/file.pdf.result diff --git a/linkcheck/plugins/parsepdf.py b/linkcheck/plugins/parsepdf.py index 557810a7..d33c800a 100755 --- a/linkcheck/plugins/parsepdf.py +++ b/linkcheck/plugins/parsepdf.py @@ -44,6 +44,9 @@ def search_url(obj, url_data, pageno, seen_objs): if isinstance(obj, dict): for key, value in obj.items(): if key == 'URI' and isinstance(value, basestring): + # URIs should be 7bit ASCII encoded, but be safe and encode + # to unicode + # XXX this does not use an optional specified base URL url = strformat.unicode_safe(value) url_data.add_url(url, page=pageno) else: @@ -78,7 +81,7 @@ class PdfParser(_ParserPlugin): try: parser = PDFParser(fp) doc = PDFDocument(parser, password=password) - for (pageno, page) in enumerate(PDFPage.create_pages(doc)): + for (pageno, page) in enumerate(PDFPage.create_pages(doc), start=1): if "Contents" in page.attrs: search_url(page.attrs["Contents"], url_data, pageno, set()) if "Annots" in page.attrs: diff --git a/tests/__init__.py b/tests/__init__.py index 179b4a7b..41d3e10f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -249,6 +249,14 @@ def has_word(): need_word = _need_func(has_word, 'Word') +@memoized +def has_pdflib(): + from linkcheck.plugins import parsepdf + return parsepdf.has_pdflib + +need_pdflib = _need_func(has_pdflib, 'pdflib') + + @contextmanager def _limit_time (seconds): """Raises LinkCheckerInterrupt if given number of seconds have passed.""" diff --git a/tests/checker/data/file.pdf b/tests/checker/data/file.pdf new file mode 100644 index 0000000000000000000000000000000000000000..244d57ffa9dbdb3dee1333c3401d9f27f8a14a56 GIT binary patch literal 7411 zcmc&(c{o&m+b^4$&!6}&-7cKe)T-pbG_I5$9t~p%sJg+}s#;9Uqb5{^Wq*pif#KvbFsiQo#N zCmUX?1l8&C;MO-IvRMi$d)GlhyjMY99W1*G;ZDR|oukVZ+Q(5X&--QTb)au|bC&St zOemb?F@|24#MD!5@Fz5HLEk$*ZInc0e-(wW=72%iIy$hLn51>7<605QU6+`wco zjBkI)kiR8~`rjw{)t5yDq|8z|@t>1?G{oB<*&>inOCFYmr3E~U5SV!?!TUfleKb!` zP7=j*{^mfoo#Rux&18{F(XHiE4i;I+YnN3~^QWvX+QdV*w5Lj6Bt#A$OSDM-Gs!oK z!n*$^Nk;g;CJBT7mL&}KAJY7@*Dx>|fx`UU*v}@uy5`1Sr>;y%3KI{34nHZ4l2=V1-{CN+78nc0j2Ccs=!1^?R~>Dut~~yI5vpq^{{1G zqtfr)yK1p`!uB3<-~!jqh~Ae))4b;|PGNufXisdfc?11q*cmq#o*Sw`3OSy( zu=`ozImo+NvL5jnGI^61dW4_#v0pR1^}u}J=TeJU43@QGLp{YyWAlMRL3<^iS+O#~ zBc{--LxskFtsHk}7p`>b&JoRWX6Z%r-M9}z#X&_m5(GJUw&o-0qaG(w+OLwrRN6h3 zt2%jU<&wFt``_2znEEu2wO(jjfEE?xd9^?HQc7@=Ixqnsqa@NTf`Nam)Fx=@bm zGvN#+w&ZBUcC(Xkf-(ZNMYT?6Ye_omBis=OO1~--slP4~6R+2K(N|)u5fu35x{*xe z73wR=OUGv8SDy=%qh(Cbc~>TXEbfKqC#fCTEp#>IXsl;aztfb}+d1D`0*`r2xN@_* zUy1rTHFB)S#AH@%I9dj6jztd9`qoK@tYiB{G?WG7w4^j{w=(bZDynKqi8U-KxiQ*! zCoDyB!&F;gGB^`?2?l1<)Xv~4)mcK4S zDUCZ0M_ug$ryd?_k}7Au^Jdg!uN+T~1gSvr!GjCZ&&k_IvX5eax{Mn^;(!wiJ9X#b`e~aORd0!*vzgB?YglZx=$2y8Ri+55 zM}e*Fvo^1SrONVAW%^s~VJ;jA*s@MMuBKl?e!Z}EGxqfRHwO+GW!Z4mo8Ah8yH0*D zT~?pAVBT`QgL}&~rbD8B{EVMZ>Lj~0H_IjIEb7bjUXM^t@y{FuBK^#s**VuBeGLuucq8vTcSt3w zewPl%99H7sNt?h2m`FwTCGv0TnS6`%dL#cGJf!hmrlGS*ynnX!?7-9UQn|7|vZZ<`|wlmFN{4r^vk3 zqOg@N*Z;Lfox_y_v3xE*_4O6g7v8~F^+Mn`U_!)%OtLL4qI&hoe(SR_k9f}vhuX7? zCy+T#Kozi^0yRPBUl=&P&bFSaljgfdxHh*seE^$#`s1{qCd*p?p-`1vcE2#KbkW@C zE`)X#9%8RKY$vPcWVgZ~290 zG>eo9LX;1Jg^jSnS+Nb!~0!@eQHO3-0pUZ5fHFzJWOBr#BJ-i9XG@|TV!Oo_fDzjMAgNNY*AdGFQPk^#We!L8+w8grsl?MSrw@@K{$h`P5{dJbweR3lk1AKx9t zn0we~Y#6Rr*J9@eX1TeJC4Mx(YWWzLS!sVoqdO|z>AlK3`BLkaf{UaSMDo3f6XG^m zQc@CvwA;2$U8!@oG4?InrVst7$pzf}ISW|@z{ypqpUai0`hl2y}+!jzhI>&8tk`+Nm`s|>FS>Eddg%Y7FIqR%@-x(B9+I12!MiETJiUUb#^{)$AJ@ia? zesSfcOD!dwQ%@WCk73?*mzHD~T4GAvhT4xm7-za{>geaV5U?~bZeo~KW3Qgpa{fb` zm1)|wve3SR83~tLnzsX%G^C~xoCh7mIEO3lm~On$G%dFr;p~}_74~RZh#H(;^V+mj zELF@@w5M>{w+S70)my)Lo9g6i_UJ9S=Pi$jS)Mcm(tE$wswiE))G!k{NqeV|cU!E$ zDc@nR+wG7x*+7kcH!G*ZowY>jwGQ>U79@cqK{efX%yc2ylqyKM-Fg$<3+92=*jE#J zcErLKOkddVZBH~sud!9VDyxJj_MpYnWIk3mJMXAQ>Ylmp58>I3u?Tn~8_YyePAw3) zLY}`6@MedP8DD&y)&1j2ZTBU6Vj?)Qm&W}W}Ytmxb1^toQ^(wCiL2b74e`ud+!&wjhf_TIgi|4C}I4|l=xWcE41 z&X%>q7x}a>KFFQL`qTPWx-Cw{I(c_r_4v!x@}1N_b0zzDcKpM%#v{wu(onOZ{iVD^@0L9(C?^4#uZqb$#EQ znK{2VF5FI4v!?)ZG7xTm(DOFSE9sl9x3W>@7Di5awZ_~JO`UV?`%c{A&jL^E7wfr! zJmhp&c(RMNpXlB$ju_G}U{ zO!0H@MQpD@Dg>{LP7vQr8EzFQM+w?%)(nMkUP}oS@y*zpByGJhIwmrAtzIWl?s7>6 zAE9Mv2XYO4V^gWt zNoMh>-@mzjkKMkoa>iE)J6`LJC|6LNV6L_=~K z?y_-5M5w4lh3+3-vBeljc|Ao|R(da$cn(7F`{)}wje)7(=+G^Hfup9bANTePMzh7m zOa%4qk5;IJ=cBOyM`FIG2mQ&}FAu?YrYaMrwyU=E?1^-zN$likRbQod|5 zJC@>zHC&&nLMZTzAI^5pXg0kmEk%}h#9R5WTm(8#S@zGX z#pKsbSQy&$YfHeqPto;E^|eqLZm=@`k8)g>DUWNMx=g2iFe9aAsg36syo)tIafP4s zT7<~1* zxcghf$YI3@PwrFQ`9|&MNzr?!2)nXxtrkXKvwK`wWcrHq{qh!sNSzei8)+1^BGOAU z8rt)q0lmca*)KRNdDC_V<+F_1ONk?vhRq(hvokSrK1^9b~^6s*|#;+a1t z2fH99moPr=yAZJ2w-M~4T{67?LFo1Hxp?Y;q+pN5*Y*1|PJ$OsT&s50?;UkMvaX@z zrX4?0jxVduxsV#1!e{SnM=a7Wnr&@s9Zcxfu5w@6v*ukOFB#G+RiNMUuuZc`+C3mX zY*?g4vupYCkk7>>1z2)vvUl?M)@Zp?3pMZIZTmLYr?2l;x<|v8zmI!dyO)~TTqxG2 zpiI23t?0D`BY#=xA6e{fJ{5-;&UuUzo4Os|b4jwnQ`PsKT>iOwDTz?iUO)44G3e-1 z&eXSq0f$=7I0uW==uGGnpGX)W+zyc0dYe_tW#04YpI`LgYv5$o0V2|qoY|g{(jy3Yv(7axABsOo9WrJ?{{lc7^&bi?Fk;R*4 ziq2JI)pZqDm8#p?)i@So(W{#$7Fq z9bghM;_sGE-@jBq`8VXw57vgk?jR5-^xqs7>?fwqz{iaQ!BfaI9EnQwaCDQpOeK=YG%!XM27)jaV?0Q~VJHv;OK?*sIodnW76^z~;3OHri*8zBnAz(x`h=96VpAibzSdjyOD%hfh3~Iyw_oX_^z!2*xb?+4I z&o3IN8wb@l$EkM{ZE~Z+~Kou!N?cgjiecBPJ&!U z+=h!v_dX_a%PE@l72{4 z2vicI4gVPnjviDR)`8%`=&=^z&wHS&#*Vf$2dcRY3JFF5?SrGyU?dd!kJI9h7XJ%+ z;2+S`13?TZhGa(uRSf>o=YMwoCk_@51H%}J{8QmCP7{CzBX%&1VF7s6^`H>-Ni=f^ zKwd)(NOYRTKM?gxL39cBB#17-g8@`oh69KmiAwSGAd;wH*dMH~Hp$kJz`#EPo=90F z7=wUYK$OX33XN(GLNkoOK$2=66i+uG7=56w1Txi);gjeC!RiAW!OM|I(o<7`Xn<)z z&6SiOSRi6xNh+AJOZ9$2{I3oBKgWmpzs2{Xs6WNWs1ak<0Ezy94kJD)=(nPQAb(d4 zF!;M{j5=vL+EUHIKbU6lkFGL=Sc)gm0QgS}G3xsBLO<&LH*~ce$<7cJiia)9<40Sd z7C;C@nmG~$Mx$ZUK#x$;09%bgO3MJP`YTvvB4fS*9WvC@V4UI(G@6?n1mf-OElr{m zT-{tq(nN|Y#0BsUI|&5yQ#_11{(r^;|0N#SuLZ&qfKg1b2fz$;xnL4c<} z4=@UeltF^+z<vh#rLk3}*&ff#Iw{wxfW5AdB&U1Dk^tWUvT4 z5~_?t!|}4pa22Qu1_qUtMc`2?s#u`tcqPz(S7D409EFH=AQ7FZo~~dBN)?Agp=Fg- tpco`Z1%*(-qi_gS6dD1;z;FmGoB?zyjo?B10UHzsjsb~?sGh@v{s+2VVut_# literal 0 HcmV?d00001 diff --git a/tests/checker/data/file.pdf.result b/tests/checker/data/file.pdf.result new file mode 100644 index 00000000..752592a4 --- /dev/null +++ b/tests/checker/data/file.pdf.result @@ -0,0 +1,15 @@ +url file://%(curdir)s/%(datadir)s/file.pdf +cache key file://%(curdir)s/%(datadir)s/file.pdf +real url file://%(curdir)s/%(datadir)s/file.pdf +name %(datadir)s/file.pdf +valid + +url http://www.example.com/link1 +cache key http://www.example.com/link1 +real url http://www.example.com/link1 +error + +url http://www.example.com/link2 +cache key http://www.example.com/link2 +real url http://www.example.com/link2 +error diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index 481846bf..6580aaa8 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -20,7 +20,7 @@ Test file parsing. import os import sys import zipfile -from tests import need_word +from tests import need_word, need_pdflib from . import LinkCheckTest, get_file @@ -65,7 +65,13 @@ class TestFile (LinkCheckTest): @need_word def test_word (self): - self.file_test("file.doc") + confargs = dict(enabledplugins=["WordParser"]) + self.file_test("file.doc", confargs=confargs) + + @need_pdflib + def test_pdf(self): + confargs = dict(enabledplugins=["PdfParser"]) + self.file_test("file.pdf", confargs=confargs) def test_urllist (self): self.file_test("urllist.txt")