From 9ab895751fe8e030eed9a82bc7bfdb806bbeb5e0 Mon Sep 17 00:00:00 2001 From: calvin Date: Thu, 20 Nov 2008 07:51:22 +0000 Subject: [PATCH] Support parsing of Firefox 3 bookmark files git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3862 e7d03fd6-7b0d-0410-9947-9c21f3af8025 --- ChangeLog.txt | 4 ++++ debian/control | 2 +- linkcheck/checker/const.py | 11 ++++++++++- linkcheck/checker/fileurl.py | 33 ++++++++++++++++++++++++++++++++- tests/checker/test_file.py | 4 ++++ 5 files changed, 51 insertions(+), 3 deletions(-) diff --git a/ChangeLog.txt b/ChangeLog.txt index feb0ee2c..b5945725 100644 --- a/ChangeLog.txt +++ b/ChangeLog.txt @@ -71,6 +71,10 @@ Type: feature Changed: linkcheck/director/__init__.py + * Support reading Firefox 3 bookmark files in SQLite format. + Type: feature + Changed: linkcheck/checker/fileurl.py + 4.9 "Michael Clayton" (released 25.4.2008) * Parse Shockwave Flash (SWF) for URLs to check diff --git a/debian/control b/debian/control index a369f3fe..3afb5d37 100644 --- a/debian/control +++ b/debian/control @@ -16,7 +16,7 @@ XB-Python-Version: ${python:Versions} Provides: ${python:Provides} Suggests: apache | httpd, python-optcomplete (>= 1.2-5), python-geoip (>= 1.2.1-2), clamav-daemon, python-utidylib, - python-cssutils + python-cssutils, python-pysqlite2 Description: check websites and HTML documents for broken links Provides a command line program and web interface to check links of websites and HTML documents. diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py index 2cca3bc3..f1243736 100644 --- a/linkcheck/checker/const.py +++ b/linkcheck/checker/const.py @@ -62,6 +62,14 @@ ExcNoCacheList = [ socket.timeout, ] +# firefox bookmark file needs sqlite3 for parsing +try: + import sqlite3 + ExcCacheList.append(sqlite3.Error) +except ImportError: + pass + + ExcList = ExcCacheList + ExcNoCacheList WARN_URL_EFFECTIVE_URL = "url-effective-url" @@ -134,7 +142,7 @@ Warnings = { # file extensions we can parse recursively PARSE_EXTENSIONS = { "html": re.compile(r'(?i)\.s?html?$'), - "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file + "opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file "css": re.compile(r'(?i)\.css$'), # CSS stylesheet "swf": re.compile(r'(?i)\.swf$'), # SWF file } @@ -157,3 +165,4 @@ PARSE_CONTENTS = { "opera" : re.compile(r'^Opera Hotlist'), "text" : re.compile(r'(?i)^# LinkChecker URL list'), } + diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py index 940f86e3..d65924b1 100644 --- a/linkcheck/checker/fileurl.py +++ b/linkcheck/checker/fileurl.py @@ -25,11 +25,16 @@ import urlparse import urllib import urllib2 -from . import urlbase, get_index_html, absolute_url +from . import urlbase, get_index_html, absolute_url, get_url_from from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \ PARSE_EXTENSIONS, PARSE_CONTENTS +try: + import sqlite3 + has_sqlite = True +except ImportError: + has_sqlite = False def get_files (dirname): """ @@ -79,6 +84,8 @@ def is_absolute_path (path): return path.startswith("/") +firefox_extension = re.compile(r'/(?i)places.sqlite$') + class FileUrl (urlbase.UrlBase): """ Url link with file scheme. @@ -244,6 +251,8 @@ class FileUrl (urlbase.UrlBase): for ro in PARSE_EXTENSIONS.values(): if ro.search(self.url): return True + if firefox_extension.search(self.url): + return True # try to read content (can fail, so catch error) try: for ro in PARSE_CONTENTS.values(): @@ -264,11 +273,33 @@ class FileUrl (urlbase.UrlBase): if ro.search(self.url): getattr(self, "parse_"+key)() return + if has_sqlite and firefox_extension.search(self.url): + self.parse_firefox() + return for key, ro in PARSE_CONTENTS.items(): if ro.search(self.get_content()[:30]): getattr(self, "parse_"+key)() return + def parse_firefox (self): + """Parse a Firefox3 bookmark file.""" + log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self) + conn = sqlite3.connect(self.get_os_filename(), timeout=0.5) + try: + c = conn.cursor() + try: + sql = """SELECT moz_places.url, moz_places.title + FROM moz_places WHERE hidden=0""" + c.execute(sql) + for url, name in c: + url_data = get_url_from(url, self.recursion_level+1, + self.aggregate, parent_url=self.url, name=name) + self.aggregate.urlqueue.put(url_data) + finally: + c.close() + finally: + conn.close() + def get_intern_pattern (self): """ Get pattern for intern URL matching. diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py index b8304203..27048ed4 100644 --- a/tests/checker/test_file.py +++ b/tests/checker/test_file.py @@ -58,6 +58,10 @@ class TestFile (LinkCheckTest): """ self.file_test("urllist.txt") + def test_firefox_bookmarks (self): + """Test firefox 3 bookmark file parsing.""" + self.file_test("places.sqlite") + def test_good_file (self): url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs() nurl = self.norm(url)