Support parsing of Firefox 3 bookmark files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3862 e7d03fd6-7b0d-0410-9947-9c21f3af8025
2026-04-15 20:01:03 +00:00 · 2008-11-20 07:51:22 +00:00 · 2008-11-20 07:51:22 +00:00 · 9ab895751f
commit 9ab895751f
parent 22d2784a58
5 changed files with 51 additions and 3 deletions
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@ -71,6 +71,10 @@
    Type: feature
    Changed: linkcheck/director/__init__.py

+  * Support reading Firefox 3 bookmark files in SQLite format.
+    Type: feature
+    Changed: linkcheck/checker/fileurl.py
+
 4.9 "Michael Clayton" (released 25.4.2008)

  * Parse Shockwave Flash (SWF) for URLs to check
--- a/debian/control
+++ b/debian/control
@ -16,7 +16,7 @@ XB-Python-Version: ${python:Versions}
 Provides: ${python:Provides}
 Suggests: apache | httpd, python-optcomplete (>= 1.2-5),
 python-geoip (>= 1.2.1-2), clamav-daemon, python-utidylib,
- python-cssutils
+ python-cssutils, python-pysqlite2
 Description: check websites and HTML documents for broken links
 Provides a command line program and web interface to check links
 of websites and HTML documents.
--- a/linkcheck/checker/const.py
+++ b/linkcheck/checker/const.py
@ -62,6 +62,14 @@ ExcNoCacheList = [
    socket.timeout,
 ]

+# firefox bookmark file needs sqlite3 for parsing
+try:
+    import sqlite3
+    ExcCacheList.append(sqlite3.Error)
+except ImportError:
+    pass
+
+
 ExcList = ExcCacheList + ExcNoCacheList

 WARN_URL_EFFECTIVE_URL = "url-effective-url"
@ -134,7 +142,7 @@ Warnings = {
 # file extensions we can parse recursively
 PARSE_EXTENSIONS = {
    "html": re.compile(r'(?i)\.s?html?$'),
-    "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
+    "opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
    "css": re.compile(r'(?i)\.css$'), # CSS stylesheet
    "swf": re.compile(r'(?i)\.swf$'), # SWF file
 }
@ -157,3 +165,4 @@ PARSE_CONTENTS = {
    "opera" : re.compile(r'^Opera Hotlist'),
    "text" : re.compile(r'(?i)^# LinkChecker URL list'),
 }
+
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@ -25,11 +25,16 @@ import urlparse
 import urllib
 import urllib2

-from . import urlbase, get_index_html, absolute_url
+from . import urlbase, get_index_html, absolute_url, get_url_from
 from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil
 from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
    PARSE_EXTENSIONS, PARSE_CONTENTS

+try:
+    import sqlite3
+    has_sqlite = True
+except ImportError:
+    has_sqlite = False

 def get_files (dirname):
    """
@ -79,6 +84,8 @@ def is_absolute_path (path):
    return path.startswith("/")


+firefox_extension = re.compile(r'/(?i)places.sqlite$')
+
 class FileUrl (urlbase.UrlBase):
    """
    Url link with file scheme.
@ -244,6 +251,8 @@ class FileUrl (urlbase.UrlBase):
        for ro in PARSE_EXTENSIONS.values():
            if ro.search(self.url):
                return True
+        if firefox_extension.search(self.url):
+            return True
        # try to read content (can fail, so catch error)
        try:
            for ro in PARSE_CONTENTS.values():
@ -264,11 +273,33 @@ class FileUrl (urlbase.UrlBase):
            if ro.search(self.url):
                getattr(self, "parse_"+key)()
                return
+        if has_sqlite and firefox_extension.search(self.url):
+            self.parse_firefox()
+            return
        for key, ro in PARSE_CONTENTS.items():
            if ro.search(self.get_content()[:30]):
                getattr(self, "parse_"+key)()
                return

+    def parse_firefox (self):
+        """Parse a Firefox3 bookmark file."""
+        log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
+        conn = sqlite3.connect(self.get_os_filename(), timeout=0.5)
+        try:
+            c = conn.cursor()
+            try:
+                sql = """SELECT moz_places.url, moz_places.title
+                FROM moz_places WHERE hidden=0"""
+                c.execute(sql)
+                for url, name in c:
+                    url_data = get_url_from(url, self.recursion_level+1,
+                        self.aggregate, parent_url=self.url, name=name)
+                    self.aggregate.urlqueue.put(url_data)
+            finally:
+                c.close()
+        finally:
+            conn.close()
+
    def get_intern_pattern (self):
        """
        Get pattern for intern URL matching.
--- a/tests/checker/test_file.py
+++ b/tests/checker/test_file.py
@ -58,6 +58,10 @@ class TestFile (LinkCheckTest):
        """
        self.file_test("urllist.txt")

+    def test_firefox_bookmarks (self):
+        """Test firefox 3 bookmark file parsing."""
+        self.file_test("places.sqlite")
+
    def test_good_file (self):
        url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
        nurl = self.norm(url)