From 9ab895751fe8e030eed9a82bc7bfdb806bbeb5e0 Mon Sep 17 00:00:00 2001
From: calvin <calvin@e7d03fd6-7b0d-0410-9947-9c21f3af8025>
Date: Thu, 20 Nov 2008 07:51:22 +0000
Subject: [PATCH] Support parsing of Firefox 3 bookmark files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3862 e7d03fd6-7b0d-0410-9947-9c21f3af8025
---
 ChangeLog.txt                |  4 ++++
 debian/control               |  2 +-
 linkcheck/checker/const.py   | 11 ++++++++++-
 linkcheck/checker/fileurl.py | 33 ++++++++++++++++++++++++++++++++-
 tests/checker/test_file.py   |  4 ++++
 5 files changed, 51 insertions(+), 3 deletions(-)

diff --git a/ChangeLog.txt b/ChangeLog.txt
index feb0ee2c..b5945725 100644
--- a/ChangeLog.txt
+++ b/ChangeLog.txt
@@ -71,6 +71,10 @@
     Type: feature
     Changed: linkcheck/director/__init__.py
 
+  * Support reading Firefox 3 bookmark files in SQLite format.
+    Type: feature
+    Changed: linkcheck/checker/fileurl.py
+
 4.9 "Michael Clayton" (released 25.4.2008)
 
   * Parse Shockwave Flash (SWF) for URLs to check
diff --git a/debian/control b/debian/control
index a369f3fe..3afb5d37 100644
--- a/debian/control
+++ b/debian/control
@@ -16,7 +16,7 @@ XB-Python-Version: ${python:Versions}
 Provides: ${python:Provides}
 Suggests: apache | httpd, python-optcomplete (>= 1.2-5),
  python-geoip (>= 1.2.1-2), clamav-daemon, python-utidylib,
- python-cssutils
+ python-cssutils, python-pysqlite2
 Description: check websites and HTML documents for broken links
  Provides a command line program and web interface to check links
  of websites and HTML documents.
diff --git a/linkcheck/checker/const.py b/linkcheck/checker/const.py
index 2cca3bc3..f1243736 100644
--- a/linkcheck/checker/const.py
+++ b/linkcheck/checker/const.py
@@ -62,6 +62,14 @@ ExcNoCacheList = [
     socket.timeout,
 ]
 
+# firefox bookmark file needs sqlite3 for parsing
+try:
+    import sqlite3
+    ExcCacheList.append(sqlite3.Error)
+except ImportError:
+    pass
+
+
 ExcList = ExcCacheList + ExcNoCacheList
 
 WARN_URL_EFFECTIVE_URL = "url-effective-url"
@@ -134,7 +142,7 @@ Warnings = {
 # file extensions we can parse recursively
 PARSE_EXTENSIONS = {
     "html": re.compile(r'(?i)\.s?html?$'),
-    "opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
+    "opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
     "css": re.compile(r'(?i)\.css$'), # CSS stylesheet
     "swf": re.compile(r'(?i)\.swf$'), # SWF file
 }
@@ -157,3 +165,4 @@ PARSE_CONTENTS = {
     "opera" : re.compile(r'^Opera Hotlist'),
     "text" : re.compile(r'(?i)^# LinkChecker URL list'),
 }
+
diff --git a/linkcheck/checker/fileurl.py b/linkcheck/checker/fileurl.py
index 940f86e3..d65924b1 100644
--- a/linkcheck/checker/fileurl.py
+++ b/linkcheck/checker/fileurl.py
@@ -25,11 +25,16 @@ import urlparse
 import urllib
 import urllib2
 
-from . import urlbase, get_index_html, absolute_url
+from . import urlbase, get_index_html, absolute_url, get_url_from
 from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil
 from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
     PARSE_EXTENSIONS, PARSE_CONTENTS
 
+try:
+    import sqlite3
+    has_sqlite = True
+except ImportError:
+    has_sqlite = False
 
 def get_files (dirname):
     """
@@ -79,6 +84,8 @@ def is_absolute_path (path):
     return path.startswith("/")
 
 
+firefox_extension = re.compile(r'/(?i)places.sqlite$')
+
 class FileUrl (urlbase.UrlBase):
     """
     Url link with file scheme.
@@ -244,6 +251,8 @@ class FileUrl (urlbase.UrlBase):
         for ro in PARSE_EXTENSIONS.values():
             if ro.search(self.url):
                 return True
+        if firefox_extension.search(self.url):
+            return True
         # try to read content (can fail, so catch error)
         try:
             for ro in PARSE_CONTENTS.values():
@@ -264,11 +273,33 @@ class FileUrl (urlbase.UrlBase):
             if ro.search(self.url):
                 getattr(self, "parse_"+key)()
                 return
+        if has_sqlite and firefox_extension.search(self.url):
+            self.parse_firefox()
+            return
         for key, ro in PARSE_CONTENTS.items():
             if ro.search(self.get_content()[:30]):
                 getattr(self, "parse_"+key)()
                 return
 
+    def parse_firefox (self):
+        """Parse a Firefox3 bookmark file."""
+        log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
+        conn = sqlite3.connect(self.get_os_filename(), timeout=0.5)
+        try:
+            c = conn.cursor()
+            try:
+                sql = """SELECT moz_places.url, moz_places.title
+                FROM moz_places WHERE hidden=0"""
+                c.execute(sql)
+                for url, name in c:
+                    url_data = get_url_from(url, self.recursion_level+1,
+                        self.aggregate, parent_url=self.url, name=name)
+                    self.aggregate.urlqueue.put(url_data)
+            finally:
+                c.close()
+        finally:
+            conn.close()
+
     def get_intern_pattern (self):
         """
         Get pattern for intern URL matching.
diff --git a/tests/checker/test_file.py b/tests/checker/test_file.py
index b8304203..27048ed4 100644
--- a/tests/checker/test_file.py
+++ b/tests/checker/test_file.py
@@ -58,6 +58,10 @@ class TestFile (LinkCheckTest):
         """
         self.file_test("urllist.txt")
 
+    def test_firefox_bookmarks (self):
+        """Test firefox 3 bookmark file parsing."""
+        self.file_test("places.sqlite")
+
     def test_good_file (self):
         url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
         nurl = self.norm(url)