Support parsing of Firefox 3 bookmark files

git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3862 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
calvin 2008-11-20 07:51:22 +00:00
parent 22d2784a58
commit 9ab895751f
5 changed files with 51 additions and 3 deletions

View file

@ -71,6 +71,10 @@
Type: feature
Changed: linkcheck/director/__init__.py
* Support reading Firefox 3 bookmark files in SQLite format.
Type: feature
Changed: linkcheck/checker/fileurl.py
4.9 "Michael Clayton" (released 25.4.2008)
* Parse Shockwave Flash (SWF) for URLs to check

2
debian/control vendored
View file

@ -16,7 +16,7 @@ XB-Python-Version: ${python:Versions}
Provides: ${python:Provides}
Suggests: apache | httpd, python-optcomplete (>= 1.2-5),
python-geoip (>= 1.2.1-2), clamav-daemon, python-utidylib,
python-cssutils
python-cssutils, python-pysqlite2
Description: check websites and HTML documents for broken links
Provides a command line program and web interface to check links
of websites and HTML documents.

View file

@ -62,6 +62,14 @@ ExcNoCacheList = [
socket.timeout,
]
# firefox bookmark file needs sqlite3 for parsing
try:
import sqlite3
ExcCacheList.append(sqlite3.Error)
except ImportError:
pass
ExcList = ExcCacheList + ExcNoCacheList
WARN_URL_EFFECTIVE_URL = "url-effective-url"
@ -134,7 +142,7 @@ Warnings = {
# file extensions we can parse recursively
PARSE_EXTENSIONS = {
"html": re.compile(r'(?i)\.s?html?$'),
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
"opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
"swf": re.compile(r'(?i)\.swf$'), # SWF file
}
@ -157,3 +165,4 @@ PARSE_CONTENTS = {
"opera" : re.compile(r'^Opera Hotlist'),
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
}

View file

@ -25,11 +25,16 @@ import urlparse
import urllib
import urllib2
from . import urlbase, get_index_html, absolute_url
from . import urlbase, get_index_html, absolute_url, get_url_from
from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
PARSE_EXTENSIONS, PARSE_CONTENTS
try:
import sqlite3
has_sqlite = True
except ImportError:
has_sqlite = False
def get_files (dirname):
"""
@ -79,6 +84,8 @@ def is_absolute_path (path):
return path.startswith("/")
firefox_extension = re.compile(r'/(?i)places.sqlite$')
class FileUrl (urlbase.UrlBase):
"""
Url link with file scheme.
@ -244,6 +251,8 @@ class FileUrl (urlbase.UrlBase):
for ro in PARSE_EXTENSIONS.values():
if ro.search(self.url):
return True
if firefox_extension.search(self.url):
return True
# try to read content (can fail, so catch error)
try:
for ro in PARSE_CONTENTS.values():
@ -264,11 +273,33 @@ class FileUrl (urlbase.UrlBase):
if ro.search(self.url):
getattr(self, "parse_"+key)()
return
if has_sqlite and firefox_extension.search(self.url):
self.parse_firefox()
return
for key, ro in PARSE_CONTENTS.items():
if ro.search(self.get_content()[:30]):
getattr(self, "parse_"+key)()
return
def parse_firefox (self):
"""Parse a Firefox3 bookmark file."""
log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
conn = sqlite3.connect(self.get_os_filename(), timeout=0.5)
try:
c = conn.cursor()
try:
sql = """SELECT moz_places.url, moz_places.title
FROM moz_places WHERE hidden=0"""
c.execute(sql)
for url, name in c:
url_data = get_url_from(url, self.recursion_level+1,
self.aggregate, parent_url=self.url, name=name)
self.aggregate.urlqueue.put(url_data)
finally:
c.close()
finally:
conn.close()
def get_intern_pattern (self):
"""
Get pattern for intern URL matching.

View file

@ -58,6 +58,10 @@ class TestFile (LinkCheckTest):
"""
self.file_test("urllist.txt")
def test_firefox_bookmarks (self):
"""Test firefox 3 bookmark file parsing."""
self.file_test("places.sqlite")
def test_good_file (self):
url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
nurl = self.norm(url)