mirror of
https://github.com/Hopiu/linkchecker.git
synced 2026-04-15 20:01:03 +00:00
Support parsing of Firefox 3 bookmark files
git-svn-id: https://linkchecker.svn.sourceforge.net/svnroot/linkchecker/trunk/linkchecker@3862 e7d03fd6-7b0d-0410-9947-9c21f3af8025
This commit is contained in:
parent
22d2784a58
commit
9ab895751f
5 changed files with 51 additions and 3 deletions
|
|
@ -71,6 +71,10 @@
|
|||
Type: feature
|
||||
Changed: linkcheck/director/__init__.py
|
||||
|
||||
* Support reading Firefox 3 bookmark files in SQLite format.
|
||||
Type: feature
|
||||
Changed: linkcheck/checker/fileurl.py
|
||||
|
||||
4.9 "Michael Clayton" (released 25.4.2008)
|
||||
|
||||
* Parse Shockwave Flash (SWF) for URLs to check
|
||||
|
|
|
|||
2
debian/control
vendored
2
debian/control
vendored
|
|
@ -16,7 +16,7 @@ XB-Python-Version: ${python:Versions}
|
|||
Provides: ${python:Provides}
|
||||
Suggests: apache | httpd, python-optcomplete (>= 1.2-5),
|
||||
python-geoip (>= 1.2.1-2), clamav-daemon, python-utidylib,
|
||||
python-cssutils
|
||||
python-cssutils, python-pysqlite2
|
||||
Description: check websites and HTML documents for broken links
|
||||
Provides a command line program and web interface to check links
|
||||
of websites and HTML documents.
|
||||
|
|
|
|||
|
|
@ -62,6 +62,14 @@ ExcNoCacheList = [
|
|||
socket.timeout,
|
||||
]
|
||||
|
||||
# firefox bookmark file needs sqlite3 for parsing
|
||||
try:
|
||||
import sqlite3
|
||||
ExcCacheList.append(sqlite3.Error)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
|
||||
ExcList = ExcCacheList + ExcNoCacheList
|
||||
|
||||
WARN_URL_EFFECTIVE_URL = "url-effective-url"
|
||||
|
|
@ -134,7 +142,7 @@ Warnings = {
|
|||
# file extensions we can parse recursively
|
||||
PARSE_EXTENSIONS = {
|
||||
"html": re.compile(r'(?i)\.s?html?$'),
|
||||
"opera": re.compile(r'^(?i)opera.adr$'), # opera bookmark file
|
||||
"opera": re.compile(r'/(?i)opera.adr$'), # opera bookmark file
|
||||
"css": re.compile(r'(?i)\.css$'), # CSS stylesheet
|
||||
"swf": re.compile(r'(?i)\.swf$'), # SWF file
|
||||
}
|
||||
|
|
@ -157,3 +165,4 @@ PARSE_CONTENTS = {
|
|||
"opera" : re.compile(r'^Opera Hotlist'),
|
||||
"text" : re.compile(r'(?i)^# LinkChecker URL list'),
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -25,11 +25,16 @@ import urlparse
|
|||
import urllib
|
||||
import urllib2
|
||||
|
||||
from . import urlbase, get_index_html, absolute_url
|
||||
from . import urlbase, get_index_html, absolute_url, get_url_from
|
||||
from .. import log, LOG_CHECK, fileutil, strformat, url as urlutil
|
||||
from .const import WARN_FILE_MISSING_SLASH, WARN_FILE_SYSTEM_PATH, \
|
||||
PARSE_EXTENSIONS, PARSE_CONTENTS
|
||||
|
||||
try:
|
||||
import sqlite3
|
||||
has_sqlite = True
|
||||
except ImportError:
|
||||
has_sqlite = False
|
||||
|
||||
def get_files (dirname):
|
||||
"""
|
||||
|
|
@ -79,6 +84,8 @@ def is_absolute_path (path):
|
|||
return path.startswith("/")
|
||||
|
||||
|
||||
firefox_extension = re.compile(r'/(?i)places.sqlite$')
|
||||
|
||||
class FileUrl (urlbase.UrlBase):
|
||||
"""
|
||||
Url link with file scheme.
|
||||
|
|
@ -244,6 +251,8 @@ class FileUrl (urlbase.UrlBase):
|
|||
for ro in PARSE_EXTENSIONS.values():
|
||||
if ro.search(self.url):
|
||||
return True
|
||||
if firefox_extension.search(self.url):
|
||||
return True
|
||||
# try to read content (can fail, so catch error)
|
||||
try:
|
||||
for ro in PARSE_CONTENTS.values():
|
||||
|
|
@ -264,11 +273,33 @@ class FileUrl (urlbase.UrlBase):
|
|||
if ro.search(self.url):
|
||||
getattr(self, "parse_"+key)()
|
||||
return
|
||||
if has_sqlite and firefox_extension.search(self.url):
|
||||
self.parse_firefox()
|
||||
return
|
||||
for key, ro in PARSE_CONTENTS.items():
|
||||
if ro.search(self.get_content()[:30]):
|
||||
getattr(self, "parse_"+key)()
|
||||
return
|
||||
|
||||
def parse_firefox (self):
|
||||
"""Parse a Firefox3 bookmark file."""
|
||||
log.debug(LOG_CHECK, "Parsing Firefox bookmarks %s", self)
|
||||
conn = sqlite3.connect(self.get_os_filename(), timeout=0.5)
|
||||
try:
|
||||
c = conn.cursor()
|
||||
try:
|
||||
sql = """SELECT moz_places.url, moz_places.title
|
||||
FROM moz_places WHERE hidden=0"""
|
||||
c.execute(sql)
|
||||
for url, name in c:
|
||||
url_data = get_url_from(url, self.recursion_level+1,
|
||||
self.aggregate, parent_url=self.url, name=name)
|
||||
self.aggregate.urlqueue.put(url_data)
|
||||
finally:
|
||||
c.close()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
def get_intern_pattern (self):
|
||||
"""
|
||||
Get pattern for intern URL matching.
|
||||
|
|
|
|||
|
|
@ -58,6 +58,10 @@ class TestFile (LinkCheckTest):
|
|||
"""
|
||||
self.file_test("urllist.txt")
|
||||
|
||||
def test_firefox_bookmarks (self):
|
||||
"""Test firefox 3 bookmark file parsing."""
|
||||
self.file_test("places.sqlite")
|
||||
|
||||
def test_good_file (self):
|
||||
url = u"file://%(curdir)s/%(datadir)s/file.txt" % self.get_attrs()
|
||||
nurl = self.norm(url)
|
||||
|
|
|
|||
Loading…
Reference in a new issue