Changed SearchBackend.update so that it indexes terms with punctuation as well as using the Xapian.TermGenerator. This allows terms like URLs to work as expected

This commit is contained in:
David Sauve 2010-03-05 15:33:14 -05:00
parent 306a6f153f
commit 9d8af9cbcf
8 changed files with 61 additions and 31 deletions

View file

@ -3,14 +3,18 @@ xapian-haystack-UNRELEASED
Tagged on ??, 2010 by David Sauve <dsauve@trapeze.com>
Minor Code Changes
Major Code Changes
------------------
* Updated SearchBackend.update to be smarter when indexing multi-value fields and stemming. It will now only stem text fields and will properly index each field of a MultiValueField.
* MultiValueFields will no longer store data in the value column of the index.
* Fixed another occurrence of DatabaseModifiedError
* Faceted fields will now be automatically duplicated internally. This corresponds to Haystack, SHA c8f8d, http://github.com/toastdriven/django-haystack/commit/c8f8dcc2d79a9dcb67deefdc031629655b874820
* `__startswith` filter has been improved to work more effeciently.
* `SearchResults` obey the type of data chosen in their corresponding field in the `SearchIndex` if present.
* Changed `SearchBackend.update` method to index terms with punctuation as well as using the Xapian.TermGenerator.
Minor Code Changes
------------------
* Fixed another occurrence of DatabaseModifiedError
Known Issues
------------

View file

@ -1,4 +1,4 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.
import os
from settings import *

View file

@ -1 +1 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.

View file

@ -1 +1 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.

View file

@ -1,4 +1,4 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.
import warnings

View file

@ -1,4 +1,4 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.
# Based on original code by Daniel Lindsley as part of the Haystack test suite.
import cPickle as pickle
@ -34,6 +34,7 @@ class XapianMockModel(models.Model):
flag = models.BooleanField(default=True)
slug = models.SlugField()
popularity = models.FloatField(default=0.0)
url = models.URLField()
def __unicode__(self):
return self.author
@ -54,6 +55,7 @@ class XapianMockSearchIndex(indexes.SearchIndex):
slug = indexes.CharField(indexed=False, model_attr='slug')
popularity = indexes.FloatField(model_attr='popularity')
month = indexes.CharField(indexed=False)
url = indexes.CharField(model_attr='url')
# Various MultiValueFields
sites = indexes.MultiValueField()
@ -106,7 +108,8 @@ class XapianSearchBackendTestCase(TestCase):
mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i)
mock.value = i * 5
mock.flag = bool(i % 2)
mock.slug = 'http://example.com/%d' % i
mock.slug = 'http://example.com/%d/' % i
mock.url = 'http://example.com/%d/' % i
self.sample_objs.append(mock)
self.sample_objs[0].popularity = 834.0
@ -148,9 +151,9 @@ class XapianSearchBackendTestCase(TestCase):
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
{'flag': u't', 'name': u'david1', 'name_exact': u'david1', 'tags': u"['a', 'b', 'c']", 'keys': u'[1, 2, 3]', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'titles': u"['object one title one', 'object one title two']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'month': u'02', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u'f', 'name': u'david2', 'name_exact': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'month': u'02', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'name_exact': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'month': u'02', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
{'flag': u't', 'name': u'david1', 'name_exact': u'david1', 'tags': u"['a', 'b', 'c']", 'keys': u'[1, 2, 3]', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'titles': u"['object one title one', 'object one title two']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'month': u'02', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1/', 'url': u'http://example.com/1/', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u'f', 'name': u'david2', 'name_exact': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'month': u'02', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2/', 'url': u'http://example.com/2/', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'name_exact': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'month': u'02', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3/', 'url': u'http://example.com/3/', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
])
def test_duplicate_update(self):
@ -166,8 +169,8 @@ class XapianSearchBackendTestCase(TestCase):
self.backend.remove(self.sample_objs[0])
self.assertEqual(len(self.xapian_search('')), 2)
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
{'flag': u'f', 'name': u'david2', 'name_exact': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'month': u'02', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'name_exact': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'month': u'02', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
{'flag': u'f', 'name': u'david2', 'name_exact': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'month': u'02', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2/', 'url': u'http://example.com/2/', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'name_exact': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'month': u'02', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3/', 'url': u'http://example.com/3/', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
])
def test_clear(self):
@ -202,6 +205,13 @@ class XapianSearchBackendTestCase(TestCase):
self.assertEqual(self.backend.search(xapian.Query('indexed'))['hits'], 3)
self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3])
def test_search_field_with_punctuation(self):
self.backend.update(self.index, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
# self.assertEqual(self.backend.search(xapian.Query('http://example.com/'))['hits'], 3)
self.assertEqual([result.pk for result in self.backend.search(xapian.Query('http://example.com/1/'))['results']], [1])
def test_search_by_mvf(self):
self.backend.update(self.index, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
@ -365,24 +375,25 @@ class XapianSearchBackendTestCase(TestCase):
def test_build_schema(self):
(content_field_name, fields) = self.backend.build_schema(self.site.all_searchfields())
self.assertEqual(content_field_name, 'text')
self.assertEqual(len(fields), 11)
self.assertEqual(len(fields), 12)
self.assertEqual(fields, [
{'column': 0, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
{'column': 1, 'type': 'text', 'field_name': 'name_exact', 'multi_valued': 'false'},
{'column': 1, 'field_name': 'name_exact', 'type': 'text', 'multi_valued': 'false'},
{'column': 2, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
{'column': 3, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 3, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'},
{'column': 4, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
{'column': 5, 'type': 'float', 'field_name': 'popularity', 'multi_valued': 'false'},
{'column': 6, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
{'column': 7, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'},
{'column': 8, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
{'column': 9, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 10, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'}
{'column': 8, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 9, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
{'column': 10, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 11, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'}
])
def test_parse_query(self):
self.backend.update(self.index, self.sample_objs)
self.assertEqual(self.backend.parse_query('indexed').get_description(), 'Xapian::Query((indexed:(pos=1) OR Zindex:(pos=1)))')
self.assertEqual(self.backend.parse_query('indexed').get_description(), 'Xapian::Query((indexed:(pos=1) OR indexed!\n1:(pos=1) OR indexed!\n2:(pos=1) OR indexed!\n3:(pos=1) OR Zindex:(pos=1)))')
self.assertEqual(self.backend.parse_query('name:david').get_description(), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1) OR ZXNAMEdavid:(pos=1)))')
self.assertEqual(self.backend.parse_query('name:da*').get_description(), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))')
self.assertEqual(self.backend.parse_query('name:david1..david2').get_description(), 'Xapian::Query(VALUE_RANGE 0 david1 david2)')

View file

@ -1,4 +1,4 @@
# Copyright (C) 2009 David Sauve, Trapeze. All rights reserved.
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.
import datetime
import os
@ -151,3 +151,7 @@ class XapianSearchQueryTestCase(TestCase):
self.sq.add_model(AnotherMockModel)
self.assertEqual(self.sq.build_query().get_description(), u'Xapian::Query(((Zhello OR hello) AND (0 * XCONTENTTYPEcore.anothermockmodel OR 0 * XCONTENTTYPEcore.mockmodel)))')
def test_build_query_with_punctuation(self):
self.sq.add_filter(SQ(content='http://www.example.com'))
self.assertEqual(self.sq.build_query().get_description(), u'Xapian::Query((Zhttp://www.example.com OR http://www.example.com))')

View file

@ -1,7 +1,7 @@
# Copyright (C) 2009 David Sauve, Trapeze
# Copyright (C) 2009-2010 David Sauve, Trapeze. All rights reserved.
__author__ = 'David Sauve'
__version__ = (1, 1, 3, 'beta')
__version__ = (1, 1, 4, 'alpha')
import time
import datetime
@ -9,6 +9,7 @@ import cPickle as pickle
import os
import re
import shutil
import string
import sys
import warnings
@ -212,22 +213,32 @@ class SearchBackend(BaseSearchBackend):
value = data[field['field_name']]
if field['type'] == 'text':
if field['multi_valued'] == 'false':
term_generator.index_text(_marshal_term(value))
term_generator.index_text(_marshal_term(value), 1, prefix)
term = _marshal_term(value)
term_generator.index_text(term)
term_generator.index_text(term, 1, prefix)
if not string.whitespace in term:
document.add_term(term)
document.add_term(prefix + term)
document.add_value(field['column'], _marshal_value(value))
else:
for term in value:
term_generator.index_text(_marshal_term(term))
term_generator.index_text(_marshal_term(term), 1, prefix)
term = _marshal_term(term)
term_generator.index_text(term)
term_generator.index_text(term, 1, prefix)
if not string.whitespace in term:
document.add_term(term)
document.add_term(prefix + term)
else:
if field['multi_valued'] == 'false':
document.add_term(_marshal_term(value))
document.add_term(prefix + _marshal_term(value))
term = _marshal_term(value)
document.add_term(term)
document.add_term(prefix + term)
document.add_value(field['column'], _marshal_value(value))
else:
for term in value:
document.add_term(_marshal_term(term))
document.add_term(prefix + _marshal_term(term))
term = _marshal_term(term)
document.add_term(term)
document.add_term(prefix + term)
document.set_data(pickle.dumps(
(obj._meta.app_label, obj._meta.module_name, obj.pk, data),