Updated SearchBackend.update to be smarter when indexing multi-value fields and stemming. Will now only stem text fields and will properly index each field of a MultiValueField.

This commit is contained in:
David Sauve 2010-01-27 19:37:49 -05:00
parent 8a7afd50ec
commit a4e60604a3
2 changed files with 73 additions and 28 deletions

View file

@ -53,12 +53,35 @@ class XapianMockSearchIndex(indexes.SearchIndex):
flag = indexes.BooleanField(model_attr='flag')
slug = indexes.CharField(indexed=False, model_attr='slug')
popularity = indexes.FloatField(model_attr='popularity')
# Various MultiValueFields
sites = indexes.MultiValueField()
tags = indexes.MultiValueField()
keys = indexes.MultiValueField()
titles = indexes.MultiValueField()
def prepare_sites(self, obj):
return ['%d' % (i * obj.id) for i in xrange(1, 4)]
def prepare_tags(self, obj):
if obj.id == 1:
return ['a', 'b', 'c']
elif obj.id == 2:
return ['ab', 'bc', 'cd']
else:
return ['an', 'to', 'or']
def prepare_keys(self, obj):
return [i * obj.id for i in xrange(1, 4)]
def prepare_titles(self, obj):
if obj.id == 1:
return ['object one title one', 'object one title two']
elif obj.id == 2:
return ['object two title one', 'object two title two']
else:
return ['object three title one', 'object three title two']
class XapianSearchBackendTestCase(TestCase):
def setUp(self):
@ -84,10 +107,6 @@ class XapianSearchBackendTestCase(TestCase):
self.sample_objs[0].popularity = 834.0
self.sample_objs[1].popularity = 35.5
self.sample_objs[2].popularity = 972.0
self.sample_objs[0].tags = ['a', 'b', 'c']
self.sample_objs[0].tags = ['ab', 'bc', 'cd']
self.sample_objs[0].tags = ['an', 'to', 'or']
def tearDown(self):
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
@ -124,9 +143,9 @@ class XapianSearchBackendTestCase(TestCase):
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
{'flag': u't', 'name': u'david1', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u'f', 'name': u'david2', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
{'flag': u't', 'name': u'david1', 'tags': u"['a', 'b', 'c']", 'keys': u'[1, 2, 3]', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'titles': u"['object one title one', 'object one title two']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u'f', 'name': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
])
def test_duplicate_update(self):
@ -142,8 +161,8 @@ class XapianSearchBackendTestCase(TestCase):
self.backend.remove(self.sample_objs[0])
self.assertEqual(len(self.xapian_search('')), 2)
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
{'flag': u'f', 'name': u'david2', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
{'flag': u'f', 'name': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
{'flag': u't', 'name': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
])
def test_clear(self):
@ -177,6 +196,15 @@ class XapianSearchBackendTestCase(TestCase):
self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3])
self.assertEqual(self.backend.search(xapian.Query('indexed'))['hits'], 3)
self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3])
def test_search_by_mvf(self):
self.backend.update(self.index, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('b'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('to'))['hits'], 1)
self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3)
def test_field_facets(self):
self.backend.update(self.index, self.sample_objs)
@ -324,16 +352,18 @@ class XapianSearchBackendTestCase(TestCase):
def test_build_schema(self):
(content_field_name, fields) = self.backend.build_schema(self.site.all_searchfields())
self.assertEqual(content_field_name, 'text')
self.assertEqual(len(fields), 8)
self.assertEqual(len(fields), 10)
self.assertEqual(fields, [
{'column': 0, 'field_name': 'name', 'type': 'text', 'multi_valued': 'false'},
{'column': 0, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
{'column': 1, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
{'column': 2, 'field_name': 'text', 'type': 'text', 'multi_valued': 'false'},
{'column': 3, 'field_name': 'popularity', 'type': 'float', 'multi_valued': 'false'},
{'column': 4, 'field_name': 'sites', 'type': 'text', 'multi_valued': 'true'},
{'column': 5, 'field_name': 'value', 'type': 'long', 'multi_valued': 'false'},
{'column': 6, 'field_name': 'flag', 'type': 'boolean', 'multi_valued': 'false'},
{'column': 7, 'field_name': 'pub_date', 'type': 'date', 'multi_valued': 'false'},
{'column': 2, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 3, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
{'column': 4, 'type': 'float', 'field_name': 'popularity', 'multi_valued': 'false'},
{'column': 5, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
{'column': 6, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'},
{'column': 7, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
{'column': 8, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 9, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'}
])
def test_parse_query(self):
@ -341,10 +371,10 @@ class XapianSearchBackendTestCase(TestCase):
self.assertEqual(self.backend.parse_query('indexed').get_description(), 'Xapian::Query((indexed:(pos=1) OR Zindex:(pos=1)))')
self.assertEqual(self.backend.parse_query('name:david').get_description(), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1) OR ZXNAMEdavid:(pos=1)))')
self.assertEqual(self.backend.parse_query('name:david1..david2').get_description(), 'Xapian::Query(VALUE_RANGE 0 david1 david2)')
self.assertEqual(self.backend.parse_query('value:0..10').get_description(), 'Xapian::Query(VALUE_RANGE 5 000000000000 000000000010)')
self.assertEqual(self.backend.parse_query('value:..10').get_description(), 'Xapian::Query(VALUE_RANGE 5 -02147483648 000000000010)')
self.assertEqual(self.backend.parse_query('value:10..*').get_description(), 'Xapian::Query(VALUE_RANGE 5 000000000010 002147483647)')
self.assertEqual(self.backend.parse_query('popularity:25.5..100.0').get_description(), 'Xapian::Query(VALUE_RANGE 3 \xb2` \xba@)')
self.assertEqual(self.backend.parse_query('value:0..10').get_description(), 'Xapian::Query(VALUE_RANGE 6 000000000000 000000000010)')
self.assertEqual(self.backend.parse_query('value:..10').get_description(), 'Xapian::Query(VALUE_RANGE 6 -02147483648 000000000010)')
self.assertEqual(self.backend.parse_query('value:10..*').get_description(), 'Xapian::Query(VALUE_RANGE 6 000000000010 002147483647)')
self.assertEqual(self.backend.parse_query('popularity:25.5..100.0').get_description(), 'Xapian::Query(VALUE_RANGE 4 \xb2` \xba@)')
class LiveXapianMockSearchIndex(indexes.SearchIndex):

View file

@ -161,10 +161,11 @@ class SearchBackend(BaseSearchBackend):
`iterable` -- An iterable of model instances to index
For each object in `iterable`, a document is created containing all
of the terms extracted from `index.prepare(obj)` with stemming prefixes,
field prefixes, and 'as-is'.
of the terms extracted from `index.prepare(obj)` with field prefixes,
and 'as-is' as needed. Also, if the field type is 'text' it will be
stemmed and stored with the 'Z' prefix as well.
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest`
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`
Each document also contains an extra term in the format:
@ -207,10 +208,24 @@ class SearchBackend(BaseSearchBackend):
if field['field_name'] in data.keys():
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
value = data[field['field_name']]
term_generator.index_text(_marshal_term(value))
term_generator.index_text(_marshal_term(value), 1, prefix)
if field['multi_valued'] == 'false':
document.add_value(field['column'], _marshal_value(value))
if field['type'] == 'text':
if field['multi_valued'] == 'false':
term_generator.index_text(_marshal_term(value))
term_generator.index_text(_marshal_term(value), 1, prefix)
document.add_value(field['column'], _marshal_value(value))
else:
for term in value:
term_generator.index_text(_marshal_term(term))
term_generator.index_text(_marshal_term(term), 1, prefix)
else:
if field['multi_valued'] == 'false':
document.add_term(_marshal_term(value))
document.add_term(prefix + _marshal_term(value))
document.add_value(field['column'], _marshal_value(value))
else:
for term in value:
document.add_term(_marshal_term(term))
document.add_term(prefix + _marshal_term(term))
document.set_data(pickle.dumps(
(obj._meta.app_label, obj._meta.module_name, obj.pk, data),