mirror of
https://github.com/Hopiu/xapian-haystack.git
synced 2026-03-16 22:20:31 +00:00
Updated SearchBackend.update to be smarter when indexing multi-value fields and stemming. Will now only stem text fields and will properly index each field of a MultiValueField.
This commit is contained in:
parent
8a7afd50ec
commit
a4e60604a3
2 changed files with 73 additions and 28 deletions
|
|
@ -53,12 +53,35 @@ class XapianMockSearchIndex(indexes.SearchIndex):
|
|||
flag = indexes.BooleanField(model_attr='flag')
|
||||
slug = indexes.CharField(indexed=False, model_attr='slug')
|
||||
popularity = indexes.FloatField(model_attr='popularity')
|
||||
|
||||
# Various MultiValueFields
|
||||
sites = indexes.MultiValueField()
|
||||
tags = indexes.MultiValueField()
|
||||
keys = indexes.MultiValueField()
|
||||
titles = indexes.MultiValueField()
|
||||
|
||||
def prepare_sites(self, obj):
|
||||
return ['%d' % (i * obj.id) for i in xrange(1, 4)]
|
||||
|
||||
def prepare_tags(self, obj):
|
||||
if obj.id == 1:
|
||||
return ['a', 'b', 'c']
|
||||
elif obj.id == 2:
|
||||
return ['ab', 'bc', 'cd']
|
||||
else:
|
||||
return ['an', 'to', 'or']
|
||||
|
||||
def prepare_keys(self, obj):
|
||||
return [i * obj.id for i in xrange(1, 4)]
|
||||
|
||||
def prepare_titles(self, obj):
|
||||
if obj.id == 1:
|
||||
return ['object one title one', 'object one title two']
|
||||
elif obj.id == 2:
|
||||
return ['object two title one', 'object two title two']
|
||||
else:
|
||||
return ['object three title one', 'object three title two']
|
||||
|
||||
|
||||
class XapianSearchBackendTestCase(TestCase):
|
||||
def setUp(self):
|
||||
|
|
@ -84,10 +107,6 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
self.sample_objs[0].popularity = 834.0
|
||||
self.sample_objs[1].popularity = 35.5
|
||||
self.sample_objs[2].popularity = 972.0
|
||||
|
||||
self.sample_objs[0].tags = ['a', 'b', 'c']
|
||||
self.sample_objs[0].tags = ['ab', 'bc', 'cd']
|
||||
self.sample_objs[0].tags = ['an', 'to', 'or']
|
||||
|
||||
def tearDown(self):
|
||||
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
||||
|
|
@ -124,9 +143,9 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
|
||||
self.assertEqual(len(self.xapian_search('')), 3)
|
||||
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
|
||||
{'flag': u't', 'name': u'david1', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u'f', 'name': u'david2', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u't', 'name': u'david3', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
|
||||
{'flag': u't', 'name': u'david1', 'tags': u"['a', 'b', 'c']", 'keys': u'[1, 2, 3]', 'text': u'indexed!\n1', 'sites': u"['1', '2', '3']", 'titles': u"['object one title one', 'object one title two']", 'pub_date': u'20090224000000', 'value': u'000000000005', 'id': u'tests.xapianmockmodel.1', 'slug': u'http://example.com/1', 'popularity': '\xca\x84', 'django_id': u'1', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u'f', 'name': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u't', 'name': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
|
||||
])
|
||||
|
||||
def test_duplicate_update(self):
|
||||
|
|
@ -142,8 +161,8 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
self.backend.remove(self.sample_objs[0])
|
||||
self.assertEqual(len(self.xapian_search('')), 2)
|
||||
self.assertEqual([dict(doc) for doc in self.xapian_search('')], [
|
||||
{'flag': u'f', 'name': u'david2', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u't', 'name': u'david3', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
|
||||
{'flag': u'f', 'name': u'david2', 'tags': u"['ab', 'bc', 'cd']", 'keys': u'[2, 4, 6]', 'text': u'indexed!\n2', 'sites': u"['2', '4', '6']", 'titles': u"['object two title one', 'object two title two']", 'pub_date': u'20090223000000', 'value': u'000000000010', 'id': u'tests.xapianmockmodel.2', 'slug': u'http://example.com/2', 'popularity': '\xb4p', 'django_id': u'2', 'django_ct': u'tests.xapianmockmodel'},
|
||||
{'flag': u't', 'name': u'david3', 'tags': u"['an', 'to', 'or']", 'keys': u'[3, 6, 9]', 'text': u'indexed!\n3', 'sites': u"['3', '6', '9']", 'titles': u"['object three title one', 'object three title two']", 'pub_date': u'20090222000000', 'value': u'000000000015', 'id': u'tests.xapianmockmodel.3', 'slug': u'http://example.com/3', 'popularity': '\xcb\x98', 'django_id': u'3', 'django_ct': u'tests.xapianmockmodel'}
|
||||
])
|
||||
|
||||
def test_clear(self):
|
||||
|
|
@ -177,6 +196,15 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3])
|
||||
self.assertEqual(self.backend.search(xapian.Query('indexed'))['hits'], 3)
|
||||
self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3])
|
||||
|
||||
def test_search_by_mvf(self):
|
||||
self.backend.update(self.index, self.sample_objs)
|
||||
self.assertEqual(len(self.xapian_search('')), 3)
|
||||
|
||||
self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1)
|
||||
self.assertEqual(self.backend.search(xapian.Query('b'))['hits'], 1)
|
||||
self.assertEqual(self.backend.search(xapian.Query('to'))['hits'], 1)
|
||||
self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3)
|
||||
|
||||
def test_field_facets(self):
|
||||
self.backend.update(self.index, self.sample_objs)
|
||||
|
|
@ -324,16 +352,18 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
def test_build_schema(self):
|
||||
(content_field_name, fields) = self.backend.build_schema(self.site.all_searchfields())
|
||||
self.assertEqual(content_field_name, 'text')
|
||||
self.assertEqual(len(fields), 8)
|
||||
self.assertEqual(len(fields), 10)
|
||||
self.assertEqual(fields, [
|
||||
{'column': 0, 'field_name': 'name', 'type': 'text', 'multi_valued': 'false'},
|
||||
{'column': 0, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
|
||||
{'column': 1, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
|
||||
{'column': 2, 'field_name': 'text', 'type': 'text', 'multi_valued': 'false'},
|
||||
{'column': 3, 'field_name': 'popularity', 'type': 'float', 'multi_valued': 'false'},
|
||||
{'column': 4, 'field_name': 'sites', 'type': 'text', 'multi_valued': 'true'},
|
||||
{'column': 5, 'field_name': 'value', 'type': 'long', 'multi_valued': 'false'},
|
||||
{'column': 6, 'field_name': 'flag', 'type': 'boolean', 'multi_valued': 'false'},
|
||||
{'column': 7, 'field_name': 'pub_date', 'type': 'date', 'multi_valued': 'false'},
|
||||
{'column': 2, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
|
||||
{'column': 3, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
|
||||
{'column': 4, 'type': 'float', 'field_name': 'popularity', 'multi_valued': 'false'},
|
||||
{'column': 5, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
|
||||
{'column': 6, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'},
|
||||
{'column': 7, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
|
||||
{'column': 8, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
|
||||
{'column': 9, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'}
|
||||
])
|
||||
|
||||
def test_parse_query(self):
|
||||
|
|
@ -341,10 +371,10 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
self.assertEqual(self.backend.parse_query('indexed').get_description(), 'Xapian::Query((indexed:(pos=1) OR Zindex:(pos=1)))')
|
||||
self.assertEqual(self.backend.parse_query('name:david').get_description(), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1) OR ZXNAMEdavid:(pos=1)))')
|
||||
self.assertEqual(self.backend.parse_query('name:david1..david2').get_description(), 'Xapian::Query(VALUE_RANGE 0 david1 david2)')
|
||||
self.assertEqual(self.backend.parse_query('value:0..10').get_description(), 'Xapian::Query(VALUE_RANGE 5 000000000000 000000000010)')
|
||||
self.assertEqual(self.backend.parse_query('value:..10').get_description(), 'Xapian::Query(VALUE_RANGE 5 -02147483648 000000000010)')
|
||||
self.assertEqual(self.backend.parse_query('value:10..*').get_description(), 'Xapian::Query(VALUE_RANGE 5 000000000010 002147483647)')
|
||||
self.assertEqual(self.backend.parse_query('popularity:25.5..100.0').get_description(), 'Xapian::Query(VALUE_RANGE 3 \xb2` \xba@)')
|
||||
self.assertEqual(self.backend.parse_query('value:0..10').get_description(), 'Xapian::Query(VALUE_RANGE 6 000000000000 000000000010)')
|
||||
self.assertEqual(self.backend.parse_query('value:..10').get_description(), 'Xapian::Query(VALUE_RANGE 6 -02147483648 000000000010)')
|
||||
self.assertEqual(self.backend.parse_query('value:10..*').get_description(), 'Xapian::Query(VALUE_RANGE 6 000000000010 002147483647)')
|
||||
self.assertEqual(self.backend.parse_query('popularity:25.5..100.0').get_description(), 'Xapian::Query(VALUE_RANGE 4 \xb2` \xba@)')
|
||||
|
||||
|
||||
class LiveXapianMockSearchIndex(indexes.SearchIndex):
|
||||
|
|
|
|||
|
|
@ -161,10 +161,11 @@ class SearchBackend(BaseSearchBackend):
|
|||
`iterable` -- An iterable of model instances to index
|
||||
|
||||
For each object in `iterable`, a document is created containing all
|
||||
of the terms extracted from `index.prepare(obj)` with stemming prefixes,
|
||||
field prefixes, and 'as-is'.
|
||||
of the terms extracted from `index.prepare(obj)` with field prefixes,
|
||||
and 'as-is' as needed. Also, if the field type is 'text' it will be
|
||||
stemmed and stored with the 'Z' prefix as well.
|
||||
|
||||
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest`
|
||||
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest, XCONTENTtest`
|
||||
|
||||
Each document also contains an extra term in the format:
|
||||
|
||||
|
|
@ -207,10 +208,24 @@ class SearchBackend(BaseSearchBackend):
|
|||
if field['field_name'] in data.keys():
|
||||
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
|
||||
value = data[field['field_name']]
|
||||
term_generator.index_text(_marshal_term(value))
|
||||
term_generator.index_text(_marshal_term(value), 1, prefix)
|
||||
if field['multi_valued'] == 'false':
|
||||
document.add_value(field['column'], _marshal_value(value))
|
||||
if field['type'] == 'text':
|
||||
if field['multi_valued'] == 'false':
|
||||
term_generator.index_text(_marshal_term(value))
|
||||
term_generator.index_text(_marshal_term(value), 1, prefix)
|
||||
document.add_value(field['column'], _marshal_value(value))
|
||||
else:
|
||||
for term in value:
|
||||
term_generator.index_text(_marshal_term(term))
|
||||
term_generator.index_text(_marshal_term(term), 1, prefix)
|
||||
else:
|
||||
if field['multi_valued'] == 'false':
|
||||
document.add_term(_marshal_term(value))
|
||||
document.add_term(prefix + _marshal_term(value))
|
||||
document.add_value(field['column'], _marshal_value(value))
|
||||
else:
|
||||
for term in value:
|
||||
document.add_term(_marshal_term(term))
|
||||
document.add_term(prefix + _marshal_term(term))
|
||||
|
||||
document.set_data(pickle.dumps(
|
||||
(obj._meta.app_label, obj._meta.module_name, obj.pk, data),
|
||||
|
|
|
|||
Loading…
Reference in a new issue