Fixed #49 - Improves support for exact matches on text.

This efectively increases the index size because it
now indexes literal words to improve __exact results.
This commit is contained in:
Jorge C. Leitão 2014-05-23 09:43:36 +02:00
parent 9d5e637ca0
commit fc0a9f71d2
3 changed files with 59 additions and 54 deletions

View file

@ -109,6 +109,7 @@ class XapianSimpleMockIndex(indexes.SearchIndex):
author = indexes.CharField(model_attr='author')
url = indexes.CharField()
non_anscii = indexes.CharField()
funny_text = indexes.CharField()
datetime = indexes.DateTimeField(model_attr='pub_date')
date = indexes.DateField()
@ -134,6 +135,9 @@ class XapianSimpleMockIndex(indexes.SearchIndex):
def prepare_non_anscii(self, obj):
return 'thsi sdas das corrup\xe7\xe3o das'
def prepare_funny_text(self, obj):
return 'this-text has funny.words!!'
def prepare_datetime(self, obj):
return datetime.datetime(2009, 2, 25, 1, 1, 1)
@ -243,30 +247,27 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase):
"""
Tests that text is correctly positioned in the document
"""
expected_order = ['this_is_a_word', 'inside', 'a', 'big', 'text']
expected_order = ['^', 'this_is_a_word', 'inside', 'a', 'big', 'text', '$']
def get_positions(term):
"""
Uses delve to get
the positions of the term in the first document.
"""
return [int(pos) for pos in get_terms(self.backend, '-r1', '-t%s' % term)]
return sorted([int(pos) for pos in get_terms(self.backend, '-r1', '-tXTEXT%s' % term)])
# (position of the first term) - 1 must be a position of "^"
self.assertTrue(get_positions(expected_order[0])[0] - 1 in
get_positions('^'))
# (position of the last term) + 1 must be a position of "$"
self.assertTrue(get_positions(expected_order[-1])[0] + 1 in
get_positions('$'))
previous_position = get_positions(expected_order[0])[0]
# confirms expected_order
previous_position = get_positions(expected_order[0])
for term in expected_order[1:]:
pos = get_positions(term)
# only one term for the word
self.assertEqual(len(pos), 1)
self.assertEqual(pos[0] - 1, previous_position)
previous_position += 1
# only two positions per term
# (one from term_generator, one from literal text)
self.assertEqual(len(pos), 2)
self.assertEqual(pos[0] - 1, previous_position[0])
self.assertEqual(pos[1] - 1, previous_position[1])
previous_position[0] += 1
previous_position[1] += 1
def test_author_field(self):
terms = get_terms(self.backend, '-a')
@ -276,6 +277,10 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase):
self.assertTrue('Zdavid' in terms)
self.assertTrue('david' in terms)
def test_funny_text_field(self):
terms = get_terms(self.backend, '-r1')
self.assertTrue('this-text' in terms)
def test_datetime_field(self):
terms = get_terms(self.backend, '-a')

View file

@ -161,9 +161,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"]))
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query(((Zwhi OR why) AND '
'(<alldocuments> AND_NOT (XTITLE^dune$ OR '
'XTITLE^jaws$))))')
'Xapian::Query('
'((Zwhi OR why) AND '
'(<alldocuments> AND_NOT ('
'(XTITLE^ PHRASE 3 XTITLEdune PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLEjaws PHRASE 3 XTITLE$)))))')
def test_build_query_in_filter_multiple_words(self):
self.sq.add_filter(SQ(content='why'))
@ -238,8 +240,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True)))
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query(((Zwhi OR why) AND '
'(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))')
'Xapian::Query('
'((Zwhi OR why) AND ('
'(XTITLE^ PHRASE 3 XTITLE1 PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLE2 PHRASE 3 XTITLE$) OR '
'(XTITLE^ PHRASE 3 XTITLE3 PHRASE 3 XTITLE$))))')
class MockSearchIndex(indexes.SearchIndex):

View file

@ -288,6 +288,21 @@ class XapianSearchBackend(BaseSearchBackend):
return term_generator.get_termpos()
def _add_literal_text(termpos, text, weight, prefix=''):
"""
Adds sentence to the document with positional information
but without processing.
The sentence is bounded by "^" "$" to allow exact matches.
"""
text = '^ %s $' % text
for word in text.split():
term = '%s%s' % (prefix, word)
document.add_posting(term, termpos, weight)
termpos += 1
termpos += TERMPOS_DISTANCE
return termpos
def add_text(termpos, prefix, text, weight):
"""
Adds text to the document with positional information
@ -295,32 +310,24 @@ class XapianSearchBackend(BaseSearchBackend):
"""
termpos = _add_text(termpos, text, weight, prefix=prefix)
termpos = _add_text(termpos, text, weight, prefix='')
termpos = _add_literal_text(termpos, text, weight, prefix=prefix)
termpos = _add_literal_text(termpos, text, weight, prefix='')
return termpos
for obj in iterable:
document = xapian.Document()
term_generator.set_document(document)
def add_to_document(prefix, sentence, weight):
def add_non_text_to_document(prefix, term, weight):
"""
Adds sentence to the document without positional information
Adds term to the document without positional information
and without processing.
If the term is alone, also adds it as "^<term>$"
to allow exact matches on single terms.
"""
if ' ' in sentence:
# search will use PHRASE, no need to add ^$
for term in sentence.split():
document.add_term(term, weight)
document.add_term(prefix + term, weight)
else:
document.add_term(sentence, weight)
document.add_term(prefix + sentence, weight)
# single terms are constructed by XapianSearchQuery._term_query
# and require ^$.
document.add_term("^%s$" % sentence, weight)
document.add_term(prefix + "^%s$" % sentence, weight)
document.add_term(term, weight)
document.add_term(prefix + term, weight)
def add_datetime_to_document(termpos, prefix, term, weight):
"""
@ -377,7 +384,6 @@ class XapianSearchBackend(BaseSearchBackend):
# add the exact match of each value
term = _to_xapian_term(t)
termpos = add_text(termpos, prefix, term, weight)
add_to_document(prefix, term, weight)
continue
term = _to_xapian_term(value)
@ -391,8 +397,9 @@ class XapianSearchBackend(BaseSearchBackend):
termpos = add_text(termpos, prefix, term, weight)
elif field['type'] == 'datetime':
termpos = add_datetime_to_document(termpos, prefix, term, weight)
# all terms are added without positional information
add_to_document(prefix, term, weight)
else:
# all other terms are added without positional information
add_non_text_to_document(prefix, term, weight)
# store data without indexing it
document.set_data(pickle.dumps(
@ -1316,15 +1323,11 @@ class XapianSearchQuery(BaseSearchQuery):
Assumes term is not a list.
"""
# this is an hack:
# the ideal would be to use the same idea as in _filter_contains.
# However, it causes tests to fail.
if field_type == 'text' and ' ' in term:
if field_type == 'text':
term = '^ %s $' % term
query = self._phrase_query(term.split(), field_name, field_type)
else:
query = self._term_query(term, field_name, field_type, exact=True, stemmed=False)
query = self._term_query(term, field_name, field_type, stemmed=False)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
@ -1349,11 +1352,11 @@ class XapianSearchQuery(BaseSearchQuery):
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
return query
def _or_query(self, term_list, field, field_type, exact=False):
def _or_query(self, term_list, field, field_type):
"""
Joins each item of term_list decorated by _term_query with an OR.
"""
term_list = [self._term_query(term, field, field_type, exact) for term in term_list]
term_list = [self._term_query(term, field, field_type) for term in term_list]
return xapian.Query(xapian.Query.OP_OR, term_list)
def _phrase_query(self, term_list, field_name, field_type):
@ -1370,22 +1373,14 @@ class XapianSearchQuery(BaseSearchQuery):
query = xapian.Query(xapian.Query.OP_PHRASE, term_list)
return query
def _term_query(self, term, field_name, field_type, exact=False, stemmed=True):
def _term_query(self, term, field_name, field_type, stemmed=True):
"""
Constructs a query of a single term.
If `field_name` is not `None`, the term is search on that field only.
If exact is `True`, the search is restricted to boolean matches.
"""
# using stemmed terms in exact query is not acceptable.
if stemmed:
assert not exact
constructor = '{prefix}{term}'
# "" is to do a boolean match, but only works on indexed terms
# (constraint on Xapian side)
if exact and field_type == 'text':
constructor = '{prefix}^{term}$'
# construct the prefix to be used.
prefix = ''