From ec8fb42e68ff84f04f4382154c26c31c91193855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 23:01:49 +0200 Subject: [PATCH] Fixed #103 - improves support for exact search. --- tests/xapian_tests/tests/test_query.py | 6 +-- xapian_backend.py | 75 ++++++++++++++++++-------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/tests/xapian_tests/tests/test_query.py b/tests/xapian_tests/tests/test_query.py index e49fe1b..244cd41 100644 --- a/tests/xapian_tests/tests/test_query.py +++ b/tests/xapian_tests/tests/test_query.py @@ -162,8 +162,8 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '( AND_NOT ("XTITLEdune" OR ' - '"XTITLEjaws"))))') + '( AND_NOT (XTITLE^dune$ OR ' + 'XTITLE^jaws$))))') def test_build_query_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) @@ -233,7 +233,7 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '("XTITLE1" OR "XTITLE2" OR "XTITLE3")))') + '(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))') class MockSearchIndex(indexes.SearchIndex): diff --git a/xapian_backend.py b/xapian_backend.py index bf27503..7eb9799 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -261,13 +261,40 @@ class XapianSearchBackend(BaseSearchBackend): if self.include_spelling is True: term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) + def add_text(termpos, prefix, term, weight): + term_generator.set_termpos(termpos + 1) + term_generator.index_text(term, weight) + term_generator.index_text(term, weight, prefix) + term_generator.increase_termpos() + return term_generator.get_termpos() + for obj in iterable: document = xapian.Document() term_generator.set_document(document) + def add_to_document(prefix, term, weight): + document.add_term('%s' % term, weight) + document.add_term(prefix + term, weight) + document.add_term(prefix + '^%s$' % term, weight) + + def add_datetime_to_document(termpos, prefix, term, weight): + date, time = term.split() + document.add_posting(date, termpos, weight) + termpos += 1 + document.add_posting(time, termpos, weight) + termpos += 1 + document.add_posting(prefix + date, termpos, weight) + termpos += 1 + document.add_posting(prefix + time, termpos, weight) + termpos += 1 + return termpos + data = index.full_prepare(obj) weights = index.get_field_weights() + + termpos = 0 for field in self.schema: + termpos += 1 # not supported fields are ignored. if field['field_name'] not in list(data.keys()): continue @@ -288,30 +315,36 @@ class XapianSearchBackend(BaseSearchBackend): document.add_term(TERM_PREFIXES[field['field_name']] + value, weight) document.add_value(field['column'], value) + continue else: prefix = TERM_PREFIXES['field'] + field['field_name'].upper() - # if not multi_valued, we add a value and construct a one-element list + # if not multi_valued, we add as a document value + # for sorting and facets if field['multi_valued'] == 'false': document.add_value(field['column'], _term_to_xapian_value(value, field['type'])) - value = [value] + else: + for t in value: + # add the exact match of each value + term = _to_xapian_term(t) + add_to_document(prefix, term, weight) + # index each value with positional information + if ' ' in term: + termpos = add_text(termpos, prefix, term, weight) + continue - for term in value: - term = _to_xapian_term(term) - # from here on term is a string; - # now decide how it is stored: + term = _to_xapian_term(value) + # from here on the term is a string; + # we now decide how it is indexed - # these are - if field['type'] == 'text': - term_generator.index_text(term, weight) - term_generator.index_text(term, weight, prefix) - elif ' ' in term: - for t in term.split(): - document.add_term(t, weight) - document.add_term(prefix + t, weight) - if term != "": - document.add_term(term, weight) - document.add_term(prefix + term, weight) + if field['type'] == 'text': + # text is indexed with positional information + termpos = add_text(termpos, prefix, term, weight) + elif field['type'] == 'datetime': + termpos = add_datetime_to_document(termpos, prefix, term, weight) + if term != "": + # all other terms are added without positional information + add_to_document(prefix, term, weight) # store data without indexing it document.set_data(pickle.dumps( @@ -1300,10 +1333,9 @@ class XapianSearchQuery(BaseSearchQuery): assert not exact constructor = '{prefix}{term}' - # "" is to do a boolean match, but only works on indexed terms - # (constraint on Xapian side) - if exact and field_type == 'text': - constructor = '"{prefix}{term}"' + # ^{term}$ is for boolean match of the term + if exact: + constructor = '{prefix}^{term}$' # construct the prefix to be used. prefix = '' @@ -1321,6 +1353,7 @@ class XapianSearchQuery(BaseSearchQuery): # we construct the query dates in a slightly different way if field_type == 'datetime': date, time = term.split() + constructor = '{prefix}{term}' return xapian.Query(xapian.Query.OP_AND_MAYBE, constructor.format(prefix=prefix, term=date), constructor.format(prefix=prefix, term=time)