Fixed #49 - Improves support for exact matches on text.

This efectively increases the index size because it now indexes literal words to improve __exact results.
2026-05-08 15:34:46 +00:00 · 2014-05-23 09:43:36 +02:00 · 2014-05-23 09:43:36 +02:00 · fc0a9f71d2
commit fc0a9f71d2
parent 9d5e637ca0
3 changed files with 59 additions and 54 deletions
--- a/tests/xapian_tests/tests/test_backend.py
+++ b/tests/xapian_tests/tests/test_backend.py
@ -109,6 +109,7 @@ class XapianSimpleMockIndex(indexes.SearchIndex):
    author = indexes.CharField(model_attr='author')
    url = indexes.CharField()
    non_anscii = indexes.CharField()
+    funny_text = indexes.CharField()

    datetime = indexes.DateTimeField(model_attr='pub_date')
    date = indexes.DateField()
@ -134,6 +135,9 @@ class XapianSimpleMockIndex(indexes.SearchIndex):
    def prepare_non_anscii(self, obj):
        return 'thsi sdas das corrup\xe7\xe3o das'

+    def prepare_funny_text(self, obj):
+        return 'this-text has funny.words!!'
+
    def prepare_datetime(self, obj):
        return datetime.datetime(2009, 2, 25, 1, 1, 1)

@ -243,30 +247,27 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase):
        """
        Tests that text is correctly positioned in the document
        """
-        expected_order = ['this_is_a_word', 'inside', 'a', 'big', 'text']
+        expected_order = ['^', 'this_is_a_word', 'inside', 'a', 'big', 'text', '$']

        def get_positions(term):
            """
            Uses delve to get
            the positions of the term in the first document.
            """
-            return [int(pos) for pos in get_terms(self.backend, '-r1', '-t%s' % term)]
+            return sorted([int(pos) for pos in get_terms(self.backend, '-r1', '-tXTEXT%s' % term)])

-        # (position of the first term) - 1 must be a position of "^"
-        self.assertTrue(get_positions(expected_order[0])[0] - 1 in
-                        get_positions('^'))
-
-        # (position of the last term) + 1 must be a position of "$"
-        self.assertTrue(get_positions(expected_order[-1])[0] + 1 in
-                        get_positions('$'))
-
-        previous_position = get_positions(expected_order[0])[0]
+        # confirms expected_order
+        previous_position = get_positions(expected_order[0])
        for term in expected_order[1:]:
            pos = get_positions(term)
-            # only one term for the word
-            self.assertEqual(len(pos), 1)
-            self.assertEqual(pos[0] - 1, previous_position)
-            previous_position += 1
+            # only two positions per term
+            # (one from term_generator, one from literal text)
+            self.assertEqual(len(pos), 2)
+
+            self.assertEqual(pos[0] - 1, previous_position[0])
+            self.assertEqual(pos[1] - 1, previous_position[1])
+            previous_position[0] += 1
+            previous_position[1] += 1

    def test_author_field(self):
        terms = get_terms(self.backend, '-a')
@ -276,6 +277,10 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase):
        self.assertTrue('Zdavid' in terms)
        self.assertTrue('david' in terms)

+    def test_funny_text_field(self):
+        terms = get_terms(self.backend, '-r1')
+        self.assertTrue('this-text' in terms)
+
    def test_datetime_field(self):
        terms = get_terms(self.backend, '-a')

--- a/tests/xapian_tests/tests/test_query.py
+++ b/tests/xapian_tests/tests/test_query.py
@ -161,9 +161,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
        self.sq.add_filter(SQ(content='why'))
        self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"]))
        self.assertEqual(str(self.sq.build_query()),
-                         'Xapian::Query(((Zwhi OR why) AND '
-                         '(<alldocuments> AND_NOT (XTITLE^dune$ OR '
-                         'XTITLE^jaws$))))')
+                         'Xapian::Query('
+                         '((Zwhi OR why) AND '
+                         '(<alldocuments> AND_NOT ('
+                         '(XTITLE^ PHRASE 3 XTITLEdune PHRASE 3 XTITLE$) OR '
+                         '(XTITLE^ PHRASE 3 XTITLEjaws PHRASE 3 XTITLE$)))))')

    def test_build_query_in_filter_multiple_words(self):
        self.sq.add_filter(SQ(content='why'))
@ -238,8 +240,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
        self.sq.add_filter(SQ(content='why'))
        self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True)))
        self.assertEqual(str(self.sq.build_query()),
-                         'Xapian::Query(((Zwhi OR why) AND '
-                         '(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))')
+                         'Xapian::Query('
+                         '((Zwhi OR why) AND ('
+                         '(XTITLE^ PHRASE 3 XTITLE1 PHRASE 3 XTITLE$) OR '
+                         '(XTITLE^ PHRASE 3 XTITLE2 PHRASE 3 XTITLE$) OR '
+                         '(XTITLE^ PHRASE 3 XTITLE3 PHRASE 3 XTITLE$))))')


 class MockSearchIndex(indexes.SearchIndex):
--- a/xapian_backend.py
+++ b/xapian_backend.py
@ -288,6 +288,21 @@ class XapianSearchBackend(BaseSearchBackend):

                return term_generator.get_termpos()

+            def _add_literal_text(termpos, text, weight, prefix=''):
+                """
+                Adds sentence to the document with positional information
+                but without processing.
+
+                The sentence is bounded by "^" "$" to allow exact matches.
+                """
+                text = '^ %s $' % text
+                for word in text.split():
+                    term = '%s%s' % (prefix, word)
+                    document.add_posting(term, termpos, weight)
+                    termpos += 1
+                termpos += TERMPOS_DISTANCE
+                return termpos
+
            def add_text(termpos, prefix, text, weight):
                """
                Adds text to the document with positional information
@ -295,32 +310,24 @@ class XapianSearchBackend(BaseSearchBackend):
                """
                termpos = _add_text(termpos, text, weight, prefix=prefix)
                termpos = _add_text(termpos, text, weight, prefix='')
+                termpos = _add_literal_text(termpos, text, weight, prefix=prefix)
+                termpos = _add_literal_text(termpos, text, weight, prefix='')
                return termpos

            for obj in iterable:
                document = xapian.Document()
                term_generator.set_document(document)

-                def add_to_document(prefix, sentence, weight):
+                def add_non_text_to_document(prefix, term, weight):
                    """
-                    Adds sentence to the document without positional information
+                    Adds term to the document without positional information
                    and without processing.

                    If the term is alone, also adds it as "^<term>$"
                    to allow exact matches on single terms.
                    """
-                    if ' ' in sentence:
-                        # search will use PHRASE, no need to add ^$
-                        for term in sentence.split():
-                            document.add_term(term, weight)
-                            document.add_term(prefix + term, weight)
-                    else:
-                        document.add_term(sentence, weight)
-                        document.add_term(prefix + sentence, weight)
-                        # single terms are constructed by XapianSearchQuery._term_query
-                        # and require ^$.
-                        document.add_term("^%s$" % sentence, weight)
-                        document.add_term(prefix + "^%s$" % sentence, weight)
+                    document.add_term(term, weight)
+                    document.add_term(prefix + term, weight)

                def add_datetime_to_document(termpos, prefix, term, weight):
                    """
@ -377,7 +384,6 @@ class XapianSearchBackend(BaseSearchBackend):
                                # add the exact match of each value
                                term = _to_xapian_term(t)
                                termpos = add_text(termpos, prefix, term, weight)
-                                add_to_document(prefix, term, weight)
                            continue

                        term = _to_xapian_term(value)
@ -391,8 +397,9 @@ class XapianSearchBackend(BaseSearchBackend):
                            termpos = add_text(termpos, prefix, term, weight)
                        elif field['type'] == 'datetime':
                            termpos = add_datetime_to_document(termpos, prefix, term, weight)
-                        # all terms are added without positional information
-                        add_to_document(prefix, term, weight)
+                        else:
+                            # all other terms are added without positional information
+                            add_non_text_to_document(prefix, term, weight)

                # store data without indexing it
                document.set_data(pickle.dumps(
@ -1316,15 +1323,11 @@ class XapianSearchQuery(BaseSearchQuery):

        Assumes term is not a list.
        """
-
-        # this is an hack:
-        # the ideal would be to use the same idea as in _filter_contains.
-        # However, it causes tests to fail.
-        if field_type == 'text' and ' ' in term:
+        if field_type == 'text':
            term = '^ %s $' % term
            query = self._phrase_query(term.split(), field_name, field_type)
        else:
-            query = self._term_query(term, field_name, field_type, exact=True, stemmed=False)
+            query = self._term_query(term, field_name, field_type, stemmed=False)

        if is_not:
            return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
@ -1349,11 +1352,11 @@ class XapianSearchQuery(BaseSearchQuery):
            return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
        return query

-    def _or_query(self, term_list, field, field_type, exact=False):
+    def _or_query(self, term_list, field, field_type):
        """
        Joins each item of term_list decorated by _term_query with an OR.
        """
-        term_list = [self._term_query(term, field, field_type, exact) for term in term_list]
+        term_list = [self._term_query(term, field, field_type) for term in term_list]
        return xapian.Query(xapian.Query.OP_OR, term_list)

    def _phrase_query(self, term_list, field_name, field_type):
@ -1370,22 +1373,14 @@ class XapianSearchQuery(BaseSearchQuery):
        query = xapian.Query(xapian.Query.OP_PHRASE, term_list)
        return query

-    def _term_query(self, term, field_name, field_type, exact=False, stemmed=True):
+    def _term_query(self, term, field_name, field_type, stemmed=True):
        """
        Constructs a query of a single term.

        If `field_name` is not `None`, the term is search on that field only.
        If exact is `True`, the search is restricted to boolean matches.
        """
-        # using stemmed terms in exact query is not acceptable.
-        if stemmed:
-            assert not exact
-
        constructor = '{prefix}{term}'
-        # "" is to do a boolean match, but only works on indexed terms
-        # (constraint on Xapian side)
-        if exact and field_type == 'text':
-            constructor = '{prefix}^{term}$'

        # construct the prefix to be used.
        prefix = ''