diff --git a/tests/xapian_tests/tests/test_backend.py b/tests/xapian_tests/tests/test_backend.py index 32ed6bb..0a9d8dd 100644 --- a/tests/xapian_tests/tests/test_backend.py +++ b/tests/xapian_tests/tests/test_backend.py @@ -109,6 +109,7 @@ class XapianSimpleMockIndex(indexes.SearchIndex): author = indexes.CharField(model_attr='author') url = indexes.CharField() non_anscii = indexes.CharField() + funny_text = indexes.CharField() datetime = indexes.DateTimeField(model_attr='pub_date') date = indexes.DateField() @@ -134,6 +135,9 @@ class XapianSimpleMockIndex(indexes.SearchIndex): def prepare_non_anscii(self, obj): return 'thsi sdas das corrup\xe7\xe3o das' + def prepare_funny_text(self, obj): + return 'this-text has funny.words!!' + def prepare_datetime(self, obj): return datetime.datetime(2009, 2, 25, 1, 1, 1) @@ -243,30 +247,27 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): """ Tests that text is correctly positioned in the document """ - expected_order = ['this_is_a_word', 'inside', 'a', 'big', 'text'] + expected_order = ['^', 'this_is_a_word', 'inside', 'a', 'big', 'text', '$'] def get_positions(term): """ Uses delve to get the positions of the term in the first document. """ - return [int(pos) for pos in get_terms(self.backend, '-r1', '-t%s' % term)] + return sorted([int(pos) for pos in get_terms(self.backend, '-r1', '-tXTEXT%s' % term)]) - # (position of the first term) - 1 must be a position of "^" - self.assertTrue(get_positions(expected_order[0])[0] - 1 in - get_positions('^')) - - # (position of the last term) + 1 must be a position of "$" - self.assertTrue(get_positions(expected_order[-1])[0] + 1 in - get_positions('$')) - - previous_position = get_positions(expected_order[0])[0] + # confirms expected_order + previous_position = get_positions(expected_order[0]) for term in expected_order[1:]: pos = get_positions(term) - # only one term for the word - self.assertEqual(len(pos), 1) - self.assertEqual(pos[0] - 1, previous_position) - previous_position += 1 + # only two positions per term + # (one from term_generator, one from literal text) + self.assertEqual(len(pos), 2) + + self.assertEqual(pos[0] - 1, previous_position[0]) + self.assertEqual(pos[1] - 1, previous_position[1]) + previous_position[0] += 1 + previous_position[1] += 1 def test_author_field(self): terms = get_terms(self.backend, '-a') @@ -276,6 +277,10 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): self.assertTrue('Zdavid' in terms) self.assertTrue('david' in terms) + def test_funny_text_field(self): + terms = get_terms(self.backend, '-r1') + self.assertTrue('this-text' in terms) + def test_datetime_field(self): terms = get_terms(self.backend, '-a') diff --git a/tests/xapian_tests/tests/test_query.py b/tests/xapian_tests/tests/test_query.py index 5baa9a6..5bfe361 100644 --- a/tests/xapian_tests/tests/test_query.py +++ b/tests/xapian_tests/tests/test_query.py @@ -161,9 +161,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND ' - '( AND_NOT (XTITLE^dune$ OR ' - 'XTITLE^jaws$))))') + 'Xapian::Query(' + '((Zwhi OR why) AND ' + '( AND_NOT (' + '(XTITLE^ PHRASE 3 XTITLEdune PHRASE 3 XTITLE$) OR ' + '(XTITLE^ PHRASE 3 XTITLEjaws PHRASE 3 XTITLE$)))))') def test_build_query_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) @@ -238,8 +240,11 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND ' - '(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))') + 'Xapian::Query(' + '((Zwhi OR why) AND (' + '(XTITLE^ PHRASE 3 XTITLE1 PHRASE 3 XTITLE$) OR ' + '(XTITLE^ PHRASE 3 XTITLE2 PHRASE 3 XTITLE$) OR ' + '(XTITLE^ PHRASE 3 XTITLE3 PHRASE 3 XTITLE$))))') class MockSearchIndex(indexes.SearchIndex): diff --git a/xapian_backend.py b/xapian_backend.py index 13e2998..7a8e4c1 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -288,6 +288,21 @@ class XapianSearchBackend(BaseSearchBackend): return term_generator.get_termpos() + def _add_literal_text(termpos, text, weight, prefix=''): + """ + Adds sentence to the document with positional information + but without processing. + + The sentence is bounded by "^" "$" to allow exact matches. + """ + text = '^ %s $' % text + for word in text.split(): + term = '%s%s' % (prefix, word) + document.add_posting(term, termpos, weight) + termpos += 1 + termpos += TERMPOS_DISTANCE + return termpos + def add_text(termpos, prefix, text, weight): """ Adds text to the document with positional information @@ -295,32 +310,24 @@ class XapianSearchBackend(BaseSearchBackend): """ termpos = _add_text(termpos, text, weight, prefix=prefix) termpos = _add_text(termpos, text, weight, prefix='') + termpos = _add_literal_text(termpos, text, weight, prefix=prefix) + termpos = _add_literal_text(termpos, text, weight, prefix='') return termpos for obj in iterable: document = xapian.Document() term_generator.set_document(document) - def add_to_document(prefix, sentence, weight): + def add_non_text_to_document(prefix, term, weight): """ - Adds sentence to the document without positional information + Adds term to the document without positional information and without processing. If the term is alone, also adds it as "^$" to allow exact matches on single terms. """ - if ' ' in sentence: - # search will use PHRASE, no need to add ^$ - for term in sentence.split(): - document.add_term(term, weight) - document.add_term(prefix + term, weight) - else: - document.add_term(sentence, weight) - document.add_term(prefix + sentence, weight) - # single terms are constructed by XapianSearchQuery._term_query - # and require ^$. - document.add_term("^%s$" % sentence, weight) - document.add_term(prefix + "^%s$" % sentence, weight) + document.add_term(term, weight) + document.add_term(prefix + term, weight) def add_datetime_to_document(termpos, prefix, term, weight): """ @@ -377,7 +384,6 @@ class XapianSearchBackend(BaseSearchBackend): # add the exact match of each value term = _to_xapian_term(t) termpos = add_text(termpos, prefix, term, weight) - add_to_document(prefix, term, weight) continue term = _to_xapian_term(value) @@ -391,8 +397,9 @@ class XapianSearchBackend(BaseSearchBackend): termpos = add_text(termpos, prefix, term, weight) elif field['type'] == 'datetime': termpos = add_datetime_to_document(termpos, prefix, term, weight) - # all terms are added without positional information - add_to_document(prefix, term, weight) + else: + # all other terms are added without positional information + add_non_text_to_document(prefix, term, weight) # store data without indexing it document.set_data(pickle.dumps( @@ -1316,15 +1323,11 @@ class XapianSearchQuery(BaseSearchQuery): Assumes term is not a list. """ - - # this is an hack: - # the ideal would be to use the same idea as in _filter_contains. - # However, it causes tests to fail. - if field_type == 'text' and ' ' in term: + if field_type == 'text': term = '^ %s $' % term query = self._phrase_query(term.split(), field_name, field_type) else: - query = self._term_query(term, field_name, field_type, exact=True, stemmed=False) + query = self._term_query(term, field_name, field_type, stemmed=False) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) @@ -1349,11 +1352,11 @@ class XapianSearchQuery(BaseSearchQuery): return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) return query - def _or_query(self, term_list, field, field_type, exact=False): + def _or_query(self, term_list, field, field_type): """ Joins each item of term_list decorated by _term_query with an OR. """ - term_list = [self._term_query(term, field, field_type, exact) for term in term_list] + term_list = [self._term_query(term, field, field_type) for term in term_list] return xapian.Query(xapian.Query.OP_OR, term_list) def _phrase_query(self, term_list, field_name, field_type): @@ -1370,22 +1373,14 @@ class XapianSearchQuery(BaseSearchQuery): query = xapian.Query(xapian.Query.OP_PHRASE, term_list) return query - def _term_query(self, term, field_name, field_type, exact=False, stemmed=True): + def _term_query(self, term, field_name, field_type, stemmed=True): """ Constructs a query of a single term. If `field_name` is not `None`, the term is search on that field only. If exact is `True`, the search is restricted to boolean matches. """ - # using stemmed terms in exact query is not acceptable. - if stemmed: - assert not exact - constructor = '{prefix}{term}' - # "" is to do a boolean match, but only works on indexed terms - # (constraint on Xapian side) - if exact and field_type == 'text': - constructor = '{prefix}^{term}$' # construct the prefix to be used. prefix = ''