From 8c805b72e55e337146c906aa4e0309f94fae0458 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 00:09:23 +0200 Subject: [PATCH 01/38] Added tests to address issue #123. --- tests/xapian_tests/models.py | 13 +- tests/xapian_tests/search_indexes.py | 22 +++ tests/xapian_tests/tests/test_live_xapian.py | 168 +++++++++++++++++++ 3 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 tests/xapian_tests/search_indexes.py create mode 100644 tests/xapian_tests/tests/test_live_xapian.py diff --git a/tests/xapian_tests/models.py b/tests/xapian_tests/models.py index 51c415e..21b3cc8 100644 --- a/tests/xapian_tests/models.py +++ b/tests/xapian_tests/models.py @@ -1,2 +1,11 @@ -# Copyright (C) 2009, 2010, 2011, 2012 David Sauve -# Copyright (C) 2009, 2010 Trapeze +from django.db import models + + +class Document(models.Model): + type_name = models.CharField(max_length=50) + number = models.IntegerField() + + date = models.DateField() + + summary = models.TextField() + text = models.TextField() diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py new file mode 100644 index 0000000..357bd02 --- /dev/null +++ b/tests/xapian_tests/search_indexes.py @@ -0,0 +1,22 @@ +from haystack import indexes + +import models + + +class DocumentIndex(indexes.SearchIndex): + text = indexes.CharField(document=True) + summary = indexes.CharField(model_attr='summary') + + type_name = indexes.CharField(model_attr='type_name') + + number = indexes.IntegerField(model_attr='number') + + name = indexes.CharField() + date = indexes.DateField(model_attr='date') + + def get_model(self): + return models.Document() + + def prepare_name(self, obj): + return "%s %s" % (obj.type_name, str(obj.number)) + diff --git a/tests/xapian_tests/tests/test_live_xapian.py b/tests/xapian_tests/tests/test_live_xapian.py new file mode 100644 index 0000000..4f9a9d2 --- /dev/null +++ b/tests/xapian_tests/tests/test_live_xapian.py @@ -0,0 +1,168 @@ +import datetime +from django.test import TestCase + +from haystack import connections +from haystack.inputs import AutoQuery +from haystack.query import SearchQuerySet + +from xapian_tests.models import Document +from xapian_tests.search_indexes import DocumentIndex + + +def pks(results): + return [result.pk for result in results] + + +class LiveXapianTestCase(TestCase): + + def setUp(self): + + types_names = ['book', 'magazine', 'article'] + texts = ['This is a huge text', + 'This is a medium text', + 'This is a small text'] + dates = [datetime.date(year=2010, month=1, day=1), + datetime.date(year=2010, month=2, day=1), + datetime.date(year=2010, month=3, day=1)] + + summaries = ['This is a huge summary', + 'This is a medium summary', + 'This is a small summary'] + + for i in range(1, 13): + doc = Document() + doc.type_name = types_names[i % 3] + doc.text = texts[i % 3] + doc.date = dates[i % 3] + doc.summary = summaries[i % 3] + doc.number = i * 2 + doc.save() + + self.index = DocumentIndex() + self.ui = connections['default'].get_unified_index() + self.ui.build(indexes=[self.index]) + + self.backend = connections['default'].get_backend() + self.backend.update(self.index, Document.objects.all()) + + self.queryset = SearchQuerySet() + + def tearDown(self): + Document.objects.all().delete() + self.backend.clear() + + def test_count(self): + self.assertEqual(self.queryset.count(), Document.objects.count()) + + def test_content_search(self): + result = self.queryset.filter(content='medium this') + self.assertEqual(sorted(pks(result)), + pks(Document.objects.all())) + + # documents with "medium" AND "this" have higher score + self.assertEqual(pks(result)[:4], [1, 4, 7, 10]) + + def test_field_search(self): + self.assertEqual(pks(self.queryset.filter(name='8')), [4]) + self.assertEqual(pks(self.queryset.filter(type_name='book')), + pks(Document.objects.filter(type_name='book'))) + + self.assertEqual(pks(self.queryset.filter(text='text huge')), + pks(Document.objects.filter(text__contains='text huge'))) + + def test_field_contains(self): + self.assertEqual(pks(self.queryset.filter(summary='huge')), + pks(Document.objects.filter(summary__contains='huge'))) + + result = self.queryset.filter(summary='huge summary') + self.assertEqual(sorted(pks(result)), + pks(Document.objects.all())) + + # documents with "huge" AND "summary" have higher score + self.assertEqual(pks(result)[:4], [3, 6, 9, 12]) + + def test_field_exact(self): + self.assertEqual(pks(self.queryset.filter(name__exact='8')), []) + self.assertEqual(pks(self.queryset.filter(name__exact='magazine 2')), [1]) + + def test_content_exact(self): + self.assertEqual(pks(self.queryset.filter(content__exact='huge')), []) + + def test_content_and(self): + self.assertEqual(pks(self.queryset.filter(content='huge').filter(summary='medium')), []) + + self.assertEqual(len(self.queryset.filter(content='huge this')), 12) + self.assertEqual(len(self.queryset.filter(content='huge this').filter(summary='huge')), 4) + + def test_content_or(self): + self.assertEqual(len(self.queryset.filter(content='huge medium')), 8) + self.assertEqual(len(self.queryset.filter(content='huge medium small')), 12) + + def test_field_and(self): + self.assertEqual(pks(self.queryset.filter(name='8').filter(name='4')), []) + + def test_field_or(self): + self.assertEqual(pks(self.queryset.filter(name='8 4')), [2, 4]) + + def test_field_in(self): + self.assertEqual(pks(self.queryset.filter(name__in=['magazine 2', 'article 4'])), [1, 2]) + + self.assertEqual(pks(self.queryset.filter(number__in=[4])), + pks(Document.objects.filter(number__in=[4]))) + + self.assertEqual(pks(self.queryset.filter(number__in=[4, 8])), + pks(Document.objects.filter(number__in=[4, 8]))) + + def test_private_fields(self): + self.assertEqual(pks(self.queryset.filter(django_id=4)), + pks(Document.objects.filter(id__in=[4]))) + self.assertEqual(pks(self.queryset.filter(django_id__in=[2, 4])), + pks(Document.objects.filter(id__in=[2, 4]))) + + self.assertEqual(pks(self.queryset.models(Document)), + pks(Document.objects.all())) + + def test_field_startswith(self): + self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4) + self.assertEqual(len(self.queryset.filter(text__startswith='This is')), 12) + + def test_auto_query(self): + self.assertEqual(len(self.queryset.auto_query("huge OR medium")), 8) + self.assertEqual(len(self.queryset.auto_query("huge AND medium")), 0) + self.assertEqual(len(self.queryset.auto_query("huge -this")), 0) + self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 OR 4"))), 2) + self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0) + + def test_value_range(self): + self.assertEqual(pks(self.queryset.filter(number__lt=3)), + pks(Document.objects.filter(number__lt=3))) + + self.assertEqual(pks(self.queryset.filter(django_id__gte=6)), + pks(Document.objects.filter(id__gte=6))) + + def test_date_range(self): + date = datetime.date(year=2010, month=2, day=1) + self.assertEqual(pks(self.queryset.filter(date__gte=date)), + pks(Document.objects.filter(date__gte=date))) + + date = datetime.date(year=2010, month=3, day=1) + self.assertEqual(pks(self.queryset.filter(date__lte=date)), + pks(Document.objects.filter(date__lte=date))) + + def test_order_by(self): + # private order + self.assertEqual(pks(self.queryset.order_by("-django_id")), + pks(Document.objects.order_by("-id"))) + + # value order + self.assertEqual(pks(self.queryset.order_by("number")), + pks(Document.objects.order_by("number"))) + + # text order + self.assertEqual(pks(self.queryset.order_by("summary")), + pks(Document.objects.order_by("summary"))) + + # date order + self.assertEqual(pks(self.queryset.order_by("-date")), + pks(Document.objects.order_by("-date"))) + From 9907b4522f142e4dba5b4b2c18697e4990af6f3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 19:58:32 +0200 Subject: [PATCH 02/38] Refactored tests code. Mostly PEP8, but also redundant asserts. --- .../xapian_tests/tests/test_xapian_backend.py | 164 +++++++++++------- tests/xapian_tests/tests/test_xapian_query.py | 133 ++++++++++---- 2 files changed, 204 insertions(+), 93 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 21f5ad7..3245ba2 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -27,6 +27,10 @@ def get_terms(backend, *args): return result.split(" ") +def pks(results): + return [result.pk for result in results] + + class XapianMockModel(models.Model): """ Same as tests.core.MockModel with a few extra fields for testing various @@ -225,19 +229,20 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.update(self.index, self.sample_objs) def test_update(self): - self.assertEqual(self.backend.document_count(), 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [1, 2, 3]) def test_duplicate_update(self): - # Duplicates should be updated, not appended -- http://github.com/notanumber/xapian-haystack/issues/#issue/6 + """ + Regression test for #6. + """ self.backend.update(self.index, self.sample_objs) - self.assertEqual(self.backend.document_count(), 3) def test_remove(self): self.backend.remove(self.sample_objs[0]) - self.assertEqual(self.backend.document_count(), 2) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [2, 3]) + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [2, 3]) def test_clear(self): self.backend.clear() @@ -259,18 +264,21 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(self.backend.document_count(), 0) def test_search(self): + # no match query self.assertEqual(self.backend.search(xapian.Query()), {'hits': 0, 'results': []}) - self.assertEqual(self.backend.search(xapian.Query(''))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) - self.assertEqual(self.backend.search(xapian.Query('indexed'))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) + # all match query + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [1, 2, 3]) - # Ensure that swapping the ``result_class`` works. - self.assertTrue(isinstance(self.backend.search(xapian.Query('indexed'), result_class=MockSearchResult)['results'][0], MockSearchResult)) + # Other `result_class` + self.assertTrue(isinstance(self.backend.search(xapian.Query('indexed'), + result_class=MockSearchResult)['results'][0], + MockSearchResult)) def test_search_field_with_punctuation(self): - # self.assertEqual(self.backend.search(xapian.Query('http://example.com/'))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query('http://example.com/1/'))['results']], [1]) + #self.assertEqual(self.backend.search(xapian.Query('http://example.com/'))['hits'], 3) + self.assertEqual(pks(self.backend.search(xapian.Query('http://example.com/1/'))['results']), + [1]) def test_search_by_mvf(self): self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1) @@ -279,22 +287,33 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3) def test_field_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), facets=['name']), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), facets=['name']), + {'hits': 0, 'results': []}) + results = self.backend.search(xapian.Query('indexed'), facets=['name']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['name'], [('david1', 1), ('david2', 1), ('david3', 1)]) + self.assertEqual(results['facets']['fields']['name'], + [('david1', 1), ('david2', 1), ('david3', 1)]) results = self.backend.search(xapian.Query('indexed'), facets=['flag']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['flag'], [(False, 1), (True, 2)]) + self.assertEqual(results['facets']['fields']['flag'], + [(False, 1), (True, 2)]) results = self.backend.search(xapian.Query('indexed'), facets=['sites']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['sites'], [('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]) + self.assertEqual(results['facets']['fields']['sites'], + [('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]) def test_date_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), date_facets={'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), 'gap_by': 'month'}}), {'hits': 0, 'results': []}) - results = self.backend.search(xapian.Query('indexed'), date_facets={'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), 'gap_by': 'month'}}) + facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26), + 'end_date': datetime.datetime(2009, 3, 26), + 'gap_by': 'month'}} + + self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets), + {'hits': 0, 'results': []}) + + results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['pub_date'], [ ('2009-02-26T00:00:00', 0), @@ -304,7 +323,11 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ('2008-10-26T00:00:00', 0), ]) - results = self.backend.search(xapian.Query('indexed'), date_facets={'pub_date': {'start_date': datetime.datetime(2009, 02, 01), 'end_date': datetime.datetime(2009, 3, 15), 'gap_by': 'day', 'gap_amount': 15}}) + facets = {'pub_date': {'start_date': datetime.datetime(2009, 02, 01), + 'end_date': datetime.datetime(2009, 3, 15), + 'gap_by': 'day', + 'gap_amount': 15}} + results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['pub_date'], [ ('2009-03-03T00:00:00', 0), @@ -313,86 +336,101 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ]) def test_query_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}), + {'hits': 0, 'results': []}) + results = self.backend.search(xapian.Query('indexed'), query_facets={'name': 'da*'}) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['queries']['name'], ('da*', 3)) def test_narrow_queries(self): - self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}), + {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), narrow_queries={'name:david1'}) self.assertEqual(results['hits'], 1) def test_highlight(self): - self.assertEqual(self.backend.search(xapian.Query(), highlight=True), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), highlight=True), + {'hits': 0, 'results': []}) self.assertEqual(self.backend.search(xapian.Query('indexed'), highlight=True)['hits'], 3) - self.assertEqual([result.highlighted['text'] for result in self.backend.search(xapian.Query('indexed'), highlight=True)['results']], ['indexed!\n1', 'indexed!\n2', 'indexed!\n3']) + + results = self.backend.search(xapian.Query('indexed'), highlight=True)['results'] + self.assertEqual([result.highlighted['text'] for result in results], + ['indexed!\n1', 'indexed!\n2', 'indexed!\n3']) def test_spelling_suggestion(self): self.assertEqual(self.backend.search(xapian.Query('indxe'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('indxed'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('foo'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'], 'david1') + self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'], + 'david1') def test_more_like_this(self): results = self.backend.more_like_this(self.sample_objs[0]) - self.assertEqual(results['hits'], 2) - self.assertEqual([result.pk for result in results['results']], [3, 2]) - results = self.backend.more_like_this(self.sample_objs[0], additional_query=xapian.Query('david3')) - self.assertEqual(results['hits'], 1) - self.assertEqual([result.pk for result in results['results']], [3]) + self.assertEqual(pks(results['results']), [3, 2]) - results = self.backend.more_like_this(self.sample_objs[0], limit_to_registered_models=True) - self.assertEqual(results['hits'], 2) - self.assertEqual([result.pk for result in results['results']], [3, 2]) + results = self.backend.more_like_this(self.sample_objs[0], + additional_query=xapian.Query('david3')) - # Ensure that swapping the ``result_class`` works. - self.assertTrue(isinstance(self.backend.more_like_this(self.sample_objs[0], result_class=MockSearchResult)['results'][0], MockSearchResult)) + self.assertEqual(pks(results['results']), [3]) + + results = self.backend.more_like_this(self.sample_objs[0], + limit_to_registered_models=True) + + self.assertEqual(pks(results['results']), [3, 2]) + + # Other `result_class` + self.assertTrue(isinstance(self.backend.more_like_this(self.sample_objs[0], + result_class=MockSearchResult)['results'][0], + MockSearchResult)) def test_order_by(self): results = self.backend.search(xapian.Query(''), sort_by=['pub_date']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['-pub_date']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['exp_date']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-exp_date']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['id']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-id']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['value']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-value']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['popularity']) - self.assertEqual([result.pk for result in results['results']], [2, 1, 3]) + self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-popularity']) - self.assertEqual([result.pk for result in results['results']], [3, 1, 2]) + self.assertEqual(pks(results['results']), [3, 1, 2]) results = self.backend.search(xapian.Query(''), sort_by=['flag', 'id']) - self.assertEqual([result.pk for result in results['results']], [2, 1, 3]) + self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['flag', '-id']) - self.assertEqual([result.pk for result in results['results']], [2, 3, 1]) + self.assertEqual(pks(results['results']), [2, 3, 1]) def test_verify_type(self): self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']], @@ -412,7 +450,9 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 18, 1, 16, 30, 250)), '20090518011630000250') def test_build_schema(self): - (content_field_name, fields) = self.backend.build_schema(connections['default'].get_unified_index().all_searchfields()) + search_fields = connections['default'].get_unified_index().all_searchfields() + (content_field_name, fields) = self.backend.build_schema(search_fields) + self.assertEqual(content_field_name, 'text') self.assertEqual(len(fields), 14 + 3) self.assertEqual(fields, [ @@ -436,13 +476,17 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ]) def test_parse_query(self): - self.assertEqual(str(self.backend.parse_query('indexed')), 'Xapian::Query(Zindex:(pos=1))') - self.assertEqual(str(self.backend.parse_query('name:david')), 'Xapian::Query(ZXNAMEdavid:(pos=1))') + self.assertEqual(str(self.backend.parse_query('indexed')), + 'Xapian::Query(Zindex:(pos=1))') + self.assertEqual(str(self.backend.parse_query('name:david')), + 'Xapian::Query(ZXNAMEdavid:(pos=1))') if xapian.minor_version() >= 2: - self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((XNAMEdavid1:(pos=1) SYNONYM XNAMEdavid2:(pos=1) SYNONYM XNAMEdavid3:(pos=1)))') + self.assertEqual(str(self.backend.parse_query('name:da*')), + 'Xapian::Query((XNAMEdavid1:(pos=1) SYNONYM XNAMEdavid2:(pos=1) SYNONYM XNAMEdavid3:(pos=1)))') else: - self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))') + self.assertEqual(str(self.backend.parse_query('name:da*')), + 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))') self.assertEqual(str(self.backend.parse_query('name:david1..david2')), 'Xapian::Query(VALUE_RANGE 7 david1 david2)') @@ -456,7 +500,10 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): b'Xapian::Query(VALUE_RANGE 9 \xb2` \xba@)') def test_order_by_django_id(self): - self.backend.clear() + """ + We need this test because ordering on more than + 10 entries was not correct at some point. + """ self.sample_objs = [] number_list = range(1, 101) for i in number_list: @@ -476,8 +523,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.update(self.index, self.sample_objs) results = self.backend.search(xapian.Query(''), sort_by=['-django_id']) - self.assertEqual(results['hits'], len(number_list)) - self.assertEqual([result.pk for result in results['results']], list(reversed(number_list))) + self.assertEqual(pks(results['results']), list(reversed(number_list))) def test_more_like_this_with_unindexed_model(self): """ diff --git a/tests/xapian_tests/tests/test_xapian_query.py b/tests/xapian_tests/tests/test_xapian_query.py index adb3678..c2d71a1 100644 --- a/tests/xapian_tests/tests/test_xapian_query.py +++ b/tests/xapian_tests/tests/test_xapian_query.py @@ -25,145 +25,210 @@ class XapianSearchQueryTestCase(TestCase): super(XapianSearchQueryTestCase, self).tearDown() def test_build_query_all(self): - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query()') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query()') def test_build_query_single_word(self): self.sq.add_filter(SQ(content='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhello OR hello))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Zhello OR hello))') def test_build_query_single_word_not(self): self.sq.add_filter(~SQ(content='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Zhello OR hello)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (Zhello OR hello)))') def test_build_query_single_word_field_exact(self): self.sq.add_filter(SQ(foo='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((ZXFOOhello OR XFOOhello))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((ZXFOOhello OR XFOOhello))') def test_build_query_single_word_field_exact_not(self): self.sq.add_filter(~SQ(foo='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (ZXFOOhello OR XFOOhello)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (ZXFOOhello OR XFOOhello)))') def test_build_query_boolean(self): self.sq.add_filter(SQ(content=True)) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Ztrue OR true))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Ztrue OR true))') def test_build_query_date(self): self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z20090508000000 OR 20090508000000))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z20090508000000 OR 20090508000000))') def test_build_query_date_not(self): self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Z20090508000000 OR 20090508000000)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (Z20090508000000 OR 20090508000000)))') def test_build_query_datetime(self): self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z20090508112800 OR 20090508112800))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z20090508112800 OR 20090508112800))') def test_build_query_datetime_not(self): self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Z20090508112800 OR 20090508112800)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (Z20090508112800 OR 20090508112800)))') def test_build_query_float(self): self.sq.add_filter(SQ(content=25.52)) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z25.52 OR 25.52))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z25.52 OR 25.52))') def test_build_query_multiple_words_and(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_filter(SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND (Zworld OR world)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zhello OR hello) AND (Zworld OR world)))') def test_build_query_multiple_words_not(self): self.sq.add_filter(~SQ(content='hello')) self.sq.add_filter(~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (Zhello OR hello)) AND ( AND_NOT (Zworld OR world))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (Zhello OR hello)) AND ' + '( AND_NOT (Zworld OR world))))') def test_build_query_multiple_words_or(self): self.sq.add_filter(SQ(content='hello') | SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhello OR hello OR Zworld OR world))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Zhello OR hello OR Zworld OR world))') def test_build_query_multiple_words_or_not(self): self.sq.add_filter(~SQ(content='hello') | ~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (Zhello OR hello)) OR ( AND_NOT (Zworld OR world))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (Zhello OR hello)) OR ' + '( AND_NOT (Zworld OR world))))') def test_build_query_multiple_words_mixed(self): self.sq.add_filter(SQ(content='why') | SQ(content='hello')) self.sq.add_filter(~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why OR Zhello OR hello) AND ( AND_NOT (Zworld OR world))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(Zwhi OR why OR Zhello OR hello) AND ' + '( AND_NOT (Zworld OR world))))') def test_build_query_multiple_word_field_exact(self): self.sq.add_filter(SQ(foo='hello')) self.sq.add_filter(SQ(bar='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((ZXFOOhello OR XFOOhello) AND (ZXBARworld OR XBARworld)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(ZXFOOhello OR XFOOhello) AND ' + '(ZXBARworld OR XBARworld)))') def test_build_query_multiple_word_field_exact_not(self): self.sq.add_filter(~SQ(foo='hello')) self.sq.add_filter(~SQ(bar='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (ZXFOOhello OR XFOOhello)) AND ( AND_NOT (ZXBARworld OR XBARworld))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (ZXFOOhello OR XFOOhello)) AND ' + '( AND_NOT (ZXBARworld OR XBARworld))))') def test_build_query_phrase(self): self.sq.add_filter(SQ(content='hello world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((hello PHRASE 2 world))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((hello PHRASE 2 world))') def test_build_query_phrase_not(self): self.sq.add_filter(~SQ(content='hello world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (hello PHRASE 2 world)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(' + '( AND_NOT (hello PHRASE 2 world)))') def test_build_query_boost(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_boost('world', 5) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND_MAYBE 5 * (Zworld OR world)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(Zhello OR hello) AND_MAYBE ' + '5 * (Zworld OR world)))') def test_build_query_in_filter_single_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=["Dune", "Jaws"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '(ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws)))') def test_build_query_not_in_filter_single_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ( AND_NOT (ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '( AND_NOT (ZXTITLEdune OR XTITLEdune OR ' + 'ZXTITLEjaw OR XTITLEjaws))))') def test_build_query_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR ' + '(XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle))))') def test_build_query_in_filter_multiple_words_with_punctuation(self): self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article", "My Store Inc."])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle) OR (XTITLEmy PHRASE 3 XTITLEstore PHRASE 3 XTITLEinc.)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR ' + '(XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle) OR ' + '(XTITLEmy PHRASE 3 XTITLEstore PHRASE 3 XTITLEinc.)))') def test_build_query_not_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["A Famous Paper", "An Infamous Article"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ( AND_NOT ((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle)))))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '( AND_NOT ((XTITLEa PHRASE 3 ' + 'XTITLEfamous PHRASE 3 ' + 'XTITLEpaper) OR (XTITLEan PHRASE 3 ' + 'XTITLEinfamous PHRASE 3 XTITLEarticle)))))') def test_build_query_in_filter_datetime(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXPUB_DATE20090706015621 OR XPUB_DATE20090706015621)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '(ZXPUB_DATE20090706015621 OR XPUB_DATE20090706015621)))') def test_clean(self): self.assertEqual(self.sq.clean('hello world'), 'hello world') self.assertEqual(self.sq.clean('hello AND world'), 'hello AND world') - self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'), 'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world') - self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'), 'so please NOTe i am in a bAND and bORed') + self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'), + 'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world') + self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'), + 'so please NOTe i am in a bAND and bORed') def test_build_query_with_models(self): self.sq.add_filter(SQ(content='hello')) self.sq.add_model(MockModel) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND 0 * CONTENTTYPEcore.mockmodel))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zhello OR hello) AND ' + '0 * CONTENTTYPEcore.mockmodel))') self.sq.add_model(AnotherMockModel) self.assertTrue(str(self.sq.build_query()) in ( - 'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.anothermockmodel OR 0 * CONTENTTYPEcore.mockmodel)))', - 'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.mockmodel OR 0 * CONTENTTYPEcore.anothermockmodel)))')) + 'Xapian::Query(((Zhello OR hello) AND ' + '(0 * CONTENTTYPEcore.anothermockmodel OR ' + '0 * CONTENTTYPEcore.mockmodel)))', + 'Xapian::Query(((Zhello OR hello) AND ' + '(0 * CONTENTTYPEcore.mockmodel OR ' + '0 * CONTENTTYPEcore.anothermockmodel)))')) def test_build_query_with_punctuation(self): self.sq.add_filter(SQ(content='http://www.example.com')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhttp://www.example.com OR http://www.example.com))') + self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhttp://www.example.com OR ' + 'http://www.example.com))') def test_in_filter_values_list(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXTITLE1 OR XTITLE1 OR ZXTITLE2 OR XTITLE2 OR ZXTITLE3 OR XTITLE3)))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '(ZXTITLE1 OR XTITLE1 OR ZXTITLE2 OR ' + 'XTITLE2 OR ZXTITLE3 OR XTITLE3)))') From 93522bbd56d7d5f75b5a495f157cfbb1a226cdc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 11:15:20 +0200 Subject: [PATCH 03/38] Changed name of variable from "field" to "field_name". --- xapian_backend.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 1196395..63f09c9 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1009,7 +1009,7 @@ class XapianSearchQuery(BaseSearchQuery): ) else: expression, term = child - field, filter_type = search_node.split_expression(expression) + field_name, filter_type = search_node.split_expression(expression) # Handle when we've got a ``ValuesListQuerySet``... if hasattr(term, 'values_list'): @@ -1020,25 +1020,25 @@ class XapianSearchQuery(BaseSearchQuery): else: term = _marshal_term(term) - if field == 'content': + if field_name == 'content': query_list.append(self._content_field(term, is_not)) else: if filter_type == 'contains': - query_list.append(self._filter_contains(term, field, is_not)) + query_list.append(self._filter_contains(term, field_name, is_not)) elif filter_type == 'exact': - query_list.append(self._filter_exact(term, field, is_not)) + query_list.append(self._filter_exact(term, field_name, is_not)) elif filter_type == 'gt': - query_list.append(self._filter_gt(term, field, is_not)) + query_list.append(self._filter_gt(term, field_name, is_not)) elif filter_type == 'gte': - query_list.append(self._filter_gte(term, field, is_not)) + query_list.append(self._filter_gte(term, field_name, is_not)) elif filter_type == 'lt': - query_list.append(self._filter_lt(term, field, is_not)) + query_list.append(self._filter_lt(term, field_name, is_not)) elif filter_type == 'lte': - query_list.append(self._filter_lte(term, field, is_not)) + query_list.append(self._filter_lte(term, field_name, is_not)) elif filter_type == 'startswith': - query_list.append(self._filter_startswith(term, field, is_not)) + query_list.append(self._filter_startswith(term, field_name, is_not)) elif filter_type == 'in': - query_list.append(self._filter_in(term, field, is_not)) + query_list.append(self._filter_in(term, field_name, is_not)) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) From 83bd8ee3e5f2acd66766f30c0bf82bd5cd21737f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 12:42:45 +0200 Subject: [PATCH 04/38] Reordered methods position in XapianSearchQuery. Also removed two @staticmethod. --- xapian_backend.py | 138 +++++++++++++++++++++++----------------------- 1 file changed, 68 insertions(+), 70 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 63f09c9..f9123e1 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1045,6 +1045,15 @@ class XapianSearchQuery(BaseSearchQuery): else: return xapian.Query(xapian.Query.OP_AND, query_list) + def _all_query(self): + """ + Private method that returns a xapian.Query that returns all documents, + + Returns: + A xapian.Query + """ + return xapian.Query('') + def _content_field(self, term, is_not): """ Private method that returns a xapian.Query that searches for `value` @@ -1090,25 +1099,6 @@ class XapianSearchQuery(BaseSearchQuery): else: return query - def _filter_exact(self, term, field, is_not): - """ - Private method that returns a xapian.Query that searches for an exact - match for `term` in a specified `field`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query - """ - query = self._phrase_query(term.split(), field) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) - else: - return query - def _filter_in(self, term_list, field, is_not): """ Private method that returns a xapian.Query that searches for any term @@ -1138,6 +1128,25 @@ class XapianSearchQuery(BaseSearchQuery): else: return xapian.Query(xapian.Query.OP_OR, query_list) + def _filter_exact(self, term, field, is_not): + """ + Private method that returns a xapian.Query that searches for an exact + match for `term` in a specified `field`. + + Required arguments: + ``term`` -- The term to search for + ``field`` -- The field to search + ``is_not`` -- Invert the search results + + Returns: + A xapian.Query + """ + query = self._phrase_query(term.split(), field) + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + else: + return query + def _filter_startswith(self, term, field, is_not): """ Private method that returns a xapian.Query that searches for any term @@ -1159,49 +1168,21 @@ class XapianSearchQuery(BaseSearchQuery): ) return self.backend.parse_query('%s:%s*' % (field, term)) - def _filter_gt(self, term, field, is_not): - return self._filter_lte(term, field, is_not=not is_not) + def _phrase_query(self, term_list, field=None, is_content=False): + """ + Private method that returns a phrase based xapian.Query that searches + for terms in `term_list. - def _filter_lt(self, term, field, is_not): - return self._filter_gte(term, field, is_not=not is_not) - - def _filter_gte(self, term, field, is_not): - """ - Private method that returns a xapian.Query that searches for any term - that is greater than `term` in a specified `field`. - """ - vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*') - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, - self._all_query(), - xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - ) - return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - - def _filter_lte(self, term, field, is_not): - """ - Private method that returns a xapian.Query that searches for any term - that is less than `term` in a specified `field`. - """ - vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term)) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, - self._all_query(), - xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - ) - return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - - @staticmethod - def _all_query(): - """ - Private method that returns a xapian.Query that returns all documents, + Required arguments: + ``term_list`` -- The terms to search for + ``field`` -- The field to search (If `None`, all fields) Returns: A xapian.Query """ - return xapian.Query('') + if field and not is_content: + term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field.upper(), term) for term in term_list] + return xapian.Query(xapian.Query.OP_PHRASE, term_list) def _term_query(self, term, field=None): """ @@ -1236,22 +1217,39 @@ class XapianSearchQuery(BaseSearchQuery): xapian.Query(unstemmed) ) - @staticmethod - def _phrase_query(term_list, field=None, is_content=False): - """ - Private method that returns a phrase based xapian.Query that searches - for terms in `term_list. + def _filter_gt(self, term, field, is_not): + return self._filter_lte(term, field, is_not=not is_not) - Required arguments: - ``term_list`` -- The terms to search for - ``field`` -- The field to search (If `None`, all fields) + def _filter_lt(self, term, field, is_not): + return self._filter_gte(term, field, is_not=not is_not) - Returns: - A xapian.Query + def _filter_gte(self, term, field, is_not): """ - if field and not is_content: - term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field.upper(), term) for term in term_list] - return xapian.Query(xapian.Query.OP_PHRASE, term_list) + Private method that returns a xapian.Query that searches for any term + that is greater than `term` in a specified `field`. + """ + vrp = XHValueRangeProcessor(self.backend) + pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*') + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, + self._all_query(), + xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + ) + return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + + def _filter_lte(self, term, field, is_not): + """ + Private method that returns a xapian.Query that searches for any term + that is less than `term` in a specified `field`. + """ + vrp = XHValueRangeProcessor(self.backend) + pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term)) + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, + self._all_query(), + xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) + ) + return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) def _marshal_value(value): From f21e2c373c30ad60c7a4bc366fbe7799f6e2f54f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 12:53:58 +0200 Subject: [PATCH 05/38] Added new argument on methods to build queries. This allows more flexibility on constructing the queries. For now the argument is not used. --- xapian_backend.py | 50 +++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index f9123e1..5d7c7af 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1024,9 +1024,9 @@ class XapianSearchQuery(BaseSearchQuery): query_list.append(self._content_field(term, is_not)) else: if filter_type == 'contains': - query_list.append(self._filter_contains(term, field_name, is_not)) + query_list.append(self._filter_contains(term, field_name, None, is_not)) elif filter_type == 'exact': - query_list.append(self._filter_exact(term, field_name, is_not)) + query_list.append(self._filter_exact(term, field_name, None, is_not)) elif filter_type == 'gt': query_list.append(self._filter_gt(term, field_name, is_not)) elif filter_type == 'gte': @@ -1036,9 +1036,9 @@ class XapianSearchQuery(BaseSearchQuery): elif filter_type == 'lte': query_list.append(self._filter_lte(term, field_name, is_not)) elif filter_type == 'startswith': - query_list.append(self._filter_startswith(term, field_name, is_not)) + query_list.append(self._filter_startswith(term, field_name, None, is_not)) elif filter_type == 'in': - query_list.append(self._filter_in(term, field_name, is_not)) + query_list.append(self._filter_in(term, field_name, None, is_not)) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) @@ -1070,14 +1070,14 @@ class XapianSearchQuery(BaseSearchQuery): if ' ' in term: query = self._phrase_query(term.split(), self.backend.content_field_name, is_content=True) else: - query = self._term_query(term) + query = self._term_query(term, None, None) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: return query - def _filter_contains(self, term, field, is_not): + def _filter_contains(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for `term` in a specified `field`. @@ -1091,15 +1091,15 @@ class XapianSearchQuery(BaseSearchQuery): A xapian.Query """ if ' ' in term: - return self._filter_exact(term, field, is_not) + return self._filter_exact(term, field_name, field_type, is_not) else: - query = self._term_query(term, field) + query = self._term_query(term, field_name, field_type) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: return query - def _filter_in(self, term_list, field, is_not): + def _filter_in(self, term_list, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term of `value_list` in a specified `field`. @@ -1116,11 +1116,11 @@ class XapianSearchQuery(BaseSearchQuery): for term in term_list: if ' ' in term: query_list.append( - self._phrase_query(term.split(), field) + self._phrase_query(term.split(), field_name) ) else: query_list.append( - self._term_query(term, field) + self._term_query(term, field_name, field_type) ) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), @@ -1128,7 +1128,7 @@ class XapianSearchQuery(BaseSearchQuery): else: return xapian.Query(xapian.Query.OP_OR, query_list) - def _filter_exact(self, term, field, is_not): + def _filter_exact(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for an exact match for `term` in a specified `field`. @@ -1141,13 +1141,13 @@ class XapianSearchQuery(BaseSearchQuery): Returns: A xapian.Query """ - query = self._phrase_query(term.split(), field) + query = self._phrase_query(term.split(), field_name) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: return query - def _filter_startswith(self, term, field, is_not): + def _filter_startswith(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that begins with `term` in a specified `field`. @@ -1164,11 +1164,11 @@ class XapianSearchQuery(BaseSearchQuery): return xapian.Query( xapian.Query.OP_AND_NOT, self._all_query(), - self.backend.parse_query('%s:%s*' % (field, term)), + self.backend.parse_query('%s:%s*' % (field_name, term)), ) - return self.backend.parse_query('%s:%s*' % (field, term)) + return self.backend.parse_query('%s:%s*' % (field_name, term)) - def _phrase_query(self, term_list, field=None, is_content=False): + def _phrase_query(self, term_list, field_name, is_content=False): """ Private method that returns a phrase based xapian.Query that searches for terms in `term_list. @@ -1180,11 +1180,11 @@ class XapianSearchQuery(BaseSearchQuery): Returns: A xapian.Query """ - if field and not is_content: - term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field.upper(), term) for term in term_list] + if field_name and not is_content: + term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field_name.upper(), term) for term in term_list] return xapian.Query(xapian.Query.OP_PHRASE, term_list) - def _term_query(self, term, field=None): + def _term_query(self, term, field_name, field_type, exact=False): """ Private method that returns a term based xapian.Query that searches for `term`. @@ -1198,14 +1198,14 @@ class XapianSearchQuery(BaseSearchQuery): """ stem = xapian.Stem(self.backend.language) - if field in ('id', 'django_id', 'django_ct'): - return xapian.Query('%s%s' % (TERM_PREFIXES[field], term)) - elif field: + if field_name in ('id', 'django_id', 'django_ct'): + return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) + elif field_name: stemmed = 'Z%s%s%s' % ( - TERM_PREFIXES['field'], field.upper(), stem(term) + TERM_PREFIXES['field'], field_name.upper(), stem(term) ) unstemmed = '%s%s%s' % ( - TERM_PREFIXES['field'], field.upper(), term + TERM_PREFIXES['field'], field_name.upper(), term ) else: stemmed = 'Z%s' % stem(term) From 98949009abd4f87f1c8355299b3d2798b2f65e6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 12:58:47 +0200 Subject: [PATCH 06/38] Modified how the query construction methods are chosen. - "content" is a special field with generic search, but now allow filters. - parameter field_type is now retrieved from the backend and passed to the construction methods. --- xapian_backend.py | 73 +++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 41 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 5d7c7af..4484fa7 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1011,7 +1011,7 @@ class XapianSearchQuery(BaseSearchQuery): expression, term = child field_name, filter_type = search_node.split_expression(expression) - # Handle when we've got a ``ValuesListQuerySet``... + # Handle `ValuesListQuerySet`. if hasattr(term, 'values_list'): term = list(term) @@ -1021,24 +1021,38 @@ class XapianSearchQuery(BaseSearchQuery): term = _marshal_term(term) if field_name == 'content': - query_list.append(self._content_field(term, is_not)) - else: + # content is the generic search: + # force no field_name search + # and the field_type to be 'text'. + field_name = None + field_type = 'text' + + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + # when filter has no filter_type, haystack uses + # filter_type = 'contains'. Here we remove it + # since the above query is already doing this if filter_type == 'contains': - query_list.append(self._filter_contains(term, field_name, None, is_not)) - elif filter_type == 'exact': - query_list.append(self._filter_exact(term, field_name, None, is_not)) - elif filter_type == 'gt': - query_list.append(self._filter_gt(term, field_name, is_not)) - elif filter_type == 'gte': - query_list.append(self._filter_gte(term, field_name, is_not)) - elif filter_type == 'lt': - query_list.append(self._filter_lt(term, field_name, is_not)) - elif filter_type == 'lte': - query_list.append(self._filter_lte(term, field_name, is_not)) - elif filter_type == 'startswith': - query_list.append(self._filter_startswith(term, field_name, None, is_not)) - elif filter_type == 'in': - query_list.append(self._filter_in(term, field_name, None, is_not)) + filter_type = None + else: + # pick the field_type from the backend + field_type = self.backend.schema[self.backend.column(field_name)]['type'] + + if filter_type == 'contains': + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + elif filter_type == 'exact': + query_list.append(self._filter_exact(term, field_name, field_type, is_not)) + elif filter_type == 'in': + query_list.append(self._filter_in(term, field_name, field_type, is_not)) + elif filter_type == 'startswith': + query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) + elif filter_type == 'gt': + query_list.append(self._filter_gt(term, field_name, is_not)) + elif filter_type == 'gte': + query_list.append(self._filter_gte(term, field_name, is_not)) + elif filter_type == 'lt': + query_list.append(self._filter_lt(term, field_name, is_not)) + elif filter_type == 'lte': + query_list.append(self._filter_lte(term, field_name, is_not)) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) @@ -1054,29 +1068,6 @@ class XapianSearchQuery(BaseSearchQuery): """ return xapian.Query('') - def _content_field(self, term, is_not): - """ - Private method that returns a xapian.Query that searches for `value` - in all fields. - - Required arguments: - ``term`` -- The term to search for - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query - """ - # it is more than one term, we build a PHRASE - if ' ' in term: - query = self._phrase_query(term.split(), self.backend.content_field_name, is_content=True) - else: - query = self._term_query(term, None, None) - - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) - else: - return query - def _filter_contains(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for `term` From e7dfbae41f3501748c308d17c7565b305ad048ee Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:08:34 +0200 Subject: [PATCH 07/38] Refactored _term_query query constructor. Its interface is explained in the docstring. --- xapian_backend.py | 54 +++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 25 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 4484fa7..a519805 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1177,36 +1177,40 @@ class XapianSearchQuery(BaseSearchQuery): def _term_query(self, term, field_name, field_type, exact=False): """ - Private method that returns a term based xapian.Query that searches - for `term`. + Constructs a query of a single term. - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search (If `None`, all fields) - - Returns: - A xapian.Query + If `field_name` is not `None`, the term is search on that field only. + If exact is `True`, the search is restricted to non-stemmed boolean match. """ - stem = xapian.Stem(self.backend.language) - if field_name in ('id', 'django_id', 'django_ct'): + # to ensure the value is serialized correctly. + if field_name == 'django_id': + term = int(term) + term = _marshal_value(term) return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) - elif field_name: - stemmed = 'Z%s%s%s' % ( - TERM_PREFIXES['field'], field_name.upper(), stem(term) - ) - unstemmed = '%s%s%s' % ( - TERM_PREFIXES['field'], field_name.upper(), term - ) - else: - stemmed = 'Z%s' % stem(term) - unstemmed = term - return xapian.Query( - xapian.Query.OP_OR, - xapian.Query(stemmed), - xapian.Query(unstemmed) - ) + constructor = '{prefix}{term}' + # "" is to do a boolean match, but only works on indexed terms + # (constraint on Xapian side) + if exact and field_type == 'text': + constructor = '"{prefix}{term}"' + + prefix = '' + if field_name: + prefix = TERM_PREFIXES['field'] + field_name.upper() + term = _marshal_value(term) + + unstemmed = constructor.format(prefix=prefix, term=term) + if exact: + return xapian.Query(unstemmed) + else: + stem = xapian.Stem(self.backend.language) + stemmed = 'Z' + constructor.format(prefix=prefix, term=stem(term)) + + return xapian.Query(xapian.Query.OP_OR, + xapian.Query(stemmed), + xapian.Query(unstemmed) + ) def _filter_gt(self, term, field, is_not): return self._filter_lte(term, field, is_not=not is_not) From 7e09d3d0b1f9c41d8f78721ed589bde70d8c9eda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:12:45 +0200 Subject: [PATCH 08/38] Refactored _phrase_query query constructor. Its interface is explained in the docstring. --- xapian_backend.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index a519805..f2a15b1 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1159,20 +1159,24 @@ class XapianSearchQuery(BaseSearchQuery): ) return self.backend.parse_query('%s:%s*' % (field_name, term)) - def _phrase_query(self, term_list, field_name, is_content=False): + def _phrase_query(self, term_list, field_name): """ - Private method that returns a phrase based xapian.Query that searches - for terms in `term_list. + Returns a query that matches exact terms and with + positional order (i.e. ["this", "thing"] != ["thing", "this"]) - Required arguments: - ``term_list`` -- The terms to search for - ``field`` -- The field to search (If `None`, all fields) - - Returns: - A xapian.Query + If `field_name` is given, this match is restricted to the field. """ - if field_name and not is_content: - term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field_name.upper(), term) for term in term_list] + prefix = '' + if field_name in ('id', 'django_id', 'django_ct'): + prefix = TERM_PREFIXES[field_name] + elif field_name: + prefix = TERM_PREFIXES['field'] + + if field_name: + term_list = ['%s%s%s' % (prefix, + field_name.upper(), + term) for term in term_list] + return xapian.Query(xapian.Query.OP_PHRASE, term_list) def _term_query(self, term, field_name, field_type, exact=False): From 83d10b6cc1a627604fc394adb4e8b6dcaae369ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:14:43 +0200 Subject: [PATCH 09/38] Refactored _filter_startswith query constructor. Its interface is explained in the docstring. --- xapian_backend.py | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index f2a15b1..0d827a5 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1140,24 +1140,19 @@ class XapianSearchQuery(BaseSearchQuery): def _filter_startswith(self, term, field_name, field_type, is_not): """ - Private method that returns a xapian.Query that searches for any term - that begins with `term` in a specified `field`. - - Required arguments: - ``term`` -- The terms to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Returns a startswith query on the un-stemmed term. """ + # TODO: if field_type is of type long, we need to marsh the value. + if field_name: + query_string = '%s:%s*' % (field_name, term) + else: + query_string = '%s*' % term + + query = self.backend.parse_query(query_string) + if is_not: - return xapian.Query( - xapian.Query.OP_AND_NOT, - self._all_query(), - self.backend.parse_query('%s:%s*' % (field_name, term)), - ) - return self.backend.parse_query('%s:%s*' % (field_name, term)) + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + return query def _phrase_query(self, term_list, field_name): """ From 2969a749533c1214a2a7bcb80d7d1b60f67f3381 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:16:16 +0200 Subject: [PATCH 10/38] Refactored _filter_exact query constructor. Its interface is explained in the docstring. --- xapian_backend.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 0d827a5..b057da8 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1121,18 +1121,14 @@ class XapianSearchQuery(BaseSearchQuery): def _filter_exact(self, term, field_name, field_type, is_not): """ - Private method that returns a xapian.Query that searches for an exact - match for `term` in a specified `field`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Returns a query that matches exactly the un-stemmed term + with positional order. """ - query = self._phrase_query(term.split(), field_name) + if ' ' in term: + query = self._phrase_query(term.split(), field_name) + else: + query = self._term_query(term, field_name, field_type, exact=True) + if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: From 99dc011a2560f26e30a691f65df90f46ae78976e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:19:17 +0200 Subject: [PATCH 11/38] Refactored _filter_in query constructor. --- xapian_backend.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index b057da8..c138513 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1092,27 +1092,16 @@ class XapianSearchQuery(BaseSearchQuery): def _filter_in(self, term_list, field_name, field_type, is_not): """ - Private method that returns a xapian.Query that searches for any term - of `value_list` in a specified `field`. + Returns a query that matches exactly ANY term in term_list. - Required arguments: - ``term_list`` -- The terms to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Notice that: + A in {B,C} <=> (A = B or A = C) + ~(A in {B,C}) <=> ~(A = B or A = C) + Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`. """ - query_list = [] - for term in term_list: - if ' ' in term: - query_list.append( - self._phrase_query(term.split(), field_name) - ) - else: - query_list.append( - self._term_query(term, field_name, field_type) - ) + query_list = [self._filter_exact(term, field_name, field_type, is_not=False) + for term in term_list] + if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_OR, query_list)) From 89691ce86a279b109c65f750a7738ae0f432049f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:21:39 +0200 Subject: [PATCH 12/38] Added method to create OR queries from list of strings. --- xapian_backend.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/xapian_backend.py b/xapian_backend.py index c138513..a5da57e 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1139,6 +1139,13 @@ class XapianSearchQuery(BaseSearchQuery): return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) return query + def _or_query(self, term_list, field, field_type, exact=False): + """ + Joins each item of term_list decorated by _term_query with an OR. + """ + term_list = [self._term_query(term, field, field_type, exact) for term in term_list] + return xapian.Query(xapian.Query.OP_OR, term_list) + def _phrase_query(self, term_list, field_name): """ Returns a query that matches exact terms and with From 4d51f5e9af985c8172947e1600bcc9345210c822 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:31:34 +0200 Subject: [PATCH 13/38] Fixed #98 - queries are now consistent with Haystack 2.X. Refactored _filter_contains query constructor. --- xapian_backend.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index a5da57e..282a314 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1068,27 +1068,17 @@ class XapianSearchQuery(BaseSearchQuery): """ return xapian.Query('') - def _filter_contains(self, term, field_name, field_type, is_not): + def _filter_contains(self, sentence, field_name, field_type, is_not): """ - Private method that returns a xapian.Query that searches for `term` - in a specified `field`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Splits the sentence in terms and join them with OR, + using stemmed and un-stemmed. """ - if ' ' in term: - return self._filter_exact(term, field_name, field_type, is_not) + query = self._or_query(sentence.split(), field_name, field_type) + + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: - query = self._term_query(term, field_name, field_type) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) - else: - return query + return query def _filter_in(self, term_list, field_name, field_type, is_not): """ From 5d16d1aca7ccb7261ae72317c51a1cdeddb7e27f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 13:40:59 +0200 Subject: [PATCH 14/38] Fixed #101 - Adds support to AutoQuery. --- xapian_backend.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/xapian_backend.py b/xapian_backend.py index 282a314..16cb906 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -16,6 +16,7 @@ from haystack import connections from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query from haystack.constants import ID, DJANGO_ID, DJANGO_CT from haystack.exceptions import HaystackError, MissingDependency +from haystack.inputs import AutoQuery from haystack.models import SearchResult from haystack.utils import get_identifier, get_model_ct @@ -1011,6 +1012,15 @@ class XapianSearchQuery(BaseSearchQuery): expression, term = child field_name, filter_type = search_node.split_expression(expression) + # Identify and parse AutoQuery + if isinstance(term, AutoQuery): + if field_name != 'content': + query = '%s:%s' % (field_name, term.prepare(self)) + else: + query = term.prepare(self) + query_list.append(self.backend.parse_query(query)) + continue + # Handle `ValuesListQuerySet`. if hasattr(term, 'values_list'): term = list(term) From d06da45d9dce0c003e974806ea06bd7408a73137 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 16:54:21 +0200 Subject: [PATCH 15/38] Simplified construction of Xapian queries. --- xapian_backend.py | 49 ++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 16cb906..7cbbdbc 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1114,10 +1114,9 @@ class XapianSearchQuery(BaseSearchQuery): with positional order. """ if ' ' in term: - query = self._phrase_query(term.split(), field_name) + query = self._phrase_query(term.split(), field_name, field_type) else: - query = self._term_query(term, field_name, field_type, exact=True) - + query = self._term_query(term, field_name, field_type, exact=True, stemmed=False) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: @@ -1146,33 +1145,31 @@ class XapianSearchQuery(BaseSearchQuery): term_list = [self._term_query(term, field, field_type, exact) for term in term_list] return xapian.Query(xapian.Query.OP_OR, term_list) - def _phrase_query(self, term_list, field_name): + def _phrase_query(self, term_list, field_name, field_type): """ - Returns a query that matches exact terms and with + Returns a query that matches exact terms with positional order (i.e. ["this", "thing"] != ["thing", "this"]) + and no stem. - If `field_name` is given, this match is restricted to the field. + If `field_name` is not `None`, restrict to the field. """ - prefix = '' - if field_name in ('id', 'django_id', 'django_ct'): - prefix = TERM_PREFIXES[field_name] - elif field_name: - prefix = TERM_PREFIXES['field'] + term_list = [self._term_query(term, field_name, field_type, + stemmed=False) for term in term_list] - if field_name: - term_list = ['%s%s%s' % (prefix, - field_name.upper(), - term) for term in term_list] + query = xapian.Query(xapian.Query.OP_PHRASE, term_list) + return query - return xapian.Query(xapian.Query.OP_PHRASE, term_list) - - def _term_query(self, term, field_name, field_type, exact=False): + def _term_query(self, term, field_name, field_type, exact=False, stemmed=True): """ Constructs a query of a single term. If `field_name` is not `None`, the term is search on that field only. - If exact is `True`, the search is restricted to non-stemmed boolean match. + If exact is `True`, the search is restricted to boolean matches. """ + # using stemmed terms in exact query is not acceptable. + if stemmed: + assert not exact + if field_name in ('id', 'django_id', 'django_ct'): # to ensure the value is serialized correctly. if field_name == 'django_id': @@ -1191,17 +1188,17 @@ class XapianSearchQuery(BaseSearchQuery): prefix = TERM_PREFIXES['field'] + field_name.upper() term = _marshal_value(term) - unstemmed = constructor.format(prefix=prefix, term=term) - if exact: - return xapian.Query(unstemmed) - else: + unstemmed_term = constructor.format(prefix=prefix, term=term) + if stemmed: stem = xapian.Stem(self.backend.language) - stemmed = 'Z' + constructor.format(prefix=prefix, term=stem(term)) + stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term)) return xapian.Query(xapian.Query.OP_OR, - xapian.Query(stemmed), - xapian.Query(unstemmed) + xapian.Query(stemmed_term), + xapian.Query(unstemmed_term) ) + else: + return unstemmed_term def _filter_gt(self, term, field, is_not): return self._filter_lte(term, field, is_not=not is_not) From 1b764d7ddf6b5243d18d6ee1ee909421e4a6e040 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 14:14:34 +0200 Subject: [PATCH 16/38] Removed dependency on type long; field_type can now be datetime. --- .../xapian_tests/tests/test_xapian_backend.py | 4 ++-- xapian_backend.py | 22 ++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 3245ba2..ece61ee 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -457,7 +457,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(len(fields), 14 + 3) self.assertEqual(fields, [ {'column': 0, 'type': 'text', 'field_name': 'id', 'multi_valued': 'false'}, - {'column': 1, 'type': 'long', 'field_name': 'django_id', 'multi_valued': 'false'}, + {'column': 1, 'type': 'integer', 'field_name': 'django_id', 'multi_valued': 'false'}, {'column': 2, 'type': 'text', 'field_name': 'django_ct', 'multi_valued': 'false'}, {'column': 3, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'}, {'column': 4, 'type': 'date', 'field_name': 'exp_date', 'multi_valued': 'false'}, @@ -472,7 +472,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): {'column': 13, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'}, {'column': 14, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'}, {'column': 15, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'}, - {'column': 16, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'} + {'column': 16, 'type': 'integer', 'field_name': 'value', 'multi_valued': 'false'} ]) def test_parse_query(self): diff --git a/xapian_backend.py b/xapian_backend.py index 7cbbdbc..626d00e 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -80,7 +80,7 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): if not begin: if field_dict['type'] == 'text': begin = 'a' # TODO: A better way of getting a min text value? - elif field_dict['type'] == 'long': + elif field_dict['type'] == 'integer': begin = -sys.maxint - 1 elif field_dict['type'] == 'float': begin = float('-inf') @@ -89,7 +89,7 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): elif end == '*': if field_dict['type'] == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? - elif field_dict['type'] == 'long': + elif field_dict['type'] == 'integer': end = sys.maxint elif field_dict['type'] == 'float': end = float('inf') @@ -98,9 +98,9 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): if field_dict['type'] == 'float': begin = _marshal_value(float(begin)) end = _marshal_value(float(end)) - elif field_dict['type'] == 'long': - begin = _marshal_value(long(begin)) - end = _marshal_value(long(end)) + elif field_dict['type'] == 'integer': + begin = _marshal_value(int(begin)) + end = _marshal_value(int(end)) return field_dict['column'], str(begin), str(end) @@ -642,7 +642,7 @@ class XapianSearchBackend(BaseSearchBackend): 'multi_valued': 'false', 'column': 0}, {'field_name': DJANGO_ID, - 'type': 'long', + 'type': 'integer', 'multi_valued': 'false', 'column': 1}, {'field_name': DJANGO_CT, @@ -668,10 +668,12 @@ class XapianSearchBackend(BaseSearchBackend): 'column': column, } - if field_class.field_type in ['date', 'datetime']: + if field_class.field_type == 'date': field_data['type'] = 'date' + elif field_class.field_type == 'datetime': + field_data['type'] = 'datetime' elif field_class.field_type == 'integer': - field_data['type'] = 'long' + field_data['type'] = 'integer' elif field_class.field_type == 'float': field_data['type'] = 'float' elif field_class.field_type == 'boolean': @@ -1126,7 +1128,7 @@ class XapianSearchQuery(BaseSearchQuery): """ Returns a startswith query on the un-stemmed term. """ - # TODO: if field_type is of type long, we need to marsh the value. + # TODO: if field_type is of type integer, we need to marsh the value. if field_name: query_string = '%s:%s*' % (field_name, term) else: @@ -1250,7 +1252,7 @@ def _marshal_value(value): value = 'f' elif isinstance(value, float): value = xapian.sortable_serialise(value) - elif isinstance(value, (int, long)): + elif isinstance(value, int): value = '%012d' % value else: value = force_text(value).lower() From f2b9c062a6be44a7c240a7c0555c7c3d208b5a52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 23:18:22 +0200 Subject: [PATCH 17/38] Fixed #56 - Implements single valued field facets using Xapian. Replaces the brute force _do_field_facets with Xapian faceting for single valued facets. Xapian does not support multi valued facets yet; thus the brute force is used in this case. --- xapian_backend.py | 87 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 68 insertions(+), 19 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 626d00e..28ed0f4 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -49,6 +49,10 @@ DEFAULT_XAPIAN_FLAGS = ( xapian.QueryParser.FLAG_PURE_NOT ) +# number of documents checked by default when building facets +# this must be improved to be relative to the total number of docs. +DEFAULT_CHECK_AT_LEAST = 1000 + class InvalidIndexError(HaystackError): """Raised when an index can not be opened.""" @@ -458,6 +462,12 @@ class XapianSearchBackend(BaseSearchBackend): if not end_offset: end_offset = database.get_doccount() - start_offset + ## prepare spies in case of facets + if facets: + facets_spies = self._prepare_facet_field_spies(facets) + for spy in facets_spies: + enquire.add_matchspy(spy) + matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: @@ -473,9 +483,18 @@ class XapianSearchBackend(BaseSearchBackend): ) if facets: - facets_dict['fields'] = self._do_field_facets(results, facets) + # pick single valued facets from spies + single_facets_dict = self._process_facet_field_spies(facets_spies) + + # pick multivalued valued facets from results + multi_facets_dict = self._do_multivalued_field_facets(results, facets) + + # merge both results (http://stackoverflow.com/a/38990/931303) + facets_dict['fields'] = dict(list(single_facets_dict.items()) + list(multi_facets_dict.items())) + if date_facets: facets_dict['dates'] = self._do_date_facets(results, date_facets) + if query_facets: facets_dict['queries'] = self._do_query_facets(results, query_facets) @@ -708,33 +727,56 @@ class XapianSearchBackend(BaseSearchBackend): return content - def _do_field_facets(self, results, field_facets): + def _prepare_facet_field_spies(self, facets): """ - Private method that facets a document by field name. + Returns a list of spies based on the facets + used to count frequencies. + """ + spies = [] + for facet in facets: + slot = self.column(facet) + spy = xapian.ValueCountMatchSpy(slot) + # add attribute "slot" to know which column this spy is targeting. + spy.slot = slot + spies.append(spy) + return spies - Fields of type MultiValueField will be faceted on each item in the - (containing) list. + def _process_facet_field_spies(self, spies): + """ + Returns a dict of facet names with lists of + tuples of the form (term, term_frequency) + from a list of spies that observed the enquire. + """ + facet_dict = {} + for spy in spies: + field = self.schema[spy.slot] + field_name = field['field_name'] + facet_dict[field_name] = [] + for facet in spy.values(): + facet_dict[field_name].append((_xapian_to_python(facet.term), facet.termfreq)) + return facet_dict - Required arguments: - `results` -- A list SearchResults to facet - `field_facets` -- A list of fields to facet on + def _do_multivalued_field_facets(self, results, field_facets): + """ + Implements a multivalued field facet on the results. + + This is implemented using brute force - O(N^2) - + because Xapian does not have it implemented yet + (see http://trac.xapian.org/ticket/199) """ facet_dict = {} - # DS_TODO: Improve this algorithm. Currently, runs in O(N^2), ouch. for field in field_facets: facet_list = {} + if not self._multi_value_field(field): + continue for result in results: field_value = getattr(result, field) - if self._multi_value_field(field): - for item in field_value: # Facet each item in a MultiValueField - facet_list[item] = facet_list.get(item, 0) + 1 - else: - facet_list[field_value] = facet_list.get(field_value, 0) + 1 + for item in field_value: # Facet each item in a MultiValueField + facet_list[item] = facet_list.get(item, 0) + 1 facet_dict[field] = facet_list.items() - return facet_dict @staticmethod @@ -834,7 +876,6 @@ class XapianSearchBackend(BaseSearchBackend): eg. {'name': ('a*', 5)} """ facet_dict = {} - for field, query in dict(query_facets).items(): facet_dict[field] = (query, self.search(self.parse_query(query))['hits']) @@ -890,7 +931,7 @@ class XapianSearchBackend(BaseSearchBackend): return database @staticmethod - def _get_enquire_mset(database, enquire, start_offset, end_offset): + def _get_enquire_mset(database, enquire, start_offset, end_offset, checkatleast=DEFAULT_CHECK_AT_LEAST): """ A safer version of Xapian.enquire.get_mset @@ -904,10 +945,10 @@ class XapianSearchBackend(BaseSearchBackend): `end_offset` -- The end offset to pass to `enquire.get_mset` """ try: - return enquire.get_mset(start_offset, end_offset) + return enquire.get_mset(start_offset, end_offset, checkatleast) except xapian.DatabaseModifiedError: database.reopen() - return enquire.get_mset(start_offset, end_offset) + return enquire.get_mset(start_offset, end_offset, checkatleast) @staticmethod def _get_document_data(database, document): @@ -1259,6 +1300,14 @@ def _marshal_value(value): return value +def _xapian_to_python(value): + if value == 't': + return True + elif value == 'f': + return False + return value + + def _marshal_term(term): """ Private utility method that converts Python terms to a string for Xapian terms. From b623ea2556ec5f1ea5ef578b01275be2d632ad09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 23:43:29 +0200 Subject: [PATCH 18/38] Fixed #109 - Raises InvalidIndexError when facet is not indexed. Also added regression test. --- tests/xapian_tests/tests/test_xapian_backend.py | 6 ++++++ xapian_backend.py | 16 ++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index ece61ee..868756b 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -305,6 +305,12 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(results['facets']['fields']['sites'], [('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]) + def test_raise_index_error_on_wrong_field(self): + """ + Regression test for #109. + """ + self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) + def test_date_facets(self): facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), diff --git a/xapian_backend.py b/xapian_backend.py index 28ed0f4..4736c4b 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -366,6 +366,18 @@ class XapianSearchBackend(BaseSearchBackend): return query + def _check_field_names(self, field_names): + """ + Raises InvalidIndexError if any of a field_name in field_names is + not indexed. + """ + if field_names: + for field_name in field_names: + try: + self.column(field_name) + except KeyError: + raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name) + @log_query def search(self, query, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, @@ -414,6 +426,10 @@ class XapianSearchBackend(BaseSearchBackend): 'hits': 0, } + self._check_field_names(facets) + self._check_field_names(date_facets) + self._check_field_names(query_facets) + database = self._database() if result_class is None: From 1c3a7ff4a58f2acd62b13fd25f4eedb31e32db9f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 09:09:12 +0200 Subject: [PATCH 19/38] Fixed #126 - Deprecates microsecond indexing. --- tests/xapian_tests/tests/test_xapian_backend.py | 1 - xapian_backend.py | 14 ++++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 868756b..cde0f73 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -453,7 +453,6 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 16, 14)), '20090509161400') self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 0, 0)), '20090509000000') self.assertEqual(_marshal_value(datetime.datetime(1899, 5, 18, 0, 0)), '18990518000000') - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 18, 1, 16, 30, 250)), '20090518011630000250') def test_build_schema(self): search_fields = connections['default'].get_unified_index().all_searchfields() diff --git a/xapian_backend.py b/xapian_backend.py index 4736c4b..8af9ee4 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1342,16 +1342,10 @@ def _marshal_date(d): def _marshal_datetime(dt): - if dt.microsecond: - return '%04d%02d%02d%02d%02d%02d%06d' % ( - dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second, dt.microsecond - ) - else: - return '%04d%02d%02d%02d%02d%02d' % ( - dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second - ) + return '%04d%02d%02d%02d%02d%02d' % ( + dt.year, dt.month, dt.day, dt.hour, + dt.minute, dt.second + ) class XapianEngine(BaseEngine): From 6576ee883e37c2e592e1ee2636a1a3a186d185ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sat, 17 May 2014 23:43:29 +0200 Subject: [PATCH 20/38] Fixed #109 - Raises InvalidIndexError when facet is not indexed. Also added regression test. --- tests/xapian_tests/tests/test_xapian_backend.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index cde0f73..bf2a118 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -311,6 +311,12 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): """ self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) + def test_raise_index_error_on_wrong_field(self): + """ + Regression test for #109. + """ + self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) + def test_date_facets(self): facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), From e85b503aaaed64a2ecf6446bf663755383cb1bb5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 13:21:17 +0200 Subject: [PATCH 21/38] Changed variable name inside a function. --- xapian_backend.py | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index 8af9ee4..ea53546 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -81,24 +81,27 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): begin = begin[colon + 1:len(begin)] for field_dict in self.backend.schema: if field_dict['field_name'] == field_name: + field_type = field_dict['type'] + if not begin: - if field_dict['type'] == 'text': + if field_type == 'text': begin = 'a' # TODO: A better way of getting a min text value? - elif field_dict['type'] == 'integer': + elif field_type == 'integer': begin = -sys.maxint - 1 - elif field_dict['type'] == 'float': + elif field_type == 'float': begin = float('-inf') - elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': + elif field_type == 'date' or field_type == 'datetime': begin = '00010101000000' elif end == '*': - if field_dict['type'] == 'text': + if field_type == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? - elif field_dict['type'] == 'integer': + elif field_type == 'integer': end = sys.maxint - elif field_dict['type'] == 'float': + elif field_type == 'float': end = float('inf') - elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': + elif field_type == 'date' or field_type == 'datetime': end = '99990101000000' + if field_dict['type'] == 'float': begin = _marshal_value(float(begin)) end = _marshal_value(float(end)) From 8c14898088641b5a27e61d4b94fe1f097d30b92b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 14:07:53 +0200 Subject: [PATCH 22/38] Made general improvement to the code. --- .../xapian_tests/tests/test_xapian_backend.py | 71 ++++--- xapian_backend.py | 194 +++++++++++------- 2 files changed, 160 insertions(+), 105 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index bf2a118..8ad7e2a 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -12,7 +12,7 @@ from django.test import TestCase from haystack import connections, reset_search_queries from haystack import indexes -from haystack.backends.xapian_backend import InvalidIndexError, _marshal_value +from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value from haystack.models import SearchResult from haystack.query import SearchQuerySet, SQ from haystack.utils.loading import UnifiedIndex @@ -156,7 +156,7 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): mock = XapianMockModel() mock.id = 1 mock.author = 'david' - mock.pub_date = datetime.date(2009, 2, 25) + mock.pub_date = datetime.datetime(2009, 2, 25) self.backend.update(self.index, [mock]) @@ -311,12 +311,6 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): """ self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) - def test_raise_index_error_on_wrong_field(self): - """ - Regression test for #109. - """ - self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) - def test_date_facets(self): facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), @@ -448,17 +442,21 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']], ['02', '02', '02']) - def test__marshal_value(self): - self.assertEqual(_marshal_value('abc'), 'abc') - self.assertEqual(_marshal_value(1), '000000000001') - self.assertEqual(_marshal_value(2653), '000000002653') - self.assertEqual(_marshal_value(25.5), b'\xb2`') - self.assertEqual(_marshal_value([1, 2, 3]), '[1, 2, 3]') - self.assertEqual(_marshal_value((1, 2, 3)), '(1, 2, 3)') - self.assertEqual(_marshal_value({'a': 1, 'c': 3, 'b': 2}), "{u'a': 1, u'c': 3, u'b': 2}") - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 16, 14)), '20090509161400') - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 0, 0)), '20090509000000') - self.assertEqual(_marshal_value(datetime.datetime(1899, 5, 18, 0, 0)), '18990518000000') + def test_term_to_xapian_value(self): + self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc') + self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001') + self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653') + self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`') + self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]') + self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)') + self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'), + "{u'a': 1, u'c': 3, u'b': 2}") + self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 16, 14), 'datetime'), + '20090509161400') + self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 0, 0), 'date'), + '20090509000000') + self.assertEqual(_term_to_xapian_value(datetime.datetime(1899, 5, 18, 0, 0), 'date'), + '18990518000000') def test_build_schema(self): search_fields = connections['default'].get_unified_index().all_searchfields() @@ -558,8 +556,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): class LiveXapianMockSearchIndex(indexes.SearchIndex): text = indexes.CharField(document=True, use_template=True) name = indexes.CharField(model_attr='author', faceted=True) - pub_date = indexes.DateField(model_attr='pub_date') - created = indexes.DateField() + pub_date = indexes.DateTimeField(model_attr='pub_date') title = indexes.CharField() def get_model(self): @@ -593,35 +590,41 @@ class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): def test_build_query_gt(self): self.sq.add_filter(SQ(name__gt='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT VALUE_RANGE 4 a m))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT VALUE_RANGE 3 a m))') def test_build_query_gte(self): self.sq.add_filter(SQ(name__gte='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzz)') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzz)') def test_build_query_lt(self): self.sq.add_filter(SQ(name__lt='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT ' + 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') def test_build_query_lte(self): self.sq.add_filter(SQ(name__lte='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 a m)') + self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 3 a m)') def test_build_query_multiple_filter_types(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) self.sq.add_filter(SQ(name__gt='david')) - self.sq.add_filter(SQ(created__lt=datetime.datetime(2009, 2, 12, 12, 13, 0))) self.sq.add_filter(SQ(title__gte='B')) - self.sq.add_filter(SQ(id__in=[1, 2, 3])) + self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND VALUE_RANGE 6 00010101000000 20090210015900 AND ' - '( AND_NOT VALUE_RANGE 4 a david) AND ' - '( AND_NOT VALUE_RANGE 3 20090212121300 99990101000000) AND ' - 'VALUE_RANGE 8 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' - '(Q1 OR Q2 OR Q3)))') + 'Xapian::Query(((Zwhi OR why) AND ' + 'VALUE_RANGE 5 00010101000000 20090210015900 AND ' + '( AND_NOT VALUE_RANGE 3 a david) AND ' + 'VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' + '(QQ000000000001 OR QQ000000000002 OR QQ000000000003)))') def test_log_query(self): reset_search_queries() diff --git a/xapian_backend.py b/xapian_backend.py index ea53546..ed3ea97 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -53,6 +53,14 @@ DEFAULT_XAPIAN_FLAGS = ( # this must be improved to be relative to the total number of docs. DEFAULT_CHECK_AT_LEAST = 1000 +# field types accepted to be serialized as values in Xapian +FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean'} + +# defines the format used to store types in Xapian +# this format ensures datetimes are sorted correctly +DATETIME_FORMAT = '%Y%m%d%H%M%S' +INTEGER_FORMAT = '%012d' + class InvalidIndexError(HaystackError): """Raised when an index can not be opened.""" @@ -102,12 +110,12 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): elif field_type == 'date' or field_type == 'datetime': end = '99990101000000' - if field_dict['type'] == 'float': - begin = _marshal_value(float(begin)) - end = _marshal_value(float(end)) - elif field_dict['type'] == 'integer': - begin = _marshal_value(int(begin)) - end = _marshal_value(int(end)) + if field_type == 'float': + begin = _term_to_xapian_value(float(begin), field_type) + end = _term_to_xapian_value(float(end), field_type) + elif field_type == 'integer': + begin = _term_to_xapian_value(int(begin), field_type) + end = _term_to_xapian_value(int(end), field_type) return field_dict['column'], str(begin), str(end) @@ -262,28 +270,27 @@ class XapianSearchBackend(BaseSearchBackend): else: weight = 1 + value = data[field['field_name']] + # Private fields are indexed in a different way: + # `django_id` is an int and `django_ct` is text; + # besides, they are indexed by their (unstemmed) value. if field['field_name'] in ('id', 'django_id', 'django_ct'): - term = data[field['field_name']] - - # django_id is always an integer, thus we send - # it to _marshal_value as int to guarantee it - # is stored as a sortable number. if field['field_name'] == 'django_id': - term = int(term) - term = _marshal_value(term) + value = int(value) + value = _term_to_xapian_value(value, field['type']) - document.add_term(TERM_PREFIXES[field['field_name']] + term, weight) - document.add_value(field['column'], term) + document.add_term(TERM_PREFIXES[field['field_name']] + value, weight) + document.add_value(field['column'], value) else: - value = data[field['field_name']] prefix = TERM_PREFIXES['field'] + field['field_name'].upper() + # if not multi_valued, we add a value and construct a one-element list if field['multi_valued'] == 'false': - document.add_value(field['column'], _marshal_value(value)) + document.add_value(field['column'], _term_to_xapian_value(value, field['type'])) value = [value] for term in value: - term = _marshal_term(term) + term = _to_xapian_term(term) if field['type'] == 'text': term_generator.index_text(term, weight) term_generator.index_text(term, weight, prefix) @@ -769,10 +776,12 @@ class XapianSearchBackend(BaseSearchBackend): facet_dict = {} for spy in spies: field = self.schema[spy.slot] - field_name = field['field_name'] + field_name, field_type = field['field_name'], field['type'] + facet_dict[field_name] = [] for facet in spy.values(): - facet_dict[field_name].append((_xapian_to_python(facet.term), facet.termfreq)) + facet_dict[field_name].append((_from_xapian_value(facet.term, field_type), + facet.termfreq)) return facet_dict def _do_multivalued_field_facets(self, results, field_facets): @@ -1087,11 +1096,6 @@ class XapianSearchQuery(BaseSearchQuery): if hasattr(term, 'values_list'): term = list(term) - if isinstance(term, (list, tuple)): - term = [_marshal_term(t) for t in term] - else: - term = _marshal_term(term) - if field_name == 'content': # content is the generic search: # force no field_name search @@ -1106,9 +1110,14 @@ class XapianSearchQuery(BaseSearchQuery): if filter_type == 'contains': filter_type = None else: - # pick the field_type from the backend + # get the field_type from the backend field_type = self.backend.schema[self.backend.column(field_name)]['type'] + # private fields don't accept 'contains' or 'startswith' + # since they have no meaning. + if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'): + filter_type = 'exact' + if filter_type == 'contains': query_list.append(self._filter_contains(term, field_name, field_type, is_not)) elif filter_type == 'exact': @@ -1118,13 +1127,13 @@ class XapianSearchQuery(BaseSearchQuery): elif filter_type == 'startswith': query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) elif filter_type == 'gt': - query_list.append(self._filter_gt(term, field_name, is_not)) + query_list.append(self._filter_gt(term, field_name, field_type, is_not)) elif filter_type == 'gte': - query_list.append(self._filter_gte(term, field_name, is_not)) + query_list.append(self._filter_gte(term, field_name, field_type, is_not)) elif filter_type == 'lt': - query_list.append(self._filter_lt(term, field_name, is_not)) + query_list.append(self._filter_lt(term, field_name, field_type, is_not)) elif filter_type == 'lte': - query_list.append(self._filter_lte(term, field_name, is_not)) + query_list.append(self._filter_lte(term, field_name, field_type, is_not)) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) @@ -1133,20 +1142,21 @@ class XapianSearchQuery(BaseSearchQuery): def _all_query(self): """ - Private method that returns a xapian.Query that returns all documents, - - Returns: - A xapian.Query + Returns a match all query. """ return xapian.Query('') - def _filter_contains(self, sentence, field_name, field_type, is_not): + def _filter_contains(self, term, field_name, field_type, is_not): """ Splits the sentence in terms and join them with OR, using stemmed and un-stemmed. """ - query = self._or_query(sentence.split(), field_name, field_type) + if field_type == 'text': + term_list = term.split() + else: + term_list = [term] + query = self._or_query(term_list, field_name, field_type) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: @@ -1175,10 +1185,15 @@ class XapianSearchQuery(BaseSearchQuery): Returns a query that matches exactly the un-stemmed term with positional order. """ - if ' ' in term: + + # this is an hack: + # the ideal would be to use the same idea as in _filter_contains. + # However, it causes tests to fail. + if field_type == 'text' and ' ' in term: query = self._phrase_query(term.split(), field_name, field_type) else: query = self._term_query(term, field_name, field_type, exact=True, stemmed=False) + if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) else: @@ -1236,7 +1251,7 @@ class XapianSearchQuery(BaseSearchQuery): # to ensure the value is serialized correctly. if field_name == 'django_id': term = int(term) - term = _marshal_value(term) + term = _term_to_xapian_value(term, field_type) return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) constructor = '{prefix}{term}' @@ -1248,7 +1263,7 @@ class XapianSearchQuery(BaseSearchQuery): prefix = '' if field_name: prefix = TERM_PREFIXES['field'] + field_name.upper() - term = _marshal_value(term) + term = _to_xapian_term(term) unstemmed_term = constructor.format(prefix=prefix, term=term) if stemmed: @@ -1262,19 +1277,19 @@ class XapianSearchQuery(BaseSearchQuery): else: return unstemmed_term - def _filter_gt(self, term, field, is_not): - return self._filter_lte(term, field, is_not=not is_not) + def _filter_gt(self, term, field_name, field_type, is_not): + return self._filter_lte(term, field_name, field_type, is_not=not is_not) - def _filter_lt(self, term, field, is_not): - return self._filter_gte(term, field, is_not=not is_not) + def _filter_lt(self, term, field_name, field_type, is_not): + return self._filter_gte(term, field_name, field_type, is_not=not is_not) - def _filter_gte(self, term, field, is_not): + def _filter_gte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is greater than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*') + pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*') if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), @@ -1282,13 +1297,13 @@ class XapianSearchQuery(BaseSearchQuery): ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - def _filter_lte(self, term, field, is_not): + def _filter_lte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is less than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term)) + pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type)) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), @@ -1297,39 +1312,47 @@ class XapianSearchQuery(BaseSearchQuery): return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) -def _marshal_value(value): +def _term_to_xapian_value(term, field_type): """ - Private utility method that converts Python values to a string for Xapian values. + Converts a term to a serialized + Xapian value based on the field_type. """ - if isinstance(value, datetime.datetime): - value = _marshal_datetime(value) - elif isinstance(value, datetime.date): - value = _marshal_date(value) - elif isinstance(value, bool): - if value: + assert field_type in FIELD_TYPES + + def strf(dt): + """ + Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT) + but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303) + """ + return '%04d%02d%02d%02d%02d%02d' % ( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) + + if field_type == 'boolean': + assert isinstance(term, bool) + if term: value = 't' else: value = 'f' - elif isinstance(value, float): - value = xapian.sortable_serialise(value) - elif isinstance(value, int): - value = '%012d' % value - else: - value = force_text(value).lower() + + elif field_type == 'integer': + value = INTEGER_FORMAT % term + elif field_type == 'float': + value = xapian.sortable_serialise(term) + elif field_type == 'date' or field_type == 'datetime': + if field_type == 'date': + # http://stackoverflow.com/a/1937636/931303 and comments + term = datetime.datetime.combine(term, datetime.time()) + value = strf(term) + else: # field_type == 'text' + value = _to_xapian_term(term) + return value -def _xapian_to_python(value): - if value == 't': - return True - elif value == 'f': - return False - return value - - -def _marshal_term(term): +def _to_xapian_term(term): """ - Private utility method that converts Python terms to a string for Xapian terms. + Converts a Python type to a + Xapian term that can be indexed. """ if isinstance(term, datetime.datetime): term = _marshal_datetime(term) @@ -1340,6 +1363,35 @@ def _marshal_term(term): return term +def _from_xapian_value(value, field_type): + """ + Converts a serialized Xapian value + to Python equivalent based on the field_type. + + Doesn't accept multivalued fields. + """ + assert field_type in FIELD_TYPES + if field_type == 'boolean': + if value == 't': + return True + elif value == 'f': + return False + else: + InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value)) + elif field_type == 'integer': + return int(value) + elif field_type == 'float': + return xapian.sortable_unserialise(value) + elif field_type == 'date' or field_type == 'datetime': + datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT) + if field_type == 'datetime': + return datetime_value + else: + return datetime_value.date() + else: # field_type == 'text' + return value + + def _marshal_date(d): return '%04d%02d%02d000000' % (d.year, d.month, d.day) From 552786b6ec6bb57cb372bfb881f773d9c6163cb1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 16:36:53 +0200 Subject: [PATCH 23/38] Changed backend column method to a property. --- xapian_backend.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index ed3ea97..fa18fc5 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -200,12 +200,13 @@ class XapianSearchBackend(BaseSearchBackend): self._update_cache() return self._content_field_name - def column(self, field_name): + @property + def column(self): """ Returns the column in the database of a given field name. """ self._update_cache() - return self._columns[field_name] + return self._columns def update(self, index, iterable): """ @@ -384,7 +385,7 @@ class XapianSearchBackend(BaseSearchBackend): if field_names: for field_name in field_names: try: - self.column(field_name) + self.column[field_name] except KeyError: raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name) @@ -474,7 +475,7 @@ class XapianSearchBackend(BaseSearchBackend): sort_field = sort_field[1:] # Strip the '-' else: reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311 - sorter.add(self.column(sort_field), reverse) + sorter.add(self.column[sort_field], reverse) enquire.set_sort_by_key_then_relevance(sorter, True) @@ -760,7 +761,7 @@ class XapianSearchBackend(BaseSearchBackend): """ spies = [] for facet in facets: - slot = self.column(facet) + slot = self.column[facet] spy = xapian.ValueCountMatchSpy(slot) # add attribute "slot" to know which column this spy is targeting. spy.slot = slot @@ -1111,7 +1112,7 @@ class XapianSearchQuery(BaseSearchQuery): filter_type = None else: # get the field_type from the backend - field_type = self.backend.schema[self.backend.column(field_name)]['type'] + field_type = self.backend.schema[self.backend.column[field_name]]['type'] # private fields don't accept 'contains' or 'startswith' # since they have no meaning. From bb74dae2c0728d002048ff980d212f89409e64fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 16:38:22 +0200 Subject: [PATCH 24/38] Improved how queries are constructed. --- xapian_backend.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/xapian_backend.py b/xapian_backend.py index fa18fc5..e825005 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1084,6 +1084,9 @@ class XapianSearchQuery(BaseSearchQuery): expression, term = child field_name, filter_type = search_node.split_expression(expression) + if field_name != 'content' and field_name not in self.backend.column: + raise InvalidIndexError('field "%s" not indexed' % field_name) + # Identify and parse AutoQuery if isinstance(term, AutoQuery): if field_name != 'content': @@ -1104,6 +1107,11 @@ class XapianSearchQuery(BaseSearchQuery): field_name = None field_type = 'text' + # we don't know what is the type(term), so we parse it. + # Ideally this would not be required, but + # some filters currently depend on the term to make decisions. + term = _to_xapian_term(term) + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) # when filter has no filter_type, haystack uses # filter_type = 'contains'. Here we remove it @@ -1119,6 +1127,17 @@ class XapianSearchQuery(BaseSearchQuery): if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'): filter_type = 'exact' + if field_type == 'text': + # we don't know what type "term" is, but we know we are searching as text + # so we parse it like that. + # Ideally this would not be required since _term_query does it, but + # some filters currently depend on the term to make decisions. + if isinstance(term, list): + term = [_to_xapian_term(term) for term in term] + else: + term = _to_xapian_term(term) + + # todo: we should check that the filter is valid for this field_type or raise InvalidIndexError if filter_type == 'contains': query_list.append(self._filter_contains(term, field_name, field_type, is_not)) elif filter_type == 'exact': @@ -1151,6 +1170,8 @@ class XapianSearchQuery(BaseSearchQuery): """ Splits the sentence in terms and join them with OR, using stemmed and un-stemmed. + + Assumes term is not a list. """ if field_type == 'text': term_list = term.split() @@ -1171,6 +1192,8 @@ class XapianSearchQuery(BaseSearchQuery): A in {B,C} <=> (A = B or A = C) ~(A in {B,C}) <=> ~(A = B or A = C) Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`. + + Assumes term is a list. """ query_list = [self._filter_exact(term, field_name, field_type, is_not=False) for term in term_list] @@ -1185,6 +1208,8 @@ class XapianSearchQuery(BaseSearchQuery): """ Returns a query that matches exactly the un-stemmed term with positional order. + + Assumes term is not a list. """ # this is an hack: @@ -1203,6 +1228,8 @@ class XapianSearchQuery(BaseSearchQuery): def _filter_startswith(self, term, field_name, field_type, is_not): """ Returns a startswith query on the un-stemmed term. + + Assumes term is not a list. """ # TODO: if field_type is of type integer, we need to marsh the value. if field_name: From 107e81be2d6aaa312b986b13505fcb9b38f82d0f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 16:38:54 +0200 Subject: [PATCH 25/38] Fixed code error (tests were not passing). --- xapian_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xapian_backend.py b/xapian_backend.py index e825005..d6e6c83 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1062,7 +1062,8 @@ class XapianSearchQuery(BaseSearchQuery): if self.boost: subqueries = [ xapian.Query( - xapian.Query.OP_SCALE_WEIGHT, self._content_field(term, False), value + xapian.Query.OP_SCALE_WEIGHT, + self._term_query(term, None, None), value ) for term, value in self.boost.iteritems() ] query = xapian.Query( From 453129d60e9b56ce9f912ef2d01fe0e5af5a706b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 16:42:33 +0200 Subject: [PATCH 26/38] Simplified code. --- xapian_backend.py | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index d6e6c83..408ed35 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1384,12 +1384,12 @@ def _to_xapian_term(term): Xapian term that can be indexed. """ if isinstance(term, datetime.datetime): - term = _marshal_datetime(term) + value = term.strftime(DATETIME_FORMAT) elif isinstance(term, datetime.date): - term = _marshal_date(term) + value = term.strftime(DATETIME_FORMAT) else: - term = force_text(term).lower() - return term + value = force_text(term).lower() + return value def _from_xapian_value(value, field_type): @@ -1421,17 +1421,6 @@ def _from_xapian_value(value, field_type): return value -def _marshal_date(d): - return '%04d%02d%02d000000' % (d.year, d.month, d.day) - - -def _marshal_datetime(dt): - return '%04d%02d%02d%02d%02d%02d' % ( - dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second - ) - - class XapianEngine(BaseEngine): backend = XapianSearchBackend query = XapianSearchQuery From 61c6ac7a297ee7453a03e247b635fb356a691815 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 17:02:37 +0200 Subject: [PATCH 27/38] Fixed tests to account for the new query correspondence. --- tests/xapian_tests/tests/test_xapian_query.py | 58 +++++++++---------- 1 file changed, 28 insertions(+), 30 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_query.py b/tests/xapian_tests/tests/test_xapian_query.py index c2d71a1..1098749 100644 --- a/tests/xapian_tests/tests/test_xapian_query.py +++ b/tests/xapian_tests/tests/test_xapian_query.py @@ -1,29 +1,35 @@ from __future__ import unicode_literals import datetime -import os -import shutil -from django.conf import settings from django.test import TestCase +from haystack import indexes from haystack import connections from haystack.query import SQ from core.models import MockModel, AnotherMockModel +from xapian_tests.tests.test_xapian_backend import HaystackBackendTestCase -class XapianSearchQueryTestCase(TestCase): +class XapianMockQueryIndex(indexes.SearchIndex): + text = indexes.CharField(document=True) + pub_date = indexes.DateTimeField() + title = indexes.CharField() + foo = indexes.CharField() + + def get_model(self): + return MockModel + + +class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): + def get_index(self): + return XapianMockQueryIndex() + def setUp(self): super(XapianSearchQueryTestCase, self).setUp() self.sq = connections['default'].get_query() - def tearDown(self): - if os.path.exists(settings.HAYSTACK_CONNECTIONS['default']['PATH']): - shutil.rmtree(settings.HAYSTACK_CONNECTIONS['default']['PATH']) - - super(XapianSearchQueryTestCase, self).tearDown() - def test_build_query_all(self): self.assertEqual(str(self.sq.build_query()), 'Xapian::Query()') @@ -114,30 +120,30 @@ class XapianSearchQueryTestCase(TestCase): def test_build_query_multiple_word_field_exact(self): self.sq.add_filter(SQ(foo='hello')) - self.sq.add_filter(SQ(bar='world')) + self.sq.add_filter(SQ(title='world')) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((' '(ZXFOOhello OR XFOOhello) AND ' - '(ZXBARworld OR XBARworld)))') + '(ZXTITLEworld OR XTITLEworld)))') def test_build_query_multiple_word_field_exact_not(self): self.sq.add_filter(~SQ(foo='hello')) - self.sq.add_filter(~SQ(bar='world')) + self.sq.add_filter(~SQ(title='world')) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((' '( AND_NOT (ZXFOOhello OR XFOOhello)) AND ' - '( AND_NOT (ZXBARworld OR XBARworld))))') + '( AND_NOT (ZXTITLEworld OR XTITLEworld))))') - def test_build_query_phrase(self): + def test_build_query_or(self): self.sq.add_filter(SQ(content='hello world')) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query((hello PHRASE 2 world))') + 'Xapian::Query((Zhello OR hello OR Zworld OR world))') - def test_build_query_phrase_not(self): + def test_build_query_not_or(self): self.sq.add_filter(~SQ(content='hello world')) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(' - '( AND_NOT (hello PHRASE 2 world)))') + '( AND_NOT (Zhello OR hello OR Zworld OR world)))') def test_build_query_boost(self): self.sq.add_filter(SQ(content='hello')) @@ -147,20 +153,13 @@ class XapianSearchQueryTestCase(TestCase): '(Zhello OR hello) AND_MAYBE ' '5 * (Zworld OR world)))') - def test_build_query_in_filter_single_words(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(title__in=["Dune", "Jaws"])) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND ' - '(ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws)))') - def test_build_query_not_in_filter_single_words(self): self.sq.add_filter(SQ(content='why')) self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '( AND_NOT (ZXTITLEdune OR XTITLEdune OR ' - 'ZXTITLEjaw OR XTITLEjaws))))') + '( AND_NOT ("XTITLEdune" OR ' + '"XTITLEjaws"))))') def test_build_query_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) @@ -193,7 +192,7 @@ class XapianSearchQueryTestCase(TestCase): self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '(ZXPUB_DATE20090706015621 OR XPUB_DATE20090706015621)))') + 'XPUB_DATE20090706015621))') def test_clean(self): self.assertEqual(self.sq.clean('hello world'), 'hello world') @@ -230,5 +229,4 @@ class XapianSearchQueryTestCase(TestCase): self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '(ZXTITLE1 OR XTITLE1 OR ZXTITLE2 OR ' - 'XTITLE2 OR ZXTITLE3 OR XTITLE3)))') + '("XTITLE1" OR "XTITLE2" OR "XTITLE3")))') From 34a172199cbe335a916ba1396d1445d0e0b56cad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 18:25:27 +0200 Subject: [PATCH 28/38] Applied 2to3 to convert all code to Python 3.X. - Related to #128, but doesn't fix it since there are no tests for Python 3.X - It passes all tests in Python2.7 --- tests/xapian_tests/search_indexes.py | 2 +- .../xapian_tests/tests/test_xapian_backend.py | 20 +++++++++---------- xapian_backend.py | 18 ++++++++--------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py index 357bd02..ad97415 100644 --- a/tests/xapian_tests/search_indexes.py +++ b/tests/xapian_tests/search_indexes.py @@ -1,6 +1,6 @@ from haystack import indexes -import models +from . import models class DocumentIndex(indexes.SearchIndex): diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 8ad7e2a..6420cfe 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -23,8 +23,8 @@ from core.tests.mocks import MockSearchResult def get_terms(backend, *args): result = subprocess.check_output(['delve'] + list(args) + [backend.path], env=os.environ.copy()) - result = result.split(": ")[1].strip() - return result.split(" ") + result = result.split(b": ")[1].strip() + return result.split(b" ") def pks(results): @@ -78,7 +78,7 @@ class XapianMockSearchIndex(indexes.SearchIndex): return XapianMockModel def prepare_sites(self, obj): - return ['%d' % (i * obj.id) for i in xrange(1, 4)] + return ['%d' % (i * obj.id) for i in range(1, 4)] def prepare_tags(self, obj): if obj.id == 1: @@ -89,7 +89,7 @@ class XapianMockSearchIndex(indexes.SearchIndex): return ['an', 'to', 'or'] def prepare_keys(self, obj): - return [i * obj.id for i in xrange(1, 4)] + return [i * obj.id for i in range(1, 4)] def prepare_titles(self, obj): if obj.id == 1: @@ -210,7 +210,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.sample_objs = [] - for i in xrange(1, 4): + for i in range(1, 4): mock = XapianMockModel() mock.id = i mock.author = 'david%s' % i @@ -329,7 +329,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ('2008-10-26T00:00:00', 0), ]) - facets = {'pub_date': {'start_date': datetime.datetime(2009, 02, 01), + facets = {'pub_date': {'start_date': datetime.datetime(2009, 2, 1), 'end_date': datetime.datetime(2009, 3, 15), 'gap_by': 'day', 'gap_amount': 15}} @@ -502,9 +502,9 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(str(self.backend.parse_query('value:0..10')), 'Xapian::Query(VALUE_RANGE 16 000000000000 000000000010)') self.assertEqual(str(self.backend.parse_query('value:..10')), - 'Xapian::Query(VALUE_RANGE 16 %012d 000000000010)' % (-sys.maxint - 1)) + 'Xapian::Query(VALUE_RANGE 16 %012d 000000000010)' % (-sys.maxsize - 1)) self.assertEqual(str(self.backend.parse_query('value:10..*')), - 'Xapian::Query(VALUE_RANGE 16 000000000010 %012d)' % sys.maxint) + 'Xapian::Query(VALUE_RANGE 16 000000000010 %012d)' % sys.maxsize) self.assertEqual(str(self.backend.parse_query('popularity:25.5..100.0')), b'Xapian::Query(VALUE_RANGE 9 \xb2` \xba@)') @@ -514,7 +514,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): 10 entries was not correct at some point. """ self.sample_objs = [] - number_list = range(1, 101) + number_list = list(range(1, 101)) for i in number_list: mock = XapianMockModel() mock.id = i @@ -713,7 +713,7 @@ class XapianBoostBackendTestCase(HaystackBackendTestCase, TestCase): super(XapianBoostBackendTestCase, self).setUp() self.sample_objs = [] - for i in xrange(1, 5): + for i in range(1, 5): mock = AFourthMockModel() mock.id = i if i % 2: diff --git a/xapian_backend.py b/xapian_backend.py index 408ed35..2ac070b 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -95,7 +95,7 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): if field_type == 'text': begin = 'a' # TODO: A better way of getting a min text value? elif field_type == 'integer': - begin = -sys.maxint - 1 + begin = -sys.maxsize - 1 elif field_type == 'float': begin = float('-inf') elif field_type == 'date' or field_type == 'datetime': @@ -104,7 +104,7 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): if field_type == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? elif field_type == 'integer': - end = sys.maxint + end = sys.maxsize elif field_type == 'float': end = float('inf') elif field_type == 'date' or field_type == 'datetime': @@ -263,7 +263,7 @@ class XapianSearchBackend(BaseSearchBackend): weights = index.get_field_weights() for field in self.schema: # not supported fields are ignored. - if field['field_name'] not in data.keys(): + if field['field_name'] not in list(data.keys()): continue if field['field_name'] in weights: @@ -702,7 +702,7 @@ class XapianSearchBackend(BaseSearchBackend): column = len(schema_fields) - for field_name, field_class in sorted(fields.items(), key=lambda n: n[0]): + for field_name, field_class in sorted(list(fields.items()), key=lambda n: n[0]): if field_class.document is True: content_field_name = field_class.index_fieldname @@ -780,7 +780,7 @@ class XapianSearchBackend(BaseSearchBackend): field_name, field_type = field['field_name'], field['type'] facet_dict[field_name] = [] - for facet in spy.values(): + for facet in list(spy.values()): facet_dict[field_name].append((_from_xapian_value(facet.term, field_type), facet.termfreq)) return facet_dict @@ -805,7 +805,7 @@ class XapianSearchBackend(BaseSearchBackend): for item in field_value: # Facet each item in a MultiValueField facet_list[item] = facet_list.get(item, 0) + 1 - facet_dict[field] = facet_list.items() + facet_dict[field] = list(facet_list.items()) return facet_dict @staticmethod @@ -839,7 +839,7 @@ class XapianSearchBackend(BaseSearchBackend): """ facet_dict = {} - for date_facet, facet_params in date_facets.iteritems(): + for date_facet, facet_params in list(date_facets.items()): gap_type = facet_params.get('gap_by') gap_value = facet_params.get('gap_amount', 1) date_range = facet_params['start_date'] @@ -905,7 +905,7 @@ class XapianSearchBackend(BaseSearchBackend): eg. {'name': ('a*', 5)} """ facet_dict = {} - for field, query in dict(query_facets).items(): + for field, query in list(dict(query_facets).items()): facet_dict[field] = (query, self.search(self.parse_query(query))['hits']) return facet_dict @@ -1064,7 +1064,7 @@ class XapianSearchQuery(BaseSearchQuery): xapian.Query( xapian.Query.OP_SCALE_WEIGHT, self._term_query(term, None, None), value - ) for term, value in self.boost.iteritems() + ) for term, value in list(self.boost.items()) ] query = xapian.Query( xapian.Query.OP_AND_MAYBE, query, From 9d29d592222fa54853ce8fc15c3c62d735667d42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 20:51:44 +0200 Subject: [PATCH 29/38] Improved how date and datetime is indexed. They can now be text-searched using UTC time format, on date, time or both. --- tests/xapian_tests/tests/test_xapian_query.py | 11 ++-- xapian_backend.py | 51 ++++++++++++------- 2 files changed, 39 insertions(+), 23 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_query.py b/tests/xapian_tests/tests/test_xapian_query.py index 1098749..21b1a86 100644 --- a/tests/xapian_tests/tests/test_xapian_query.py +++ b/tests/xapian_tests/tests/test_xapian_query.py @@ -62,22 +62,23 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): def test_build_query_date(self): self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8))) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query((Z20090508000000 OR 20090508000000))') + 'Xapian::Query((Z2009-05-08 OR 2009-05-08))') def test_build_query_date_not(self): self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8))) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(( AND_NOT (Z20090508000000 OR 20090508000000)))') + 'Xapian::Query(( AND_NOT (Z2009-05-08 OR 2009-05-08)))') def test_build_query_datetime(self): self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query((Z20090508112800 OR 20090508112800))') + 'Xapian::Query((Z2009-05-08 OR 2009-05-08 OR Z11:28:00 OR 11:28:00))') def test_build_query_datetime_not(self): self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(( AND_NOT (Z20090508112800 OR 20090508112800)))') + 'Xapian::Query(( AND_NOT ' + '(Z2009-05-08 OR 2009-05-08 OR Z11:28:00 OR 11:28:00)))') def test_build_query_float(self): self.sq.add_filter(SQ(content=25.52)) @@ -192,7 +193,7 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - 'XPUB_DATE20090706015621))') + '(XPUB_DATE2009-07-06 AND_MAYBE XPUB_DATE01:56:21)))') def test_clean(self): self.assertEqual(self.sq.clean('hello world'), 'hello world') diff --git a/xapian_backend.py b/xapian_backend.py index 2ac070b..494e0f9 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -292,10 +292,18 @@ class XapianSearchBackend(BaseSearchBackend): for term in value: term = _to_xapian_term(term) + # from here on term is a string; + # now decide how it is stored: + + # these are if field['type'] == 'text': term_generator.index_text(term, weight) term_generator.index_text(term, weight, prefix) - if len(term.split()) == 1: + elif ' ' in term: + for t in term.split(): + document.add_term(t, weight) + document.add_term(prefix + t, weight) + if term != "": document.add_term(term, weight) document.add_term(prefix + term, weight) @@ -1276,6 +1284,18 @@ class XapianSearchQuery(BaseSearchQuery): if stemmed: assert not exact + constructor = '{prefix}{term}' + # "" is to do a boolean match, but only works on indexed terms + # (constraint on Xapian side) + if exact and field_type == 'text': + constructor = '"{prefix}{term}"' + + # construct the prefix to be used. + prefix = '' + if field_name: + prefix = TERM_PREFIXES['field'] + field_name.upper() + term = _to_xapian_term(term) + if field_name in ('id', 'django_id', 'django_ct'): # to ensure the value is serialized correctly. if field_name == 'django_id': @@ -1283,16 +1303,17 @@ class XapianSearchQuery(BaseSearchQuery): term = _term_to_xapian_value(term, field_type) return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) - constructor = '{prefix}{term}' - # "" is to do a boolean match, but only works on indexed terms - # (constraint on Xapian side) - if exact and field_type == 'text': - constructor = '"{prefix}{term}"' + # we construct the query dates in a slightly different way + if field_type == 'datetime': + date, time = term.split() + return xapian.Query(xapian.Query.OP_AND_MAYBE, + constructor.format(prefix=prefix, term=date), + constructor.format(prefix=prefix, term=time) + ) - prefix = '' - if field_name: - prefix = TERM_PREFIXES['field'] + field_name.upper() - term = _to_xapian_term(term) + # only use stem if field is text or "None" + if field_type not in ('text', None): + stemmed = False unstemmed_term = constructor.format(prefix=prefix, term=term) if stemmed: @@ -1304,7 +1325,7 @@ class XapianSearchQuery(BaseSearchQuery): xapian.Query(unstemmed_term) ) else: - return unstemmed_term + return xapian.Query(unstemmed_term) def _filter_gt(self, term, field_name, field_type, is_not): return self._filter_lte(term, field_name, field_type, is_not=not is_not) @@ -1383,13 +1404,7 @@ def _to_xapian_term(term): Converts a Python type to a Xapian term that can be indexed. """ - if isinstance(term, datetime.datetime): - value = term.strftime(DATETIME_FORMAT) - elif isinstance(term, datetime.date): - value = term.strftime(DATETIME_FORMAT) - else: - value = force_text(term).lower() - return value + return force_text(term).lower() def _from_xapian_value(value, field_type): From e6d23507907da283917544143ad2816259fd4470 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 20:52:45 +0200 Subject: [PATCH 30/38] Added tests to check that every field type is correctly indexed. --- .../xapian_tests/tests/test_xapian_backend.py | 131 ++++++++++++++---- 1 file changed, 103 insertions(+), 28 deletions(-) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 6420cfe..d962986 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -22,9 +22,10 @@ from core.tests.mocks import MockSearchResult def get_terms(backend, *args): - result = subprocess.check_output(['delve'] + list(args) + [backend.path], env=os.environ.copy()) - result = result.split(b": ")[1].strip() - return result.split(b" ") + result = subprocess.check_output(['delve'] + list(args) + [backend.path], + env=os.environ.copy()).decode('utf-8') + result = result.split(": ")[1].strip() + return result.split(" ") def pks(results): @@ -109,7 +110,14 @@ class XapianMockSearchIndex(indexes.SearchIndex): class XapianSimpleMockIndex(indexes.SearchIndex): text = indexes.CharField(document=True) author = indexes.CharField(model_attr='author') - pub_date = indexes.DateTimeField(model_attr='pub_date') + url = indexes.CharField() + + datetime = indexes.DateTimeField(model_attr='pub_date') + date = indexes.DateField() + + number = indexes.IntegerField() + float_number = indexes.FloatField() + decimal_number = indexes.DecimalField() def get_model(self): return MockModel @@ -117,6 +125,27 @@ class XapianSimpleMockIndex(indexes.SearchIndex): def prepare_text(self, obj): return 'this_is_a_word' + def prepare_author(self, obj): + return 'david' + + def prepare_url(self, obj): + return 'http://example.com/1/' + + def prepare_datetime(self, obj): + return datetime.datetime(2009, 2, 25, 1, 1, 1) + + def prepare_date(self, obj): + return datetime.date(2008, 8, 8) + + def prepare_number(self, obj): + return 123456789 + + def prepare_float_number(self, obj): + return 123.123456789 + + def prepare_decimal_number(self, obj): + return '22.34' + class HaystackBackendTestCase(object): """ @@ -152,33 +181,10 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): def setUp(self): super(XapianBackendTestCase, self).setUp() - mock = XapianMockModel() mock.id = 1 - mock.author = 'david' - mock.pub_date = datetime.datetime(2009, 2, 25) - self.backend.update(self.index, [mock]) - def test_fields(self): - """ - Tests that all fields are in the database - """ - terms = get_terms(self.backend, '-a') - for field in ['author', 'pub_date', 'text']: - is_inside = False - for term in terms: - if "X%s" % field.upper() in term: - is_inside = True - break - self.assertTrue(is_inside, field) - - def test_text(self): - terms = get_terms(self.backend, '-a') - - self.assertTrue('this_is_a_word' in terms) - self.assertTrue('Zthis_is_a_word' in terms) - def test_app_is_not_split(self): """ Tests that the app path is not split @@ -199,6 +205,76 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): self.assertFalse('xapianmockmodel' in terms) self.assertFalse('tests' in terms) + def test_fields_exist(self): + """ + Tests that all fields are in the database + """ + terms = get_terms(self.backend, '-a') + for field in ['author', 'datetime', 'text', 'url']: + is_inside = False + for term in terms: + if term.startswith("X%s" % field.upper()): + is_inside = True + break + self.assertTrue(is_inside, field) + + def test_text_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('this_is_a_word' in terms) + self.assertTrue('Zthis_is_a_word' in terms) + self.assertTrue('ZXTEXTthis_is_a_word' in terms) + self.assertTrue('XTEXTthis_is_a_word' in terms) + + def test_author_field(self): + terms = get_terms(self.backend, '-a') + + self.assertTrue('XAUTHORdavid' in terms) + self.assertTrue('ZXAUTHORdavid' in terms) + self.assertTrue('Zdavid' in terms) + self.assertTrue('david' in terms) + + def test_datetime_field(self): + terms = get_terms(self.backend, '-a') + + self.assertFalse('XDATETIME20090225000000' in terms) + self.assertFalse('ZXDATETIME20090225000000' in terms) + self.assertFalse('20090225000000' in terms) + + self.assertTrue('XDATETIME2009-02-25' in terms) + self.assertTrue('2009-02-25' in terms) + self.assertTrue('01:01:01' in terms) + self.assertTrue('XDATETIME01:01:01' in terms) + + def test_date_field(self): + terms = get_terms(self.backend, '-a') + + self.assertTrue('XDATE2008-08-08' in terms) + self.assertTrue('2008-08-08' in terms) + self.assertFalse('XDATE00:00:00' in terms) + self.assertFalse('00:00:00' in terms) + + def test_url_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('http://example.com/1/' in terms) + + def test_integer_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('123456789' in terms) + self.assertTrue('XNUMBER123456789' in terms) + self.assertFalse('ZXNUMBER123456789' in terms) + + def test_float_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('123.123456789' in terms) + self.assertTrue('XFLOAT_NUMBER123.123456789' in terms) + self.assertFalse('ZXFLOAT_NUMBER123.123456789' in terms) + + def test_decimal_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('22.34' in terms) + self.assertTrue('XDECIMAL_NUMBER22.34' in terms) + self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) + class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): @@ -276,7 +352,6 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): MockSearchResult)) def test_search_field_with_punctuation(self): - #self.assertEqual(self.backend.search(xapian.Query('http://example.com/'))['hits'], 3) self.assertEqual(pks(self.backend.search(xapian.Query('http://example.com/1/'))['results']), [1]) From ab26c30d90aafe3103f299760e672d3312f0c92a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Sun, 18 May 2014 21:38:34 +0200 Subject: [PATCH 31/38] Fixed #119 - Adds support to non-anscii indexing and search. --- tests/xapian_tests/tests/test_live_xapian.py | 31 ++++++++++++------- .../xapian_tests/tests/test_xapian_backend.py | 8 +++++ xapian_backend.py | 2 +- 3 files changed, 29 insertions(+), 12 deletions(-) diff --git a/tests/xapian_tests/tests/test_live_xapian.py b/tests/xapian_tests/tests/test_live_xapian.py index 4f9a9d2..d1d057b 100644 --- a/tests/xapian_tests/tests/test_live_xapian.py +++ b/tests/xapian_tests/tests/test_live_xapian.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import datetime from django.test import TestCase @@ -7,6 +9,7 @@ from haystack.query import SearchQuerySet from xapian_tests.models import Document from xapian_tests.search_indexes import DocumentIndex +from xapian_tests.tests.test_xapian_backend import get_terms def pks(results): @@ -25,7 +28,7 @@ class LiveXapianTestCase(TestCase): datetime.date(year=2010, month=2, day=1), datetime.date(year=2010, month=3, day=1)] - summaries = ['This is a huge summary', + summaries = ['This is a huge corrup\xe7\xe3o summary', 'This is a medium summary', 'This is a small summary'] @@ -119,8 +122,8 @@ class LiveXapianTestCase(TestCase): self.assertEqual(pks(self.queryset.filter(django_id__in=[2, 4])), pks(Document.objects.filter(id__in=[2, 4]))) - self.assertEqual(pks(self.queryset.models(Document)), - pks(Document.objects.all())) + self.assertEqual(set(pks(self.queryset.models(Document))), + set(pks(Document.objects.all()))) def test_field_startswith(self): self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4) @@ -134,20 +137,20 @@ class LiveXapianTestCase(TestCase): self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0) def test_value_range(self): - self.assertEqual(pks(self.queryset.filter(number__lt=3)), - pks(Document.objects.filter(number__lt=3))) + self.assertEqual(set(pks(self.queryset.filter(number__lt=3))), + set(pks(Document.objects.filter(number__lt=3)))) - self.assertEqual(pks(self.queryset.filter(django_id__gte=6)), - pks(Document.objects.filter(id__gte=6))) + self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))), + set(pks(Document.objects.filter(id__gte=6)))) def test_date_range(self): date = datetime.date(year=2010, month=2, day=1) - self.assertEqual(pks(self.queryset.filter(date__gte=date)), - pks(Document.objects.filter(date__gte=date))) + self.assertEqual(set(pks(self.queryset.filter(date__gte=date))), + set(pks(Document.objects.filter(date__gte=date)))) date = datetime.date(year=2010, month=3, day=1) - self.assertEqual(pks(self.queryset.filter(date__lte=date)), - pks(Document.objects.filter(date__lte=date))) + self.assertEqual(set(pks(self.queryset.filter(date__lte=date))), + set(pks(Document.objects.filter(date__lte=date)))) def test_order_by(self): # private order @@ -166,3 +169,9 @@ class LiveXapianTestCase(TestCase): self.assertEqual(pks(self.queryset.order_by("-date")), pks(Document.objects.order_by("-date"))) + def test_non_ascii_search(self): + """ + Regression test for #119. + """ + self.assertEqual(pks(self.queryset.filter(content='corrup\xe7\xe3o')), + pks(Document.objects.filter(summary__contains='corrup\xe7\xe3o'))) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index d962986..9c7dc75 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -111,6 +111,7 @@ class XapianSimpleMockIndex(indexes.SearchIndex): text = indexes.CharField(document=True) author = indexes.CharField(model_attr='author') url = indexes.CharField() + non_anscii = indexes.CharField() datetime = indexes.DateTimeField(model_attr='pub_date') date = indexes.DateField() @@ -131,6 +132,9 @@ class XapianSimpleMockIndex(indexes.SearchIndex): def prepare_url(self, obj): return 'http://example.com/1/' + def prepare_non_anscii(self, obj): + return 'thsi sdas das corrup\xe7\xe3o das' + def prepare_datetime(self, obj): return datetime.datetime(2009, 2, 25, 1, 1, 1) @@ -275,6 +279,10 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): self.assertTrue('XDECIMAL_NUMBER22.34' in terms) self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) + def test_non_ascii_chars(self): + terms = get_terms(self.backend, '-a') + self.assertIn('corrup\xe7\xe3o', terms) + class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): diff --git a/xapian_backend.py b/xapian_backend.py index 494e0f9..d1712eb 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -1318,7 +1318,7 @@ class XapianSearchQuery(BaseSearchQuery): unstemmed_term = constructor.format(prefix=prefix, term=term) if stemmed: stem = xapian.Stem(self.backend.language) - stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term)) + stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term).decode('utf-8')) return xapian.Query(xapian.Query.OP_OR, xapian.Query(stemmed_term), From 48cec5202deb44be4f77202c15693bf55afbea5d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 06:35:05 +0200 Subject: [PATCH 32/38] Refactored _query_from_search_node into two menthods. --- xapian_backend.py | 159 +++++++++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 72 deletions(-) diff --git a/xapian_backend.py b/xapian_backend.py index d1712eb..bf27503 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -179,12 +179,18 @@ class XapianSearchBackend(BaseSearchBackend): # these 4 attributes are caches populated in `build_schema` # they are checked in `_update_cache` - self._fields = None - self._schema = None + # use property to retrieve them + self._fields = {} + self._schema = [] self._content_field_name = None self._columns = {} def _update_cache(self): + """ + To avoid build_schema every time, we cache + some values: they only change when a SearchIndex + changes, which typically restarts the Python. + """ fields = connections[self.connection_alias].get_unified_index().all_searchfields() if self._fields != fields: self._fields = fields @@ -1093,82 +1099,91 @@ class XapianSearchQuery(BaseSearchQuery): expression, term = child field_name, filter_type = search_node.split_expression(expression) - if field_name != 'content' and field_name not in self.backend.column: - raise InvalidIndexError('field "%s" not indexed' % field_name) - - # Identify and parse AutoQuery - if isinstance(term, AutoQuery): - if field_name != 'content': - query = '%s:%s' % (field_name, term.prepare(self)) - else: - query = term.prepare(self) - query_list.append(self.backend.parse_query(query)) - continue - - # Handle `ValuesListQuerySet`. - if hasattr(term, 'values_list'): - term = list(term) - - if field_name == 'content': - # content is the generic search: - # force no field_name search - # and the field_type to be 'text'. - field_name = None - field_type = 'text' - - # we don't know what is the type(term), so we parse it. - # Ideally this would not be required, but - # some filters currently depend on the term to make decisions. - term = _to_xapian_term(term) - - query_list.append(self._filter_contains(term, field_name, field_type, is_not)) - # when filter has no filter_type, haystack uses - # filter_type = 'contains'. Here we remove it - # since the above query is already doing this - if filter_type == 'contains': - filter_type = None - else: - # get the field_type from the backend - field_type = self.backend.schema[self.backend.column[field_name]]['type'] - - # private fields don't accept 'contains' or 'startswith' - # since they have no meaning. - if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'): - filter_type = 'exact' - - if field_type == 'text': - # we don't know what type "term" is, but we know we are searching as text - # so we parse it like that. - # Ideally this would not be required since _term_query does it, but - # some filters currently depend on the term to make decisions. - if isinstance(term, list): - term = [_to_xapian_term(term) for term in term] - else: - term = _to_xapian_term(term) - - # todo: we should check that the filter is valid for this field_type or raise InvalidIndexError - if filter_type == 'contains': - query_list.append(self._filter_contains(term, field_name, field_type, is_not)) - elif filter_type == 'exact': - query_list.append(self._filter_exact(term, field_name, field_type, is_not)) - elif filter_type == 'in': - query_list.append(self._filter_in(term, field_name, field_type, is_not)) - elif filter_type == 'startswith': - query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) - elif filter_type == 'gt': - query_list.append(self._filter_gt(term, field_name, field_type, is_not)) - elif filter_type == 'gte': - query_list.append(self._filter_gte(term, field_name, field_type, is_not)) - elif filter_type == 'lt': - query_list.append(self._filter_lt(term, field_name, field_type, is_not)) - elif filter_type == 'lte': - query_list.append(self._filter_lte(term, field_name, field_type, is_not)) + constructed_query_list = self._query_from_term(term, field_name, filter_type, is_not) + query_list.extend(constructed_query_list) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) else: return xapian.Query(xapian.Query.OP_AND, query_list) + def _query_from_term(self, term, field_name, filter_type, is_not): + """ + Uses arguments to construct a list of xapian.Query's. + """ + if field_name != 'content' and field_name not in self.backend.column: + raise InvalidIndexError('field "%s" not indexed' % field_name) + + # It it is an AutoQuery, it has no filters + # or others, thus we short-circuit the procedure. + if isinstance(term, AutoQuery): + if field_name != 'content': + query = '%s:%s' % (field_name, term.prepare(self)) + else: + query = term.prepare(self) + return [self.backend.parse_query(query)] + query_list = [] + + # Handle `ValuesListQuerySet`. + if hasattr(term, 'values_list'): + term = list(term) + + if field_name == 'content': + # content is the generic search: + # force no field_name search + # and the field_type to be 'text'. + field_name = None + field_type = 'text' + + # we don't know what is the type(term), so we parse it. + # Ideally this would not be required, but + # some filters currently depend on the term to make decisions. + term = _to_xapian_term(term) + + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + # when filter has no filter_type, haystack uses + # filter_type = 'contains'. Here we remove it + # since the above query is already doing this + if filter_type == 'contains': + filter_type = None + else: + # get the field_type from the backend + field_type = self.backend.schema[self.backend.column[field_name]]['type'] + + # private fields don't accept 'contains' or 'startswith' + # since they have no meaning. + if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'): + filter_type = 'exact' + + if field_type == 'text': + # we don't know what type "term" is, but we know we are searching as text + # so we parse it like that. + # Ideally this would not be required since _term_query does it, but + # some filters currently depend on the term to make decisions. + if isinstance(term, list): + term = [_to_xapian_term(term) for term in term] + else: + term = _to_xapian_term(term) + + # todo: we should check that the filter is valid for this field_type or raise InvalidIndexError + if filter_type == 'contains': + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + elif filter_type == 'exact': + query_list.append(self._filter_exact(term, field_name, field_type, is_not)) + elif filter_type == 'in': + query_list.append(self._filter_in(term, field_name, field_type, is_not)) + elif filter_type == 'startswith': + query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) + elif filter_type == 'gt': + query_list.append(self._filter_gt(term, field_name, field_type, is_not)) + elif filter_type == 'gte': + query_list.append(self._filter_gte(term, field_name, field_type, is_not)) + elif filter_type == 'lt': + query_list.append(self._filter_lt(term, field_name, field_type, is_not)) + elif filter_type == 'lte': + query_list.append(self._filter_lte(term, field_name, field_type, is_not)) + return query_list + def _all_query(self): """ Returns a match all query. From b4a41565a5a8f2b619cc203dca37ee241b04861f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 06:47:32 +0200 Subject: [PATCH 33/38] Added test for multivalue indexing. --- tests/xapian_tests/tests/test_xapian_backend.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_xapian_backend.py index 9c7dc75..134166d 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_xapian_backend.py @@ -120,6 +120,8 @@ class XapianSimpleMockIndex(indexes.SearchIndex): float_number = indexes.FloatField() decimal_number = indexes.DecimalField() + multi_value = indexes.MultiValueField() + def get_model(self): return MockModel @@ -150,6 +152,9 @@ class XapianSimpleMockIndex(indexes.SearchIndex): def prepare_decimal_number(self, obj): return '22.34' + def prepare_multi_value(self, obj): + return ['multi1', 'multi2'] + class HaystackBackendTestCase(object): """ @@ -279,6 +284,15 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): self.assertTrue('XDECIMAL_NUMBER22.34' in terms) self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) + def test_multivalue_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('multi1' in terms) + self.assertTrue('multi2' in terms) + self.assertTrue('XMULTI_VALUEmulti1' in terms) + self.assertTrue('XMULTI_VALUEmulti2' in terms) + self.assertTrue('ZXMULTI_VALUEmulti2' in terms) + self.assertTrue('Zmulti2' in terms) + def test_non_ascii_chars(self): terms = get_terms(self.backend, '-a') self.assertIn('corrup\xe7\xe3o', terms) From 229752da87fda0118eb66e266808f0965247828f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 07:35:29 +0200 Subject: [PATCH 34/38] Reorganized, renamed, and improved docstrings of TestCases. --- ...test_xapian_backend.py => test_backend.py} | 230 +++--------------- ...{test_live_xapian.py => test_interface.py} | 14 +- .../{test_xapian_query.py => test_query.py} | 215 +++++++++++++++- 3 files changed, 246 insertions(+), 213 deletions(-) rename tests/xapian_tests/tests/{test_xapian_backend.py => test_backend.py} (76%) rename tests/xapian_tests/tests/{test_live_xapian.py => test_interface.py} (95%) rename tests/xapian_tests/tests/{test_xapian_query.py => test_query.py} (58%) diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_backend.py similarity index 76% rename from tests/xapian_tests/tests/test_xapian_backend.py rename to tests/xapian_tests/tests/test_backend.py index 134166d..7ed4551 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_backend.py @@ -6,18 +6,15 @@ import xapian import subprocess import os -from django.conf import settings from django.db import models from django.test import TestCase -from haystack import connections, reset_search_queries +from haystack import connections from haystack import indexes from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value -from haystack.models import SearchResult -from haystack.query import SearchQuerySet, SQ from haystack.utils.loading import UnifiedIndex -from core.models import MockTag, MockModel, AnotherMockModel, AFourthMockModel +from core.models import MockTag, MockModel, AnotherMockModel from core.tests.mocks import MockSearchResult @@ -158,8 +155,8 @@ class XapianSimpleMockIndex(indexes.SearchIndex): class HaystackBackendTestCase(object): """ - An abstract TestCase that implements a hack to ensure connections - has the mock index + Abstract TestCase that implements an hack to ensure `connections` + has the right index It has a method get_index() that returns a SearchIndex that must be overwritten. @@ -183,13 +180,19 @@ class HaystackBackendTestCase(object): connections['default']._index = self.old_ui -class XapianBackendTestCase(HaystackBackendTestCase, TestCase): +class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): + """ + Tests indexation behavior. + + Tests related to how the backend indexes terms, + values, and others go here. + """ def get_index(self): return XapianSimpleMockIndex() def setUp(self): - super(XapianBackendTestCase, self).setUp() + super(BackendIndexationTestCase, self).setUp() mock = XapianMockModel() mock.id = 1 self.backend.update(self.index, [mock]) @@ -298,13 +301,19 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): self.assertIn('corrup\xe7\xe3o', terms) -class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): +class BackendFeaturesTestCase(HaystackBackendTestCase, TestCase): + """ + Tests supported features on the backend side. + + Tests to features implemented on the backend + go here. + """ def get_index(self): return XapianMockSearchIndex() def setUp(self): - super(XapianSearchBackendTestCase, self).setUp() + super(BackendFeaturesTestCase, self).setUp() self.sample_objs = [] @@ -589,10 +598,16 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): if xapian.minor_version() >= 2: self.assertEqual(str(self.backend.parse_query('name:da*')), - 'Xapian::Query((XNAMEdavid1:(pos=1) SYNONYM XNAMEdavid2:(pos=1) SYNONYM XNAMEdavid3:(pos=1)))') + 'Xapian::Query((' + 'XNAMEdavid1:(pos=1) SYNONYM ' + 'XNAMEdavid2:(pos=1) SYNONYM ' + 'XNAMEdavid3:(pos=1)))') else: self.assertEqual(str(self.backend.parse_query('name:da*')), - 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))') + 'Xapian::Query((' + 'XNAMEdavid1:(pos=1) OR ' + 'XNAMEdavid2:(pos=1) OR ' + 'XNAMEdavid3:(pos=1)))') self.assertEqual(str(self.backend.parse_query('name:david1..david2')), 'Xapian::Query(VALUE_RANGE 7 david1 david2)') @@ -648,192 +663,3 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.silently_fail = False self.assertRaises(InvalidIndexError, self.backend.more_like_this, mock) - - -class LiveXapianMockSearchIndex(indexes.SearchIndex): - text = indexes.CharField(document=True, use_template=True) - name = indexes.CharField(model_attr='author', faceted=True) - pub_date = indexes.DateTimeField(model_attr='pub_date') - title = indexes.CharField() - - def get_model(self): - return MockModel - - -class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): - """ - SearchQuery specific tests - """ - fixtures = ['initial_data.json'] - - def get_index(self): - return LiveXapianMockSearchIndex() - - def setUp(self): - super(LiveXapianSearchQueryTestCase, self).setUp() - - self.backend.update(self.index, MockModel.objects.all()) - - self.sq = connections['default'].get_query() - - def test_get_spelling(self): - self.sq.add_filter(SQ(content='indxd')) - self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed') - self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed') - - def test_startswith(self): - self.sq.add_filter(SQ(name__startswith='da')) - self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3]) - - def test_build_query_gt(self): - self.sq.add_filter(SQ(name__gt='m')) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(( AND_NOT VALUE_RANGE 3 a m))') - - def test_build_query_gte(self): - self.sq.add_filter(SQ(name__gte='m')) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzz)') - - def test_build_query_lt(self): - self.sq.add_filter(SQ(name__lt='m')) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(( AND_NOT ' - 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') - - def test_build_query_lte(self): - self.sq.add_filter(SQ(name__lte='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 3 a m)') - - def test_build_query_multiple_filter_types(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) - self.sq.add_filter(SQ(name__gt='david')) - self.sq.add_filter(SQ(title__gte='B')) - self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND ' - 'VALUE_RANGE 5 00010101000000 20090210015900 AND ' - '( AND_NOT VALUE_RANGE 3 a david) AND ' - 'VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' - '(QQ000000000001 OR QQ000000000002 OR QQ000000000003)))') - - def test_log_query(self): - reset_search_queries() - self.assertEqual(len(connections['default'].queries), 0) - - # Stow. - old_debug = settings.DEBUG - settings.DEBUG = False - - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 0) - - settings.DEBUG = True - # Redefine it to clear out the cached results. - self.sq = connections['default'].get_query() - self.sq.add_filter(SQ(name='bar')) - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 1) - self.assertEqual(str(connections['default'].queries[0]['query_string']), 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') - - # And again, for good measure. - self.sq = connections['default'].get_query() - self.sq.add_filter(SQ(name='bar')) - self.sq.add_filter(SQ(text='moof')) - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 2) - self.assertEqual(str(connections['default'].queries[0]['query_string']), 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') - self.assertEqual(str(connections['default'].queries[1]['query_string']), 'Xapian::Query(((ZXNAMEbar OR XNAMEbar) AND (ZXTEXTmoof OR XTEXTmoof)))') - - # Restore. - settings.DEBUG = old_debug - - -class LiveXapianSearchQuerySetTestCase(HaystackBackendTestCase, TestCase): - """ - SearchQuerySet specific tests - """ - fixtures = ['initial_data.json'] - - def get_index(self): - return LiveXapianMockSearchIndex() - - def setUp(self): - super(LiveXapianSearchQuerySetTestCase, self).setUp() - - self.backend.update(self.index, MockModel.objects.all()) - self.sq = connections['default'].get_query() - self.sqs = SearchQuerySet() - - def test_result_class(self): - # Assert that we're defaulting to ``SearchResult``. - sqs = self.sqs.all() - self.assertTrue(isinstance(sqs[0], SearchResult)) - - # Custom class. - sqs = self.sqs.result_class(MockSearchResult).all() - self.assertTrue(isinstance(sqs[0], MockSearchResult)) - - # Reset to default. - sqs = self.sqs.result_class(None).all() - self.assertTrue(isinstance(sqs[0], SearchResult)) - - def test_facet(self): - self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3) - - -class XapianBoostMockSearchIndex(indexes.SearchIndex): - text = indexes.CharField( - document=True, use_template=True, - template_name='search/indexes/core/mockmodel_template.txt' - ) - author = indexes.CharField(model_attr='author', weight=2.0) - editor = indexes.CharField(model_attr='editor') - pub_date = indexes.DateField(model_attr='pub_date') - - def get_model(self): - return AFourthMockModel - - -class XapianBoostBackendTestCase(HaystackBackendTestCase, TestCase): - - def get_index(self): - return XapianBoostMockSearchIndex() - - def setUp(self): - super(XapianBoostBackendTestCase, self).setUp() - - self.sample_objs = [] - for i in range(1, 5): - mock = AFourthMockModel() - mock.id = i - if i % 2: - mock.author = 'daniel' - mock.editor = 'david' - else: - mock.author = 'david' - mock.editor = 'daniel' - mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i) - self.sample_objs.append(mock) - - self.backend.update(self.index, self.sample_objs) - - def test_boost(self): - sqs = SearchQuerySet() - - self.assertEqual(len(sqs.all()), 4) - - results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel')) - - self.assertEqual([result.id for result in results], [ - 'core.afourthmockmodel.1', - 'core.afourthmockmodel.3', - 'core.afourthmockmodel.2', - 'core.afourthmockmodel.4' - ]) diff --git a/tests/xapian_tests/tests/test_live_xapian.py b/tests/xapian_tests/tests/test_interface.py similarity index 95% rename from tests/xapian_tests/tests/test_live_xapian.py rename to tests/xapian_tests/tests/test_interface.py index d1d057b..824bcfd 100644 --- a/tests/xapian_tests/tests/test_live_xapian.py +++ b/tests/xapian_tests/tests/test_interface.py @@ -9,16 +9,19 @@ from haystack.query import SearchQuerySet from xapian_tests.models import Document from xapian_tests.search_indexes import DocumentIndex -from xapian_tests.tests.test_xapian_backend import get_terms +from xapian_tests.tests.test_backend import pks -def pks(results): - return [result.pk for result in results] +class InterfaceTestCase(TestCase): + """ + Tests the interface of Xapian-Haystack. - -class LiveXapianTestCase(TestCase): + Tests related to usability and expected behavior + go here. + """ def setUp(self): + super(InterfaceTestCase, self).setUp() types_names = ['book', 'magazine', 'article'] texts = ['This is a huge text', @@ -53,6 +56,7 @@ class LiveXapianTestCase(TestCase): def tearDown(self): Document.objects.all().delete() self.backend.clear() + super(InterfaceTestCase, self).tearDown() def test_count(self): self.assertEqual(self.queryset.count(), Document.objects.count()) diff --git a/tests/xapian_tests/tests/test_xapian_query.py b/tests/xapian_tests/tests/test_query.py similarity index 58% rename from tests/xapian_tests/tests/test_xapian_query.py rename to tests/xapian_tests/tests/test_query.py index 21b1a86..e49fe1b 100644 --- a/tests/xapian_tests/tests/test_xapian_query.py +++ b/tests/xapian_tests/tests/test_query.py @@ -2,17 +2,20 @@ from __future__ import unicode_literals import datetime +from django.conf import settings from django.test import TestCase from haystack import indexes -from haystack import connections -from haystack.query import SQ +from haystack import connections, reset_search_queries +from haystack.models import SearchResult +from haystack.query import SearchQuerySet, SQ -from core.models import MockModel, AnotherMockModel -from xapian_tests.tests.test_xapian_backend import HaystackBackendTestCase +from core.models import MockModel, AnotherMockModel, AFourthMockModel +from core.tests.mocks import MockSearchResult +from xapian_tests.tests.test_backend import HaystackBackendTestCase -class XapianMockQueryIndex(indexes.SearchIndex): +class MockQueryIndex(indexes.SearchIndex): text = indexes.CharField(document=True) pub_date = indexes.DateTimeField() title = indexes.CharField() @@ -24,7 +27,7 @@ class XapianMockQueryIndex(indexes.SearchIndex): class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): def get_index(self): - return XapianMockQueryIndex() + return MockQueryIndex() def setUp(self): super(XapianSearchQueryTestCase, self).setUp() @@ -231,3 +234,203 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' '("XTITLE1" OR "XTITLE2" OR "XTITLE3")))') + + +class MockSearchIndex(indexes.SearchIndex): + text = indexes.CharField(document=True, use_template=True) + name = indexes.CharField(model_attr='author', faceted=True) + pub_date = indexes.DateTimeField(model_attr='pub_date') + title = indexes.CharField() + + def get_model(self): + return MockModel + + +class SearchQueryTestCase(HaystackBackendTestCase, TestCase): + """ + Tests expected behavior of + SearchQuery. + """ + fixtures = ['initial_data.json'] + + def get_index(self): + return MockSearchIndex() + + def setUp(self): + super(SearchQueryTestCase, self).setUp() + + self.backend.update(self.index, MockModel.objects.all()) + + self.sq = connections['default'].get_query() + + def test_get_spelling(self): + self.sq.add_filter(SQ(content='indxd')) + self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed') + self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed') + + def test_startswith(self): + self.sq.add_filter(SQ(name__startswith='da')) + self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3]) + + def test_build_query_gt(self): + self.sq.add_filter(SQ(name__gt='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT VALUE_RANGE 3 a m))') + + def test_build_query_gte(self): + self.sq.add_filter(SQ(name__gte='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzz)') + + def test_build_query_lt(self): + self.sq.add_filter(SQ(name__lt='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT ' + 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') + + def test_build_query_lte(self): + self.sq.add_filter(SQ(name__lte='m')) + self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 3 a m)') + + def test_build_query_multiple_filter_types(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) + self.sq.add_filter(SQ(name__gt='david')) + self.sq.add_filter(SQ(title__gte='B')) + self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + 'VALUE_RANGE 5 00010101000000 20090210015900 AND ' + '( AND_NOT VALUE_RANGE 3 a david) AND ' + 'VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' + '(QQ000000000001 OR QQ000000000002 OR QQ000000000003)))') + + def test_log_query(self): + reset_search_queries() + self.assertEqual(len(connections['default'].queries), 0) + + # Stow. + old_debug = settings.DEBUG + settings.DEBUG = False + + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 0) + + settings.DEBUG = True + # Redefine it to clear out the cached results. + self.sq = connections['default'].get_query() + self.sq.add_filter(SQ(name='bar')) + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 1) + self.assertEqual(str(connections['default'].queries[0]['query_string']), + 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') + + # And again, for good measure. + self.sq = connections['default'].get_query() + self.sq.add_filter(SQ(name='bar')) + self.sq.add_filter(SQ(text='moof')) + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 2) + self.assertEqual(str(connections['default'].queries[0]['query_string']), + 'Xapian::Query((' + 'ZXNAMEbar OR ' + 'XNAMEbar))') + self.assertEqual(str(connections['default'].queries[1]['query_string']), + 'Xapian::Query((' + '(ZXNAMEbar OR XNAMEbar) AND ' + '(ZXTEXTmoof OR XTEXTmoof)))') + + # Restore. + settings.DEBUG = old_debug + + +class LiveSearchQuerySetTestCase(HaystackBackendTestCase, TestCase): + """ + SearchQuerySet specific tests + """ + fixtures = ['initial_data.json'] + + def get_index(self): + return MockSearchIndex() + + def setUp(self): + super(LiveSearchQuerySetTestCase, self).setUp() + + self.backend.update(self.index, MockModel.objects.all()) + self.sq = connections['default'].get_query() + self.sqs = SearchQuerySet() + + def test_result_class(self): + # Assert that we're defaulting to ``SearchResult``. + sqs = self.sqs.all() + self.assertTrue(isinstance(sqs[0], SearchResult)) + + # Custom class. + sqs = self.sqs.result_class(MockSearchResult).all() + self.assertTrue(isinstance(sqs[0], MockSearchResult)) + + # Reset to default. + sqs = self.sqs.result_class(None).all() + self.assertTrue(isinstance(sqs[0], SearchResult)) + + def test_facet(self): + self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3) + + +class BoostMockSearchIndex(indexes.SearchIndex): + text = indexes.CharField( + document=True, use_template=True, + template_name='search/indexes/core/mockmodel_template.txt' + ) + author = indexes.CharField(model_attr='author', weight=2.0) + editor = indexes.CharField(model_attr='editor') + pub_date = indexes.DateField(model_attr='pub_date') + + def get_model(self): + return AFourthMockModel + + +class BoostFieldTestCase(HaystackBackendTestCase, TestCase): + """ + Tests boosted fields. + """ + + def get_index(self): + return BoostMockSearchIndex() + + def setUp(self): + super(BoostFieldTestCase, self).setUp() + + self.sample_objs = [] + for i in range(1, 5): + mock = AFourthMockModel() + mock.id = i + if i % 2: + mock.author = 'daniel' + mock.editor = 'david' + else: + mock.author = 'david' + mock.editor = 'daniel' + mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i) + self.sample_objs.append(mock) + + self.backend.update(self.index, self.sample_objs) + + def test_boost(self): + sqs = SearchQuerySet() + + self.assertEqual(len(sqs.all()), 4) + + results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel')) + + self.assertEqual([result.id for result in results], [ + 'core.afourthmockmodel.1', + 'core.afourthmockmodel.3', + 'core.afourthmockmodel.2', + 'core.afourthmockmodel.4' + ]) From 3c5f9d4a36113475d1b244b8c6666458254e4c3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 19:01:05 +0200 Subject: [PATCH 35/38] Improved robustness of auto_query test. --- tests/xapian_tests/models.py | 1 + tests/xapian_tests/search_indexes.py | 6 +---- tests/xapian_tests/tests/test_interface.py | 27 ++++++++++++++++------ 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/tests/xapian_tests/models.py b/tests/xapian_tests/models.py index 21b3cc8..36a5931 100644 --- a/tests/xapian_tests/models.py +++ b/tests/xapian_tests/models.py @@ -4,6 +4,7 @@ from django.db import models class Document(models.Model): type_name = models.CharField(max_length=50) number = models.IntegerField() + name = models.CharField(max_length=200) date = models.DateField() diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py index ad97415..3e87b27 100644 --- a/tests/xapian_tests/search_indexes.py +++ b/tests/xapian_tests/search_indexes.py @@ -11,12 +11,8 @@ class DocumentIndex(indexes.SearchIndex): number = indexes.IntegerField(model_attr='number') - name = indexes.CharField() + name = indexes.CharField(model_attr='name') date = indexes.DateField(model_attr='date') def get_model(self): return models.Document() - - def prepare_name(self, obj): - return "%s %s" % (obj.type_name, str(obj.number)) - diff --git a/tests/xapian_tests/tests/test_interface.py b/tests/xapian_tests/tests/test_interface.py index 824bcfd..e9bfb24 100644 --- a/tests/xapian_tests/tests/test_interface.py +++ b/tests/xapian_tests/tests/test_interface.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import datetime +from django.db.models import Q from django.test import TestCase from haystack import connections @@ -38,10 +39,12 @@ class InterfaceTestCase(TestCase): for i in range(1, 13): doc = Document() doc.type_name = types_names[i % 3] - doc.text = texts[i % 3] - doc.date = dates[i % 3] - doc.summary = summaries[i % 3] doc.number = i * 2 + doc.name = "%s %d" % (doc.type_name, doc.number) + doc.date = dates[i % 3] + + doc.summary = summaries[i % 3] + doc.text = texts[i % 3] doc.save() self.index = DocumentIndex() @@ -131,12 +134,22 @@ class InterfaceTestCase(TestCase): def test_field_startswith(self): self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4) - self.assertEqual(len(self.queryset.filter(text__startswith='This is')), 12) + self.assertEqual(set(pks(self.queryset.filter(text__startswith='This is'))), + set(pks(Document.objects.filter(text__startswith='This is')))) def test_auto_query(self): - self.assertEqual(len(self.queryset.auto_query("huge OR medium")), 8) - self.assertEqual(len(self.queryset.auto_query("huge AND medium")), 0) - self.assertEqual(len(self.queryset.auto_query("huge -this")), 0) + self.assertEqual(set(pks(self.queryset.auto_query("huge OR medium"))), + set(pks(Document.objects.filter(Q(text__contains="huge") | + Q(text__contains="medium"))))) + + self.assertEqual(set(pks(self.queryset.auto_query("huge AND medium"))), + set(pks(Document.objects.filter(Q(text__contains="huge") & + Q(text__contains="medium"))))) + + self.assertEqual(set(pks(self.queryset.auto_query("text:huge text:-this"))), + set(pks(Document.objects.filter(Q(text__contains="huge") & + ~Q(text__contains="this"))))) + self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 OR 4"))), 2) self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0) From 39bc6d6c89b1d844658749b518caade7c4436d14 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 22:54:49 +0200 Subject: [PATCH 36/38] Improved robustness of test. --- tests/xapian_tests/tests/test_interface.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/xapian_tests/tests/test_interface.py b/tests/xapian_tests/tests/test_interface.py index e9bfb24..60fe0fb 100644 --- a/tests/xapian_tests/tests/test_interface.py +++ b/tests/xapian_tests/tests/test_interface.py @@ -115,7 +115,8 @@ class InterfaceTestCase(TestCase): self.assertEqual(pks(self.queryset.filter(name='8 4')), [2, 4]) def test_field_in(self): - self.assertEqual(pks(self.queryset.filter(name__in=['magazine 2', 'article 4'])), [1, 2]) + self.assertEqual(set(pks(self.queryset.filter(name__in=['magazine 2', 'article 4']))), + set(pks(Document.objects.filter(name__in=['magazine 2', 'article 4'])))) self.assertEqual(pks(self.queryset.filter(number__in=[4])), pks(Document.objects.filter(number__in=[4]))) From ad0d67dc12684fe041af44f3e09f9637390f936d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 22:57:52 +0200 Subject: [PATCH 37/38] Added regression test for #103. --- tests/xapian_tests/search_indexes.py | 8 ++++++++ tests/xapian_tests/tests/test_backend.py | 24 +++++++++++++++------- tests/xapian_tests/tests/test_interface.py | 8 ++++++++ 3 files changed, 33 insertions(+), 7 deletions(-) diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py index 3e87b27..76c0df4 100644 --- a/tests/xapian_tests/search_indexes.py +++ b/tests/xapian_tests/search_indexes.py @@ -14,5 +14,13 @@ class DocumentIndex(indexes.SearchIndex): name = indexes.CharField(model_attr='name') date = indexes.DateField(model_attr='date') + tags = indexes.MultiValueField() + def get_model(self): return models.Document() + + def prepare_tags(self, obj): + l = [['tag', 'tag-test', 'tag-test-test'], + ['tag', 'tag-test'], + ['tag']] + return l[obj.id % 3] diff --git a/tests/xapian_tests/tests/test_backend.py b/tests/xapian_tests/tests/test_backend.py index 7ed4551..2e91f58 100644 --- a/tests/xapian_tests/tests/test_backend.py +++ b/tests/xapian_tests/tests/test_backend.py @@ -150,7 +150,7 @@ class XapianSimpleMockIndex(indexes.SearchIndex): return '22.34' def prepare_multi_value(self, obj): - return ['multi1', 'multi2'] + return ['tag', 'tag-test', 'tag-test-test'] class HaystackBackendTestCase(object): @@ -288,13 +288,23 @@ class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) def test_multivalue_field(self): + """ + Regression test for #103 + """ terms = get_terms(self.backend, '-a') - self.assertTrue('multi1' in terms) - self.assertTrue('multi2' in terms) - self.assertTrue('XMULTI_VALUEmulti1' in terms) - self.assertTrue('XMULTI_VALUEmulti2' in terms) - self.assertTrue('ZXMULTI_VALUEmulti2' in terms) - self.assertTrue('Zmulti2' in terms) + self.assertTrue('tag' in terms) + self.assertTrue('tag-test' in terms) + self.assertTrue('tag-test-test' in terms) + + self.assertTrue('XMULTI_VALUEtag' in terms) + self.assertTrue('XMULTI_VALUEtag-test' in terms) + self.assertTrue('XMULTI_VALUEtag-test-test' in terms) + + # these and only these terms + # 3 for the exact term (^{term}$) + self.assertEqual(len([term for term in terms if term.startswith('XMULTI_VALUE')]), 6) + # no stem for exact multivalues. + self.assertEqual(len([term for term in terms if term.startswith('ZXMULTI_VALUE')]), 0) def test_non_ascii_chars(self): terms = get_terms(self.backend, '-a') diff --git a/tests/xapian_tests/tests/test_interface.py b/tests/xapian_tests/tests/test_interface.py index 60fe0fb..3b16a22 100644 --- a/tests/xapian_tests/tests/test_interface.py +++ b/tests/xapian_tests/tests/test_interface.py @@ -193,3 +193,11 @@ class InterfaceTestCase(TestCase): """ self.assertEqual(pks(self.queryset.filter(content='corrup\xe7\xe3o')), pks(Document.objects.filter(summary__contains='corrup\xe7\xe3o'))) + + def test_multi_values_exact_search(self): + """ + Regression test for #103 + """ + self.assertEqual(len(self.queryset.filter(tags__exact='tag')), 12) + self.assertEqual(len(self.queryset.filter(tags__exact='tag-test')), 8) + self.assertEqual(len(self.queryset.filter(tags__exact='tag-test-test')), 4) From ec8fb42e68ff84f04f4382154c26c31c91193855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20C=2E=20Leit=C3=A3o?= Date: Tue, 20 May 2014 23:01:49 +0200 Subject: [PATCH 38/38] Fixed #103 - improves support for exact search. --- tests/xapian_tests/tests/test_query.py | 6 +-- xapian_backend.py | 75 ++++++++++++++++++-------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/tests/xapian_tests/tests/test_query.py b/tests/xapian_tests/tests/test_query.py index e49fe1b..244cd41 100644 --- a/tests/xapian_tests/tests/test_query.py +++ b/tests/xapian_tests/tests/test_query.py @@ -162,8 +162,8 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '( AND_NOT ("XTITLEdune" OR ' - '"XTITLEjaws"))))') + '( AND_NOT (XTITLE^dune$ OR ' + 'XTITLE^jaws$))))') def test_build_query_in_filter_multiple_words(self): self.sq.add_filter(SQ(content='why')) @@ -233,7 +233,7 @@ class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ' - '("XTITLE1" OR "XTITLE2" OR "XTITLE3")))') + '(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))') class MockSearchIndex(indexes.SearchIndex): diff --git a/xapian_backend.py b/xapian_backend.py index bf27503..7eb9799 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -261,13 +261,40 @@ class XapianSearchBackend(BaseSearchBackend): if self.include_spelling is True: term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) + def add_text(termpos, prefix, term, weight): + term_generator.set_termpos(termpos + 1) + term_generator.index_text(term, weight) + term_generator.index_text(term, weight, prefix) + term_generator.increase_termpos() + return term_generator.get_termpos() + for obj in iterable: document = xapian.Document() term_generator.set_document(document) + def add_to_document(prefix, term, weight): + document.add_term('%s' % term, weight) + document.add_term(prefix + term, weight) + document.add_term(prefix + '^%s$' % term, weight) + + def add_datetime_to_document(termpos, prefix, term, weight): + date, time = term.split() + document.add_posting(date, termpos, weight) + termpos += 1 + document.add_posting(time, termpos, weight) + termpos += 1 + document.add_posting(prefix + date, termpos, weight) + termpos += 1 + document.add_posting(prefix + time, termpos, weight) + termpos += 1 + return termpos + data = index.full_prepare(obj) weights = index.get_field_weights() + + termpos = 0 for field in self.schema: + termpos += 1 # not supported fields are ignored. if field['field_name'] not in list(data.keys()): continue @@ -288,30 +315,36 @@ class XapianSearchBackend(BaseSearchBackend): document.add_term(TERM_PREFIXES[field['field_name']] + value, weight) document.add_value(field['column'], value) + continue else: prefix = TERM_PREFIXES['field'] + field['field_name'].upper() - # if not multi_valued, we add a value and construct a one-element list + # if not multi_valued, we add as a document value + # for sorting and facets if field['multi_valued'] == 'false': document.add_value(field['column'], _term_to_xapian_value(value, field['type'])) - value = [value] + else: + for t in value: + # add the exact match of each value + term = _to_xapian_term(t) + add_to_document(prefix, term, weight) + # index each value with positional information + if ' ' in term: + termpos = add_text(termpos, prefix, term, weight) + continue - for term in value: - term = _to_xapian_term(term) - # from here on term is a string; - # now decide how it is stored: + term = _to_xapian_term(value) + # from here on the term is a string; + # we now decide how it is indexed - # these are - if field['type'] == 'text': - term_generator.index_text(term, weight) - term_generator.index_text(term, weight, prefix) - elif ' ' in term: - for t in term.split(): - document.add_term(t, weight) - document.add_term(prefix + t, weight) - if term != "": - document.add_term(term, weight) - document.add_term(prefix + term, weight) + if field['type'] == 'text': + # text is indexed with positional information + termpos = add_text(termpos, prefix, term, weight) + elif field['type'] == 'datetime': + termpos = add_datetime_to_document(termpos, prefix, term, weight) + if term != "": + # all other terms are added without positional information + add_to_document(prefix, term, weight) # store data without indexing it document.set_data(pickle.dumps( @@ -1300,10 +1333,9 @@ class XapianSearchQuery(BaseSearchQuery): assert not exact constructor = '{prefix}{term}' - # "" is to do a boolean match, but only works on indexed terms - # (constraint on Xapian side) - if exact and field_type == 'text': - constructor = '"{prefix}{term}"' + # ^{term}$ is for boolean match of the term + if exact: + constructor = '{prefix}^{term}$' # construct the prefix to be used. prefix = '' @@ -1321,6 +1353,7 @@ class XapianSearchQuery(BaseSearchQuery): # we construct the query dates in a slightly different way if field_type == 'datetime': date, time = term.split() + constructor = '{prefix}{term}' return xapian.Query(xapian.Query.OP_AND_MAYBE, constructor.format(prefix=prefix, term=date), constructor.format(prefix=prefix, term=time)