diff --git a/tests/xapian_tests/models.py b/tests/xapian_tests/models.py index 51c415e..36a5931 100644 --- a/tests/xapian_tests/models.py +++ b/tests/xapian_tests/models.py @@ -1,2 +1,12 @@ -# Copyright (C) 2009, 2010, 2011, 2012 David Sauve -# Copyright (C) 2009, 2010 Trapeze +from django.db import models + + +class Document(models.Model): + type_name = models.CharField(max_length=50) + number = models.IntegerField() + name = models.CharField(max_length=200) + + date = models.DateField() + + summary = models.TextField() + text = models.TextField() diff --git a/tests/xapian_tests/search_indexes.py b/tests/xapian_tests/search_indexes.py new file mode 100644 index 0000000..76c0df4 --- /dev/null +++ b/tests/xapian_tests/search_indexes.py @@ -0,0 +1,26 @@ +from haystack import indexes + +from . import models + + +class DocumentIndex(indexes.SearchIndex): + text = indexes.CharField(document=True) + summary = indexes.CharField(model_attr='summary') + + type_name = indexes.CharField(model_attr='type_name') + + number = indexes.IntegerField(model_attr='number') + + name = indexes.CharField(model_attr='name') + date = indexes.DateField(model_attr='date') + + tags = indexes.MultiValueField() + + def get_model(self): + return models.Document() + + def prepare_tags(self, obj): + l = [['tag', 'tag-test', 'tag-test-test'], + ['tag', 'tag-test'], + ['tag']] + return l[obj.id % 3] diff --git a/tests/xapian_tests/tests/test_xapian_backend.py b/tests/xapian_tests/tests/test_backend.py similarity index 53% rename from tests/xapian_tests/tests/test_xapian_backend.py rename to tests/xapian_tests/tests/test_backend.py index 21f5ad7..2e91f58 100644 --- a/tests/xapian_tests/tests/test_xapian_backend.py +++ b/tests/xapian_tests/tests/test_backend.py @@ -6,27 +6,29 @@ import xapian import subprocess import os -from django.conf import settings from django.db import models from django.test import TestCase -from haystack import connections, reset_search_queries +from haystack import connections from haystack import indexes -from haystack.backends.xapian_backend import InvalidIndexError, _marshal_value -from haystack.models import SearchResult -from haystack.query import SearchQuerySet, SQ +from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value from haystack.utils.loading import UnifiedIndex -from core.models import MockTag, MockModel, AnotherMockModel, AFourthMockModel +from core.models import MockTag, MockModel, AnotherMockModel from core.tests.mocks import MockSearchResult def get_terms(backend, *args): - result = subprocess.check_output(['delve'] + list(args) + [backend.path], env=os.environ.copy()) + result = subprocess.check_output(['delve'] + list(args) + [backend.path], + env=os.environ.copy()).decode('utf-8') result = result.split(": ")[1].strip() return result.split(" ") +def pks(results): + return [result.pk for result in results] + + class XapianMockModel(models.Model): """ Same as tests.core.MockModel with a few extra fields for testing various @@ -74,7 +76,7 @@ class XapianMockSearchIndex(indexes.SearchIndex): return XapianMockModel def prepare_sites(self, obj): - return ['%d' % (i * obj.id) for i in xrange(1, 4)] + return ['%d' % (i * obj.id) for i in range(1, 4)] def prepare_tags(self, obj): if obj.id == 1: @@ -85,7 +87,7 @@ class XapianMockSearchIndex(indexes.SearchIndex): return ['an', 'to', 'or'] def prepare_keys(self, obj): - return [i * obj.id for i in xrange(1, 4)] + return [i * obj.id for i in range(1, 4)] def prepare_titles(self, obj): if obj.id == 1: @@ -105,7 +107,17 @@ class XapianMockSearchIndex(indexes.SearchIndex): class XapianSimpleMockIndex(indexes.SearchIndex): text = indexes.CharField(document=True) author = indexes.CharField(model_attr='author') - pub_date = indexes.DateTimeField(model_attr='pub_date') + url = indexes.CharField() + non_anscii = indexes.CharField() + + datetime = indexes.DateTimeField(model_attr='pub_date') + date = indexes.DateField() + + number = indexes.IntegerField() + float_number = indexes.FloatField() + decimal_number = indexes.DecimalField() + + multi_value = indexes.MultiValueField() def get_model(self): return MockModel @@ -113,11 +125,38 @@ class XapianSimpleMockIndex(indexes.SearchIndex): def prepare_text(self, obj): return 'this_is_a_word' + def prepare_author(self, obj): + return 'david' + + def prepare_url(self, obj): + return 'http://example.com/1/' + + def prepare_non_anscii(self, obj): + return 'thsi sdas das corrup\xe7\xe3o das' + + def prepare_datetime(self, obj): + return datetime.datetime(2009, 2, 25, 1, 1, 1) + + def prepare_date(self, obj): + return datetime.date(2008, 8, 8) + + def prepare_number(self, obj): + return 123456789 + + def prepare_float_number(self, obj): + return 123.123456789 + + def prepare_decimal_number(self, obj): + return '22.34' + + def prepare_multi_value(self, obj): + return ['tag', 'tag-test', 'tag-test-test'] + class HaystackBackendTestCase(object): """ - An abstract TestCase that implements a hack to ensure connections - has the mock index + Abstract TestCase that implements an hack to ensure `connections` + has the right index It has a method get_index() that returns a SearchIndex that must be overwritten. @@ -141,40 +180,23 @@ class HaystackBackendTestCase(object): connections['default']._index = self.old_ui -class XapianBackendTestCase(HaystackBackendTestCase, TestCase): +class BackendIndexationTestCase(HaystackBackendTestCase, TestCase): + """ + Tests indexation behavior. + + Tests related to how the backend indexes terms, + values, and others go here. + """ def get_index(self): return XapianSimpleMockIndex() def setUp(self): - super(XapianBackendTestCase, self).setUp() - + super(BackendIndexationTestCase, self).setUp() mock = XapianMockModel() mock.id = 1 - mock.author = 'david' - mock.pub_date = datetime.date(2009, 2, 25) - self.backend.update(self.index, [mock]) - def test_fields(self): - """ - Tests that all fields are in the database - """ - terms = get_terms(self.backend, '-a') - for field in ['author', 'pub_date', 'text']: - is_inside = False - for term in terms: - if "X%s" % field.upper() in term: - is_inside = True - break - self.assertTrue(is_inside, field) - - def test_text(self): - terms = get_terms(self.backend, '-a') - - self.assertTrue('this_is_a_word' in terms) - self.assertTrue('Zthis_is_a_word' in terms) - def test_app_is_not_split(self): """ Tests that the app path is not split @@ -195,18 +217,117 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase): self.assertFalse('xapianmockmodel' in terms) self.assertFalse('tests' in terms) + def test_fields_exist(self): + """ + Tests that all fields are in the database + """ + terms = get_terms(self.backend, '-a') + for field in ['author', 'datetime', 'text', 'url']: + is_inside = False + for term in terms: + if term.startswith("X%s" % field.upper()): + is_inside = True + break + self.assertTrue(is_inside, field) -class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): + def test_text_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('this_is_a_word' in terms) + self.assertTrue('Zthis_is_a_word' in terms) + self.assertTrue('ZXTEXTthis_is_a_word' in terms) + self.assertTrue('XTEXTthis_is_a_word' in terms) + + def test_author_field(self): + terms = get_terms(self.backend, '-a') + + self.assertTrue('XAUTHORdavid' in terms) + self.assertTrue('ZXAUTHORdavid' in terms) + self.assertTrue('Zdavid' in terms) + self.assertTrue('david' in terms) + + def test_datetime_field(self): + terms = get_terms(self.backend, '-a') + + self.assertFalse('XDATETIME20090225000000' in terms) + self.assertFalse('ZXDATETIME20090225000000' in terms) + self.assertFalse('20090225000000' in terms) + + self.assertTrue('XDATETIME2009-02-25' in terms) + self.assertTrue('2009-02-25' in terms) + self.assertTrue('01:01:01' in terms) + self.assertTrue('XDATETIME01:01:01' in terms) + + def test_date_field(self): + terms = get_terms(self.backend, '-a') + + self.assertTrue('XDATE2008-08-08' in terms) + self.assertTrue('2008-08-08' in terms) + self.assertFalse('XDATE00:00:00' in terms) + self.assertFalse('00:00:00' in terms) + + def test_url_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('http://example.com/1/' in terms) + + def test_integer_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('123456789' in terms) + self.assertTrue('XNUMBER123456789' in terms) + self.assertFalse('ZXNUMBER123456789' in terms) + + def test_float_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('123.123456789' in terms) + self.assertTrue('XFLOAT_NUMBER123.123456789' in terms) + self.assertFalse('ZXFLOAT_NUMBER123.123456789' in terms) + + def test_decimal_field(self): + terms = get_terms(self.backend, '-a') + self.assertTrue('22.34' in terms) + self.assertTrue('XDECIMAL_NUMBER22.34' in terms) + self.assertFalse('ZXDECIMAL_NUMBER22.34' in terms) + + def test_multivalue_field(self): + """ + Regression test for #103 + """ + terms = get_terms(self.backend, '-a') + self.assertTrue('tag' in terms) + self.assertTrue('tag-test' in terms) + self.assertTrue('tag-test-test' in terms) + + self.assertTrue('XMULTI_VALUEtag' in terms) + self.assertTrue('XMULTI_VALUEtag-test' in terms) + self.assertTrue('XMULTI_VALUEtag-test-test' in terms) + + # these and only these terms + # 3 for the exact term (^{term}$) + self.assertEqual(len([term for term in terms if term.startswith('XMULTI_VALUE')]), 6) + # no stem for exact multivalues. + self.assertEqual(len([term for term in terms if term.startswith('ZXMULTI_VALUE')]), 0) + + def test_non_ascii_chars(self): + terms = get_terms(self.backend, '-a') + self.assertIn('corrup\xe7\xe3o', terms) + + +class BackendFeaturesTestCase(HaystackBackendTestCase, TestCase): + """ + Tests supported features on the backend side. + + Tests to features implemented on the backend + go here. + """ def get_index(self): return XapianMockSearchIndex() def setUp(self): - super(XapianSearchBackendTestCase, self).setUp() + super(BackendFeaturesTestCase, self).setUp() self.sample_objs = [] - for i in xrange(1, 4): + for i in range(1, 4): mock = XapianMockModel() mock.id = i mock.author = 'david%s' % i @@ -225,19 +346,20 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.update(self.index, self.sample_objs) def test_update(self): - self.assertEqual(self.backend.document_count(), 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [1, 2, 3]) def test_duplicate_update(self): - # Duplicates should be updated, not appended -- http://github.com/notanumber/xapian-haystack/issues/#issue/6 + """ + Regression test for #6. + """ self.backend.update(self.index, self.sample_objs) - self.assertEqual(self.backend.document_count(), 3) def test_remove(self): self.backend.remove(self.sample_objs[0]) - self.assertEqual(self.backend.document_count(), 2) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [2, 3]) + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [2, 3]) def test_clear(self): self.backend.clear() @@ -259,18 +381,20 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(self.backend.document_count(), 0) def test_search(self): + # no match query self.assertEqual(self.backend.search(xapian.Query()), {'hits': 0, 'results': []}) - self.assertEqual(self.backend.search(xapian.Query(''))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) - self.assertEqual(self.backend.search(xapian.Query('indexed'))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query(''))['results']], [1, 2, 3]) + # all match query + self.assertEqual(pks(self.backend.search(xapian.Query(''))['results']), + [1, 2, 3]) - # Ensure that swapping the ``result_class`` works. - self.assertTrue(isinstance(self.backend.search(xapian.Query('indexed'), result_class=MockSearchResult)['results'][0], MockSearchResult)) + # Other `result_class` + self.assertTrue(isinstance(self.backend.search(xapian.Query('indexed'), + result_class=MockSearchResult)['results'][0], + MockSearchResult)) def test_search_field_with_punctuation(self): - # self.assertEqual(self.backend.search(xapian.Query('http://example.com/'))['hits'], 3) - self.assertEqual([result.pk for result in self.backend.search(xapian.Query('http://example.com/1/'))['results']], [1]) + self.assertEqual(pks(self.backend.search(xapian.Query('http://example.com/1/'))['results']), + [1]) def test_search_by_mvf(self): self.assertEqual(self.backend.search(xapian.Query('ab'))['hits'], 1) @@ -279,22 +403,39 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.assertEqual(self.backend.search(xapian.Query('one'))['hits'], 3) def test_field_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), facets=['name']), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), facets=['name']), + {'hits': 0, 'results': []}) + results = self.backend.search(xapian.Query('indexed'), facets=['name']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['name'], [('david1', 1), ('david2', 1), ('david3', 1)]) + self.assertEqual(results['facets']['fields']['name'], + [('david1', 1), ('david2', 1), ('david3', 1)]) results = self.backend.search(xapian.Query('indexed'), facets=['flag']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['flag'], [(False, 1), (True, 2)]) + self.assertEqual(results['facets']['fields']['flag'], + [(False, 1), (True, 2)]) results = self.backend.search(xapian.Query('indexed'), facets=['sites']) self.assertEqual(results['hits'], 3) - self.assertEqual(results['facets']['fields']['sites'], [('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]) + self.assertEqual(results['facets']['fields']['sites'], + [('1', 1), ('3', 2), ('2', 2), ('4', 1), ('6', 2), ('9', 1)]) + + def test_raise_index_error_on_wrong_field(self): + """ + Regression test for #109. + """ + self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas']) def test_date_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), date_facets={'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), 'gap_by': 'month'}}), {'hits': 0, 'results': []}) - results = self.backend.search(xapian.Query('indexed'), date_facets={'pub_date': {'start_date': datetime.datetime(2008, 10, 26), 'end_date': datetime.datetime(2009, 3, 26), 'gap_by': 'month'}}) + facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26), + 'end_date': datetime.datetime(2009, 3, 26), + 'gap_by': 'month'}} + + self.assertEqual(self.backend.search(xapian.Query(), date_facets=facets), + {'hits': 0, 'results': []}) + + results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['pub_date'], [ ('2009-02-26T00:00:00', 0), @@ -304,7 +445,11 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ('2008-10-26T00:00:00', 0), ]) - results = self.backend.search(xapian.Query('indexed'), date_facets={'pub_date': {'start_date': datetime.datetime(2009, 02, 01), 'end_date': datetime.datetime(2009, 3, 15), 'gap_by': 'day', 'gap_amount': 15}}) + facets = {'pub_date': {'start_date': datetime.datetime(2009, 2, 1), + 'end_date': datetime.datetime(2009, 3, 15), + 'gap_by': 'day', + 'gap_amount': 15}} + results = self.backend.search(xapian.Query('indexed'), date_facets=facets) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['dates']['pub_date'], [ ('2009-03-03T00:00:00', 0), @@ -313,111 +458,131 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): ]) def test_query_facets(self): - self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), query_facets={'name': 'da*'}), + {'hits': 0, 'results': []}) + results = self.backend.search(xapian.Query('indexed'), query_facets={'name': 'da*'}) self.assertEqual(results['hits'], 3) self.assertEqual(results['facets']['queries']['name'], ('da*', 3)) def test_narrow_queries(self): - self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), narrow_queries={'name:david1'}), + {'hits': 0, 'results': []}) results = self.backend.search(xapian.Query('indexed'), narrow_queries={'name:david1'}) self.assertEqual(results['hits'], 1) def test_highlight(self): - self.assertEqual(self.backend.search(xapian.Query(), highlight=True), {'hits': 0, 'results': []}) + self.assertEqual(self.backend.search(xapian.Query(), highlight=True), + {'hits': 0, 'results': []}) self.assertEqual(self.backend.search(xapian.Query('indexed'), highlight=True)['hits'], 3) - self.assertEqual([result.highlighted['text'] for result in self.backend.search(xapian.Query('indexed'), highlight=True)['results']], ['indexed!\n1', 'indexed!\n2', 'indexed!\n3']) + + results = self.backend.search(xapian.Query('indexed'), highlight=True)['results'] + self.assertEqual([result.highlighted['text'] for result in results], + ['indexed!\n1', 'indexed!\n2', 'indexed!\n3']) def test_spelling_suggestion(self): self.assertEqual(self.backend.search(xapian.Query('indxe'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('indxe'))['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('indxed'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('indxed'))['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('foo'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'], 'indexed') + self.assertEqual(self.backend.search(xapian.Query('foo'), spelling_query='indexy')['spelling_suggestion'], + 'indexed') self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['hits'], 0) - self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'], 'david1') + self.assertEqual(self.backend.search(xapian.Query('XNAMEdavid'))['spelling_suggestion'], + 'david1') def test_more_like_this(self): results = self.backend.more_like_this(self.sample_objs[0]) - self.assertEqual(results['hits'], 2) - self.assertEqual([result.pk for result in results['results']], [3, 2]) - results = self.backend.more_like_this(self.sample_objs[0], additional_query=xapian.Query('david3')) - self.assertEqual(results['hits'], 1) - self.assertEqual([result.pk for result in results['results']], [3]) + self.assertEqual(pks(results['results']), [3, 2]) - results = self.backend.more_like_this(self.sample_objs[0], limit_to_registered_models=True) - self.assertEqual(results['hits'], 2) - self.assertEqual([result.pk for result in results['results']], [3, 2]) + results = self.backend.more_like_this(self.sample_objs[0], + additional_query=xapian.Query('david3')) - # Ensure that swapping the ``result_class`` works. - self.assertTrue(isinstance(self.backend.more_like_this(self.sample_objs[0], result_class=MockSearchResult)['results'][0], MockSearchResult)) + self.assertEqual(pks(results['results']), [3]) + + results = self.backend.more_like_this(self.sample_objs[0], + limit_to_registered_models=True) + + self.assertEqual(pks(results['results']), [3, 2]) + + # Other `result_class` + self.assertTrue(isinstance(self.backend.more_like_this(self.sample_objs[0], + result_class=MockSearchResult)['results'][0], + MockSearchResult)) def test_order_by(self): results = self.backend.search(xapian.Query(''), sort_by=['pub_date']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['-pub_date']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['exp_date']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-exp_date']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['id']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-id']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['value']) - self.assertEqual([result.pk for result in results['results']], [1, 2, 3]) + self.assertEqual(pks(results['results']), [1, 2, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-value']) - self.assertEqual([result.pk for result in results['results']], [3, 2, 1]) + self.assertEqual(pks(results['results']), [3, 2, 1]) results = self.backend.search(xapian.Query(''), sort_by=['popularity']) - self.assertEqual([result.pk for result in results['results']], [2, 1, 3]) + self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['-popularity']) - self.assertEqual([result.pk for result in results['results']], [3, 1, 2]) + self.assertEqual(pks(results['results']), [3, 1, 2]) results = self.backend.search(xapian.Query(''), sort_by=['flag', 'id']) - self.assertEqual([result.pk for result in results['results']], [2, 1, 3]) + self.assertEqual(pks(results['results']), [2, 1, 3]) results = self.backend.search(xapian.Query(''), sort_by=['flag', '-id']) - self.assertEqual([result.pk for result in results['results']], [2, 3, 1]) + self.assertEqual(pks(results['results']), [2, 3, 1]) def test_verify_type(self): self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']], ['02', '02', '02']) - def test__marshal_value(self): - self.assertEqual(_marshal_value('abc'), 'abc') - self.assertEqual(_marshal_value(1), '000000000001') - self.assertEqual(_marshal_value(2653), '000000002653') - self.assertEqual(_marshal_value(25.5), b'\xb2`') - self.assertEqual(_marshal_value([1, 2, 3]), '[1, 2, 3]') - self.assertEqual(_marshal_value((1, 2, 3)), '(1, 2, 3)') - self.assertEqual(_marshal_value({'a': 1, 'c': 3, 'b': 2}), "{u'a': 1, u'c': 3, u'b': 2}") - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 16, 14)), '20090509161400') - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 0, 0)), '20090509000000') - self.assertEqual(_marshal_value(datetime.datetime(1899, 5, 18, 0, 0)), '18990518000000') - self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 18, 1, 16, 30, 250)), '20090518011630000250') + def test_term_to_xapian_value(self): + self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc') + self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001') + self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653') + self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`') + self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]') + self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)') + self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'), + "{u'a': 1, u'c': 3, u'b': 2}") + self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 16, 14), 'datetime'), + '20090509161400') + self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 0, 0), 'date'), + '20090509000000') + self.assertEqual(_term_to_xapian_value(datetime.datetime(1899, 5, 18, 0, 0), 'date'), + '18990518000000') def test_build_schema(self): - (content_field_name, fields) = self.backend.build_schema(connections['default'].get_unified_index().all_searchfields()) + search_fields = connections['default'].get_unified_index().all_searchfields() + (content_field_name, fields) = self.backend.build_schema(search_fields) + self.assertEqual(content_field_name, 'text') self.assertEqual(len(fields), 14 + 3) self.assertEqual(fields, [ {'column': 0, 'type': 'text', 'field_name': 'id', 'multi_valued': 'false'}, - {'column': 1, 'type': 'long', 'field_name': 'django_id', 'multi_valued': 'false'}, + {'column': 1, 'type': 'integer', 'field_name': 'django_id', 'multi_valued': 'false'}, {'column': 2, 'type': 'text', 'field_name': 'django_ct', 'multi_valued': 'false'}, {'column': 3, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'}, {'column': 4, 'type': 'date', 'field_name': 'exp_date', 'multi_valued': 'false'}, @@ -432,33 +597,46 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): {'column': 13, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'}, {'column': 14, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'}, {'column': 15, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'}, - {'column': 16, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'} + {'column': 16, 'type': 'integer', 'field_name': 'value', 'multi_valued': 'false'} ]) def test_parse_query(self): - self.assertEqual(str(self.backend.parse_query('indexed')), 'Xapian::Query(Zindex:(pos=1))') - self.assertEqual(str(self.backend.parse_query('name:david')), 'Xapian::Query(ZXNAMEdavid:(pos=1))') + self.assertEqual(str(self.backend.parse_query('indexed')), + 'Xapian::Query(Zindex:(pos=1))') + self.assertEqual(str(self.backend.parse_query('name:david')), + 'Xapian::Query(ZXNAMEdavid:(pos=1))') if xapian.minor_version() >= 2: - self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((XNAMEdavid1:(pos=1) SYNONYM XNAMEdavid2:(pos=1) SYNONYM XNAMEdavid3:(pos=1)))') + self.assertEqual(str(self.backend.parse_query('name:da*')), + 'Xapian::Query((' + 'XNAMEdavid1:(pos=1) SYNONYM ' + 'XNAMEdavid2:(pos=1) SYNONYM ' + 'XNAMEdavid3:(pos=1)))') else: - self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))') + self.assertEqual(str(self.backend.parse_query('name:da*')), + 'Xapian::Query((' + 'XNAMEdavid1:(pos=1) OR ' + 'XNAMEdavid2:(pos=1) OR ' + 'XNAMEdavid3:(pos=1)))') self.assertEqual(str(self.backend.parse_query('name:david1..david2')), 'Xapian::Query(VALUE_RANGE 7 david1 david2)') self.assertEqual(str(self.backend.parse_query('value:0..10')), 'Xapian::Query(VALUE_RANGE 16 000000000000 000000000010)') self.assertEqual(str(self.backend.parse_query('value:..10')), - 'Xapian::Query(VALUE_RANGE 16 %012d 000000000010)' % (-sys.maxint - 1)) + 'Xapian::Query(VALUE_RANGE 16 %012d 000000000010)' % (-sys.maxsize - 1)) self.assertEqual(str(self.backend.parse_query('value:10..*')), - 'Xapian::Query(VALUE_RANGE 16 000000000010 %012d)' % sys.maxint) + 'Xapian::Query(VALUE_RANGE 16 000000000010 %012d)' % sys.maxsize) self.assertEqual(str(self.backend.parse_query('popularity:25.5..100.0')), b'Xapian::Query(VALUE_RANGE 9 \xb2` \xba@)') def test_order_by_django_id(self): - self.backend.clear() + """ + We need this test because ordering on more than + 10 entries was not correct at some point. + """ self.sample_objs = [] - number_list = range(1, 101) + number_list = list(range(1, 101)) for i in number_list: mock = XapianMockModel() mock.id = i @@ -476,8 +654,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.update(self.index, self.sample_objs) results = self.backend.search(xapian.Query(''), sort_by=['-django_id']) - self.assertEqual(results['hits'], len(number_list)) - self.assertEqual([result.pk for result in results['results']], list(reversed(number_list))) + self.assertEqual(pks(results['results']), list(reversed(number_list))) def test_more_like_this_with_unindexed_model(self): """ @@ -496,187 +673,3 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase): self.backend.silently_fail = False self.assertRaises(InvalidIndexError, self.backend.more_like_this, mock) - - -class LiveXapianMockSearchIndex(indexes.SearchIndex): - text = indexes.CharField(document=True, use_template=True) - name = indexes.CharField(model_attr='author', faceted=True) - pub_date = indexes.DateField(model_attr='pub_date') - created = indexes.DateField() - title = indexes.CharField() - - def get_model(self): - return MockModel - - -class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): - """ - SearchQuery specific tests - """ - fixtures = ['initial_data.json'] - - def get_index(self): - return LiveXapianMockSearchIndex() - - def setUp(self): - super(LiveXapianSearchQueryTestCase, self).setUp() - - self.backend.update(self.index, MockModel.objects.all()) - - self.sq = connections['default'].get_query() - - def test_get_spelling(self): - self.sq.add_filter(SQ(content='indxd')) - self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed') - self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed') - - def test_startswith(self): - self.sq.add_filter(SQ(name__startswith='da')) - self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3]) - - def test_build_query_gt(self): - self.sq.add_filter(SQ(name__gt='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT VALUE_RANGE 4 a m))') - - def test_build_query_gte(self): - self.sq.add_filter(SQ(name__gte='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' - 'zzzzzzzzzzzzzz)') - - def test_build_query_lt(self): - self.sq.add_filter(SQ(name__lt='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') - - def test_build_query_lte(self): - self.sq.add_filter(SQ(name__lte='m')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 a m)') - - def test_build_query_multiple_filter_types(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) - self.sq.add_filter(SQ(name__gt='david')) - self.sq.add_filter(SQ(created__lt=datetime.datetime(2009, 2, 12, 12, 13, 0))) - self.sq.add_filter(SQ(title__gte='B')) - self.sq.add_filter(SQ(id__in=[1, 2, 3])) - self.assertEqual(str(self.sq.build_query()), - 'Xapian::Query(((Zwhi OR why) AND VALUE_RANGE 6 00010101000000 20090210015900 AND ' - '( AND_NOT VALUE_RANGE 4 a david) AND ' - '( AND_NOT VALUE_RANGE 3 20090212121300 99990101000000) AND ' - 'VALUE_RANGE 8 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' - '(Q1 OR Q2 OR Q3)))') - - def test_log_query(self): - reset_search_queries() - self.assertEqual(len(connections['default'].queries), 0) - - # Stow. - old_debug = settings.DEBUG - settings.DEBUG = False - - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 0) - - settings.DEBUG = True - # Redefine it to clear out the cached results. - self.sq = connections['default'].get_query() - self.sq.add_filter(SQ(name='bar')) - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 1) - self.assertEqual(str(connections['default'].queries[0]['query_string']), 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') - - # And again, for good measure. - self.sq = connections['default'].get_query() - self.sq.add_filter(SQ(name='bar')) - self.sq.add_filter(SQ(text='moof')) - len(self.sq.get_results()) - self.assertEqual(len(connections['default'].queries), 2) - self.assertEqual(str(connections['default'].queries[0]['query_string']), 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') - self.assertEqual(str(connections['default'].queries[1]['query_string']), 'Xapian::Query(((ZXNAMEbar OR XNAMEbar) AND (ZXTEXTmoof OR XTEXTmoof)))') - - # Restore. - settings.DEBUG = old_debug - - -class LiveXapianSearchQuerySetTestCase(HaystackBackendTestCase, TestCase): - """ - SearchQuerySet specific tests - """ - fixtures = ['initial_data.json'] - - def get_index(self): - return LiveXapianMockSearchIndex() - - def setUp(self): - super(LiveXapianSearchQuerySetTestCase, self).setUp() - - self.backend.update(self.index, MockModel.objects.all()) - self.sq = connections['default'].get_query() - self.sqs = SearchQuerySet() - - def test_result_class(self): - # Assert that we're defaulting to ``SearchResult``. - sqs = self.sqs.all() - self.assertTrue(isinstance(sqs[0], SearchResult)) - - # Custom class. - sqs = self.sqs.result_class(MockSearchResult).all() - self.assertTrue(isinstance(sqs[0], MockSearchResult)) - - # Reset to default. - sqs = self.sqs.result_class(None).all() - self.assertTrue(isinstance(sqs[0], SearchResult)) - - def test_facet(self): - self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3) - - -class XapianBoostMockSearchIndex(indexes.SearchIndex): - text = indexes.CharField( - document=True, use_template=True, - template_name='search/indexes/core/mockmodel_template.txt' - ) - author = indexes.CharField(model_attr='author', weight=2.0) - editor = indexes.CharField(model_attr='editor') - pub_date = indexes.DateField(model_attr='pub_date') - - def get_model(self): - return AFourthMockModel - - -class XapianBoostBackendTestCase(HaystackBackendTestCase, TestCase): - - def get_index(self): - return XapianBoostMockSearchIndex() - - def setUp(self): - super(XapianBoostBackendTestCase, self).setUp() - - self.sample_objs = [] - for i in xrange(1, 5): - mock = AFourthMockModel() - mock.id = i - if i % 2: - mock.author = 'daniel' - mock.editor = 'david' - else: - mock.author = 'david' - mock.editor = 'daniel' - mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i) - self.sample_objs.append(mock) - - self.backend.update(self.index, self.sample_objs) - - def test_boost(self): - sqs = SearchQuerySet() - - self.assertEqual(len(sqs.all()), 4) - - results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel')) - - self.assertEqual([result.id for result in results], [ - 'core.afourthmockmodel.1', - 'core.afourthmockmodel.3', - 'core.afourthmockmodel.2', - 'core.afourthmockmodel.4' - ]) diff --git a/tests/xapian_tests/tests/test_interface.py b/tests/xapian_tests/tests/test_interface.py new file mode 100644 index 0000000..3b16a22 --- /dev/null +++ b/tests/xapian_tests/tests/test_interface.py @@ -0,0 +1,203 @@ +from __future__ import unicode_literals + +import datetime +from django.db.models import Q +from django.test import TestCase + +from haystack import connections +from haystack.inputs import AutoQuery +from haystack.query import SearchQuerySet + +from xapian_tests.models import Document +from xapian_tests.search_indexes import DocumentIndex +from xapian_tests.tests.test_backend import pks + + +class InterfaceTestCase(TestCase): + """ + Tests the interface of Xapian-Haystack. + + Tests related to usability and expected behavior + go here. + """ + + def setUp(self): + super(InterfaceTestCase, self).setUp() + + types_names = ['book', 'magazine', 'article'] + texts = ['This is a huge text', + 'This is a medium text', + 'This is a small text'] + dates = [datetime.date(year=2010, month=1, day=1), + datetime.date(year=2010, month=2, day=1), + datetime.date(year=2010, month=3, day=1)] + + summaries = ['This is a huge corrup\xe7\xe3o summary', + 'This is a medium summary', + 'This is a small summary'] + + for i in range(1, 13): + doc = Document() + doc.type_name = types_names[i % 3] + doc.number = i * 2 + doc.name = "%s %d" % (doc.type_name, doc.number) + doc.date = dates[i % 3] + + doc.summary = summaries[i % 3] + doc.text = texts[i % 3] + doc.save() + + self.index = DocumentIndex() + self.ui = connections['default'].get_unified_index() + self.ui.build(indexes=[self.index]) + + self.backend = connections['default'].get_backend() + self.backend.update(self.index, Document.objects.all()) + + self.queryset = SearchQuerySet() + + def tearDown(self): + Document.objects.all().delete() + self.backend.clear() + super(InterfaceTestCase, self).tearDown() + + def test_count(self): + self.assertEqual(self.queryset.count(), Document.objects.count()) + + def test_content_search(self): + result = self.queryset.filter(content='medium this') + self.assertEqual(sorted(pks(result)), + pks(Document.objects.all())) + + # documents with "medium" AND "this" have higher score + self.assertEqual(pks(result)[:4], [1, 4, 7, 10]) + + def test_field_search(self): + self.assertEqual(pks(self.queryset.filter(name='8')), [4]) + self.assertEqual(pks(self.queryset.filter(type_name='book')), + pks(Document.objects.filter(type_name='book'))) + + self.assertEqual(pks(self.queryset.filter(text='text huge')), + pks(Document.objects.filter(text__contains='text huge'))) + + def test_field_contains(self): + self.assertEqual(pks(self.queryset.filter(summary='huge')), + pks(Document.objects.filter(summary__contains='huge'))) + + result = self.queryset.filter(summary='huge summary') + self.assertEqual(sorted(pks(result)), + pks(Document.objects.all())) + + # documents with "huge" AND "summary" have higher score + self.assertEqual(pks(result)[:4], [3, 6, 9, 12]) + + def test_field_exact(self): + self.assertEqual(pks(self.queryset.filter(name__exact='8')), []) + self.assertEqual(pks(self.queryset.filter(name__exact='magazine 2')), [1]) + + def test_content_exact(self): + self.assertEqual(pks(self.queryset.filter(content__exact='huge')), []) + + def test_content_and(self): + self.assertEqual(pks(self.queryset.filter(content='huge').filter(summary='medium')), []) + + self.assertEqual(len(self.queryset.filter(content='huge this')), 12) + self.assertEqual(len(self.queryset.filter(content='huge this').filter(summary='huge')), 4) + + def test_content_or(self): + self.assertEqual(len(self.queryset.filter(content='huge medium')), 8) + self.assertEqual(len(self.queryset.filter(content='huge medium small')), 12) + + def test_field_and(self): + self.assertEqual(pks(self.queryset.filter(name='8').filter(name='4')), []) + + def test_field_or(self): + self.assertEqual(pks(self.queryset.filter(name='8 4')), [2, 4]) + + def test_field_in(self): + self.assertEqual(set(pks(self.queryset.filter(name__in=['magazine 2', 'article 4']))), + set(pks(Document.objects.filter(name__in=['magazine 2', 'article 4'])))) + + self.assertEqual(pks(self.queryset.filter(number__in=[4])), + pks(Document.objects.filter(number__in=[4]))) + + self.assertEqual(pks(self.queryset.filter(number__in=[4, 8])), + pks(Document.objects.filter(number__in=[4, 8]))) + + def test_private_fields(self): + self.assertEqual(pks(self.queryset.filter(django_id=4)), + pks(Document.objects.filter(id__in=[4]))) + self.assertEqual(pks(self.queryset.filter(django_id__in=[2, 4])), + pks(Document.objects.filter(id__in=[2, 4]))) + + self.assertEqual(set(pks(self.queryset.models(Document))), + set(pks(Document.objects.all()))) + + def test_field_startswith(self): + self.assertEqual(len(self.queryset.filter(name__startswith='magaz')), 4) + self.assertEqual(set(pks(self.queryset.filter(text__startswith='This is'))), + set(pks(Document.objects.filter(text__startswith='This is')))) + + def test_auto_query(self): + self.assertEqual(set(pks(self.queryset.auto_query("huge OR medium"))), + set(pks(Document.objects.filter(Q(text__contains="huge") | + Q(text__contains="medium"))))) + + self.assertEqual(set(pks(self.queryset.auto_query("huge AND medium"))), + set(pks(Document.objects.filter(Q(text__contains="huge") & + Q(text__contains="medium"))))) + + self.assertEqual(set(pks(self.queryset.auto_query("text:huge text:-this"))), + set(pks(Document.objects.filter(Q(text__contains="huge") & + ~Q(text__contains="this"))))) + + self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 OR 4"))), 2) + self.assertEqual(len(self.queryset.filter(name=AutoQuery("8 AND 4"))), 0) + + def test_value_range(self): + self.assertEqual(set(pks(self.queryset.filter(number__lt=3))), + set(pks(Document.objects.filter(number__lt=3)))) + + self.assertEqual(set(pks(self.queryset.filter(django_id__gte=6))), + set(pks(Document.objects.filter(id__gte=6)))) + + def test_date_range(self): + date = datetime.date(year=2010, month=2, day=1) + self.assertEqual(set(pks(self.queryset.filter(date__gte=date))), + set(pks(Document.objects.filter(date__gte=date)))) + + date = datetime.date(year=2010, month=3, day=1) + self.assertEqual(set(pks(self.queryset.filter(date__lte=date))), + set(pks(Document.objects.filter(date__lte=date)))) + + def test_order_by(self): + # private order + self.assertEqual(pks(self.queryset.order_by("-django_id")), + pks(Document.objects.order_by("-id"))) + + # value order + self.assertEqual(pks(self.queryset.order_by("number")), + pks(Document.objects.order_by("number"))) + + # text order + self.assertEqual(pks(self.queryset.order_by("summary")), + pks(Document.objects.order_by("summary"))) + + # date order + self.assertEqual(pks(self.queryset.order_by("-date")), + pks(Document.objects.order_by("-date"))) + + def test_non_ascii_search(self): + """ + Regression test for #119. + """ + self.assertEqual(pks(self.queryset.filter(content='corrup\xe7\xe3o')), + pks(Document.objects.filter(summary__contains='corrup\xe7\xe3o'))) + + def test_multi_values_exact_search(self): + """ + Regression test for #103 + """ + self.assertEqual(len(self.queryset.filter(tags__exact='tag')), 12) + self.assertEqual(len(self.queryset.filter(tags__exact='tag-test')), 8) + self.assertEqual(len(self.queryset.filter(tags__exact='tag-test-test')), 4) diff --git a/tests/xapian_tests/tests/test_query.py b/tests/xapian_tests/tests/test_query.py new file mode 100644 index 0000000..244cd41 --- /dev/null +++ b/tests/xapian_tests/tests/test_query.py @@ -0,0 +1,436 @@ +from __future__ import unicode_literals + +import datetime + +from django.conf import settings +from django.test import TestCase + +from haystack import indexes +from haystack import connections, reset_search_queries +from haystack.models import SearchResult +from haystack.query import SearchQuerySet, SQ + +from core.models import MockModel, AnotherMockModel, AFourthMockModel +from core.tests.mocks import MockSearchResult +from xapian_tests.tests.test_backend import HaystackBackendTestCase + + +class MockQueryIndex(indexes.SearchIndex): + text = indexes.CharField(document=True) + pub_date = indexes.DateTimeField() + title = indexes.CharField() + foo = indexes.CharField() + + def get_model(self): + return MockModel + + +class XapianSearchQueryTestCase(HaystackBackendTestCase, TestCase): + def get_index(self): + return MockQueryIndex() + + def setUp(self): + super(XapianSearchQueryTestCase, self).setUp() + self.sq = connections['default'].get_query() + + def test_build_query_all(self): + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query()') + + def test_build_query_single_word(self): + self.sq.add_filter(SQ(content='hello')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Zhello OR hello))') + + def test_build_query_single_word_not(self): + self.sq.add_filter(~SQ(content='hello')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (Zhello OR hello)))') + + def test_build_query_single_word_field_exact(self): + self.sq.add_filter(SQ(foo='hello')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((ZXFOOhello OR XFOOhello))') + + def test_build_query_single_word_field_exact_not(self): + self.sq.add_filter(~SQ(foo='hello')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (ZXFOOhello OR XFOOhello)))') + + def test_build_query_boolean(self): + self.sq.add_filter(SQ(content=True)) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Ztrue OR true))') + + def test_build_query_date(self): + self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8))) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z2009-05-08 OR 2009-05-08))') + + def test_build_query_date_not(self): + self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8))) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT (Z2009-05-08 OR 2009-05-08)))') + + def test_build_query_datetime(self): + self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z2009-05-08 OR 2009-05-08 OR Z11:28:00 OR 11:28:00))') + + def test_build_query_datetime_not(self): + self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT ' + '(Z2009-05-08 OR 2009-05-08 OR Z11:28:00 OR 11:28:00)))') + + def test_build_query_float(self): + self.sq.add_filter(SQ(content=25.52)) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Z25.52 OR 25.52))') + + def test_build_query_multiple_words_and(self): + self.sq.add_filter(SQ(content='hello')) + self.sq.add_filter(SQ(content='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zhello OR hello) AND (Zworld OR world)))') + + def test_build_query_multiple_words_not(self): + self.sq.add_filter(~SQ(content='hello')) + self.sq.add_filter(~SQ(content='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (Zhello OR hello)) AND ' + '( AND_NOT (Zworld OR world))))') + + def test_build_query_multiple_words_or(self): + self.sq.add_filter(SQ(content='hello') | SQ(content='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Zhello OR hello OR Zworld OR world))') + + def test_build_query_multiple_words_or_not(self): + self.sq.add_filter(~SQ(content='hello') | ~SQ(content='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (Zhello OR hello)) OR ' + '( AND_NOT (Zworld OR world))))') + + def test_build_query_multiple_words_mixed(self): + self.sq.add_filter(SQ(content='why') | SQ(content='hello')) + self.sq.add_filter(~SQ(content='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(Zwhi OR why OR Zhello OR hello) AND ' + '( AND_NOT (Zworld OR world))))') + + def test_build_query_multiple_word_field_exact(self): + self.sq.add_filter(SQ(foo='hello')) + self.sq.add_filter(SQ(title='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(ZXFOOhello OR XFOOhello) AND ' + '(ZXTITLEworld OR XTITLEworld)))') + + def test_build_query_multiple_word_field_exact_not(self): + self.sq.add_filter(~SQ(foo='hello')) + self.sq.add_filter(~SQ(title='world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '( AND_NOT (ZXFOOhello OR XFOOhello)) AND ' + '( AND_NOT (ZXTITLEworld OR XTITLEworld))))') + + def test_build_query_or(self): + self.sq.add_filter(SQ(content='hello world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((Zhello OR hello OR Zworld OR world))') + + def test_build_query_not_or(self): + self.sq.add_filter(~SQ(content='hello world')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(' + '( AND_NOT (Zhello OR hello OR Zworld OR world)))') + + def test_build_query_boost(self): + self.sq.add_filter(SQ(content='hello')) + self.sq.add_boost('world', 5) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(Zhello OR hello) AND_MAYBE ' + '5 * (Zworld OR world)))') + + def test_build_query_not_in_filter_single_words(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '( AND_NOT (XTITLE^dune$ OR ' + 'XTITLE^jaws$))))') + + def test_build_query_in_filter_multiple_words(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article"])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR ' + '(XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle))))') + + def test_build_query_in_filter_multiple_words_with_punctuation(self): + self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article", "My Store Inc."])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query((' + '(XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR ' + '(XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle) OR ' + '(XTITLEmy PHRASE 3 XTITLEstore PHRASE 3 XTITLEinc.)))') + + def test_build_query_not_in_filter_multiple_words(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(~SQ(title__in=["A Famous Paper", "An Infamous Article"])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '( AND_NOT ((XTITLEa PHRASE 3 ' + 'XTITLEfamous PHRASE 3 ' + 'XTITLEpaper) OR (XTITLEan PHRASE 3 ' + 'XTITLEinfamous PHRASE 3 XTITLEarticle)))))') + + def test_build_query_in_filter_datetime(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '(XPUB_DATE2009-07-06 AND_MAYBE XPUB_DATE01:56:21)))') + + def test_clean(self): + self.assertEqual(self.sq.clean('hello world'), 'hello world') + self.assertEqual(self.sq.clean('hello AND world'), 'hello AND world') + self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'), + 'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world') + self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'), + 'so please NOTe i am in a bAND and bORed') + + def test_build_query_with_models(self): + self.sq.add_filter(SQ(content='hello')) + self.sq.add_model(MockModel) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zhello OR hello) AND ' + '0 * CONTENTTYPEcore.mockmodel))') + + self.sq.add_model(AnotherMockModel) + + self.assertTrue(str(self.sq.build_query()) in ( + 'Xapian::Query(((Zhello OR hello) AND ' + '(0 * CONTENTTYPEcore.anothermockmodel OR ' + '0 * CONTENTTYPEcore.mockmodel)))', + 'Xapian::Query(((Zhello OR hello) AND ' + '(0 * CONTENTTYPEcore.mockmodel OR ' + '0 * CONTENTTYPEcore.anothermockmodel)))')) + + def test_build_query_with_punctuation(self): + self.sq.add_filter(SQ(content='http://www.example.com')) + self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhttp://www.example.com OR ' + 'http://www.example.com))') + + def test_in_filter_values_list(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + '(XTITLE^1$ OR XTITLE^2$ OR XTITLE^3$)))') + + +class MockSearchIndex(indexes.SearchIndex): + text = indexes.CharField(document=True, use_template=True) + name = indexes.CharField(model_attr='author', faceted=True) + pub_date = indexes.DateTimeField(model_attr='pub_date') + title = indexes.CharField() + + def get_model(self): + return MockModel + + +class SearchQueryTestCase(HaystackBackendTestCase, TestCase): + """ + Tests expected behavior of + SearchQuery. + """ + fixtures = ['initial_data.json'] + + def get_index(self): + return MockSearchIndex() + + def setUp(self): + super(SearchQueryTestCase, self).setUp() + + self.backend.update(self.index, MockModel.objects.all()) + + self.sq = connections['default'].get_query() + + def test_get_spelling(self): + self.sq.add_filter(SQ(content='indxd')) + self.assertEqual(self.sq.get_spelling_suggestion(), 'indexed') + self.assertEqual(self.sq.get_spelling_suggestion('indxd'), 'indexed') + + def test_startswith(self): + self.sq.add_filter(SQ(name__startswith='da')) + self.assertEqual([result.pk for result in self.sq.get_results()], [1, 2, 3]) + + def test_build_query_gt(self): + self.sq.add_filter(SQ(name__gt='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT VALUE_RANGE 3 a m))') + + def test_build_query_gte(self): + self.sq.add_filter(SQ(name__gte='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzz)') + + def test_build_query_lt(self): + self.sq.add_filter(SQ(name__lt='m')) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(( AND_NOT ' + 'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))') + + def test_build_query_lte(self): + self.sq.add_filter(SQ(name__lte='m')) + self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 3 a m)') + + def test_build_query_multiple_filter_types(self): + self.sq.add_filter(SQ(content='why')) + self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0))) + self.sq.add_filter(SQ(name__gt='david')) + self.sq.add_filter(SQ(title__gte='B')) + self.sq.add_filter(SQ(django_id__in=[1, 2, 3])) + self.assertEqual(str(self.sq.build_query()), + 'Xapian::Query(((Zwhi OR why) AND ' + 'VALUE_RANGE 5 00010101000000 20090210015900 AND ' + '( AND_NOT VALUE_RANGE 3 a david) AND ' + 'VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz' + 'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND ' + '(QQ000000000001 OR QQ000000000002 OR QQ000000000003)))') + + def test_log_query(self): + reset_search_queries() + self.assertEqual(len(connections['default'].queries), 0) + + # Stow. + old_debug = settings.DEBUG + settings.DEBUG = False + + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 0) + + settings.DEBUG = True + # Redefine it to clear out the cached results. + self.sq = connections['default'].get_query() + self.sq.add_filter(SQ(name='bar')) + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 1) + self.assertEqual(str(connections['default'].queries[0]['query_string']), + 'Xapian::Query((ZXNAMEbar OR XNAMEbar))') + + # And again, for good measure. + self.sq = connections['default'].get_query() + self.sq.add_filter(SQ(name='bar')) + self.sq.add_filter(SQ(text='moof')) + len(self.sq.get_results()) + self.assertEqual(len(connections['default'].queries), 2) + self.assertEqual(str(connections['default'].queries[0]['query_string']), + 'Xapian::Query((' + 'ZXNAMEbar OR ' + 'XNAMEbar))') + self.assertEqual(str(connections['default'].queries[1]['query_string']), + 'Xapian::Query((' + '(ZXNAMEbar OR XNAMEbar) AND ' + '(ZXTEXTmoof OR XTEXTmoof)))') + + # Restore. + settings.DEBUG = old_debug + + +class LiveSearchQuerySetTestCase(HaystackBackendTestCase, TestCase): + """ + SearchQuerySet specific tests + """ + fixtures = ['initial_data.json'] + + def get_index(self): + return MockSearchIndex() + + def setUp(self): + super(LiveSearchQuerySetTestCase, self).setUp() + + self.backend.update(self.index, MockModel.objects.all()) + self.sq = connections['default'].get_query() + self.sqs = SearchQuerySet() + + def test_result_class(self): + # Assert that we're defaulting to ``SearchResult``. + sqs = self.sqs.all() + self.assertTrue(isinstance(sqs[0], SearchResult)) + + # Custom class. + sqs = self.sqs.result_class(MockSearchResult).all() + self.assertTrue(isinstance(sqs[0], MockSearchResult)) + + # Reset to default. + sqs = self.sqs.result_class(None).all() + self.assertTrue(isinstance(sqs[0], SearchResult)) + + def test_facet(self): + self.assertEqual(len(self.sqs.facet('name').facet_counts()['fields']['name']), 3) + + +class BoostMockSearchIndex(indexes.SearchIndex): + text = indexes.CharField( + document=True, use_template=True, + template_name='search/indexes/core/mockmodel_template.txt' + ) + author = indexes.CharField(model_attr='author', weight=2.0) + editor = indexes.CharField(model_attr='editor') + pub_date = indexes.DateField(model_attr='pub_date') + + def get_model(self): + return AFourthMockModel + + +class BoostFieldTestCase(HaystackBackendTestCase, TestCase): + """ + Tests boosted fields. + """ + + def get_index(self): + return BoostMockSearchIndex() + + def setUp(self): + super(BoostFieldTestCase, self).setUp() + + self.sample_objs = [] + for i in range(1, 5): + mock = AFourthMockModel() + mock.id = i + if i % 2: + mock.author = 'daniel' + mock.editor = 'david' + else: + mock.author = 'david' + mock.editor = 'daniel' + mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i) + self.sample_objs.append(mock) + + self.backend.update(self.index, self.sample_objs) + + def test_boost(self): + sqs = SearchQuerySet() + + self.assertEqual(len(sqs.all()), 4) + + results = sqs.filter(SQ(author='daniel') | SQ(editor='daniel')) + + self.assertEqual([result.id for result in results], [ + 'core.afourthmockmodel.1', + 'core.afourthmockmodel.3', + 'core.afourthmockmodel.2', + 'core.afourthmockmodel.4' + ]) diff --git a/tests/xapian_tests/tests/test_xapian_query.py b/tests/xapian_tests/tests/test_xapian_query.py deleted file mode 100644 index adb3678..0000000 --- a/tests/xapian_tests/tests/test_xapian_query.py +++ /dev/null @@ -1,169 +0,0 @@ -from __future__ import unicode_literals - -import datetime -import os -import shutil - -from django.conf import settings -from django.test import TestCase - -from haystack import connections -from haystack.query import SQ - -from core.models import MockModel, AnotherMockModel - - -class XapianSearchQueryTestCase(TestCase): - def setUp(self): - super(XapianSearchQueryTestCase, self).setUp() - self.sq = connections['default'].get_query() - - def tearDown(self): - if os.path.exists(settings.HAYSTACK_CONNECTIONS['default']['PATH']): - shutil.rmtree(settings.HAYSTACK_CONNECTIONS['default']['PATH']) - - super(XapianSearchQueryTestCase, self).tearDown() - - def test_build_query_all(self): - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query()') - - def test_build_query_single_word(self): - self.sq.add_filter(SQ(content='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhello OR hello))') - - def test_build_query_single_word_not(self): - self.sq.add_filter(~SQ(content='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Zhello OR hello)))') - - def test_build_query_single_word_field_exact(self): - self.sq.add_filter(SQ(foo='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((ZXFOOhello OR XFOOhello))') - - def test_build_query_single_word_field_exact_not(self): - self.sq.add_filter(~SQ(foo='hello')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (ZXFOOhello OR XFOOhello)))') - - def test_build_query_boolean(self): - self.sq.add_filter(SQ(content=True)) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Ztrue OR true))') - - def test_build_query_date(self): - self.sq.add_filter(SQ(content=datetime.date(2009, 5, 8))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z20090508000000 OR 20090508000000))') - - def test_build_query_date_not(self): - self.sq.add_filter(~SQ(content=datetime.date(2009, 5, 8))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Z20090508000000 OR 20090508000000)))') - - def test_build_query_datetime(self): - self.sq.add_filter(SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z20090508112800 OR 20090508112800))') - - def test_build_query_datetime_not(self): - self.sq.add_filter(~SQ(content=datetime.datetime(2009, 5, 8, 11, 28))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (Z20090508112800 OR 20090508112800)))') - - def test_build_query_float(self): - self.sq.add_filter(SQ(content=25.52)) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Z25.52 OR 25.52))') - - def test_build_query_multiple_words_and(self): - self.sq.add_filter(SQ(content='hello')) - self.sq.add_filter(SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND (Zworld OR world)))') - - def test_build_query_multiple_words_not(self): - self.sq.add_filter(~SQ(content='hello')) - self.sq.add_filter(~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (Zhello OR hello)) AND ( AND_NOT (Zworld OR world))))') - - def test_build_query_multiple_words_or(self): - self.sq.add_filter(SQ(content='hello') | SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhello OR hello OR Zworld OR world))') - - def test_build_query_multiple_words_or_not(self): - self.sq.add_filter(~SQ(content='hello') | ~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (Zhello OR hello)) OR ( AND_NOT (Zworld OR world))))') - - def test_build_query_multiple_words_mixed(self): - self.sq.add_filter(SQ(content='why') | SQ(content='hello')) - self.sq.add_filter(~SQ(content='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why OR Zhello OR hello) AND ( AND_NOT (Zworld OR world))))') - - def test_build_query_multiple_word_field_exact(self): - self.sq.add_filter(SQ(foo='hello')) - self.sq.add_filter(SQ(bar='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((ZXFOOhello OR XFOOhello) AND (ZXBARworld OR XBARworld)))') - - def test_build_query_multiple_word_field_exact_not(self): - self.sq.add_filter(~SQ(foo='hello')) - self.sq.add_filter(~SQ(bar='world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((( AND_NOT (ZXFOOhello OR XFOOhello)) AND ( AND_NOT (ZXBARworld OR XBARworld))))') - - def test_build_query_phrase(self): - self.sq.add_filter(SQ(content='hello world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((hello PHRASE 2 world))') - - def test_build_query_phrase_not(self): - self.sq.add_filter(~SQ(content='hello world')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(( AND_NOT (hello PHRASE 2 world)))') - - def test_build_query_boost(self): - self.sq.add_filter(SQ(content='hello')) - self.sq.add_boost('world', 5) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND_MAYBE 5 * (Zworld OR world)))') - - def test_build_query_in_filter_single_words(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(title__in=["Dune", "Jaws"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws)))') - - def test_build_query_not_in_filter_single_words(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(~SQ(title__in=["Dune", "Jaws"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ( AND_NOT (ZXTITLEdune OR XTITLEdune OR ZXTITLEjaw OR XTITLEjaws))))') - - def test_build_query_in_filter_multiple_words(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle))))') - - def test_build_query_in_filter_multiple_words_with_punctuation(self): - self.sq.add_filter(SQ(title__in=["A Famous Paper", "An Infamous Article", "My Store Inc."])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle) OR (XTITLEmy PHRASE 3 XTITLEstore PHRASE 3 XTITLEinc.)))') - - def test_build_query_not_in_filter_multiple_words(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(~SQ(title__in=["A Famous Paper", "An Infamous Article"])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND ( AND_NOT ((XTITLEa PHRASE 3 XTITLEfamous PHRASE 3 XTITLEpaper) OR (XTITLEan PHRASE 3 XTITLEinfamous PHRASE 3 XTITLEarticle)))))') - - def test_build_query_in_filter_datetime(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(pub_date__in=[datetime.datetime(2009, 7, 6, 1, 56, 21)])) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXPUB_DATE20090706015621 OR XPUB_DATE20090706015621)))') - - def test_clean(self): - self.assertEqual(self.sq.clean('hello world'), 'hello world') - self.assertEqual(self.sq.clean('hello AND world'), 'hello AND world') - self.assertEqual(self.sq.clean('hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world'), 'hello AND OR NOT TO + - && || ! ( ) { } [ ] ^ " ~ * ? : \ world') - self.assertEqual(self.sq.clean('so please NOTe i am in a bAND and bORed'), 'so please NOTe i am in a bAND and bORed') - - def test_build_query_with_models(self): - self.sq.add_filter(SQ(content='hello')) - self.sq.add_model(MockModel) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND 0 * CONTENTTYPEcore.mockmodel))') - - self.sq.add_model(AnotherMockModel) - - self.assertTrue(str(self.sq.build_query()) in ( - 'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.anothermockmodel OR 0 * CONTENTTYPEcore.mockmodel)))', - 'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.mockmodel OR 0 * CONTENTTYPEcore.anothermockmodel)))')) - - def test_build_query_with_punctuation(self): - self.sq.add_filter(SQ(content='http://www.example.com')) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((Zhttp://www.example.com OR http://www.example.com))') - - def test_in_filter_values_list(self): - self.sq.add_filter(SQ(content='why')) - self.sq.add_filter(SQ(title__in=MockModel.objects.values_list('id', flat=True))) - self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND (ZXTITLE1 OR XTITLE1 OR ZXTITLE2 OR XTITLE2 OR ZXTITLE3 OR XTITLE3)))') diff --git a/xapian_backend.py b/xapian_backend.py index 1196395..7eb9799 100755 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -16,6 +16,7 @@ from haystack import connections from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query from haystack.constants import ID, DJANGO_ID, DJANGO_CT from haystack.exceptions import HaystackError, MissingDependency +from haystack.inputs import AutoQuery from haystack.models import SearchResult from haystack.utils import get_identifier, get_model_ct @@ -48,6 +49,18 @@ DEFAULT_XAPIAN_FLAGS = ( xapian.QueryParser.FLAG_PURE_NOT ) +# number of documents checked by default when building facets +# this must be improved to be relative to the total number of docs. +DEFAULT_CHECK_AT_LEAST = 1000 + +# field types accepted to be serialized as values in Xapian +FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean'} + +# defines the format used to store types in Xapian +# this format ensures datetimes are sorted correctly +DATETIME_FORMAT = '%Y%m%d%H%M%S' +INTEGER_FORMAT = '%012d' + class InvalidIndexError(HaystackError): """Raised when an index can not be opened.""" @@ -76,30 +89,33 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor): begin = begin[colon + 1:len(begin)] for field_dict in self.backend.schema: if field_dict['field_name'] == field_name: + field_type = field_dict['type'] + if not begin: - if field_dict['type'] == 'text': + if field_type == 'text': begin = 'a' # TODO: A better way of getting a min text value? - elif field_dict['type'] == 'long': - begin = -sys.maxint - 1 - elif field_dict['type'] == 'float': + elif field_type == 'integer': + begin = -sys.maxsize - 1 + elif field_type == 'float': begin = float('-inf') - elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': + elif field_type == 'date' or field_type == 'datetime': begin = '00010101000000' elif end == '*': - if field_dict['type'] == 'text': + if field_type == 'text': end = 'z' * 100 # TODO: A better way of getting a max text value? - elif field_dict['type'] == 'long': - end = sys.maxint - elif field_dict['type'] == 'float': + elif field_type == 'integer': + end = sys.maxsize + elif field_type == 'float': end = float('inf') - elif field_dict['type'] == 'date' or field_dict['type'] == 'datetime': + elif field_type == 'date' or field_type == 'datetime': end = '99990101000000' - if field_dict['type'] == 'float': - begin = _marshal_value(float(begin)) - end = _marshal_value(float(end)) - elif field_dict['type'] == 'long': - begin = _marshal_value(long(begin)) - end = _marshal_value(long(end)) + + if field_type == 'float': + begin = _term_to_xapian_value(float(begin), field_type) + end = _term_to_xapian_value(float(end), field_type) + elif field_type == 'integer': + begin = _term_to_xapian_value(int(begin), field_type) + end = _term_to_xapian_value(int(end), field_type) return field_dict['column'], str(begin), str(end) @@ -163,12 +179,18 @@ class XapianSearchBackend(BaseSearchBackend): # these 4 attributes are caches populated in `build_schema` # they are checked in `_update_cache` - self._fields = None - self._schema = None + # use property to retrieve them + self._fields = {} + self._schema = [] self._content_field_name = None self._columns = {} def _update_cache(self): + """ + To avoid build_schema every time, we cache + some values: they only change when a SearchIndex + changes, which typically restarts the Python. + """ fields = connections[self.connection_alias].get_unified_index().all_searchfields() if self._fields != fields: self._fields = fields @@ -184,12 +206,13 @@ class XapianSearchBackend(BaseSearchBackend): self._update_cache() return self._content_field_name - def column(self, field_name): + @property + def column(self): """ Returns the column in the database of a given field name. """ self._update_cache() - return self._columns[field_name] + return self._columns def update(self, index, iterable): """ @@ -238,15 +261,42 @@ class XapianSearchBackend(BaseSearchBackend): if self.include_spelling is True: term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING) + def add_text(termpos, prefix, term, weight): + term_generator.set_termpos(termpos + 1) + term_generator.index_text(term, weight) + term_generator.index_text(term, weight, prefix) + term_generator.increase_termpos() + return term_generator.get_termpos() + for obj in iterable: document = xapian.Document() term_generator.set_document(document) + def add_to_document(prefix, term, weight): + document.add_term('%s' % term, weight) + document.add_term(prefix + term, weight) + document.add_term(prefix + '^%s$' % term, weight) + + def add_datetime_to_document(termpos, prefix, term, weight): + date, time = term.split() + document.add_posting(date, termpos, weight) + termpos += 1 + document.add_posting(time, termpos, weight) + termpos += 1 + document.add_posting(prefix + date, termpos, weight) + termpos += 1 + document.add_posting(prefix + time, termpos, weight) + termpos += 1 + return termpos + data = index.full_prepare(obj) weights = index.get_field_weights() + + termpos = 0 for field in self.schema: + termpos += 1 # not supported fields are ignored. - if field['field_name'] not in data.keys(): + if field['field_name'] not in list(data.keys()): continue if field['field_name'] in weights: @@ -254,34 +304,47 @@ class XapianSearchBackend(BaseSearchBackend): else: weight = 1 + value = data[field['field_name']] + # Private fields are indexed in a different way: + # `django_id` is an int and `django_ct` is text; + # besides, they are indexed by their (unstemmed) value. if field['field_name'] in ('id', 'django_id', 'django_ct'): - term = data[field['field_name']] - - # django_id is always an integer, thus we send - # it to _marshal_value as int to guarantee it - # is stored as a sortable number. if field['field_name'] == 'django_id': - term = int(term) - term = _marshal_value(term) + value = int(value) + value = _term_to_xapian_value(value, field['type']) - document.add_term(TERM_PREFIXES[field['field_name']] + term, weight) - document.add_value(field['column'], term) + document.add_term(TERM_PREFIXES[field['field_name']] + value, weight) + document.add_value(field['column'], value) + continue else: - value = data[field['field_name']] prefix = TERM_PREFIXES['field'] + field['field_name'].upper() + # if not multi_valued, we add as a document value + # for sorting and facets if field['multi_valued'] == 'false': - document.add_value(field['column'], _marshal_value(value)) - value = [value] + document.add_value(field['column'], _term_to_xapian_value(value, field['type'])) + else: + for t in value: + # add the exact match of each value + term = _to_xapian_term(t) + add_to_document(prefix, term, weight) + # index each value with positional information + if ' ' in term: + termpos = add_text(termpos, prefix, term, weight) + continue - for term in value: - term = _marshal_term(term) - if field['type'] == 'text': - term_generator.index_text(term, weight) - term_generator.index_text(term, weight, prefix) - if len(term.split()) == 1: - document.add_term(term, weight) - document.add_term(prefix + term, weight) + term = _to_xapian_term(value) + # from here on the term is a string; + # we now decide how it is indexed + + if field['type'] == 'text': + # text is indexed with positional information + termpos = add_text(termpos, prefix, term, weight) + elif field['type'] == 'datetime': + termpos = add_datetime_to_document(termpos, prefix, term, weight) + if term != "": + # all other terms are added without positional information + add_to_document(prefix, term, weight) # store data without indexing it document.set_data(pickle.dumps( @@ -361,6 +424,18 @@ class XapianSearchBackend(BaseSearchBackend): return query + def _check_field_names(self, field_names): + """ + Raises InvalidIndexError if any of a field_name in field_names is + not indexed. + """ + if field_names: + for field_name in field_names: + try: + self.column[field_name] + except KeyError: + raise InvalidIndexError('Trying to use non indexed field "%s"' % field_name) + @log_query def search(self, query, sort_by=None, start_offset=0, end_offset=None, fields='', highlight=False, facets=None, date_facets=None, @@ -409,6 +484,10 @@ class XapianSearchBackend(BaseSearchBackend): 'hits': 0, } + self._check_field_names(facets) + self._check_field_names(date_facets) + self._check_field_names(query_facets) + database = self._database() if result_class is None: @@ -443,7 +522,7 @@ class XapianSearchBackend(BaseSearchBackend): sort_field = sort_field[1:] # Strip the '-' else: reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311 - sorter.add(self.column(sort_field), reverse) + sorter.add(self.column[sort_field], reverse) enquire.set_sort_by_key_then_relevance(sorter, True) @@ -457,6 +536,12 @@ class XapianSearchBackend(BaseSearchBackend): if not end_offset: end_offset = database.get_doccount() - start_offset + ## prepare spies in case of facets + if facets: + facets_spies = self._prepare_facet_field_spies(facets) + for spy in facets_spies: + enquire.add_matchspy(spy) + matches = self._get_enquire_mset(database, enquire, start_offset, end_offset) for match in matches: @@ -472,9 +557,18 @@ class XapianSearchBackend(BaseSearchBackend): ) if facets: - facets_dict['fields'] = self._do_field_facets(results, facets) + # pick single valued facets from spies + single_facets_dict = self._process_facet_field_spies(facets_spies) + + # pick multivalued valued facets from results + multi_facets_dict = self._do_multivalued_field_facets(results, facets) + + # merge both results (http://stackoverflow.com/a/38990/931303) + facets_dict['fields'] = dict(list(single_facets_dict.items()) + list(multi_facets_dict.items())) + if date_facets: facets_dict['dates'] = self._do_date_facets(results, date_facets) + if query_facets: facets_dict['queries'] = self._do_query_facets(results, query_facets) @@ -641,7 +735,7 @@ class XapianSearchBackend(BaseSearchBackend): 'multi_valued': 'false', 'column': 0}, {'field_name': DJANGO_ID, - 'type': 'long', + 'type': 'integer', 'multi_valued': 'false', 'column': 1}, {'field_name': DJANGO_CT, @@ -655,7 +749,7 @@ class XapianSearchBackend(BaseSearchBackend): column = len(schema_fields) - for field_name, field_class in sorted(fields.items(), key=lambda n: n[0]): + for field_name, field_class in sorted(list(fields.items()), key=lambda n: n[0]): if field_class.document is True: content_field_name = field_class.index_fieldname @@ -667,10 +761,12 @@ class XapianSearchBackend(BaseSearchBackend): 'column': column, } - if field_class.field_type in ['date', 'datetime']: + if field_class.field_type == 'date': field_data['type'] = 'date' + elif field_class.field_type == 'datetime': + field_data['type'] = 'datetime' elif field_class.field_type == 'integer': - field_data['type'] = 'long' + field_data['type'] = 'integer' elif field_class.field_type == 'float': field_data['type'] = 'float' elif field_class.field_type == 'boolean': @@ -705,33 +801,58 @@ class XapianSearchBackend(BaseSearchBackend): return content - def _do_field_facets(self, results, field_facets): + def _prepare_facet_field_spies(self, facets): """ - Private method that facets a document by field name. + Returns a list of spies based on the facets + used to count frequencies. + """ + spies = [] + for facet in facets: + slot = self.column[facet] + spy = xapian.ValueCountMatchSpy(slot) + # add attribute "slot" to know which column this spy is targeting. + spy.slot = slot + spies.append(spy) + return spies - Fields of type MultiValueField will be faceted on each item in the - (containing) list. + def _process_facet_field_spies(self, spies): + """ + Returns a dict of facet names with lists of + tuples of the form (term, term_frequency) + from a list of spies that observed the enquire. + """ + facet_dict = {} + for spy in spies: + field = self.schema[spy.slot] + field_name, field_type = field['field_name'], field['type'] - Required arguments: - `results` -- A list SearchResults to facet - `field_facets` -- A list of fields to facet on + facet_dict[field_name] = [] + for facet in list(spy.values()): + facet_dict[field_name].append((_from_xapian_value(facet.term, field_type), + facet.termfreq)) + return facet_dict + + def _do_multivalued_field_facets(self, results, field_facets): + """ + Implements a multivalued field facet on the results. + + This is implemented using brute force - O(N^2) - + because Xapian does not have it implemented yet + (see http://trac.xapian.org/ticket/199) """ facet_dict = {} - # DS_TODO: Improve this algorithm. Currently, runs in O(N^2), ouch. for field in field_facets: facet_list = {} + if not self._multi_value_field(field): + continue for result in results: field_value = getattr(result, field) - if self._multi_value_field(field): - for item in field_value: # Facet each item in a MultiValueField - facet_list[item] = facet_list.get(item, 0) + 1 - else: - facet_list[field_value] = facet_list.get(field_value, 0) + 1 - - facet_dict[field] = facet_list.items() + for item in field_value: # Facet each item in a MultiValueField + facet_list[item] = facet_list.get(item, 0) + 1 + facet_dict[field] = list(facet_list.items()) return facet_dict @staticmethod @@ -765,7 +886,7 @@ class XapianSearchBackend(BaseSearchBackend): """ facet_dict = {} - for date_facet, facet_params in date_facets.iteritems(): + for date_facet, facet_params in list(date_facets.items()): gap_type = facet_params.get('gap_by') gap_value = facet_params.get('gap_amount', 1) date_range = facet_params['start_date'] @@ -831,8 +952,7 @@ class XapianSearchBackend(BaseSearchBackend): eg. {'name': ('a*', 5)} """ facet_dict = {} - - for field, query in dict(query_facets).items(): + for field, query in list(dict(query_facets).items()): facet_dict[field] = (query, self.search(self.parse_query(query))['hits']) return facet_dict @@ -887,7 +1007,7 @@ class XapianSearchBackend(BaseSearchBackend): return database @staticmethod - def _get_enquire_mset(database, enquire, start_offset, end_offset): + def _get_enquire_mset(database, enquire, start_offset, end_offset, checkatleast=DEFAULT_CHECK_AT_LEAST): """ A safer version of Xapian.enquire.get_mset @@ -901,10 +1021,10 @@ class XapianSearchBackend(BaseSearchBackend): `end_offset` -- The end offset to pass to `enquire.get_mset` """ try: - return enquire.get_mset(start_offset, end_offset) + return enquire.get_mset(start_offset, end_offset, checkatleast) except xapian.DatabaseModifiedError: database.reopen() - return enquire.get_mset(start_offset, end_offset) + return enquire.get_mset(start_offset, end_offset, checkatleast) @staticmethod def _get_document_data(database, document): @@ -989,8 +1109,9 @@ class XapianSearchQuery(BaseSearchQuery): if self.boost: subqueries = [ xapian.Query( - xapian.Query.OP_SCALE_WEIGHT, self._content_field(term, False), value - ) for term, value in self.boost.iteritems() + xapian.Query.OP_SCALE_WEIGHT, + self._term_query(term, None, None), value + ) for term, value in list(self.boost.items()) ] query = xapian.Query( xapian.Query.OP_AND_MAYBE, query, @@ -1009,169 +1130,264 @@ class XapianSearchQuery(BaseSearchQuery): ) else: expression, term = child - field, filter_type = search_node.split_expression(expression) + field_name, filter_type = search_node.split_expression(expression) - # Handle when we've got a ``ValuesListQuerySet``... - if hasattr(term, 'values_list'): - term = list(term) - - if isinstance(term, (list, tuple)): - term = [_marshal_term(t) for t in term] - else: - term = _marshal_term(term) - - if field == 'content': - query_list.append(self._content_field(term, is_not)) - else: - if filter_type == 'contains': - query_list.append(self._filter_contains(term, field, is_not)) - elif filter_type == 'exact': - query_list.append(self._filter_exact(term, field, is_not)) - elif filter_type == 'gt': - query_list.append(self._filter_gt(term, field, is_not)) - elif filter_type == 'gte': - query_list.append(self._filter_gte(term, field, is_not)) - elif filter_type == 'lt': - query_list.append(self._filter_lt(term, field, is_not)) - elif filter_type == 'lte': - query_list.append(self._filter_lte(term, field, is_not)) - elif filter_type == 'startswith': - query_list.append(self._filter_startswith(term, field, is_not)) - elif filter_type == 'in': - query_list.append(self._filter_in(term, field, is_not)) + constructed_query_list = self._query_from_term(term, field_name, filter_type, is_not) + query_list.extend(constructed_query_list) if search_node.connector == 'OR': return xapian.Query(xapian.Query.OP_OR, query_list) else: return xapian.Query(xapian.Query.OP_AND, query_list) - def _content_field(self, term, is_not): + def _query_from_term(self, term, field_name, filter_type, is_not): """ - Private method that returns a xapian.Query that searches for `value` - in all fields. - - Required arguments: - ``term`` -- The term to search for - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Uses arguments to construct a list of xapian.Query's. """ - # it is more than one term, we build a PHRASE - if ' ' in term: - query = self._phrase_query(term.split(), self.backend.content_field_name, is_content=True) - else: - query = self._term_query(term) + if field_name != 'content' and field_name not in self.backend.column: + raise InvalidIndexError('field "%s" not indexed' % field_name) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) - else: - return query - - def _filter_contains(self, term, field, is_not): - """ - Private method that returns a xapian.Query that searches for `term` - in a specified `field`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query - """ - if ' ' in term: - return self._filter_exact(term, field, is_not) - else: - query = self._term_query(term, field) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + # It it is an AutoQuery, it has no filters + # or others, thus we short-circuit the procedure. + if isinstance(term, AutoQuery): + if field_name != 'content': + query = '%s:%s' % (field_name, term.prepare(self)) else: - return query - - def _filter_exact(self, term, field, is_not): - """ - Private method that returns a xapian.Query that searches for an exact - match for `term` in a specified `field`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query - """ - query = self._phrase_query(term.split(), field) - if is_not: - return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) - else: - return query - - def _filter_in(self, term_list, field, is_not): - """ - Private method that returns a xapian.Query that searches for any term - of `value_list` in a specified `field`. - - Required arguments: - ``term_list`` -- The terms to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query - """ + query = term.prepare(self) + return [self.backend.parse_query(query)] query_list = [] - for term in term_list: - if ' ' in term: - query_list.append( - self._phrase_query(term.split(), field) - ) + + # Handle `ValuesListQuerySet`. + if hasattr(term, 'values_list'): + term = list(term) + + if field_name == 'content': + # content is the generic search: + # force no field_name search + # and the field_type to be 'text'. + field_name = None + field_type = 'text' + + # we don't know what is the type(term), so we parse it. + # Ideally this would not be required, but + # some filters currently depend on the term to make decisions. + term = _to_xapian_term(term) + + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + # when filter has no filter_type, haystack uses + # filter_type = 'contains'. Here we remove it + # since the above query is already doing this + if filter_type == 'contains': + filter_type = None + else: + # get the field_type from the backend + field_type = self.backend.schema[self.backend.column[field_name]]['type'] + + # private fields don't accept 'contains' or 'startswith' + # since they have no meaning. + if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'): + filter_type = 'exact' + + if field_type == 'text': + # we don't know what type "term" is, but we know we are searching as text + # so we parse it like that. + # Ideally this would not be required since _term_query does it, but + # some filters currently depend on the term to make decisions. + if isinstance(term, list): + term = [_to_xapian_term(term) for term in term] else: - query_list.append( - self._term_query(term, field) - ) + term = _to_xapian_term(term) + + # todo: we should check that the filter is valid for this field_type or raise InvalidIndexError + if filter_type == 'contains': + query_list.append(self._filter_contains(term, field_name, field_type, is_not)) + elif filter_type == 'exact': + query_list.append(self._filter_exact(term, field_name, field_type, is_not)) + elif filter_type == 'in': + query_list.append(self._filter_in(term, field_name, field_type, is_not)) + elif filter_type == 'startswith': + query_list.append(self._filter_startswith(term, field_name, field_type, is_not)) + elif filter_type == 'gt': + query_list.append(self._filter_gt(term, field_name, field_type, is_not)) + elif filter_type == 'gte': + query_list.append(self._filter_gte(term, field_name, field_type, is_not)) + elif filter_type == 'lt': + query_list.append(self._filter_lt(term, field_name, field_type, is_not)) + elif filter_type == 'lte': + query_list.append(self._filter_lte(term, field_name, field_type, is_not)) + return query_list + + def _all_query(self): + """ + Returns a match all query. + """ + return xapian.Query('') + + def _filter_contains(self, term, field_name, field_type, is_not): + """ + Splits the sentence in terms and join them with OR, + using stemmed and un-stemmed. + + Assumes term is not a list. + """ + if field_type == 'text': + term_list = term.split() + else: + term_list = [term] + + query = self._or_query(term_list, field_name, field_type) + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + else: + return query + + def _filter_in(self, term_list, field_name, field_type, is_not): + """ + Returns a query that matches exactly ANY term in term_list. + + Notice that: + A in {B,C} <=> (A = B or A = C) + ~(A in {B,C}) <=> ~(A = B or A = C) + Because OP_AND_NOT(C, D) <=> (C and ~D), then D=(A in {B,C}) requires `is_not=False`. + + Assumes term is a list. + """ + query_list = [self._filter_exact(term, field_name, field_type, is_not=False) + for term in term_list] + if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), xapian.Query(xapian.Query.OP_OR, query_list)) else: return xapian.Query(xapian.Query.OP_OR, query_list) - def _filter_startswith(self, term, field, is_not): + def _filter_exact(self, term, field_name, field_type, is_not): """ - Private method that returns a xapian.Query that searches for any term - that begins with `term` in a specified `field`. + Returns a query that matches exactly the un-stemmed term + with positional order. - Required arguments: - ``term`` -- The terms to search for - ``field`` -- The field to search - ``is_not`` -- Invert the search results - - Returns: - A xapian.Query + Assumes term is not a list. """ + + # this is an hack: + # the ideal would be to use the same idea as in _filter_contains. + # However, it causes tests to fail. + if field_type == 'text' and ' ' in term: + query = self._phrase_query(term.split(), field_name, field_type) + else: + query = self._term_query(term, field_name, field_type, exact=True, stemmed=False) + if is_not: - return xapian.Query( - xapian.Query.OP_AND_NOT, - self._all_query(), - self.backend.parse_query('%s:%s*' % (field, term)), - ) - return self.backend.parse_query('%s:%s*' % (field, term)) + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + else: + return query - def _filter_gt(self, term, field, is_not): - return self._filter_lte(term, field, is_not=not is_not) + def _filter_startswith(self, term, field_name, field_type, is_not): + """ + Returns a startswith query on the un-stemmed term. - def _filter_lt(self, term, field, is_not): - return self._filter_gte(term, field, is_not=not is_not) + Assumes term is not a list. + """ + # TODO: if field_type is of type integer, we need to marsh the value. + if field_name: + query_string = '%s:%s*' % (field_name, term) + else: + query_string = '%s*' % term - def _filter_gte(self, term, field, is_not): + query = self.backend.parse_query(query_string) + + if is_not: + return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query) + return query + + def _or_query(self, term_list, field, field_type, exact=False): + """ + Joins each item of term_list decorated by _term_query with an OR. + """ + term_list = [self._term_query(term, field, field_type, exact) for term in term_list] + return xapian.Query(xapian.Query.OP_OR, term_list) + + def _phrase_query(self, term_list, field_name, field_type): + """ + Returns a query that matches exact terms with + positional order (i.e. ["this", "thing"] != ["thing", "this"]) + and no stem. + + If `field_name` is not `None`, restrict to the field. + """ + term_list = [self._term_query(term, field_name, field_type, + stemmed=False) for term in term_list] + + query = xapian.Query(xapian.Query.OP_PHRASE, term_list) + return query + + def _term_query(self, term, field_name, field_type, exact=False, stemmed=True): + """ + Constructs a query of a single term. + + If `field_name` is not `None`, the term is search on that field only. + If exact is `True`, the search is restricted to boolean matches. + """ + # using stemmed terms in exact query is not acceptable. + if stemmed: + assert not exact + + constructor = '{prefix}{term}' + # ^{term}$ is for boolean match of the term + if exact: + constructor = '{prefix}^{term}$' + + # construct the prefix to be used. + prefix = '' + if field_name: + prefix = TERM_PREFIXES['field'] + field_name.upper() + term = _to_xapian_term(term) + + if field_name in ('id', 'django_id', 'django_ct'): + # to ensure the value is serialized correctly. + if field_name == 'django_id': + term = int(term) + term = _term_to_xapian_value(term, field_type) + return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term)) + + # we construct the query dates in a slightly different way + if field_type == 'datetime': + date, time = term.split() + constructor = '{prefix}{term}' + return xapian.Query(xapian.Query.OP_AND_MAYBE, + constructor.format(prefix=prefix, term=date), + constructor.format(prefix=prefix, term=time) + ) + + # only use stem if field is text or "None" + if field_type not in ('text', None): + stemmed = False + + unstemmed_term = constructor.format(prefix=prefix, term=term) + if stemmed: + stem = xapian.Stem(self.backend.language) + stemmed_term = 'Z' + constructor.format(prefix=prefix, term=stem(term).decode('utf-8')) + + return xapian.Query(xapian.Query.OP_OR, + xapian.Query(stemmed_term), + xapian.Query(unstemmed_term) + ) + else: + return xapian.Query(unstemmed_term) + + def _filter_gt(self, term, field_name, field_type, is_not): + return self._filter_lte(term, field_name, field_type, is_not=not is_not) + + def _filter_lt(self, term, field_name, field_type, is_not): + return self._filter_gte(term, field_name, field_type, is_not=not is_not) + + def _filter_gte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is greater than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*') + pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*') if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), @@ -1179,13 +1395,13 @@ class XapianSearchQuery(BaseSearchQuery): ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - def _filter_lte(self, term, field, is_not): + def _filter_lte(self, term, field_name, field_type, is_not): """ Private method that returns a xapian.Query that searches for any term that is less than `term` in a specified `field`. """ vrp = XHValueRangeProcessor(self.backend) - pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term)) + pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type)) if is_not: return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), @@ -1193,117 +1409,79 @@ class XapianSearchQuery(BaseSearchQuery): ) return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end) - @staticmethod - def _all_query(): - """ - Private method that returns a xapian.Query that returns all documents, - Returns: - A xapian.Query - """ - return xapian.Query('') - - def _term_query(self, term, field=None): - """ - Private method that returns a term based xapian.Query that searches - for `term`. - - Required arguments: - ``term`` -- The term to search for - ``field`` -- The field to search (If `None`, all fields) - - Returns: - A xapian.Query - """ - stem = xapian.Stem(self.backend.language) - - if field in ('id', 'django_id', 'django_ct'): - return xapian.Query('%s%s' % (TERM_PREFIXES[field], term)) - elif field: - stemmed = 'Z%s%s%s' % ( - TERM_PREFIXES['field'], field.upper(), stem(term) - ) - unstemmed = '%s%s%s' % ( - TERM_PREFIXES['field'], field.upper(), term - ) - else: - stemmed = 'Z%s' % stem(term) - unstemmed = term - - return xapian.Query( - xapian.Query.OP_OR, - xapian.Query(stemmed), - xapian.Query(unstemmed) - ) - - @staticmethod - def _phrase_query(term_list, field=None, is_content=False): - """ - Private method that returns a phrase based xapian.Query that searches - for terms in `term_list. - - Required arguments: - ``term_list`` -- The terms to search for - ``field`` -- The field to search (If `None`, all fields) - - Returns: - A xapian.Query - """ - if field and not is_content: - term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field.upper(), term) for term in term_list] - return xapian.Query(xapian.Query.OP_PHRASE, term_list) - - -def _marshal_value(value): +def _term_to_xapian_value(term, field_type): """ - Private utility method that converts Python values to a string for Xapian values. + Converts a term to a serialized + Xapian value based on the field_type. """ - if isinstance(value, datetime.datetime): - value = _marshal_datetime(value) - elif isinstance(value, datetime.date): - value = _marshal_date(value) - elif isinstance(value, bool): - if value: + assert field_type in FIELD_TYPES + + def strf(dt): + """ + Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT) + but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303) + """ + return '%04d%02d%02d%02d%02d%02d' % ( + dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second) + + if field_type == 'boolean': + assert isinstance(term, bool) + if term: value = 't' else: value = 'f' - elif isinstance(value, float): - value = xapian.sortable_serialise(value) - elif isinstance(value, (int, long)): - value = '%012d' % value - else: - value = force_text(value).lower() + + elif field_type == 'integer': + value = INTEGER_FORMAT % term + elif field_type == 'float': + value = xapian.sortable_serialise(term) + elif field_type == 'date' or field_type == 'datetime': + if field_type == 'date': + # http://stackoverflow.com/a/1937636/931303 and comments + term = datetime.datetime.combine(term, datetime.time()) + value = strf(term) + else: # field_type == 'text' + value = _to_xapian_term(term) + return value -def _marshal_term(term): +def _to_xapian_term(term): """ - Private utility method that converts Python terms to a string for Xapian terms. + Converts a Python type to a + Xapian term that can be indexed. """ - if isinstance(term, datetime.datetime): - term = _marshal_datetime(term) - elif isinstance(term, datetime.date): - term = _marshal_date(term) - else: - term = force_text(term).lower() - return term + return force_text(term).lower() -def _marshal_date(d): - return '%04d%02d%02d000000' % (d.year, d.month, d.day) +def _from_xapian_value(value, field_type): + """ + Converts a serialized Xapian value + to Python equivalent based on the field_type. - -def _marshal_datetime(dt): - if dt.microsecond: - return '%04d%02d%02d%02d%02d%02d%06d' % ( - dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second, dt.microsecond - ) - else: - return '%04d%02d%02d%02d%02d%02d' % ( - dt.year, dt.month, dt.day, dt.hour, - dt.minute, dt.second - ) + Doesn't accept multivalued fields. + """ + assert field_type in FIELD_TYPES + if field_type == 'boolean': + if value == 't': + return True + elif value == 'f': + return False + else: + InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value)) + elif field_type == 'integer': + return int(value) + elif field_type == 'float': + return xapian.sortable_unserialise(value) + elif field_type == 'date' or field_type == 'datetime': + datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT) + if field_type == 'datetime': + return datetime_value + else: + return datetime_value.date() + else: # field_type == 'text' + return value class XapianEngine(BaseEngine):