Fixed #90 - Added fields django_id and django_ct.

- Fields are not fully indexed because they are not text.
- Added test of order_by django_id.
- Simplified code for updating index.
This commit is contained in:
Jorge C. Leitão 2014-05-14 21:34:39 +02:00
parent 65c6e9a71b
commit afb9958c4f
3 changed files with 118 additions and 66 deletions

View file

@ -161,8 +161,7 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase):
Tests that all fields are in the database
"""
terms = get_terms(self.backend, '-a')
for field in ['id', 'author', 'pub_date', 'text']:
for field in ['author', 'pub_date', 'text']:
is_inside = False
for term in terms:
if "X%s" % field.upper() in term:
@ -415,23 +414,25 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase):
def test_build_schema(self):
(content_field_name, fields) = self.backend.build_schema(connections['default'].get_unified_index().all_searchfields())
self.assertEqual(content_field_name, 'text')
self.assertEqual(len(fields), 15)
self.assertEqual(len(fields), 14 + 3)
self.assertEqual(fields, [
{'column': 0, 'type': 'text', 'field_name': 'id', 'multi_valued': 'false'},
{'column': 1, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'},
{'column': 2, 'type': 'date', 'field_name': 'exp_date', 'multi_valued': 'false'},
{'column': 3, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
{'column': 4, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 5, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
{'column': 6, 'type': 'text', 'field_name': 'name_exact', 'multi_valued': 'false'},
{'column': 7, 'type': 'float', 'field_name': 'popularity', 'multi_valued': 'false'},
{'column': 8, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'},
{'column': 9, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
{'column': 10, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
{'column': 11, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
{'column': 12, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 13, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'},
{'column': 14, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'}
{'column': 1, 'type': 'long', 'field_name': 'django_id', 'multi_valued': 'false'},
{'column': 2, 'type': 'text', 'field_name': 'django_ct', 'multi_valued': 'false'},
{'column': 3, 'type': 'text', 'field_name': 'empty', 'multi_valued': 'false'},
{'column': 4, 'type': 'date', 'field_name': 'exp_date', 'multi_valued': 'false'},
{'column': 5, 'type': 'boolean', 'field_name': 'flag', 'multi_valued': 'false'},
{'column': 6, 'type': 'text', 'field_name': 'keys', 'multi_valued': 'true'},
{'column': 7, 'type': 'text', 'field_name': 'name', 'multi_valued': 'false'},
{'column': 8, 'type': 'text', 'field_name': 'name_exact', 'multi_valued': 'false'},
{'column': 9, 'type': 'float', 'field_name': 'popularity', 'multi_valued': 'false'},
{'column': 10, 'type': 'date', 'field_name': 'pub_date', 'multi_valued': 'false'},
{'column': 11, 'type': 'text', 'field_name': 'sites', 'multi_valued': 'true'},
{'column': 12, 'type': 'text', 'field_name': 'tags', 'multi_valued': 'true'},
{'column': 13, 'type': 'text', 'field_name': 'text', 'multi_valued': 'false'},
{'column': 14, 'type': 'text', 'field_name': 'titles', 'multi_valued': 'true'},
{'column': 15, 'type': 'text', 'field_name': 'url', 'multi_valued': 'false'},
{'column': 16, 'type': 'long', 'field_name': 'value', 'multi_valued': 'false'}
])
def test_parse_query(self):
@ -444,15 +445,39 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase):
self.assertEqual(str(self.backend.parse_query('name:da*')), 'Xapian::Query((XNAMEdavid1:(pos=1) OR XNAMEdavid2:(pos=1) OR XNAMEdavid3:(pos=1)))')
self.assertEqual(str(self.backend.parse_query('name:david1..david2')),
'Xapian::Query(VALUE_RANGE 5 david1 david2)')
'Xapian::Query(VALUE_RANGE 7 david1 david2)')
self.assertEqual(str(self.backend.parse_query('value:0..10')),
'Xapian::Query(VALUE_RANGE 14 000000000000 000000000010)')
'Xapian::Query(VALUE_RANGE 16 000000000000 000000000010)')
self.assertEqual(str(self.backend.parse_query('value:..10')),
'Xapian::Query(VALUE_RANGE 14 %012d 000000000010)' % (-sys.maxint - 1))
'Xapian::Query(VALUE_RANGE 16 %012d 000000000010)' % (-sys.maxint - 1))
self.assertEqual(str(self.backend.parse_query('value:10..*')),
'Xapian::Query(VALUE_RANGE 14 000000000010 %012d)' % sys.maxint)
'Xapian::Query(VALUE_RANGE 16 000000000010 %012d)' % sys.maxint)
self.assertEqual(str(self.backend.parse_query('popularity:25.5..100.0')),
b'Xapian::Query(VALUE_RANGE 7 \xb2` \xba@)')
b'Xapian::Query(VALUE_RANGE 9 \xb2` \xba@)')
def test_order_by_django_id(self):
self.backend.clear()
self.sample_objs = []
number_list = range(1, 101)
for i in number_list:
mock = XapianMockModel()
mock.id = i
mock.author = 'david%s' % i
mock.pub_date = datetime.date(2009, 2, 25) - datetime.timedelta(days=i)
mock.exp_date = datetime.date(2009, 2, 23) + datetime.timedelta(days=i)
mock.value = i * 5
mock.flag = bool(i % 2)
mock.slug = 'http://example.com/%d/' % i
mock.url = 'http://example.com/%d/' % i
mock.popularity = i*2
self.sample_objs.append(mock)
self.backend.clear()
self.backend.update(self.index, self.sample_objs)
results = self.backend.search(xapian.Query(''), sort_by=['-django_id'])
self.assertEqual(results['hits'], len(number_list))
self.assertEqual([result.pk for result in results['results']], list(reversed(number_list)))
class LiveXapianMockSearchIndex(indexes.SearchIndex):
@ -493,21 +518,21 @@ class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
def test_build_query_gt(self):
self.sq.add_filter(SQ(name__gt='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 2 a m))')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 4 a m))')
def test_build_query_gte(self):
self.sq.add_filter(SQ(name__gte='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 2 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzz)')
def test_build_query_lt(self):
self.sq.add_filter(SQ(name__lt='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 2 m zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))')
def test_build_query_lte(self):
self.sq.add_filter(SQ(name__lte='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 2 a m)')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 a m)')
def test_build_query_multiple_filter_types(self):
self.sq.add_filter(SQ(content='why'))
@ -516,7 +541,7 @@ class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
self.sq.add_filter(SQ(created__lt=datetime.datetime(2009, 2, 12, 12, 13, 0)))
self.sq.add_filter(SQ(title__gte='B'))
self.sq.add_filter(SQ(id__in=[1, 2, 3]))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND VALUE_RANGE 3 00010101000000 20090210015900 AND (<alldocuments> AND_NOT VALUE_RANGE 2 a david) AND (<alldocuments> AND_NOT VALUE_RANGE 1 20090212121300 99990101000000) AND VALUE_RANGE 5 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND (Q1 OR Q2 OR Q3)))')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zwhi OR why) AND VALUE_RANGE 5 00010101000000 20090210015900 AND (<alldocuments> AND_NOT VALUE_RANGE 4 a david) AND (<alldocuments> AND_NOT VALUE_RANGE 3 20090212121300 99990101000000) AND VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND (Q1 OR Q2 OR Q3)))')
def test_log_query(self):
reset_search_queries()

View file

@ -151,13 +151,13 @@ class XapianSearchQueryTestCase(TestCase):
def test_build_query_with_models(self):
self.sq.add_filter(SQ(content='hello'))
self.sq.add_model(MockModel)
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND 0 * XCONTENTTYPEcore.mockmodel))')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(((Zhello OR hello) AND 0 * CONTENTTYPEcore.mockmodel))')
self.sq.add_model(AnotherMockModel)
self.assertTrue(str(self.sq.build_query()) in (
'Xapian::Query(((Zhello OR hello) AND (0 * XCONTENTTYPEcore.anothermockmodel OR 0 * XCONTENTTYPEcore.mockmodel)))',
'Xapian::Query(((Zhello OR hello) AND (0 * XCONTENTTYPEcore.mockmodel OR 0 * XCONTENTTYPEcore.anothermockmodel)))'))
'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.anothermockmodel OR 0 * CONTENTTYPEcore.mockmodel)))',
'Xapian::Query(((Zhello OR hello) AND (0 * CONTENTTYPEcore.mockmodel OR 0 * CONTENTTYPEcore.anothermockmodel)))'))
def test_build_query_with_punctuation(self):
self.sq.add_filter(SQ(content='http://www.example.com'))

View file

@ -14,7 +14,7 @@ from django.utils.encoding import force_text
from haystack import connections
from haystack.backends import BaseEngine, BaseSearchBackend, BaseSearchQuery, SearchNode, log_query
from haystack.constants import ID
from haystack.constants import ID, DJANGO_ID, DJANGO_CT
from haystack.exceptions import HaystackError, MissingDependency
from haystack.models import SearchResult
from haystack.utils import get_identifier, get_model_ct
@ -26,12 +26,17 @@ except ImportError:
"Please refer to the documentation.")
# The prefix we add to identify that the term refers to a specific ID.
DOCUMENT_ID_TERM_PREFIX = 'Q'
# The prefix we add to identify that the term refers to a specific Field.
DOCUMENT_CUSTOM_TERM_PREFIX = 'X'
# The prefix we add to identify that the term refers to specific ContentType.
DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE'
# this maps the different reserved fields to prefixes used to
# create the database:
# id str: unique document id.
# django_id int: id of the django model instance.
# django_ct str: of the content type of the django model.
# field str: name of the field of the index.
TERM_PREFIXES = {'id': 'Q',
'django_id': 'QQ',
'django_ct': 'CONTENTTYPE',
'field': 'X'
}
MEMORY_DB_NAME = ':memory:'
@ -104,9 +109,9 @@ class XHExpandDecider(xapian.ExpandDecider):
Return True if the term should be used for expanding the search
query, False otherwise.
Currently, we only want to ignore terms beginning with `DOCUMENT_CT_TERM_PREFIX`
Ignore terms related with the content type of objects.
"""
if term.startswith(DOCUMENT_CT_TERM_PREFIX):
if term.startswith(TERM_PREFIXES['django_ct']):
return False
return True
@ -236,21 +241,26 @@ class XapianSearchBackend(BaseSearchBackend):
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
term_generator.set_document(document)
document_id = DOCUMENT_ID_TERM_PREFIX + get_identifier(obj)
data = index.full_prepare(obj)
weights = index.get_field_weights()
for field in self.schema:
if field['field_name'] in data.keys():
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
prefix = TERM_PREFIXES['field'] + field['field_name'].upper()
value = data[field['field_name']]
try:
weight = int(weights[field['field_name']])
except KeyError:
weight = 1
if field['field_name'] == 'id':
term = _marshal_term(value)
document.add_term(prefix + term, weight)
document.add_value(field['column'], _marshal_value(value))
if field['field_name'] in ('id', 'django_id', 'django_ct'):
term = value
# django_id is always an integer, thus we are going to send
# it to _marshal_value to garantee it can be sorted as a number.
if field['field_name'] == 'django_id':
term = int(term)
term = _marshal_value(term)
document.add_term(TERM_PREFIXES[field['field_name']] + term, weight)
document.add_value(field['column'], term)
elif field['type'] == 'text':
if field['multi_valued'] == 'false':
term = _marshal_term(value)
@ -282,14 +292,17 @@ class XapianSearchBackend(BaseSearchBackend):
document.add_term(term, weight)
document.add_term(prefix + term, weight)
# store data without indexing it
document.set_data(pickle.dumps(
(obj._meta.app_label, obj._meta.module_name, obj.pk, data),
pickle.HIGHEST_PROTOCOL
))
# add the id of the document
document_id = TERM_PREFIXES['id'] + get_identifier(obj)
document.add_term(document_id)
document.add_term(
DOCUMENT_CT_TERM_PREFIX + get_model_ct(obj)
)
# finally, replace or add the document to the database
database.replace_document(document_id, document)
except UnicodeDecodeError:
@ -307,7 +320,7 @@ class XapianSearchBackend(BaseSearchBackend):
should be unique to this object.
"""
database = self._database(writable=True)
database.delete_document(DOCUMENT_ID_TERM_PREFIX + get_identifier(obj))
database.delete_document(TERM_PREFIXES['id'] + get_identifier(obj))
database.close()
def clear(self, models=(), commit=True):
@ -334,9 +347,7 @@ class XapianSearchBackend(BaseSearchBackend):
else:
database = self._database(writable=True)
for model in models:
database.delete_document(
DOCUMENT_CT_TERM_PREFIX + get_model_ct(model)
)
database.delete_document(TERM_PREFIXES['django_ct'] + get_model_ct(model))
database.close()
def document_count(self):
@ -351,7 +362,7 @@ class XapianSearchBackend(BaseSearchBackend):
"""
registered_models_ct = self.build_models_list()
if registered_models_ct:
restrictions = [xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, model_ct))
restrictions = [xapian.Query('%s%s' % (TERM_PREFIXES['django_ct'], model_ct))
for model_ct in registered_models_ct]
limit_query = xapian.Query(xapian.Query.OP_OR, restrictions)
@ -520,7 +531,7 @@ class XapianSearchBackend(BaseSearchBackend):
if result_class is None:
result_class = SearchResult
query = xapian.Query(DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance))
query = xapian.Query(TERM_PREFIXES['id'] + get_identifier(model_instance))
enquire = xapian.Enquire(database)
enquire.set_query(query)
@ -539,7 +550,7 @@ class XapianSearchBackend(BaseSearchBackend):
match.document.termlist_count()
)
query = xapian.Query(
xapian.Query.OP_AND_NOT, [query, DOCUMENT_ID_TERM_PREFIX + get_identifier(model_instance)]
xapian.Query.OP_AND_NOT, [query, TERM_PREFIXES['id'] + get_identifier(model_instance)]
)
if limit_to_registered_models:
@ -590,12 +601,17 @@ class XapianSearchBackend(BaseSearchBackend):
qp.set_database(self._database())
qp.set_stemmer(xapian.Stem(self.language))
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX)
qp.add_boolean_prefix('django_ct', TERM_PREFIXES['django_ct'])
for field_dict in self.schema:
# since 'django_ct' has a boolean_prefix,
# we ignore it here.
if field_dict['field_name'] == 'django_ct':
continue
qp.add_prefix(
field_dict['field_name'],
DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper()
TERM_PREFIXES['field'] + field_dict['field_name'].upper()
)
vrp = XHValueRangeProcessor(self)
@ -620,9 +636,22 @@ class XapianSearchBackend(BaseSearchBackend):
"""
content_field_name = ''
schema_fields = [
{'field_name': ID, 'type': 'text', 'multi_valued': 'false', 'column': 0},
{'field_name': ID,
'type': 'text',
'multi_valued': 'false',
'column': 0},
{'field_name': DJANGO_ID,
'type': 'long',
'multi_valued': 'false',
'column': 1},
{'field_name': DJANGO_CT,
'type': 'text',
'multi_valued': 'false',
'column': 2},
]
self._columns[ID] = 0
self._columns[DJANGO_ID] = 1
self._columns[DJANGO_CT] = 2
column = len(schema_fields)
@ -948,7 +977,7 @@ class XapianSearchQuery(BaseSearchQuery):
subqueries = [
xapian.Query(
xapian.Query.OP_SCALE_WEIGHT,
xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, get_model_ct(model))),
xapian.Query('%s%s' % (TERM_PREFIXES['django_ct'], get_model_ct(model))),
0 # Pure boolean sub-query
) for model in self.models
]
@ -1188,16 +1217,14 @@ class XapianSearchQuery(BaseSearchQuery):
"""
stem = xapian.Stem(self.backend.language)
if field == 'id':
return xapian.Query('%s%s' % (DOCUMENT_ID_TERM_PREFIX, term))
elif field == 'django_ct':
return xapian.Query('%s%s' % (DOCUMENT_CT_TERM_PREFIX, term))
if field in ('id', 'django_id', 'django_ct'):
return xapian.Query('%s%s' % (TERM_PREFIXES[field], term))
elif field:
stemmed = 'Z%s%s%s' % (
DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), stem(term)
TERM_PREFIXES['field'], field.upper(), stem(term)
)
unstemmed = '%s%s%s' % (
DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), term
TERM_PREFIXES['field'], field.upper(), term
)
else:
stemmed = 'Z%s' % stem(term)
@ -1223,7 +1250,7 @@ class XapianSearchQuery(BaseSearchQuery):
A xapian.Query
"""
if field and not is_content:
term_list = ['%s%s%s' % (DOCUMENT_CUSTOM_TERM_PREFIX, field.upper(), term) for term in term_list]
term_list = ['%s%s%s' % (TERM_PREFIXES['field'], field.upper(), term) for term in term_list]
return xapian.Query(xapian.Query.OP_PHRASE, term_list)