Made general improvement to the code.

This commit is contained in:
Jorge C. Leitão 2014-05-18 14:07:53 +02:00
parent e85b503aaa
commit 8c14898088
2 changed files with 160 additions and 105 deletions

View file

@ -12,7 +12,7 @@ from django.test import TestCase
from haystack import connections, reset_search_queries
from haystack import indexes
from haystack.backends.xapian_backend import InvalidIndexError, _marshal_value
from haystack.backends.xapian_backend import InvalidIndexError, _term_to_xapian_value
from haystack.models import SearchResult
from haystack.query import SearchQuerySet, SQ
from haystack.utils.loading import UnifiedIndex
@ -156,7 +156,7 @@ class XapianBackendTestCase(HaystackBackendTestCase, TestCase):
mock = XapianMockModel()
mock.id = 1
mock.author = 'david'
mock.pub_date = datetime.date(2009, 2, 25)
mock.pub_date = datetime.datetime(2009, 2, 25)
self.backend.update(self.index, [mock])
@ -311,12 +311,6 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase):
"""
self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas'])
def test_raise_index_error_on_wrong_field(self):
"""
Regression test for #109.
"""
self.assertRaises(InvalidIndexError, self.backend.search, xapian.Query(''), facets=['dsdas'])
def test_date_facets(self):
facets = {'pub_date': {'start_date': datetime.datetime(2008, 10, 26),
'end_date': datetime.datetime(2009, 3, 26),
@ -448,17 +442,21 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase):
self.assertEqual([result.month for result in self.backend.search(xapian.Query(''))['results']],
['02', '02', '02'])
def test__marshal_value(self):
self.assertEqual(_marshal_value('abc'), 'abc')
self.assertEqual(_marshal_value(1), '000000000001')
self.assertEqual(_marshal_value(2653), '000000002653')
self.assertEqual(_marshal_value(25.5), b'\xb2`')
self.assertEqual(_marshal_value([1, 2, 3]), '[1, 2, 3]')
self.assertEqual(_marshal_value((1, 2, 3)), '(1, 2, 3)')
self.assertEqual(_marshal_value({'a': 1, 'c': 3, 'b': 2}), "{u'a': 1, u'c': 3, u'b': 2}")
self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 16, 14)), '20090509161400')
self.assertEqual(_marshal_value(datetime.datetime(2009, 5, 9, 0, 0)), '20090509000000')
self.assertEqual(_marshal_value(datetime.datetime(1899, 5, 18, 0, 0)), '18990518000000')
def test_term_to_xapian_value(self):
self.assertEqual(_term_to_xapian_value('abc', 'text'), 'abc')
self.assertEqual(_term_to_xapian_value(1, 'integer'), '000000000001')
self.assertEqual(_term_to_xapian_value(2653, 'integer'), '000000002653')
self.assertEqual(_term_to_xapian_value(25.5, 'float'), b'\xb2`')
self.assertEqual(_term_to_xapian_value([1, 2, 3], 'text'), '[1, 2, 3]')
self.assertEqual(_term_to_xapian_value((1, 2, 3), 'text'), '(1, 2, 3)')
self.assertEqual(_term_to_xapian_value({'a': 1, 'c': 3, 'b': 2}, 'text'),
"{u'a': 1, u'c': 3, u'b': 2}")
self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 16, 14), 'datetime'),
'20090509161400')
self.assertEqual(_term_to_xapian_value(datetime.datetime(2009, 5, 9, 0, 0), 'date'),
'20090509000000')
self.assertEqual(_term_to_xapian_value(datetime.datetime(1899, 5, 18, 0, 0), 'date'),
'18990518000000')
def test_build_schema(self):
search_fields = connections['default'].get_unified_index().all_searchfields()
@ -558,8 +556,7 @@ class XapianSearchBackendTestCase(HaystackBackendTestCase, TestCase):
class LiveXapianMockSearchIndex(indexes.SearchIndex):
text = indexes.CharField(document=True, use_template=True)
name = indexes.CharField(model_attr='author', faceted=True)
pub_date = indexes.DateField(model_attr='pub_date')
created = indexes.DateField()
pub_date = indexes.DateTimeField(model_attr='pub_date')
title = indexes.CharField()
def get_model(self):
@ -593,35 +590,41 @@ class LiveXapianSearchQueryTestCase(HaystackBackendTestCase, TestCase):
def test_build_query_gt(self):
self.sq.add_filter(SQ(name__gt='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 4 a m))')
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 3 a m))')
def test_build_query_gte(self):
self.sq.add_filter(SQ(name__gte='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzz)')
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query(VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzz)')
def test_build_query_lt(self):
self.sq.add_filter(SQ(name__lt='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query((<alldocuments> AND_NOT VALUE_RANGE 4 m zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))')
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query((<alldocuments> AND_NOT '
'VALUE_RANGE 3 m zzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz))')
def test_build_query_lte(self):
self.sq.add_filter(SQ(name__lte='m'))
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 4 a m)')
self.assertEqual(str(self.sq.build_query()), 'Xapian::Query(VALUE_RANGE 3 a m)')
def test_build_query_multiple_filter_types(self):
self.sq.add_filter(SQ(content='why'))
self.sq.add_filter(SQ(pub_date__lte=datetime.datetime(2009, 2, 10, 1, 59, 0)))
self.sq.add_filter(SQ(name__gt='david'))
self.sq.add_filter(SQ(created__lt=datetime.datetime(2009, 2, 12, 12, 13, 0)))
self.sq.add_filter(SQ(title__gte='B'))
self.sq.add_filter(SQ(id__in=[1, 2, 3]))
self.sq.add_filter(SQ(django_id__in=[1, 2, 3]))
self.assertEqual(str(self.sq.build_query()),
'Xapian::Query(((Zwhi OR why) AND VALUE_RANGE 6 00010101000000 20090210015900 AND '
'(<alldocuments> AND_NOT VALUE_RANGE 4 a david) AND '
'(<alldocuments> AND_NOT VALUE_RANGE 3 20090212121300 99990101000000) AND '
'VALUE_RANGE 8 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND '
'(Q1 OR Q2 OR Q3)))')
'Xapian::Query(((Zwhi OR why) AND '
'VALUE_RANGE 5 00010101000000 20090210015900 AND '
'(<alldocuments> AND_NOT VALUE_RANGE 3 a david) AND '
'VALUE_RANGE 7 b zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz'
'zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz AND '
'(QQ000000000001 OR QQ000000000002 OR QQ000000000003)))')
def test_log_query(self):
reset_search_queries()

View file

@ -53,6 +53,14 @@ DEFAULT_XAPIAN_FLAGS = (
# this must be improved to be relative to the total number of docs.
DEFAULT_CHECK_AT_LEAST = 1000
# field types accepted to be serialized as values in Xapian
FIELD_TYPES = {'text', 'integer', 'date', 'datetime', 'float', 'boolean'}
# defines the format used to store types in Xapian
# this format ensures datetimes are sorted correctly
DATETIME_FORMAT = '%Y%m%d%H%M%S'
INTEGER_FORMAT = '%012d'
class InvalidIndexError(HaystackError):
"""Raised when an index can not be opened."""
@ -102,12 +110,12 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor):
elif field_type == 'date' or field_type == 'datetime':
end = '99990101000000'
if field_dict['type'] == 'float':
begin = _marshal_value(float(begin))
end = _marshal_value(float(end))
elif field_dict['type'] == 'integer':
begin = _marshal_value(int(begin))
end = _marshal_value(int(end))
if field_type == 'float':
begin = _term_to_xapian_value(float(begin), field_type)
end = _term_to_xapian_value(float(end), field_type)
elif field_type == 'integer':
begin = _term_to_xapian_value(int(begin), field_type)
end = _term_to_xapian_value(int(end), field_type)
return field_dict['column'], str(begin), str(end)
@ -262,28 +270,27 @@ class XapianSearchBackend(BaseSearchBackend):
else:
weight = 1
value = data[field['field_name']]
# Private fields are indexed in a different way:
# `django_id` is an int and `django_ct` is text;
# besides, they are indexed by their (unstemmed) value.
if field['field_name'] in ('id', 'django_id', 'django_ct'):
term = data[field['field_name']]
# django_id is always an integer, thus we send
# it to _marshal_value as int to guarantee it
# is stored as a sortable number.
if field['field_name'] == 'django_id':
term = int(term)
term = _marshal_value(term)
value = int(value)
value = _term_to_xapian_value(value, field['type'])
document.add_term(TERM_PREFIXES[field['field_name']] + term, weight)
document.add_value(field['column'], term)
document.add_term(TERM_PREFIXES[field['field_name']] + value, weight)
document.add_value(field['column'], value)
else:
value = data[field['field_name']]
prefix = TERM_PREFIXES['field'] + field['field_name'].upper()
# if not multi_valued, we add a value and construct a one-element list
if field['multi_valued'] == 'false':
document.add_value(field['column'], _marshal_value(value))
document.add_value(field['column'], _term_to_xapian_value(value, field['type']))
value = [value]
for term in value:
term = _marshal_term(term)
term = _to_xapian_term(term)
if field['type'] == 'text':
term_generator.index_text(term, weight)
term_generator.index_text(term, weight, prefix)
@ -769,10 +776,12 @@ class XapianSearchBackend(BaseSearchBackend):
facet_dict = {}
for spy in spies:
field = self.schema[spy.slot]
field_name = field['field_name']
field_name, field_type = field['field_name'], field['type']
facet_dict[field_name] = []
for facet in spy.values():
facet_dict[field_name].append((_xapian_to_python(facet.term), facet.termfreq))
facet_dict[field_name].append((_from_xapian_value(facet.term, field_type),
facet.termfreq))
return facet_dict
def _do_multivalued_field_facets(self, results, field_facets):
@ -1087,11 +1096,6 @@ class XapianSearchQuery(BaseSearchQuery):
if hasattr(term, 'values_list'):
term = list(term)
if isinstance(term, (list, tuple)):
term = [_marshal_term(t) for t in term]
else:
term = _marshal_term(term)
if field_name == 'content':
# content is the generic search:
# force no field_name search
@ -1106,9 +1110,14 @@ class XapianSearchQuery(BaseSearchQuery):
if filter_type == 'contains':
filter_type = None
else:
# pick the field_type from the backend
# get the field_type from the backend
field_type = self.backend.schema[self.backend.column(field_name)]['type']
# private fields don't accept 'contains' or 'startswith'
# since they have no meaning.
if filter_type in ('contains', 'startswith') and field_name in ('id', 'django_id', 'django_ct'):
filter_type = 'exact'
if filter_type == 'contains':
query_list.append(self._filter_contains(term, field_name, field_type, is_not))
elif filter_type == 'exact':
@ -1118,13 +1127,13 @@ class XapianSearchQuery(BaseSearchQuery):
elif filter_type == 'startswith':
query_list.append(self._filter_startswith(term, field_name, field_type, is_not))
elif filter_type == 'gt':
query_list.append(self._filter_gt(term, field_name, is_not))
query_list.append(self._filter_gt(term, field_name, field_type, is_not))
elif filter_type == 'gte':
query_list.append(self._filter_gte(term, field_name, is_not))
query_list.append(self._filter_gte(term, field_name, field_type, is_not))
elif filter_type == 'lt':
query_list.append(self._filter_lt(term, field_name, is_not))
query_list.append(self._filter_lt(term, field_name, field_type, is_not))
elif filter_type == 'lte':
query_list.append(self._filter_lte(term, field_name, is_not))
query_list.append(self._filter_lte(term, field_name, field_type, is_not))
if search_node.connector == 'OR':
return xapian.Query(xapian.Query.OP_OR, query_list)
@ -1133,20 +1142,21 @@ class XapianSearchQuery(BaseSearchQuery):
def _all_query(self):
"""
Private method that returns a xapian.Query that returns all documents,
Returns:
A xapian.Query
Returns a match all query.
"""
return xapian.Query('')
def _filter_contains(self, sentence, field_name, field_type, is_not):
def _filter_contains(self, term, field_name, field_type, is_not):
"""
Splits the sentence in terms and join them with OR,
using stemmed and un-stemmed.
"""
query = self._or_query(sentence.split(), field_name, field_type)
if field_type == 'text':
term_list = term.split()
else:
term_list = [term]
query = self._or_query(term_list, field_name, field_type)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
else:
@ -1175,10 +1185,15 @@ class XapianSearchQuery(BaseSearchQuery):
Returns a query that matches exactly the un-stemmed term
with positional order.
"""
if ' ' in term:
# this is an hack:
# the ideal would be to use the same idea as in _filter_contains.
# However, it causes tests to fail.
if field_type == 'text' and ' ' in term:
query = self._phrase_query(term.split(), field_name, field_type)
else:
query = self._term_query(term, field_name, field_type, exact=True, stemmed=False)
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT, self._all_query(), query)
else:
@ -1236,7 +1251,7 @@ class XapianSearchQuery(BaseSearchQuery):
# to ensure the value is serialized correctly.
if field_name == 'django_id':
term = int(term)
term = _marshal_value(term)
term = _term_to_xapian_value(term, field_type)
return xapian.Query('%s%s' % (TERM_PREFIXES[field_name], term))
constructor = '{prefix}{term}'
@ -1248,7 +1263,7 @@ class XapianSearchQuery(BaseSearchQuery):
prefix = ''
if field_name:
prefix = TERM_PREFIXES['field'] + field_name.upper()
term = _marshal_value(term)
term = _to_xapian_term(term)
unstemmed_term = constructor.format(prefix=prefix, term=term)
if stemmed:
@ -1262,19 +1277,19 @@ class XapianSearchQuery(BaseSearchQuery):
else:
return unstemmed_term
def _filter_gt(self, term, field, is_not):
return self._filter_lte(term, field, is_not=not is_not)
def _filter_gt(self, term, field_name, field_type, is_not):
return self._filter_lte(term, field_name, field_type, is_not=not is_not)
def _filter_lt(self, term, field, is_not):
return self._filter_gte(term, field, is_not=not is_not)
def _filter_lt(self, term, field_name, field_type, is_not):
return self._filter_gte(term, field_name, field_type, is_not=not is_not)
def _filter_gte(self, term, field, is_not):
def _filter_gte(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is greater than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:%s' % (field, _marshal_value(term)), '*')
pos, begin, end = vrp('%s:%s' % (field_name, _term_to_xapian_value(term, field_type)), '*')
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
@ -1282,13 +1297,13 @@ class XapianSearchQuery(BaseSearchQuery):
)
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
def _filter_lte(self, term, field, is_not):
def _filter_lte(self, term, field_name, field_type, is_not):
"""
Private method that returns a xapian.Query that searches for any term
that is less than `term` in a specified `field`.
"""
vrp = XHValueRangeProcessor(self.backend)
pos, begin, end = vrp('%s:' % field, '%s' % _marshal_value(term))
pos, begin, end = vrp('%s:' % field_name, '%s' % _term_to_xapian_value(term, field_type))
if is_not:
return xapian.Query(xapian.Query.OP_AND_NOT,
self._all_query(),
@ -1297,39 +1312,47 @@ class XapianSearchQuery(BaseSearchQuery):
return xapian.Query(xapian.Query.OP_VALUE_RANGE, pos, begin, end)
def _marshal_value(value):
def _term_to_xapian_value(term, field_type):
"""
Private utility method that converts Python values to a string for Xapian values.
Converts a term to a serialized
Xapian value based on the field_type.
"""
if isinstance(value, datetime.datetime):
value = _marshal_datetime(value)
elif isinstance(value, datetime.date):
value = _marshal_date(value)
elif isinstance(value, bool):
if value:
assert field_type in FIELD_TYPES
def strf(dt):
"""
Equivalent to datetime.datetime.strptime(dt, DATETIME_FORMAT)
but accepts years below 1900 (see http://stackoverflow.com/q/10263956/931303)
"""
return '%04d%02d%02d%02d%02d%02d' % (
dt.year, dt.month, dt.day, dt.hour, dt.minute, dt.second)
if field_type == 'boolean':
assert isinstance(term, bool)
if term:
value = 't'
else:
value = 'f'
elif isinstance(value, float):
value = xapian.sortable_serialise(value)
elif isinstance(value, int):
value = '%012d' % value
else:
value = force_text(value).lower()
elif field_type == 'integer':
value = INTEGER_FORMAT % term
elif field_type == 'float':
value = xapian.sortable_serialise(term)
elif field_type == 'date' or field_type == 'datetime':
if field_type == 'date':
# http://stackoverflow.com/a/1937636/931303 and comments
term = datetime.datetime.combine(term, datetime.time())
value = strf(term)
else: # field_type == 'text'
value = _to_xapian_term(term)
return value
def _xapian_to_python(value):
if value == 't':
return True
elif value == 'f':
return False
return value
def _marshal_term(term):
def _to_xapian_term(term):
"""
Private utility method that converts Python terms to a string for Xapian terms.
Converts a Python type to a
Xapian term that can be indexed.
"""
if isinstance(term, datetime.datetime):
term = _marshal_datetime(term)
@ -1340,6 +1363,35 @@ def _marshal_term(term):
return term
def _from_xapian_value(value, field_type):
"""
Converts a serialized Xapian value
to Python equivalent based on the field_type.
Doesn't accept multivalued fields.
"""
assert field_type in FIELD_TYPES
if field_type == 'boolean':
if value == 't':
return True
elif value == 'f':
return False
else:
InvalidIndexError('Field type "%d" does not accept value "%s"' % (field_type, value))
elif field_type == 'integer':
return int(value)
elif field_type == 'float':
return xapian.sortable_unserialise(value)
elif field_type == 'date' or field_type == 'datetime':
datetime_value = datetime.datetime.strptime(value, DATETIME_FORMAT)
if field_type == 'datetime':
return datetime_value
else:
return datetime_value.date()
else: # field_type == 'text'
return value
def _marshal_date(d):
return '%04d%02d%02d000000' % (d.year, d.month, d.day)