mirror of
https://github.com/Hopiu/xapian-haystack.git
synced 2026-03-16 22:20:31 +00:00
Continued refactoring. 23 of 27 tests passing now. Code is much cleaner and leaner
This commit is contained in:
parent
7752a6e743
commit
8ee12dd68d
2 changed files with 143 additions and 95 deletions
|
|
@ -94,12 +94,12 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
|
||||
for match in matches:
|
||||
document = match.get_document()
|
||||
object_data = pickle.loads(document.get_data())
|
||||
for key, value in object_data.iteritems():
|
||||
object_data[key] = self.sb._from_python(value)
|
||||
object_data['id'] = force_unicode(document.get_value(0))
|
||||
document_list.append(object_data)
|
||||
|
||||
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
|
||||
for key, value in model_data.iteritems():
|
||||
model_data[key] = self.sb._from_python(value)
|
||||
model_data['id'] = u'%s.%s.%d' % (app_label, module_name, pk)
|
||||
document_list.append(model_data)
|
||||
|
||||
return document_list
|
||||
|
||||
def test_update(self):
|
||||
|
|
@ -148,10 +148,10 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
|
||||
# Wildcard -- All
|
||||
self.assertEqual(self.sb.search('*')['hits'], 3)
|
||||
self.assertEqual([result.pk for result in self.sb.search('*')['results']], [u'1', u'2', u'3'])
|
||||
self.assertEqual([result.pk for result in self.sb.search('*')['results']], [1, 2, 3])
|
||||
|
||||
# NOT operator
|
||||
self.assertEqual([result.pk for result in self.sb.search('NOT author:david1')['results']], [u'1', u'2', u'3'])
|
||||
self.assertEqual([result.pk for result in self.sb.search('NOT author:david1')['results']], [1, 2, 3])
|
||||
|
||||
def test_field_facets(self):
|
||||
self.sb.update(self.msi, self.sample_objs)
|
||||
|
|
@ -231,29 +231,29 @@ class XapianSearchBackendTestCase(TestCase):
|
|||
self.sb.update(self.msi, self.sample_objs)
|
||||
|
||||
results = self.sb.search('*', sort_by=['pub_date'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'1', u'2', u'3'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [1, 2, 3])
|
||||
|
||||
results = self.sb.search('*', sort_by=['-pub_date'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'3', u'2', u'1'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [3, 2, 1])
|
||||
|
||||
results = self.sb.search('*', sort_by=['id'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'3', u'2', u'1'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [3, 2, 1])
|
||||
|
||||
results = self.sb.search('*', sort_by=['-id'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'1', u'2', u'3'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [1, 2, 3])
|
||||
|
||||
results = self.sb.search('*', sort_by=['value'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'3', u'2', u'1'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [3, 2, 1])
|
||||
|
||||
results = self.sb.search('*', sort_by=['-value'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'1', u'2', u'3'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [1, 2, 3])
|
||||
|
||||
results = self.sb.search('*', sort_by=['flag', 'id'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'3', u'1', u'2'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [3, 1, 2])
|
||||
|
||||
results = self.sb.search('*', sort_by=['flag', '-id'])
|
||||
self.assertEqual([result.pk for result in results['results']], [u'1', u'3', u'2'])
|
||||
|
||||
self.assertEqual([result.pk for result in results['results']], [1, 3, 2])
|
||||
|
||||
def test__from_python(self):
|
||||
self.assertEqual(self.sb._from_python('abc'), u'abc')
|
||||
self.assertEqual(self.sb._from_python(1), u'1')
|
||||
|
|
|
|||
|
|
@ -40,7 +40,6 @@ DOCUMENT_ID_TERM_PREFIX = 'Q'
|
|||
DOCUMENT_CUSTOM_TERM_PREFIX = 'X'
|
||||
DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE'
|
||||
|
||||
|
||||
field_re = re.compile(r'(?<=(?<!Z)X)([A-Z_]+)(\w+)')
|
||||
|
||||
|
||||
|
|
@ -94,7 +93,9 @@ class SearchBackend(BaseSearchBackend):
|
|||
os.makedirs(settings.HAYSTACK_XAPIAN_PATH)
|
||||
|
||||
self.stemmer = xapian.Stem(stemming_language)
|
||||
self.open = False
|
||||
|
||||
def get_identifier(self, obj_or_string):
|
||||
return DOCUMENT_ID_TERM_PREFIX + super(SearchBackend, self).get_identifier(obj_or_string)
|
||||
|
||||
def update(self, index, iterable):
|
||||
"""
|
||||
|
|
@ -153,13 +154,16 @@ class SearchBackend(BaseSearchBackend):
|
|||
else:
|
||||
document.add_value(field['column'], data)
|
||||
|
||||
document.set_data(self._dump_document_data(document_id, model_data))
|
||||
document.add_term(DOCUMENT_ID_TERM_PREFIX + document_id)
|
||||
document.set_data(pickle.dumps(
|
||||
(obj._meta.app_label, obj._meta.module_name, obj.pk, model_data),
|
||||
pickle.HIGHEST_PROTOCOL
|
||||
))
|
||||
document.add_term(document_id)
|
||||
document.add_term(
|
||||
DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
|
||||
(obj._meta.app_label, obj._meta.module_name)
|
||||
)
|
||||
database.replace_document(DOCUMENT_ID_TERM_PREFIX + document_id, document)
|
||||
database.replace_document(document_id, document)
|
||||
|
||||
except UnicodeDecodeError:
|
||||
sys.stderr.write('Chunk failed.\n')
|
||||
|
|
@ -172,8 +176,8 @@ class SearchBackend(BaseSearchBackend):
|
|||
We delete all instances of `Q<app_name>.<model_name>.<pk>` which
|
||||
should be unique to this object.
|
||||
"""
|
||||
database = self._open_database(writable=True)
|
||||
database.delete_document(DOCUMENT_ID_TERM_PREFIX + self.get_identifier(obj))
|
||||
database = self._database(writable=True)
|
||||
database.delete_document(self.get_identifier(obj))
|
||||
|
||||
def clear(self, models=[]):
|
||||
"""
|
||||
|
|
@ -190,15 +194,15 @@ class SearchBackend(BaseSearchBackend):
|
|||
the term `XCONTENTTYPE<app_name>.<model_name>`. This will delete
|
||||
all documents with the specified model type.
|
||||
"""
|
||||
self._prepare(rw=True)
|
||||
database = self._database(writable=True)
|
||||
if not models:
|
||||
query = xapian.Query('') # Empty query matches all
|
||||
enquire = self._get_enquire(query)
|
||||
query, __unused__ = self._query(database, '*')
|
||||
enquire = self._enquire(database, query)
|
||||
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
||||
self.database.delete_document(match.get_docid())
|
||||
database.delete_document(match.get_docid())
|
||||
else:
|
||||
for model in models:
|
||||
self.database.delete_document(
|
||||
database.delete_document(
|
||||
DOCUMENT_CT_TERM_PREFIX + '%s.%s' %
|
||||
(model._meta.app_label, model._meta.module_name)
|
||||
)
|
||||
|
|
@ -262,38 +266,45 @@ class SearchBackend(BaseSearchBackend):
|
|||
if query_facets is not None:
|
||||
warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)
|
||||
|
||||
spelling_suggestion = None
|
||||
|
||||
self._prepare()
|
||||
|
||||
if query_string == '*':
|
||||
query = xapian.Query('') # Make '*' match everything
|
||||
else:
|
||||
flags = self._get_flags()
|
||||
qp = self._get_query_parser()
|
||||
query = qp.parse_query(query_string, flags)
|
||||
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
||||
spelling_suggestion = qp.get_corrected_query_string()
|
||||
|
||||
if narrow_queries:
|
||||
subqueries = [qp.parse_query(narrow_query, flags) for narrow_query in narrow_queries]
|
||||
query = xapian.Query(xapian.Query.OP_FILTER, query, xapian.Query(xapian.Query.OP_AND, subqueries))
|
||||
|
||||
enquire = self._get_enquire(query)
|
||||
database = self._database()
|
||||
query, spelling_suggestion = self._query(database, query_string, narrow_queries)
|
||||
enquire = self._enquire(database, query)
|
||||
|
||||
if sort_by:
|
||||
sorter = self._get_sorter(sort_by)
|
||||
sorter = self._sorter(sort_by)
|
||||
enquire.set_sort_by_key_then_relevance(sorter, True)
|
||||
|
||||
results = []
|
||||
facets_dict = {
|
||||
'fields': {},
|
||||
'dates': {},
|
||||
'queries': {},
|
||||
}
|
||||
matches = enquire.get_mset(start_offset, end_offset)
|
||||
results = self._process_results(
|
||||
matches, query_string=query_string, highlight=highlight, facets=facets
|
||||
)
|
||||
|
||||
if spelling_suggestion:
|
||||
results['spelling_suggestion'] = spelling_suggestion
|
||||
for match in matches:
|
||||
document = match.get_document()
|
||||
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
|
||||
results.append(
|
||||
SearchResult(app_label, module_name, pk, match.weight, **model_data)
|
||||
)
|
||||
if facets:
|
||||
facets_dict['fields'] = self._do_field_facets(
|
||||
document, facets, facets_dict['fields']
|
||||
)
|
||||
if highlight and (len(query_string) > 0):
|
||||
model_data['highlighted'] = {
|
||||
self.content_field_name: self._do_highlight(
|
||||
model_data.get(self.content_field_name), query_string
|
||||
)
|
||||
}
|
||||
|
||||
return results
|
||||
return {
|
||||
'results': results,
|
||||
'hits': matches.get_matches_estimated(),
|
||||
'facets': facets_dict,
|
||||
'spelling_suggestion': spelling_suggestion,
|
||||
}
|
||||
|
||||
def delete_index(self):
|
||||
"""
|
||||
|
|
@ -303,10 +314,8 @@ class SearchBackend(BaseSearchBackend):
|
|||
"""
|
||||
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
|
||||
index_files = os.listdir(settings.HAYSTACK_XAPIAN_PATH)
|
||||
|
||||
for index_file in index_files:
|
||||
os.remove(os.path.join(settings.HAYSTACK_XAPIAN_PATH, index_file))
|
||||
|
||||
os.removedirs(settings.HAYSTACK_XAPIAN_PATH)
|
||||
|
||||
def document_count(self):
|
||||
|
|
@ -314,11 +323,10 @@ class SearchBackend(BaseSearchBackend):
|
|||
Retrieves the total document count for the search index.
|
||||
"""
|
||||
try:
|
||||
if not self.loaded:
|
||||
self._load()
|
||||
database = self._database()
|
||||
except xapian.DatabaseOpeningError:
|
||||
return 0
|
||||
return self.database.get_doccount()
|
||||
return database.get_doccount()
|
||||
|
||||
def more_like_this(self, model_instance):
|
||||
"""
|
||||
|
|
@ -344,13 +352,11 @@ class SearchBackend(BaseSearchBackend):
|
|||
|
||||
Finally, processes the resulting matches and returns.
|
||||
"""
|
||||
if not self.loaded:
|
||||
self._load()
|
||||
|
||||
database = self._database()
|
||||
query = xapian.Query(
|
||||
DOCUMENT_ID_TERM_PREFIX + self.get_identifier(model_instance)
|
||||
)
|
||||
enquire = self._get_enquire(query)
|
||||
enquire = self._enquire(database, query)
|
||||
rset = xapian.RSet()
|
||||
for match in enquire.get_mset(0, DEFAULT_MAX_RESULTS):
|
||||
rset.add_document(match.get_docid())
|
||||
|
|
@ -472,7 +478,7 @@ class SearchBackend(BaseSearchBackend):
|
|||
|
||||
def _from_python(self, value):
|
||||
"""
|
||||
Converts Python values to a string for Xapian.
|
||||
Private method that converts Python values to a string for Xapian.
|
||||
"""
|
||||
if isinstance(value, datetime.datetime):
|
||||
if value.microsecond:
|
||||
|
|
@ -496,9 +502,6 @@ class SearchBackend(BaseSearchBackend):
|
|||
value = force_unicode(value)
|
||||
return value
|
||||
|
||||
def _dump_document_data(self, document_id, model_data):
|
||||
return pickle.dumps((document_id, model_data), pickle.HIGHEST_PROTOCOL)
|
||||
|
||||
def _database(self, writable=False):
|
||||
"""
|
||||
Private method that returns a xapian.Database for use and sets up
|
||||
|
|
@ -531,9 +534,8 @@ class SearchBackend(BaseSearchBackend):
|
|||
Required Argument:
|
||||
`document` -- The document to be indexed
|
||||
|
||||
Returns a Xapian.TermGenerator instance ready for use. If
|
||||
`HAYSTACK_INCLUDE_SPELLING` is True, then the term generator
|
||||
will have spell-checking enabled.
|
||||
Returns a Xapian.TermGenerator instance. If `HAYSTACK_INCLUDE_SPELLING`
|
||||
is True, then the term generator will have spell-checking enabled.
|
||||
"""
|
||||
term_generator = xapian.TermGenerator()
|
||||
term_generator.set_database(database)
|
||||
|
|
@ -543,9 +545,39 @@ class SearchBackend(BaseSearchBackend):
|
|||
term_generator.set_document(document)
|
||||
return term_generator
|
||||
|
||||
def _get_sorter(self, sort_by):
|
||||
def _query(self, database, query_string, narrow_queries=None):
|
||||
"""
|
||||
Given a list of fields to sort by, returns a xapian.MultiValueSorter
|
||||
Private method that takes a query string and returns a xapian.Query
|
||||
|
||||
Required arguments:
|
||||
`query_string` -- The query string to parse
|
||||
|
||||
Optional arguments:
|
||||
`narrow_queries` -- A list of queries to narrow the query with
|
||||
|
||||
Returns a xapian.Query instance
|
||||
"""
|
||||
spelling_suggestion = None
|
||||
|
||||
if query_string == '*':
|
||||
query = xapian.Query('') # Make '*' match everything
|
||||
else:
|
||||
flags = self._flags()
|
||||
qp = self._query_parser(database)
|
||||
query = qp.parse_query(query_string, flags)
|
||||
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
|
||||
spelling_suggestion = qp.get_corrected_query_string()
|
||||
|
||||
if narrow_queries:
|
||||
subqueries = [qp.parse_query(narrow_query, flags) for narrow_query in narrow_queries]
|
||||
query = xapian.Query(xapian.Query.OP_FILTER, query, xapian.Query(xapian.Query.OP_AND, subqueries))
|
||||
|
||||
return query, spelling_suggestion
|
||||
|
||||
def _sorter(self, sort_by):
|
||||
"""
|
||||
Private methos that takes a list of fields to sort by and returns a
|
||||
xapian.MultiValueSorter
|
||||
|
||||
Required Arguments:
|
||||
`sort_by` -- A list of fields to sort by
|
||||
|
|
@ -553,22 +585,18 @@ class SearchBackend(BaseSearchBackend):
|
|||
Returns a xapian.MultiValueSorter instance
|
||||
"""
|
||||
sorter = xapian.MultiValueSorter()
|
||||
|
||||
for sort_field in sort_by:
|
||||
if sort_field.startswith('-'):
|
||||
reverse = False
|
||||
sort_field = sort_field[1:] # Strip the '-'
|
||||
else:
|
||||
reverse = True # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311
|
||||
sorter.add(self._get_value_column(sort_field), reverse)
|
||||
sorter.add(self._value_column(sort_field), reverse)
|
||||
|
||||
return sorter
|
||||
|
||||
def _get_value_column(self, field):
|
||||
for field_dict in self.schema:
|
||||
if field_dict['field_name'] == field:
|
||||
return field_dict['column']
|
||||
return 0
|
||||
|
||||
def _get_flags(self):
|
||||
def _flags(self):
|
||||
"""
|
||||
Returns the commonly used Xapian.QueryParser flags
|
||||
"""
|
||||
|
|
@ -582,16 +610,18 @@ class SearchBackend(BaseSearchBackend):
|
|||
flags = flags | xapian.QueryParser.FLAG_SPELLING_CORRECTION
|
||||
return flags
|
||||
|
||||
def _get_query_parser(self):
|
||||
def _query_parser(self, database):
|
||||
"""
|
||||
Returns a Xapian.QueryParser instance.
|
||||
Private method that returns a Xapian.QueryParser instance.
|
||||
|
||||
Required arguments:
|
||||
`database` -- The database to be queried
|
||||
|
||||
The query parser returned will have stemming enabled, a boolean prefix
|
||||
for `django_ct`, and prefixes for all of the fields in the designated
|
||||
`schema`.
|
||||
for `django_ct`, and prefixes for all of the fields in the `self.schema`.
|
||||
"""
|
||||
qp = xapian.QueryParser()
|
||||
qp.set_database(self.database)
|
||||
qp.set_database(database)
|
||||
qp.set_stemmer(self.stemmer)
|
||||
qp.set_stemming_strategy(xapian.QueryParser.STEM_SOME)
|
||||
qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX)
|
||||
|
|
@ -599,18 +629,20 @@ class SearchBackend(BaseSearchBackend):
|
|||
qp.add_prefix(field_dict['field_name'], DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper())
|
||||
return qp
|
||||
|
||||
def _get_enquire(self, query):
|
||||
def _enquire(self, database, query):
|
||||
"""
|
||||
Given a query, returns an Xapian.Enquire instance.
|
||||
Private method that that returns a Xapian.Enquire instance for use with
|
||||
the specifed `query`.
|
||||
|
||||
Required Arguments:
|
||||
`query` -- The query to run
|
||||
|
||||
Returns a xapian.Enquire instance
|
||||
"""
|
||||
enquire = xapian.Enquire(self.database)
|
||||
enquire = xapian.Enquire(database)
|
||||
enquire.set_query(query)
|
||||
enquire.set_docid_order(enquire.ASCENDING)
|
||||
|
||||
return enquire
|
||||
|
||||
def _build_schema(self, fields):
|
||||
|
|
@ -630,6 +662,22 @@ class SearchBackend(BaseSearchBackend):
|
|||
del field
|
||||
return fields
|
||||
|
||||
def _value_column(self, field):
|
||||
"""
|
||||
Private method that returns the column value slot in the database
|
||||
for a given field.
|
||||
|
||||
Required arguemnts:
|
||||
`field` -- The field to lookup
|
||||
|
||||
Returns an integer with the column location (0 indexed).
|
||||
"""
|
||||
for field_dict in self.schema:
|
||||
if field_dict['field_name'] == field:
|
||||
return field_dict['column']
|
||||
return 0
|
||||
|
||||
|
||||
|
||||
class SearchQuery(BaseSearchQuery):
|
||||
"""
|
||||
|
|
|
|||
Loading…
Reference in a new issue