Added docstring for _do_query_facets

This commit is contained in:
David Sauve 2009-08-11 08:49:45 -04:00
parent 1a8e6d24b6
commit 648ca3a3d3

View file

@ -49,7 +49,7 @@ class XHValueRangeProcessor(xapian.ValueRangeProcessor):
def __init__(self, sb):
self.sb = sb
xapian.ValueRangeProcessor.__init__(self)
def __call__(self, begin, end):
"""
Construct a tuple for value range processing.
@ -90,15 +90,15 @@ class SearchBackend(BaseSearchBackend):
"""
`SearchBackend` defines the Xapian search backend for use with the Haystack
API for Django search.
It uses the Xapian Python bindings to interface with Xapian, and as
such is subject to this bug: <http://trac.xapian.org/ticket/364> when
Django is running with mod_python or mod_wsgi under Apache.
Until this issue has been fixed by Xapian, it is neccessary to set
`WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or
`PythonInterpreter main_interpreter` when using mod_python.
In order to use this backend, `HAYSTACK_XAPIAN_PATH` must be set in
your settings. This should point to a location where you would your
indexes to reside.
@ -111,66 +111,66 @@ class SearchBackend(BaseSearchBackend):
'NEAR',
'ADJ',
)
RESERVED_CHARACTERS = (
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
'\\', '+', '-', '&&', '||', '!', '(', ')', '{', '}',
'[', ']', '^', '"', '~', '*', '?', ':',
)
def __init__(self, site=None, stemming_language='english'):
"""
Instantiates an instance of `SearchBackend`.
Optional arguments:
`site` -- The site to associate the backend with (default = None)
`stemming_language` -- The stemming language (default = 'english')
Also sets the stemming language to be used to `stemming_language`.
"""
super(SearchBackend, self).__init__(site)
if not hasattr(settings, 'HAYSTACK_XAPIAN_PATH'):
raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.')
if not os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
os.makedirs(settings.HAYSTACK_XAPIAN_PATH)
self.stemmer = xapian.Stem(stemming_language)
def get_identifier(self, obj_or_string):
return DOCUMENT_ID_TERM_PREFIX + super(SearchBackend, self).get_identifier(obj_or_string)
def update(self, index, iterable):
"""
Updates the `index` with any objects in `iterable` by adding/updating
the database as needed.
Required arguments:
`index` -- The `SearchIndex` to process
`iterable` -- An iterable of model instances to index
For each object in `iterable`, a document is created containing all
of the terms extracted from `index.prepare(obj)` with stemming prefixes,
field prefixes, and 'as-is'.
eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest`
Each document also contains an extra term in the format:
`XCONTENTTYPE<app_name>.<model_name>`
As well as a unique identifier in the the format:
`Q<app_name>.<model_name>.<pk>`
eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar`
This is useful for querying for a specific document corresponding to
a model instance.
The document also contains a pickled version of the object itself and
the document ID in the document data field.
Finally, we also store field values to be used for sorting data. We
store these in the document value slots (position zero is reserver
for the document ID). All values are stored as unicode strings with
@ -184,7 +184,7 @@ class SearchBackend(BaseSearchBackend):
term_generator = self._term_generator(database, document)
document_id = self.get_identifier(obj)
model_data = index.prepare(obj)
for field in self.schema:
if field['field_name'] in model_data.keys():
prefix = DOCUMENT_CUSTOM_TERM_PREFIX + field['field_name'].upper()
@ -192,40 +192,40 @@ class SearchBackend(BaseSearchBackend):
term_generator.index_text(force_unicode(value))
term_generator.index_text(force_unicode(value), 1, prefix)
document.add_value(field['column'], self._marshal_value(value))
document.set_data(pickle.dumps(
(obj._meta.app_label, obj._meta.module_name, obj.pk, model_data),
(obj._meta.app_label, obj._meta.module_name, obj.pk, model_data),
pickle.HIGHEST_PROTOCOL
))
document.add_term(document_id)
document.add_term(
DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
DOCUMENT_CT_TERM_PREFIX + u'%s.%s' %
(obj._meta.app_label, obj._meta.module_name)
)
database.replace_document(document_id, document)
except UnicodeDecodeError:
sys.stderr.write('Chunk failed.\n')
pass
def remove(self, obj):
"""
Remove indexes for `obj` from the database.
We delete all instances of `Q<app_name>.<model_name>.<pk>` which
should be unique to this object.
"""
database = self._database(writable=True)
database.delete_document(self.get_identifier(obj))
def clear(self, models=[]):
"""
Clear all instances of `models` from the database or all models, if
not specified.
Optional Arguments:
`models` -- Models to clear from the database (default = [])
If `models` is empty, an empty query is executed which matches all
documents in the database. Afterwards, each match is deleted.
@ -242,19 +242,19 @@ class SearchBackend(BaseSearchBackend):
else:
for model in models:
database.delete_document(
DOCUMENT_CT_TERM_PREFIX + '%s.%s' %
DOCUMENT_CT_TERM_PREFIX + '%s.%s' %
(model._meta.app_label, model._meta.module_name)
)
def search(self, query_string, sort_by=None, start_offset=0, end_offset=DEFAULT_MAX_RESULTS,
fields='', highlight=False, facets=None, date_facets=None, query_facets=None,
narrow_queries=None, boost=None, **kwargs):
"""
Executes the search as defined in `query_string`.
Required arguments:
`query_string` -- Search query to execute
Optional arguments:
`sort_by` -- Sort results by specified field (default = None)
`start_offset` -- Slice results from `start_offset` (default = 0)
@ -266,7 +266,7 @@ class SearchBackend(BaseSearchBackend):
`query_facets` -- Facet results on queries (default = None)
`narrow_queries` -- Narrow queries (default = None)
`boost` -- Dictionary of terms and weights to boost results
Returns:
A dictionary with the following keys:
`results` -- A list of `SearchResult`
@ -276,13 +276,13 @@ class SearchBackend(BaseSearchBackend):
`dates` -- A list of date facets
`queries` -- A list of query facets
If faceting was not used, the `facets` key will not be present
If `query_string` is empty, returns no results.
Otherwise, loads the available fields from the database meta data schema
and sets up prefixes for each one along with a prefix for `django_ct`,
used to filter by model, and loads the current stemmer instance.
Afterwards, executes the Xapian query parser to create a query from
`query_string` that is then passed to a new `enquire` instance.
@ -299,20 +299,20 @@ class SearchBackend(BaseSearchBackend):
'results': [],
'hits': 0,
}
if query_facets is not None:
warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)
database = self._database()
query, spelling_suggestion = self._query(
database, query_string, narrow_queries, boost
)
enquire = self._enquire(database, query)
if sort_by:
sorter = self._sorter(sort_by)
enquire.set_sort_by_key_then_relevance(sorter, True)
results = []
facets_dict = {
'fields': {},
@ -320,7 +320,7 @@ class SearchBackend(BaseSearchBackend):
'queries': {},
}
matches = enquire.get_mset(start_offset, end_offset)
for match in matches:
document = match.get_document()
app_label, module_name, pk, model_data = pickle.loads(document.get_data())
@ -337,19 +337,19 @@ class SearchBackend(BaseSearchBackend):
results.append(
SearchResult(app_label, module_name, pk, match.weight, **model_data)
)
if date_facets:
facets_dict['dates'] = self._do_date_facets(results, date_facets)
if query_facets:
facets_dict['queries'] = self._do_query_facets(results, query_facets)
return {
'results': results,
'hits': matches.get_matches_estimated(),
'facets': facets_dict,
'spelling_suggestion': spelling_suggestion,
}
def delete_index(self):
"""
Delete the index.
@ -358,7 +358,7 @@ class SearchBackend(BaseSearchBackend):
"""
if os.path.exists(settings.HAYSTACK_XAPIAN_PATH):
shutil.rmtree(settings.HAYSTACK_XAPIAN_PATH)
def document_count(self):
"""
Retrieves the total document count for the search index.
@ -368,7 +368,7 @@ class SearchBackend(BaseSearchBackend):
except xapian.DatabaseOpeningError:
return 0
return database.get_doccount()
def more_like_this(self, model_instance):
"""
Given a model instance, returns a result set of similar documents.
@ -376,21 +376,21 @@ class SearchBackend(BaseSearchBackend):
Required arguments:
`model_instance` -- The model instance to use as a basis for
retrieving similar documents.
Returns:
A dictionary with the following keys:
`results` -- A list of `SearchResult`
`hits` -- The total available results
Opens a database connection, then builds a simple query using the
`model_instance` to build the unique identifier.
For each document retrieved(should always be one), adds an entry into
an RSet (relevance set) with the document id, then, uses the RSet
to query for an ESet (A set of terms that can be used to suggest
expansions to the original query), omitting any document that was in
the original query.
Finally, processes the resulting matches and returns.
"""
database = self._database()
@ -406,7 +406,7 @@ class SearchBackend(BaseSearchBackend):
xapian.Query.OP_AND_NOT, [query, self.get_identifier(model_instance)]
)
enquire.set_query(query)
results = []
matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS)
@ -416,7 +416,7 @@ class SearchBackend(BaseSearchBackend):
results.append(
SearchResult(app_label, module_name, pk, match.weight, **model_data)
)
return {
'results': results,
'hits': matches.get_matches_estimated(),
@ -427,7 +427,7 @@ class SearchBackend(BaseSearchBackend):
},
'spelling_suggestion': None,
}
def _do_highlight(self, content, text, tag='em'):
"""
Highlight `text` in `content` with html `tag`.
@ -444,18 +444,18 @@ class SearchBackend(BaseSearchBackend):
term_re = re.compile(re.escape(term), re.IGNORECASE)
content = term_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
return content
def _do_field_facets(self, document, facets, fields):
"""
Private method that facets a document by field name.
Required arguments:
`document` -- The document to parse
`facets` -- A list of facets to use when faceting
`fields` -- A list of fields that have already been faceted. This
will be extended with any new field names and counts
found in the `document`.
For each term in the document, extract the field name and determine
if it is one of the `facets` we want. If so, verify if it already in
the `fields` list. If it is, update the count, otherwise, add it and
@ -469,16 +469,16 @@ class SearchBackend(BaseSearchBackend):
else:
fields[match.group(1).lower()] = [(match.group(2), term[1])]
return fields
def _do_date_facets(self, results, date_facets):
"""
Private method that facets a document by date ranges
Required arguments:
`results` -- A list SearchResults to facet
`date_facets` -- A dictionary containg facet parameters:
`date_facets` -- A dictionary containing facet parameters:
{'field': {'start_date': ..., 'end_date': ...: 'gap': '...'}}
nb., gap must satisfy the regex:
nb., gap must satisfy the regex:
(?P<type>year|month|day|hour|minute|second+)s?=?(?P<value>\d*)
For each date facet field in `date_facets`, generates a list
@ -529,9 +529,9 @@ class SearchBackend(BaseSearchBackend):
date_range += datetime.timedelta(minutes=int(gap_value))
elif gap_type == 'second':
date_range += datetime.timedelta(seconds=int(gap_value))
facet_list = sorted(facet_list, key=lambda n:n[0], reverse=True)
for result in results:
result_date = getattr(result, date_facet)
if result_date:
@ -545,19 +545,33 @@ class SearchBackend(BaseSearchBackend):
if result_date > datetime.datetime.strptime(facet_date[0], '%Y-%m-%dT%H:%M:%S'):
facet_list[n] = (facet_list[n][0], (facet_list[n][1] + 1))
break
facet_dict[date_facet] = facet_list
return facet_dict
def _do_query_facets(self, results, query_facets):
"""
Private method that facets a document by query
Required arguments:
`results` -- A list SearchResults to facet
`query_facets` -- A dictionary containing facet parameters:
{'field': 'query', [...]}
For each query in `query_facets`, generates a dictionary entry with
the field name as the key and a tuple with the query and result count
as the value.
eg. {'name': ('a*', 5)}
"""
facet_dict = {}
for field, query in query_facets.iteritems():
facet_dict[field] = (query, self.search(query)['hits'])
return facet_dict
def _marshal_value(self, value):
"""
Private method that converts Python values to a string for Xapian values.
@ -565,12 +579,12 @@ class SearchBackend(BaseSearchBackend):
if isinstance(value, datetime.datetime):
if value.microsecond:
value = u'%04d%02d%02d%02d%02d%02d%06d' % (
value.year, value.month, value.day, value.hour,
value.year, value.month, value.day, value.hour,
value.minute, value.second, value.microsecond
)
else:
value = u'%04d%02d%02d%02d%02d%02d' % (
value.year, value.month, value.day, value.hour,
value.year, value.month, value.day, value.hour,
value.minute, value.second
)
elif isinstance(value, datetime.date):
@ -585,39 +599,39 @@ class SearchBackend(BaseSearchBackend):
else:
value = force_unicode(value)
return value
def _database(self, writable=False):
"""
Private method that returns a xapian.Database for use and sets up
schema and content_field definitions.
Optional arguments:
``writable`` -- Open the database in read/write mode (default=False)
Returns an instance of a xapian.Database or xapian.WritableDatabase
"""
if writable:
self.content_field_name, fields = self.site.build_unified_schema()
self.schema = self._build_schema(fields)
database = xapian.WritableDatabase(settings.HAYSTACK_XAPIAN_PATH, xapian.DB_CREATE_OR_OPEN)
database.set_metadata('schema', pickle.dumps(self.schema, pickle.HIGHEST_PROTOCOL))
database.set_metadata('content', pickle.dumps(self.content_field_name, pickle.HIGHEST_PROTOCOL))
else:
database = xapian.Database(settings.HAYSTACK_XAPIAN_PATH)
self.schema = pickle.loads(database.get_metadata('schema'))
self.content_field_name = pickle.loads(database.get_metadata('content'))
return database
def _term_generator(self, database, document):
"""
Private method that returns a Xapian.TermGenerator
Required Argument:
`document` -- The document to be indexed
Returns a Xapian.TermGenerator instance. If `HAYSTACK_INCLUDE_SPELLING`
is True, then the term generator will have spell-checking enabled.
"""
@ -628,7 +642,7 @@ class SearchBackend(BaseSearchBackend):
term_generator.set_flags(xapian.TermGenerator.FLAG_SPELLING)
term_generator.set_document(document)
return term_generator
def _query(self, database, query_string, narrow_queries=None, boost=None):
"""
Private method that takes a query string and returns a xapian.Query.
@ -644,7 +658,7 @@ class SearchBackend(BaseSearchBackend):
setup as pulled from the `query_string`.
"""
spelling_suggestion = None
if query_string == '*':
query = xapian.Query('') # Make '*' match everything
else:
@ -655,13 +669,13 @@ class SearchBackend(BaseSearchBackend):
query = qp.parse_query(query_string, flags)
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
spelling_suggestion = qp.get_corrected_query_string()
if narrow_queries:
subqueries = [
qp.parse_query(narrow_query, flags) for narrow_query in narrow_queries
]
query = xapian.Query(
xapian.Query.OP_FILTER,
xapian.Query.OP_FILTER,
query, xapian.Query(xapian.Query.OP_AND, subqueries)
)
if boost:
@ -674,17 +688,17 @@ class SearchBackend(BaseSearchBackend):
xapian.Query.OP_OR, query,
xapian.Query(xapian.Query.OP_AND, subqueries)
)
return query, spelling_suggestion
def _sorter(self, sort_by):
"""
Private methos that takes a list of fields to sort by and returns a
Private methos that takes a list of fields to sort by and returns a
xapian.MultiValueSorter
Required Arguments:
`sort_by` -- A list of fields to sort by
Returns a xapian.MultiValueSorter instance
"""
sorter = xapian.MultiValueSorter()
@ -696,9 +710,9 @@ class SearchBackend(BaseSearchBackend):
else:
reverse = False # Reverse is inverted in Xapian -- http://trac.xapian.org/ticket/311
sorter.add(self._value_column(sort_field), reverse)
return sorter
def _flags(self):
"""
Returns the commonly used Xapian.QueryParser flags
@ -712,14 +726,14 @@ class SearchBackend(BaseSearchBackend):
if getattr(settings, 'HAYSTACK_INCLUDE_SPELLING', False) is True:
flags = flags | xapian.QueryParser.FLAG_SPELLING_CORRECTION
return flags
def _query_parser(self, database):
"""
Private method that returns a Xapian.QueryParser instance.
Required arguments:
`database` -- The database to be queried
The query parser returned will have stemming enabled, a boolean prefix
for `django_ct`, and prefixes for all of the fields in the `self.schema`.
"""
@ -730,19 +744,19 @@ class SearchBackend(BaseSearchBackend):
qp.add_boolean_prefix('django_ct', DOCUMENT_CT_TERM_PREFIX)
for field_dict in self.schema:
qp.add_prefix(
field_dict['field_name'],
field_dict['field_name'],
DOCUMENT_CUSTOM_TERM_PREFIX + field_dict['field_name'].upper()
)
return qp
def _enquire(self, database, query):
"""
Private method that that returns a Xapian.Enquire instance for use with
the specifed `query`.
Required Arguments:
`query` -- The query to run
Returns a xapian.Enquire instance
"""
enquire = xapian.Enquire(database)
@ -750,14 +764,14 @@ class SearchBackend(BaseSearchBackend):
enquire.set_docid_order(enquire.ASCENDING)
return enquire
def _build_schema(self, fields):
"""
Private method to build a schema.
Required arguments:
``fields`` -- A list of fields in the index
Returns a list of fields in dictionary format ready for inclusion in
an indexed meta-data.
"""
@ -769,15 +783,15 @@ class SearchBackend(BaseSearchBackend):
n += 1
schema.append(field)
return schema
def _value_column(self, field):
"""
Private method that returns the column value slot in the database
for a given field.
Required arguemnts:
`field` -- The field to lookup
Returns an integer with the column location (0 indexed).
"""
for field_dict in self.schema:
@ -790,55 +804,55 @@ class SearchQuery(BaseSearchQuery):
"""
`SearchQuery` is responsible for converting search queries into a format
that Xapian can understand.
Most of the work is done by the :method:`build_query`.
"""
def __init__(self, backend=None):
"""
Create a new instance of the SearchQuery setting the backend as
specified. If no backend is set, will use the Xapian `SearchBackend`.
Optional arguments:
`backend` -- The `SearchBackend` to use (default = None)
"""
super(SearchQuery, self).__init__(backend=backend)
self.backend = backend or SearchBackend()
def build_query(self):
"""
Builds a search query from previously set values, returning a query
string in a format ready for use by the Xapian `SearchBackend`.
Returns:
A query string suitable for parsing by Xapian.
"""
query = ''
if not self.query_filters:
query = '*'
else:
query_chunks = []
for the_filter in self.query_filters:
if the_filter.is_and():
query_chunks.append('AND')
if the_filter.is_not():
query_chunks.append('NOT')
if the_filter.is_or():
query_chunks.append('OR')
value = the_filter.value
if not isinstance(value, (list, tuple)):
# Convert whatever we find to what xapian wants.
value = self.backend._marshal_value(value)
# Check to see if it's a phrase for an exact match.
if ' ' in value:
value = '"%s"' % value
# 'content' is a special reserved word, much like 'pk' in
# Django's ORM layer. It indicates 'no special field'.
if the_filter.field == 'content':
@ -852,33 +866,33 @@ class SearchQuery(BaseSearchQuery):
'lt': "NOT %s:%s..*",
'startswith': "%s:%s*",
}
if the_filter.filter_type != 'in':
query_chunks.append(filter_types[the_filter.filter_type] % (the_filter.field, value))
else:
in_options = []
for possible_value in value:
in_options.append("%s:%s" % (the_filter.field, possible_value))
query_chunks.append("(%s)" % " OR ".join(in_options))
if query_chunks[0] in ('AND', 'OR'):
# Pull off an undesirable leading "AND" or "OR".
del(query_chunks[0])
query = " ".join(query_chunks)
if len(self.models):
models = ['django_ct:%s.%s' % (model._meta.app_label, model._meta.module_name) for model in self.models]
models_clause = ' '.join(models)
final_query = '(%s) %s' % (query, models_clause)
else:
final_query = query
return final_query
return final_query
def run(self):
"""
Builds and executes the query. Returns a list of search results.
@ -887,52 +901,52 @@ class SearchQuery(BaseSearchQuery):
kwargs = {
'start_offset': self.start_offset,
}
if self.order_by:
kwargs['sort_by'] = self.order_by
if self.end_offset is not None:
kwargs['end_offset'] = self.end_offset - self.start_offset
if self.highlight:
kwargs['highlight'] = self.highlight
if self.facets:
kwargs['facets'] = list(self.facets)
if self.date_facets:
kwargs['date_facets'] = self.date_facets
if self.query_facets:
kwargs['query_facets'] = self.query_facets
if self.narrow_queries:
kwargs['narrow_queries'] = self.narrow_queries
if self.boost:
kwargs['boost'] = self.boost
results = self.backend.search(final_query, **kwargs)
self._results = results.get('results', [])
self._hit_count = results.get('hits', 0)
self._facet_counts = results.get('facets', {})
self._spelling_suggestion = results.get('spelling_suggestion', None)
def run_mlt(self):
"""
Builds and executes the query. Returns a list of search results.
"""
if self._more_like_this is False or self._mlt_instance is None:
raise MoreLikeThisError("No instance was provided to determine 'More Like This' results.")
additional_query_string = self.build_query()
kwargs = {
'start_offset': self.start_offset,
}
if self.end_offset is not None:
kwargs['end_offset'] = self.end_offset - self.start_offset
results = self.backend.more_like_this(self._mlt_instance, additional_query_string, **kwargs)
self._results = results.get('results', [])
self._hit_count = results.get('hits', 0)