diff --git a/xapian_backend.py b/xapian_backend.py index b651a1b..9672887 100644 --- a/xapian_backend.py +++ b/xapian_backend.py @@ -32,8 +32,6 @@ RESERVED_CHARACTERS = ( '[', ']', '^', '"', '~', '*', '?', ':', ) -DATETIME_REGEX = re.compile('^(?P\d{4})-(?P\d{2})-(?P\d{2})T(?P\d{2}):(?P\d{2}):(?P\d{2})(\.\d{3,6}Z?)?$') - DEFAULT_MAX_RESULTS = 100000 DOCUMENT_ID_TERM_PREFIX = 'Q' @@ -41,8 +39,39 @@ DOCUMENT_CUSTOM_TERM_PREFIX = 'X' DOCUMENT_CT_TERM_PREFIX = DOCUMENT_CUSTOM_TERM_PREFIX + 'CONTENTTYPE' +field_re = re.compile(r'(?<=(? when + Django is running with mod_python or mod_wsgi under Apache. + + Until this issue has been fixed by Xapian, it is neccessary to set + `WSGIApplicationGroup to %{GLOBAL}` when using mod_wsgi, or + `PythonInterpreter main_interpreter` when using mod_python. + + In order to use this backend, `HAYSTACK_XAPIAN_PATH` must be set in + your settings. This should point to a location where you would your + indexes to reside. + """ def __init__(self, site=None, stem_lang='en'): + """ + Instantiates an instance of `SearchBackend`. + + Optional arguments: + `site` -- The site to associate the backend with (default = None) + `stem_lang` -- The stemming language (default = 'en') + + Verifies `HAYSTACK_XAPIAN_PATH` has been properly set and that the path + specified is readable. If it is not, tries to create the folder. + + Also sets the stemming language to be used to `stem_lang`. + """ super(SearchBackend, self).__init__(site) if not hasattr(settings, 'HAYSTACK_XAPIAN_PATH'): raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.') @@ -54,6 +83,47 @@ class SearchBackend(BaseSearchBackend): os.makedirs(self.path) def update(self, index, iterable): + """ + Updates the `index` with any objects in `iterable` by adding/updating + the database as needed. + + Required arguments: + `index` -- The `SearchIndex` to process + `iterable` -- An iterable of model instances to index + + For each object in `iterable`, a document is created containing all + of the terms extracted from `index.prepare(obj)` with stemming prefixes, + field prefixes, and 'as-is'. + + eg. `content:Testing` ==> `testing, Ztest, ZXCONTENTtest` + + Each document also contains two extra terms; a term in the format: + + `XCONTENTTYPE.` + + As well as a unique identifier in the the format: + + `Q..` + + eg.: foo.bar (pk=1) ==> `Qfoo.bar.1`, `XCONTENTTYPEfoo.bar` + + This is useful for querying for a specific document corresponding to + an model instance and is also stored in the document value field at + position 0 for easy extraction. + + The document also contains a pickled version of the object itself in + the document data field. + + Finally, the database itself maintains a list of all index field names + in use through the database meta data field. This is a pickled set + of strings that can be loaded on demand and used to assign prefixes + to query parsers so that a user can perform field name filtering by + simply querying as follow: + + `:` + + eg.: `'foo:bar'` will filter based on the `foo` field for `bar`. + """ database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN) indexer = xapian.TermGenerator() indexer.set_database(database) @@ -83,7 +153,10 @@ class SearchBackend(BaseSearchBackend): document.set_data(pickle.dumps(document_data, pickle.HIGHEST_PROTOCOL)) document.add_term(DOCUMENT_ID_TERM_PREFIX + document_id) - document.add_term(DOCUMENT_CT_TERM_PREFIX + u'%s.%s' % (obj._meta.app_label, obj._meta.module_name)) + document.add_term( + DOCUMENT_CT_TERM_PREFIX + u'%s.%s' % + (obj._meta.app_label, obj._meta.module_name) + ) database.replace_document(document_id, document) @@ -94,10 +167,30 @@ class SearchBackend(BaseSearchBackend): pass def remove(self, obj): + """ + Remove indexes for `obj` from the database. + + We delete all instances of `Q..` which + should be unique to this object. + """ database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN) database.delete_document(DOCUMENT_ID_TERM_PREFIX + self.get_identifier(obj)) def clear(self, models=[]): + """ + Clear all instances of `models` from the database or all models, if + not specified. + + Optional Arguments: + `models` -- Models to clear from the database (default = []) + + If `models` is empty, an empty query is executed which matches all + documents in the database. Afterwards, each match is deleted. + + Otherwise, for each model, a `delete_document` call is issued with + the term `XCONTENTTYPE.`. This will delete + all documents with the specified model type. + """ database = xapian.WritableDatabase(self.path, xapian.DB_CREATE_OR_OPEN) if not models: query = xapian.Query('') # Empty query matches all @@ -107,11 +200,53 @@ class SearchBackend(BaseSearchBackend): database.delete_document(match.get_docid()) else: for model in models: - database.delete_document(DOCUMENT_CT_TERM_PREFIX + '%s.%s' % (model._meta.app_label, model._meta.module_name)) + database.delete_document( + DOCUMENT_CT_TERM_PREFIX + '%s.%s' % + (model._meta.app_label, model._meta.module_name) + ) def search(self, query_string, sort_by=None, start_offset=0, end_offset=DEFAULT_MAX_RESULTS, fields='', highlight=False, facets=None, date_facets=None, query_facets=None, narrow_queries=None, **kwargs): + """ + Executes the search as defined in `query_string`. + + Required arguments: + `query_string` -- Search query to execute + + Optional arguments: + `sort_by` -- Sort results by specified field (default = None) + `start_offset` -- Slice results from `start_offset` (default = 0) + `end_offset` -- Slice results at `end_offset` (default = 10,000) + `fields` -- Filter results on `fields` (default = '') + `highlight` -- Highlight terms in results (default = False) + `facets` -- Facet results on fields (default = None) + `date_facets` -- Facet results on date ranges (default = None) + `query_facets` -- Facet results on queries (default = None) + `narrow_queries` -- Narrow queries (default = None) + + Returns: + A dictionary with the following keys: + `results` -- A list of `SearchResult` + `hits` -- The total available results + `facets` - A dictionary of facets with the following keys: + `fields` -- A list of field facets + `dates` -- A list of date facets + `queries` -- A list of query facets + If faceting was not used, the `facets` key will not be present + + If `query_string` is empty, returns no results. + + Otherwise, loads the available fields from the database meta data + and sets up prefixes for each one along with a prefix for `django_ct`, + used to filter by model, and loads the current stemmer instance. + + Afterwards, executes the Xapian query parser to create a query from + `query_string` that is then passed to a new `enquire` instance. + + The resulting match set is passed to :method:`_process_results` for + further processing prior to returning a dictionary with the results. + """ if not query_string: return { 'results': [], @@ -155,6 +290,11 @@ class SearchBackend(BaseSearchBackend): return self._process_results(matches, facets) def delete_index(self): + """ + Delete the index. + + This removes all indexes files and the `HAYSTACK_XAPIAN_PATH` folder. + """ if os.path.exists(self.path): index_files = os.listdir(self.path) @@ -164,6 +304,9 @@ class SearchBackend(BaseSearchBackend): os.removedirs(self.path) def document_count(self): + """ + Retrieves the total document count for the search index. + """ try: database = xapian.Database(self.path) except xapian.DatabaseOpeningError: @@ -171,8 +314,33 @@ class SearchBackend(BaseSearchBackend): return database.get_doccount() def more_like_this(self, model_instance): + """ + Given a model instance, returns a result set of similar documents. + + Required arguments: + `model_instance` -- The model instance to use as a basis for + retrieving similar documents. + + Returns: + A dictionary with the following keys: + `results` -- A list of `SearchResult` + `hits` -- The total available results + + Opens a database connection, then builds a simple query using the + `model_instance` to build the unique identifier. + + For each document retrieved(should always be one), adds an entry into + an RSet (relevance set) with the document id, then, uses the RSet + to query for an ESet (A set of terms that can be used to suggest + expansions to the original query), omitting any document that was in + the original query. + + Finally, processes the resulting matches and returns. + """ database = xapian.Database(self.path) - query = xapian.Query(DOCUMENT_ID_TERM_PREFIX + self.get_identifier(model_instance)) + query = xapian.Query( + DOCUMENT_ID_TERM_PREFIX + self.get_identifier(model_instance) + ) enquire = xapian.Enquire(database) enquire.set_query(query) rset = xapian.RSet() @@ -188,7 +356,34 @@ class SearchBackend(BaseSearchBackend): matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS) return self._process_results(matches) - def _process_results(self, matches, facets=None, highlights=[]): + def _process_results(self, matches, facets=None): + """ + Private method for processing an MSet (match set). + + Required arguments: + `matches` -- An MSet of matches + + Optional arguments: + `facets` -- Fields to facet (default = None) + + Returns: + A dictionary with the following keys: + `results` -- A list of `SearchResult` + `hits` -- The total available results + `facets` - A dictionary of facets with the following keys: + `fields` -- A list of field facets + `dates` -- A list of date facets + `queries` -- A list of query facets + If faceting was not used, the `facets` key will not be present + + For each match in the `matches`, retrieves the corresponding document + and extracts the `app_name`, `model_name`, and `pk` from the information + at value position 0, and :method:pickle.loads the remaining model + values from the document data area. + + For each match, one `SearchResult` will be appended to the `results` + list. + """ facets_dict = { 'fields': {}, 'dates': {}, @@ -207,7 +402,9 @@ class SearchBackend(BaseSearchBackend): results.append(result) if facets: - facets_dict['fields'] = self._do_field_facets(document, facets, facets_dict['fields']) + facets_dict['fields'] = self._do_field_facets( + document, facets, facets_dict['fields'] + ) return { 'results': results, @@ -216,9 +413,22 @@ class SearchBackend(BaseSearchBackend): } def _do_field_facets(self, document, facets, fields): - field_re = re.compile(r'(?<=(?