Implemented highlighting

This commit is contained in:
David Sauve 2009-07-27 15:12:20 -04:00
parent a238bafe80
commit c087348a48
2 changed files with 60 additions and 23 deletions

View file

@ -85,7 +85,7 @@ class XapianSearchBackendTestCase(TestCase):
document_list.append(object_data)
return document_list
def test_update(self):
self.sb.update(self.msi, self.sample_objs)
self.sb.update(self.msi, self.sample_objs) # Duplicates should be updated, not appended -- http://github.com/notanumber/xapian-haystack/issues/#issue/6
@ -104,37 +104,37 @@ class XapianSearchBackendTestCase(TestCase):
def test_clear(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.sb.clear()
self.assertEqual(len(self.xapian_search('')), 0)
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.sb.clear([AnotherMockModel])
self.assertEqual(len(self.xapian_search('')), 3)
self.sb.clear([MockModel])
self.assertEqual(len(self.xapian_search('')), 0)
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.sb.clear([AnotherMockModel, MockModel])
self.assertEqual(len(self.xapian_search('')), 0)
def test_search(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual(self.sb.search(''), {'hits': 0, 'results': []})
self.assertEqual(self.sb.search('*')['hits'], 3)
self.assertEqual([result.pk for result in self.sb.search('*')['results']], [u'1', u'2', u'3'])
# self.assertEqual(self.sb.search('', highlight=True), {'hits': 0, 'results': []})
# self.assertEqual(self.sb.search('Index*', highlight=True)['hits'], 3)
# self.assertEqual([result.highlighted['text'][0] for result in self.sb.search('Index*', highlight=True)['results']], ['<em>Indexed</em>!\n3', '<em>Indexed</em>!\n2', '<em>Indexed</em>!\n1'])
#
def test_field_facets(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual(self.sb.search('', facets=['name']), {'hits': 0, 'results': []})
results = self.sb.search('index', facets=['name'])
self.assertEqual(results['hits'], 3)
@ -150,10 +150,22 @@ class XapianSearchBackendTestCase(TestCase):
# self.assertEqual(results['hits'], 3)
# self.assertEqual(results['facets'], {})
def test_narrow_queries(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual(self.sb.search('', narrow_queries=['name:david1']), {'hits': 0, 'results': []})
results = self.sb.search('index*', narrow_queries=['name:david1'])
results = self.sb.search('index', narrow_queries=['name:david1'])
self.assertEqual(results['hits'], 1)
def test_highlight(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
self.assertEqual(self.sb.search('', highlight=True), {'hits': 0, 'results': []})
self.assertEqual(self.sb.search('Index', highlight=True)['hits'], 3)
self.assertEqual([result.highlighted['text'] for result in self.sb.search('Index', highlight=True)['results']], ['<em>Index</em>ed!\n1', '<em>Index</em>ed!\n2', '<em>Index</em>ed!\n3'])
def test_spelling_suggestion(self):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
@ -168,10 +180,10 @@ class XapianSearchBackendTestCase(TestCase):
self.sb.update(self.msi, self.sample_objs)
self.assertEqual(len(self.xapian_search('')), 3)
results = self.sb.search('index', facets=['name'])
results = self.sb.search('index')
self.assertEqual(results['hits'], 3)
results = self.sb.search('indexing', facets=['name'])
results = self.sb.search('indexing')
self.assertEqual(results['hits'], 3)
def test_more_like_this(self):

View file

@ -78,7 +78,7 @@ class SearchBackend(BaseSearchBackend):
raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.')
self.path = settings.HAYSTACK_XAPIAN_PATH
self.stemmer = xapian.Stem('english')
self.stemmer = xapian.Stem(stem_lang)
if not os.path.exists(self.path):
os.makedirs(self.path)
@ -264,9 +264,6 @@ class SearchBackend(BaseSearchBackend):
if query_facets is not None:
warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)
if highlight is not False:
warnings.warn("Highlight has not been implemented yet.", Warning, stacklevel=2)
database = self._open_database()
schema = pickle.loads(database.get_metadata('schema'))
spelling_suggestion = None
@ -291,7 +288,9 @@ class SearchBackend(BaseSearchBackend):
enquire.set_sort_by_key_then_relevance(sorter, True)
matches = enquire.get_mset(start_offset, end_offset)
results = self._process_results(matches, facets)
results = self._process_results(
matches, query_string=query_string, highlight=highlight, facets=facets
)
if spelling_suggestion:
results['spelling_suggestion'] = spelling_suggestion
@ -364,7 +363,7 @@ class SearchBackend(BaseSearchBackend):
matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS)
return self._process_results(matches)
def _process_results(self, matches, facets=None):
def _process_results(self, matches, query_string='', highlight=False, facets=None):
"""
Private method for processing an MSet (match set).
@ -372,6 +371,8 @@ class SearchBackend(BaseSearchBackend):
`matches` -- An MSet of matches
Optional arguments:
`query_string` -- The query string that generated the matches
`highlight` -- Add highlighting to results? (default=False)
`facets` -- Fields to facet (default = None)
Returns:
@ -404,6 +405,12 @@ class SearchBackend(BaseSearchBackend):
document = match.get_document()
app_label, module_name, pk = document.get_value(0).split('.')
additional_fields = pickle.loads(document.get_data())
if highlight and (len(query_string) > 0):
additional_fields['highlighted'] = {
self.content_field_name: self._do_highlight(
additional_fields.get(self.content_field_name), query_string
)
}
result = SearchResult(
app_label, module_name, pk, match.weight, **additional_fields
)
@ -420,6 +427,23 @@ class SearchBackend(BaseSearchBackend):
'facets': facets_dict,
}
def _do_highlight(self, content, text, tag='em'):
"""
Highlight `text` in `content` with html `tag`.
This method assumes that the input text (`content`) does not contain
any special formatting. That is, it does not contain any html tags
or similar markup that could be screwed up by the highlighting.
Required arguments:
`content` -- Content to search for instances of `text`
`text` -- The text to be highlighted
"""
for term in [term.replace('*', '') for term in text.split()]:
term_re = re.compile(re.escape(term), re.IGNORECASE)
content = term_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
return content
def _do_field_facets(self, document, facets, fields):
"""
Private method that facets a document by field name.
@ -477,7 +501,7 @@ class SearchBackend(BaseSearchBackend):
Returns a dictionary that can be stored in the database ('schema') metdata.
"""
content_field_name, fields = self.site.build_unified_schema()
self.content_field_name, fields = self.site.build_unified_schema()
schema_fields = {}
for i, field in enumerate(fields):
if field['indexed'] == 'true':
@ -586,6 +610,7 @@ class SearchBackend(BaseSearchBackend):
enquire.set_docid_order(enquire.ASCENDING)
return enquire
class SearchQuery(BaseSearchQuery):
"""
`SearchQuery` is responsible for converting search queries into a format