Implemented highlighting

2026-05-16 02:53:13 +00:00 · 2009-07-27 15:12:20 -04:00 · 2009-07-27 15:12:20 -04:00 · c087348a48
commit c087348a48
parent a238bafe80
2 changed files with 60 additions and 23 deletions
--- a/tests/xapian_backend.py
+++ b/tests/xapian_backend.py
@ -85,7 +85,7 @@ class XapianSearchBackendTestCase(TestCase):
            document_list.append(object_data)

        return document_list
-    
+
    def test_update(self):
        self.sb.update(self.msi, self.sample_objs)
        self.sb.update(self.msi, self.sample_objs) # Duplicates should be updated, not appended -- http://github.com/notanumber/xapian-haystack/issues/#issue/6
@ -104,37 +104,37 @@ class XapianSearchBackendTestCase(TestCase):
    def test_clear(self):
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)
-        
+
        self.sb.clear()
        self.assertEqual(len(self.xapian_search('')), 0)
-        
+
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)
-        
+
        self.sb.clear([AnotherMockModel])
        self.assertEqual(len(self.xapian_search('')), 3)
-        
+
        self.sb.clear([MockModel])
        self.assertEqual(len(self.xapian_search('')), 0)
-        
+
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)
-        
+
        self.sb.clear([AnotherMockModel, MockModel])
        self.assertEqual(len(self.xapian_search('')), 0)
    
    def test_search(self):
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)
-        
+
        self.assertEqual(self.sb.search(''), {'hits': 0, 'results': []})
        self.assertEqual(self.sb.search('*')['hits'], 3)
        self.assertEqual([result.pk for result in self.sb.search('*')['results']], [u'1', u'2', u'3'])
-        
-        # self.assertEqual(self.sb.search('', highlight=True), {'hits': 0, 'results': []})
-        # self.assertEqual(self.sb.search('Index*', highlight=True)['hits'], 3)
-        # self.assertEqual([result.highlighted['text'][0] for result in self.sb.search('Index*', highlight=True)['results']], ['<em>Indexed</em>!\n3', '<em>Indexed</em>!\n2', '<em>Indexed</em>!\n1'])
-        # 
+
+    def test_field_facets(self):
+        self.sb.update(self.msi, self.sample_objs)
+        self.assertEqual(len(self.xapian_search('')), 3)
+
        self.assertEqual(self.sb.search('', facets=['name']), {'hits': 0, 'results': []})
        results = self.sb.search('index', facets=['name'])
        self.assertEqual(results['hits'], 3)
@ -150,10 +150,22 @@ class XapianSearchBackendTestCase(TestCase):
    #     self.assertEqual(results['hits'], 3)
    #     self.assertEqual(results['facets'], {})

+    def test_narrow_queries(self):
+        self.sb.update(self.msi, self.sample_objs)
+        self.assertEqual(len(self.xapian_search('')), 3)
+
        self.assertEqual(self.sb.search('', narrow_queries=['name:david1']), {'hits': 0, 'results': []})
-        results = self.sb.search('index*', narrow_queries=['name:david1'])
+        results = self.sb.search('index', narrow_queries=['name:david1'])
        self.assertEqual(results['hits'], 1)

+    def test_highlight(self):
+        self.sb.update(self.msi, self.sample_objs)
+        self.assertEqual(len(self.xapian_search('')), 3)
+
+        self.assertEqual(self.sb.search('', highlight=True), {'hits': 0, 'results': []})
+        self.assertEqual(self.sb.search('Index', highlight=True)['hits'], 3)
+        self.assertEqual([result.highlighted['text'] for result in self.sb.search('Index', highlight=True)['results']], ['<em>Index</em>ed!\n1', '<em>Index</em>ed!\n2', '<em>Index</em>ed!\n3'])
+
    def test_spelling_suggestion(self):
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)
@ -168,10 +180,10 @@ class XapianSearchBackendTestCase(TestCase):
        self.sb.update(self.msi, self.sample_objs)
        self.assertEqual(len(self.xapian_search('')), 3)

-        results = self.sb.search('index', facets=['name'])
+        results = self.sb.search('index')
        self.assertEqual(results['hits'], 3)

-        results = self.sb.search('indexing', facets=['name'])
+        results = self.sb.search('indexing')
        self.assertEqual(results['hits'], 3)

    def test_more_like_this(self):
--- a/xapian_backend.py
+++ b/xapian_backend.py
@ -78,7 +78,7 @@ class SearchBackend(BaseSearchBackend):
            raise ImproperlyConfigured('You must specify a HAYSTACK_XAPIAN_PATH in your settings.')

        self.path = settings.HAYSTACK_XAPIAN_PATH
-        self.stemmer = xapian.Stem('english')
+        self.stemmer = xapian.Stem(stem_lang)

        if not os.path.exists(self.path):
            os.makedirs(self.path)
@ -264,9 +264,6 @@ class SearchBackend(BaseSearchBackend):
        if query_facets is not None:
            warnings.warn("Query faceting has not been implemented yet.", Warning, stacklevel=2)

-        if highlight is not False:
-            warnings.warn("Highlight has not been implemented yet.", Warning, stacklevel=2)
-
        database = self._open_database()
        schema = pickle.loads(database.get_metadata('schema'))
        spelling_suggestion = None
@ -291,7 +288,9 @@ class SearchBackend(BaseSearchBackend):
            enquire.set_sort_by_key_then_relevance(sorter, True)

        matches = enquire.get_mset(start_offset, end_offset)
-        results = self._process_results(matches, facets)
+        results = self._process_results(
+            matches, query_string=query_string, highlight=highlight, facets=facets
+        )

        if spelling_suggestion:
            results['spelling_suggestion'] = spelling_suggestion
@ -364,7 +363,7 @@ class SearchBackend(BaseSearchBackend):
        matches = enquire.get_mset(0, DEFAULT_MAX_RESULTS)
        return self._process_results(matches)

-    def _process_results(self, matches, facets=None):
+    def _process_results(self, matches, query_string='', highlight=False, facets=None):
        """
        Private method for processing an MSet (match set).

@ -372,6 +371,8 @@ class SearchBackend(BaseSearchBackend):
            `matches` -- An MSet of matches

        Optional arguments:
+            `query_string` -- The query string that generated the matches
+            `highlight` -- Add highlighting to results? (default=False)
            `facets` -- Fields to facet (default = None)

        Returns:
@ -404,6 +405,12 @@ class SearchBackend(BaseSearchBackend):
            document = match.get_document()
            app_label, module_name, pk = document.get_value(0).split('.')
            additional_fields = pickle.loads(document.get_data())
+            if highlight and (len(query_string) > 0):
+                additional_fields['highlighted'] = {
+                    self.content_field_name: self._do_highlight(
+                        additional_fields.get(self.content_field_name), query_string
+                    )
+                }
            result = SearchResult(
                app_label, module_name, pk, match.weight, **additional_fields
            )
@ -420,6 +427,23 @@ class SearchBackend(BaseSearchBackend):
            'facets': facets_dict,
        }

+    def _do_highlight(self, content, text, tag='em'):
+        """
+        Highlight `text` in `content` with html `tag`.
+        
+        This method assumes that the input text (`content`) does not contain
+        any special formatting.  That is, it does not contain any html tags
+        or similar markup that could be screwed up by the highlighting.
+        
+        Required arguments:
+            `content` -- Content to search for instances of `text`
+            `text` -- The text to be highlighted
+        """
+        for term in [term.replace('*', '') for term in text.split()]:
+            term_re = re.compile(re.escape(term), re.IGNORECASE)
+            content = term_re.sub('<%s>%s</%s>' % (tag, term, tag), content)
+        return content
+
    def _do_field_facets(self, document, facets, fields):
        """
        Private method that facets a document by field name.
@ -477,7 +501,7 @@ class SearchBackend(BaseSearchBackend):

        Returns a dictionary that can be stored in the database ('schema') metdata.
        """
-        content_field_name, fields = self.site.build_unified_schema()
+        self.content_field_name, fields = self.site.build_unified_schema()
        schema_fields = {}
        for i, field in enumerate(fields):
            if field['indexed'] == 'true':
@ -586,6 +610,7 @@ class SearchBackend(BaseSearchBackend):
        enquire.set_docid_order(enquire.ASCENDING)
        return enquire

+
 class SearchQuery(BaseSearchQuery):
    """
    `SearchQuery` is responsible for converting search queries into a format