From 7af321a2e9a9c5902eb755216168074497b5fe13 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Thu, 30 Oct 2014 16:45:52 +0000 Subject: [PATCH 1/2] Don't use edgengram as query analyser When a field uses the partial matching, the edgengram_analyser is added to that field. This breaks down the field data into "ngrams" like so: Hello -> "H", "He", "Hel", "Hell", "Hello" This allows a users query for "Hel" to match the above text. The issue that this commit solves is that this was accidentally set as both the index analyser (as described above) and also the query analyser. Setting this as the query analyser will instruct Elasticsearch to perform the above transformation on the users input to the box as well. So if, for example, there was a document with the word "Horse" in it, a users query for "Hello" will match this simply because they both start with the letter "H". The solution is to simply set the "index_analyser" instead of the "analyser" field (which will sets "query_analyser" as well). --- .../wagtailsearch/backends/elasticsearch.py | 4 +-- .../tests/test_elasticsearch_backend.py | 34 ++++++++++++++++++- 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/wagtail/wagtailsearch/backends/elasticsearch.py b/wagtail/wagtailsearch/backends/elasticsearch.py index 7a1eaf6f1..a4a89aeb9 100644 --- a/wagtail/wagtailsearch/backends/elasticsearch.py +++ b/wagtail/wagtailsearch/backends/elasticsearch.py @@ -62,7 +62,7 @@ class ElasticSearchMapping(object): mapping['boost'] = field.boost if field.partial_match: - mapping['analyzer'] = 'edgengram_analyzer' + mapping['index_analyzer'] = 'edgengram_analyzer' mapping['include_in_all'] = True elif isinstance(field, FilterField): @@ -80,7 +80,7 @@ class ElasticSearchMapping(object): fields = { 'pk': dict(type='string', index='not_analyzed', store='yes', include_in_all=False), 'content_type': dict(type='string', index='not_analyzed', include_in_all=False), - '_partials': dict(type='string', analyzer='edgengram_analyzer', include_in_all=False), + '_partials': dict(type='string', index_analyzer='edgengram_analyzer', include_in_all=False), } fields.update(dict( diff --git a/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py b/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py index 8a8b71bdd..583939c01 100644 --- a/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py +++ b/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py @@ -89,7 +89,7 @@ class TestElasticSearchBackend(BackendTests, TestCase): # Add some test data obj = models.SearchTest() - obj.title = "Ĥéỻø" + obj.title = "Ĥéllø" obj.live = True obj.save() self.backend.add(obj) @@ -103,6 +103,38 @@ class TestElasticSearchBackend(BackendTests, TestCase): self.assertEqual(len(results), 1) self.assertEqual(results[0].id, obj.id) + def test_query_analyser(self): + """ + This is testing that fields that use edgengram_analyzer as their index analyser do not + have it also as their query analyser + """ + # Reset the index + self.backend.reset_index() + self.backend.add_type(models.SearchTest) + self.backend.add_type(models.SearchTestChild) + + # Add some test data + obj = models.SearchTest() + obj.title = "Hello" + obj.live = True + obj.save() + self.backend.add(obj) + + # Refresh the index + self.backend.refresh_index() + + # Test search for "Hello" + results = self.backend.search("Hello", models.SearchTest.objects.all()) + + # Should find the result + self.assertEqual(len(results), 1) + + # Test search for "Horse" + results = self.backend.search("Horse", models.SearchTest.objects.all()) + + # Even though they both start with the letter "H". This should not be considered a match + self.assertEqual(len(results), 0) + class TestElasticSearchQuery(TestCase): def assertDictEqual(self, a, b): From 57ae33e5a573485bdff7691e3752d693e9c9dcd7 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Fri, 31 Oct 2014 09:35:33 +0000 Subject: [PATCH 2/2] Updated tests --- .../wagtailsearch/tests/test_elasticsearch_backend.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py b/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py index 583939c01..2ab755c51 100644 --- a/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py +++ b/wagtail/wagtailsearch/tests/test_elasticsearch_backend.py @@ -342,10 +342,10 @@ class TestElasticSearchMapping(TestCase): 'properties': { 'pk': {'index': 'not_analyzed', 'type': 'string', 'store': 'yes', 'include_in_all': False}, 'content_type': {'index': 'not_analyzed', 'type': 'string', 'include_in_all': False}, - '_partials': {'analyzer': 'edgengram_analyzer', 'include_in_all': False, 'type': 'string'}, + '_partials': {'index_analyzer': 'edgengram_analyzer', 'include_in_all': False, 'type': 'string'}, 'live_filter': {'index': 'not_analyzed', 'type': 'boolean', 'include_in_all': False}, 'published_date_filter': {'index': 'not_analyzed', 'type': 'date', 'include_in_all': False}, - 'title': {'type': 'string', 'include_in_all': True, 'analyzer': 'edgengram_analyzer'}, + 'title': {'type': 'string', 'include_in_all': True, 'index_analyzer': 'edgengram_analyzer'}, 'title_filter': {'index': 'not_analyzed', 'type': 'string', 'include_in_all': False}, 'content': {'type': 'string', 'include_in_all': True}, 'callable_indexed_field': {'type': 'string', 'include_in_all': True} @@ -414,15 +414,15 @@ class TestElasticSearchMappingInheritance(TestCase): 'properties': { # New 'extra_content': {'type': 'string', 'include_in_all': True}, - 'subtitle': {'type': 'string', 'include_in_all': True, 'analyzer': 'edgengram_analyzer'}, + 'subtitle': {'type': 'string', 'include_in_all': True, 'index_analyzer': 'edgengram_analyzer'}, # Inherited 'pk': {'index': 'not_analyzed', 'type': 'string', 'store': 'yes', 'include_in_all': False}, 'content_type': {'index': 'not_analyzed', 'type': 'string', 'include_in_all': False}, - '_partials': {'analyzer': 'edgengram_analyzer', 'include_in_all': False, 'type': 'string'}, + '_partials': {'index_analyzer': 'edgengram_analyzer', 'include_in_all': False, 'type': 'string'}, 'live_filter': {'index': 'not_analyzed', 'type': 'boolean', 'include_in_all': False}, 'published_date_filter': {'index': 'not_analyzed', 'type': 'date', 'include_in_all': False}, - 'title': {'type': 'string', 'include_in_all': True, 'analyzer': 'edgengram_analyzer'}, + 'title': {'type': 'string', 'include_in_all': True, 'index_analyzer': 'edgengram_analyzer'}, 'title_filter': {'index': 'not_analyzed', 'type': 'string', 'include_in_all': False}, 'content': {'type': 'string', 'include_in_all': True}, 'callable_indexed_field': {'type': 'string', 'include_in_all': True}