From 9bf6c1872624201435b34c7b492733f882d9b245 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Fri, 17 Oct 2014 12:16:56 +0100 Subject: [PATCH 1/5] Added get_indexed_objects method to Page --- wagtail/wagtailcore/models.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/wagtail/wagtailcore/models.py b/wagtail/wagtailcore/models.py index a4d0bed92..11e3947fa 100644 --- a/wagtail/wagtailcore/models.py +++ b/wagtail/wagtailcore/models.py @@ -509,6 +509,11 @@ class Page(six.with_metaclass(PageBase, MP_Node, ClusterableModel, index.Indexed if self.url_path.startswith(root_path): return ('' if current_site.id == id else root_url) + reverse('wagtail_serve', args=(self.url_path[len(root_path):],)) + @classmethod + def get_indexed_objects(cls): + content_type = ContentType.objects.get_for_model(cls) + return super(Page, cls).get_indexed_objects().filter(content_type=content_type) + @classmethod def search(cls, query_string, show_unpublished=False, search_title_only=False, extra_filters={}, prefetch_related=[], path=None): # Filters From b4fdec74ac1af2b50ab5c79f6127d87033a9d297 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Fri, 17 Oct 2014 12:17:20 +0100 Subject: [PATCH 2/5] Make search signal handlers use get_indexed_objects --- wagtail/wagtailsearch/signal_handlers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/wagtail/wagtailsearch/signal_handlers.py b/wagtail/wagtailsearch/signal_handlers.py index decbfdb51..3e90d0c74 100644 --- a/wagtail/wagtailsearch/signal_handlers.py +++ b/wagtail/wagtailsearch/signal_handlers.py @@ -6,11 +6,18 @@ from wagtail.wagtailsearch.backends import get_search_backends def post_save_signal_handler(instance, **kwargs): + if instance not in type(instance).get_indexed_objects(): + return + + for backend in get_search_backends(): backend.add(instance) def post_delete_signal_handler(instance, **kwargs): + if instance not in type(instance).get_indexed_objects(): + return + for backend in get_search_backends(): backend.delete(instance) From 7a4651ad64e947b4e1984d5cbce85de3edf6176a Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Fri, 17 Oct 2014 12:43:29 +0100 Subject: [PATCH 3/5] Search: Added new deduplication method Fixes #710 --- wagtail/tests/models.py | 11 ++++ wagtail/wagtailsearch/backends/base.py | 2 +- wagtail/wagtailsearch/backends/db.py | 4 +- .../wagtailsearch/backends/elasticsearch.py | 47 ++++++--------- .../management/commands/update_index.py | 59 ++++++------------- 5 files changed, 48 insertions(+), 75 deletions(-) diff --git a/wagtail/tests/models.py b/wagtail/tests/models.py index 53a9b2796..e2a695f03 100644 --- a/wagtail/tests/models.py +++ b/wagtail/tests/models.py @@ -425,6 +425,17 @@ class SearchTest(models.Model, index.Indexed): def callable_indexed_field(self): return "Callable" + @classmethod + def get_indexed_objects(cls): + indexed_objects = super(SearchTest, cls).get_indexed_objects() + + # Exclude SearchTests that have a SearchTestChild to prevent duplicates + if cls is SearchTest: + indexed_objects = indexed_objects.exclude( + id__in=SearchTestChild.objects.all().values_list('searchtest_ptr_id', flat=True) + ) + + return indexed_objects class SearchTestChild(SearchTest): subtitle = models.CharField(max_length=255, null=True, blank=True) diff --git a/wagtail/wagtailsearch/backends/base.py b/wagtail/wagtailsearch/backends/base.py index 1928a7bbe..e3a3a9bd2 100644 --- a/wagtail/wagtailsearch/backends/base.py +++ b/wagtail/wagtailsearch/backends/base.py @@ -29,7 +29,7 @@ class BaseSearch(object): def add(self, obj): return NotImplemented - def add_bulk(self, obj_list): + def add_bulk(self, model, obj_list): return NotImplemented def delete(self, obj): diff --git a/wagtail/wagtailsearch/backends/db.py b/wagtail/wagtailsearch/backends/db.py index 3f7e64f42..d190e29e8 100644 --- a/wagtail/wagtailsearch/backends/db.py +++ b/wagtail/wagtailsearch/backends/db.py @@ -19,8 +19,8 @@ class DBSearch(BaseSearch): def add(self, obj): pass # Not needed - def add_bulk(self, obj_list): - return [] # Not needed + def add_bulk(self, model, obj_list): + pass # Not needed def delete(self, obj): pass # Not needed diff --git a/wagtail/wagtailsearch/backends/elasticsearch.py b/wagtail/wagtailsearch/backends/elasticsearch.py index 80830827a..ba6dc35a3 100644 --- a/wagtail/wagtailsearch/backends/elasticsearch.py +++ b/wagtail/wagtailsearch/backends/elasticsearch.py @@ -572,42 +572,29 @@ class ElasticSearch(BaseSearch): # Add document to index self.es.index(self.es_index, mapping.get_document_type(), mapping.get_document(obj), id=mapping.get_document_id(obj)) - def add_bulk(self, obj_list): - # Group all objects by their type - type_set = {} + def add_bulk(self, model, obj_list): + # Get mapping + mapping = ElasticSearchMapping(model) + doc_type = mapping.get_document_type() + + # Create list of actions + actions = [] for obj in obj_list: # Object must be a decendant of Indexed and be a django model if not self.object_can_be_indexed(obj): continue - # Get mapping - mapping = ElasticSearchMapping(obj.__class__) + # Create the action + action = { + '_index': self.es_index, + '_type': doc_type, + '_id': mapping.get_document_id(obj), + } + action.update(mapping.get_document(obj)) + actions.append(action) - # Get document type - doc_type = mapping.get_document_type() - - # If type is currently not in set, add it - if doc_type not in type_set: - type_set[doc_type] = [] - - # Add document to set - type_set[doc_type].append((mapping.get_document_id(obj), mapping.get_document(obj))) - - # Loop through each type and bulk add them - for type_name, type_documents in type_set.items(): - # Get list of actions - actions = [] - for doc_id, doc in type_documents: - action = { - '_index': self.es_index, - '_type': type_name, - '_id': doc_id, - } - action.update(doc) - actions.append(action) - - yield type_name, len(type_documents) - bulk(self.es, actions) + # Run the actions + bulk(self.es, actions) def delete(self, obj): # Object must be a decendant of Indexed and be a django model diff --git a/wagtail/wagtailsearch/management/commands/update_index.py b/wagtail/wagtailsearch/management/commands/update_index.py index 2c9002c4a..0d7900dfc 100644 --- a/wagtail/wagtailsearch/management/commands/update_index.py +++ b/wagtail/wagtailsearch/management/commands/update_index.py @@ -10,43 +10,16 @@ from wagtail.wagtailsearch.backends import get_search_backend class Command(BaseCommand): def get_object_list(self): - # Print info - self.stdout.write("Getting object list") - # Get list of indexed models indexed_models = [model for model in models.get_models() if issubclass(model, Indexed)] - # Object set - object_set = {} + # Return list of (model_name, queryset) tuples + return [ + (model, model.get_indexed_objects()) + for model in indexed_models + ] - # Add all objects to object set and detect any duplicates - # Duplicates are caused when both a model and a derived model are indexed - # Eg, if BlogPost inherits from Page and both of these models are indexed - # If we were to add all objects from both models into the index, all the BlogPosts will have two entries - for model in indexed_models: - # Get toplevel content type - toplevel_content_type = model.indexed_get_toplevel_content_type() - - # Loop through objects - for obj in model.get_indexed_objects(): - # Get key for this object - key = toplevel_content_type + ':' + str(obj.pk) - - # Check if this key already exists - if key in object_set: - # Conflict, work out who should get this space - # The object with the longest content type string gets the space - # Eg, "wagtailcore.Page-myapp.BlogPost" kicks out "wagtailcore.Page" - if len(obj.indexed_get_content_type()) > len(object_set[key].indexed_get_content_type()): - # Take the spot - object_set[key] = obj - else: - # Space free, take it - object_set[key] = obj - - return indexed_models, object_set.values() - - def update_backend(self, backend_name, models, object_list): + def update_backend(self, backend_name, object_list): # Print info self.stdout.write("Updating backend: " + backend_name) @@ -57,15 +30,17 @@ class Command(BaseCommand): self.stdout.write(backend_name + ": Reseting index") backend.reset_index() - # Add types - self.stdout.write(backend_name + ": Adding types") - for model in models: + for model, queryset in object_list: + self.stdout.write(backend_name + ": Indexing model '%s.%s'" % ( + model._meta.app_label, + model.__name__, + )) + + # Add type backend.add_type(model) - # Add objects to index - self.stdout.write(backend_name + ": Adding objects") - for result in backend.add_bulk(object_list): - self.stdout.write(result[0] + ' ' + str(result[1])) + # Add objects + backend.add_bulk(model, queryset) # Refresh index self.stdout.write(backend_name + ": Refreshing index") @@ -82,7 +57,7 @@ class Command(BaseCommand): def handle(self, **options): # Get object list - models, object_list = self.get_object_list() + object_list = self.get_object_list() # Get list of backends to index if options['backend_name']: @@ -97,4 +72,4 @@ class Command(BaseCommand): # Update backends for backend_name in backend_names: - self.update_backend(backend_name, models, object_list) + self.update_backend(backend_name, object_list) From 3c3c3b6557d2c99b3b96cd0a85131d3cc712d331 Mon Sep 17 00:00:00 2001 From: Karl Hobley Date: Tue, 28 Oct 2014 15:39:57 +0000 Subject: [PATCH 4/5] Improved performance of signal handlers --- wagtail/wagtailsearch/signal_handlers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wagtail/wagtailsearch/signal_handlers.py b/wagtail/wagtailsearch/signal_handlers.py index 3e90d0c74..a828c64c2 100644 --- a/wagtail/wagtailsearch/signal_handlers.py +++ b/wagtail/wagtailsearch/signal_handlers.py @@ -6,7 +6,7 @@ from wagtail.wagtailsearch.backends import get_search_backends def post_save_signal_handler(instance, **kwargs): - if instance not in type(instance).get_indexed_objects(): + if not type(instance).get_indexed_objects().filter(id=instance.id).exists(): return @@ -15,7 +15,7 @@ def post_save_signal_handler(instance, **kwargs): def post_delete_signal_handler(instance, **kwargs): - if instance not in type(instance).get_indexed_objects(): + if not type(instance).get_indexed_objects().filter(id=instance.id).exists(): return for backend in get_search_backends(): From 47e32a3cd16784af97eabbb13251c327e6b5dcf9 Mon Sep 17 00:00:00 2001 From: Matt Westcott Date: Tue, 28 Oct 2014 15:48:16 +0000 Subject: [PATCH 5/5] Release note for #714 --- CHANGELOG.txt | 1 + docs/releases/0.8.rst | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.txt b/CHANGELOG.txt index cb9376131..285bba5c4 100644 --- a/CHANGELOG.txt +++ b/CHANGELOG.txt @@ -16,6 +16,7 @@ Changelog * Fix: When copying pages recursively, revisions of sub-pages were being copied regardless of the copy_revisions flag * Fix: Updated the migration dependencies within the project template to ensure that Wagtail's own migrations consistently apply first. * Fix: The cache of site root paths is now cleared when a site is deleted. + * Fix: Search indexing now prevents pages from being indexed multiple times, as both the base Page model and the specific subclass 0.7 (09.10.2014) ~~~~~~~~~~~~~~~~ diff --git a/docs/releases/0.8.rst b/docs/releases/0.8.rst index 9a6de865d..ec1e82b8c 100644 --- a/docs/releases/0.8.rst +++ b/docs/releases/0.8.rst @@ -32,7 +32,7 @@ Bug fixes * When copying pages recursively, revisions of sub-pages were being copied regardless of the ``copy_revisions`` flag * Updated the migration dependencies within the project template to ensure that Wagtail's own migrations consistently apply first * The cache of site root paths is now cleared when a site is deleted - + * Search indexing now prevents pages from being indexed multiple times, as both the base Page model and the specific subclass Upgrade considerations ======================