Added in splitting up of emails when preprocessing content.

This commit is contained in:
Dave Hall 2011-08-24 11:22:51 +01:00
parent b20ad0c56f
commit ffda21b33e
3 changed files with 33 additions and 6 deletions

View file

@ -1,5 +1,7 @@
"""Search backends used by django-watson."""
import re
from django.conf import settings
from django.core.exceptions import ImproperlyConfigured
from django.contrib.contenttypes.models import ContentType
@ -14,7 +16,7 @@ def regex_from_search_text(search_text):
words = search_text.split()
return u"|".join(
u"(\s{word}\s)|(^{word}\s)|(\s{word}$)|(^{word}$)".format(
word = word,
word = re.escape(word),
)
for word in words
)

View file

@ -1,6 +1,6 @@
"""Adapters for registering models with django-watson."""
import operator
import operator, re
from threading import local
from contextlib import contextmanager
from functools import wraps
@ -24,6 +24,10 @@ class SearchAdapterError(Exception):
"""Something went wrong with a search adapter."""
# Used for splitting up email addresses.
RE_EMAIL = re.compile(u"([a-z0-9][a-z0-9\.+]*)@([a-z0-9\.+]*[a-z])", re.IGNORECASE)
class SearchAdapter(object):
"""An adapter for performing a full-text search on a model."""
@ -66,6 +70,20 @@ class SearchAdapter(object):
# Resolution complete!
return value
def prepare_content(self, content):
"""Sanitizes the given content string for better parsing by the search engine."""
# Strip out HTML tags.
content = strip_tags(content)
# Split up email addresess.
def split_email(match):
return u" ".join((
match.group(0),
match.group(1),
match.group(2),
))
content = RE_EMAIL.sub(split_email, content)
return content
def get_title(self, obj):
"""Returns the search title for the given obj."""
return unicode(obj)
@ -84,10 +102,10 @@ class SearchAdapter(object):
# Exclude named fields.
field_names = (field_name for field_name in field_names if field_name not in self.exclude)
# Create the text.
return u" ".join(
strip_tags(self._resolve_field(obj, field_name))
return self.prepare_content(u" ".join(
self._resolve_field(obj, field_name)
for field_name in field_names
)
))
def get_url(self, obj):
"""Return the URL of the given obj."""

View file

@ -168,7 +168,14 @@ class InternalsTest(SearchTestBase):
call_command("buildwatson")
# Make sure that we have four again (including duplicates).
self.assertEqual(SearchEntry.objects.count(), 4)
def testSearchEmailParts(self):
with watson.context():
self.test11.content = "foo@bar.com"
self.test11.save()
self.assertEqual(watson.search("foo").count(), 1)
self.assertEqual(watson.search("bar.com").count(), 1)
self.assertEqual(watson.search("foo@bar.com").count(), 1)
class SearchTest(SearchTestBase):