diff --git a/src/tests/test_watson/tests.py b/src/tests/test_watson/tests.py index 6f85527..68f9f1d 100644 --- a/src/tests/test_watson/tests.py +++ b/src/tests/test_watson/tests.py @@ -1,3 +1,4 @@ +# coding=utf-8 """ Tests for django-watson. @@ -284,6 +285,14 @@ class SearchTest(SearchTestBase): self.assertEqual(watson.search("FOOO INSTANCE11").count(), 0) self.assertEqual(watson.search("MODEL2 INSTANCE11").count(), 0) + def testSearchWithAccent(self): + WatsonTestModel1.objects.create( + title = "title model1 instance12", + content = "content model1 instance13 café", + description = "description model1 instance13", + ) + self.assertEqual(watson.search("café").count(), 1) + def testSearchWithApostrophe(self): WatsonTestModel1.objects.create( title = "title model1 instance12", @@ -298,7 +307,7 @@ class SearchTest(SearchTestBase): content = "'content model1 instance13", description = "description model1 instance13", ) - self.assertEqual(watson.search("'content").count(), 1) + self.assertTrue(watson.search("'content").exists()) # Some database engines ignore leading apostrophes, some count them. @skipUnless(get_backend().supports_prefix_matching, "Search backend does not support prefix matching.") def testMultiTablePrefixSearch(self): diff --git a/src/watson/backends.py b/src/watson/backends.py index cd31c24..cfb95b5 100644 --- a/src/watson/backends.py +++ b/src/watson/backends.py @@ -18,15 +18,17 @@ def regex_from_word(word): return "(\s{word})|(^{word})".format( word = re.escape(word), ) - - -def make_escaper(badchars): - """Creates an efficient escape function that strips the given characters from the string.""" - translation_table = dict((ord(c), None) for c in badchars) - translation_table[ord("'")] = "''" - def escaper(text): - return force_text(text, errors="ignore").translate(translation_table) - return escaper + + +RE_SPACE = re.compile(r"[\s]+", re.UNICODE) +RE_NON_WORD = re.compile(r"[^ \w\-']", re.UNICODE) + + +def escape_query(text): + text = force_text(text) + text = RE_SPACE.sub(" ", text) # Standardize spacing. + text = RE_NON_WORD.sub("", text) # Remove non-word characters. + return text class SearchBackend(six.with_metaclass(abc.ABCMeta)): @@ -154,9 +156,6 @@ class RegexSearchBackend(RegexSearchMixin, SearchBackend): """A search backend that works with SQLite3.""" -escape_postgres_query_chars = make_escaper("():|!&*") - - class PostgresSearchBackend(SearchBackend): """A search backend that uses native PostgreSQL full text indices.""" @@ -167,9 +166,9 @@ class PostgresSearchBackend(SearchBackend): def escape_postgres_query(self, text): """Escapes the given text to become a valid ts_query.""" return " & ".join( - "{0}:*".format(word) + "$${0}$$:*".format(word) for word - in escape_postgres_query_chars(text).split() + in escape_query(text).split() ) def is_installed(self): @@ -310,7 +309,11 @@ class PostgresLegacySearchBackend(PostgresSearchBackend): def escape_postgres_query(self, text): """Escapes the given text to become a valid ts_query.""" - return " & ".join(escape_postgres_query_chars(text).split()) + return " & ".join( + "$${0}$$".format(word) + for word + in escape_query(text).split() + ) class PostgresPrefixLegacySearchBackend(RegexSearchMixin, PostgresLegacySearchBackend): @@ -322,16 +325,14 @@ class PostgresPrefixLegacySearchBackend(RegexSearchMixin, PostgresLegacySearchBa Use if your postgres vesion is less than 8.3, and you absolutely can't live without prefix matching. Beware, this backend can get slow with large datasets! """ - -escape_mysql_boolean_query_chars = make_escaper("+-<>()*\".!:,;") def escape_mysql_boolean_query(search_text): return " ".join( '+{word}*'.format( word = word, ) - for word in escape_mysql_boolean_query_chars(search_text).split() + for word in escape_query(search_text).split() )