Improving postgres query escaping

2026-05-09 06:04:42 +00:00 · 2015-06-01 11:01:50 +01:00 · 2015-06-01 11:01:50 +01:00 · 30b59a00a9
commit 30b59a00a9
parent 4eeb9c73d5
2 changed files with 29 additions and 19 deletions
--- a/src/tests/test_watson/tests.py
+++ b/src/tests/test_watson/tests.py
@ -1,3 +1,4 @@
+# coding=utf-8
 """
 Tests for django-watson.

@ -284,6 +285,14 @@ class SearchTest(SearchTestBase):
        self.assertEqual(watson.search("FOOO INSTANCE11").count(), 0)
        self.assertEqual(watson.search("MODEL2 INSTANCE11").count(), 0)

+    def testSearchWithAccent(self):
+        WatsonTestModel1.objects.create(
+            title = "title model1 instance12",
+            content = "content model1 instance13 café",
+            description = "description model1 instance13",
+        )
+        self.assertEqual(watson.search("café").count(), 1)
+
    def testSearchWithApostrophe(self):
        WatsonTestModel1.objects.create(
            title = "title model1 instance12",
@ -298,7 +307,7 @@ class SearchTest(SearchTestBase):
            content = "'content model1 instance13",
            description = "description model1 instance13",
        )
-        self.assertEqual(watson.search("'content").count(), 1)
+        self.assertTrue(watson.search("'content").exists())  # Some database engines ignore leading apostrophes, some count them.
        
    @skipUnless(get_backend().supports_prefix_matching, "Search backend does not support prefix matching.")
    def testMultiTablePrefixSearch(self):
--- a/src/watson/backends.py
+++ b/src/watson/backends.py
@ -18,15 +18,17 @@ def regex_from_word(word):
    return "(\s{word})|(^{word})".format(
        word = re.escape(word),
    )
-    
-    
-def make_escaper(badchars):
-    """Creates an efficient escape function that strips the given characters from the string."""
-    translation_table = dict((ord(c), None) for c in badchars)
-    translation_table[ord("'")] = "''"
-    def escaper(text):
-        return force_text(text, errors="ignore").translate(translation_table)
-    return escaper
+
+
+RE_SPACE = re.compile(r"[\s]+", re.UNICODE)
+RE_NON_WORD = re.compile(r"[^ \w\-']", re.UNICODE)
+
+
+def escape_query(text):
+    text = force_text(text)
+    text = RE_SPACE.sub(" ", text)  # Standardize spacing.
+    text = RE_NON_WORD.sub("", text)  # Remove non-word characters.
+    return text


 class SearchBackend(six.with_metaclass(abc.ABCMeta)):
@ -154,9 +156,6 @@ class RegexSearchBackend(RegexSearchMixin, SearchBackend):
    """A search backend that works with SQLite3."""


-escape_postgres_query_chars = make_escaper("():|!&*")
-
-
 class PostgresSearchBackend(SearchBackend):

    """A search backend that uses native PostgreSQL full text indices."""
@ -167,9 +166,9 @@ class PostgresSearchBackend(SearchBackend):
    def escape_postgres_query(self, text):
        """Escapes the given text to become a valid ts_query."""
        return " & ".join(
-            "{0}:*".format(word)
+            "$${0}$$:*".format(word)
            for word
-            in escape_postgres_query_chars(text).split()
+            in escape_query(text).split()
        )
    
    def is_installed(self):
@ -310,7 +309,11 @@ class PostgresLegacySearchBackend(PostgresSearchBackend):
    
    def escape_postgres_query(self, text):
        """Escapes the given text to become a valid ts_query."""
-        return " & ".join(escape_postgres_query_chars(text).split())
+        return " & ".join(
+            "$${0}$$".format(word)
+            for word
+            in escape_query(text).split()
+        )


 class PostgresPrefixLegacySearchBackend(RegexSearchMixin, PostgresLegacySearchBackend):
@ -322,16 +325,14 @@ class PostgresPrefixLegacySearchBackend(RegexSearchMixin, PostgresLegacySearchBa
    Use if your postgres vesion is less than 8.3, and you absolutely can't live without
    prefix matching. Beware, this backend can get slow with large datasets! 
    """
-        

-escape_mysql_boolean_query_chars = make_escaper("+-<>()*\".!:,;")

 def escape_mysql_boolean_query(search_text):
    return " ".join(
        '+{word}*'.format(
            word = word,
        )
-        for word in escape_mysql_boolean_query_chars(search_text).split()
+        for word in escape_query(search_text).split()
    )