Fix text substring search for non-deterministic collations.

author Tom Lane <tgl@sss.pgh.pa.us>

Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)

committer Tom Lane <tgl@sss.pgh.pa.us>

Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)
author Tom Lane <tgl@sss.pgh.pa.us>
Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)
committer Tom Lane <tgl@sss.pgh.pa.us>
Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c

index 3894457ab404f7dc16738f675cb4ddc152f3fd9c..f202b8df4e2b4bc17bd3308312103a87091b182e 100644 (file)
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1111,6 +1111,7 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
     const char *hptr;
  
     Assert(start_ptr >= haystack && start_ptr <= haystack_end);
+   Assert(needle_len > 0);
  
     state->last_match_len_tmp = needle_len;
  
@@ -1123,19 +1124,26 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
          * needle under the given collation.
          *
          * Note, the found substring could have a different length than the
-        * needle, including being empty.  Callers that want to skip over the
-        * found string need to read the length of the found substring from
-        * last_match_len rather than just using the length of their needle.
+        * needle.  Callers that want to skip over the found string need to
+        * read the length of the found substring from last_match_len rather
+        * than just using the length of their needle.
          *
          * Most callers will require "greedy" semantics, meaning that we need
          * to find the longest such substring, not the shortest.  For callers
          * that don't need greedy semantics, we can finish on the first match.
+        *
+        * This loop depends on the assumption that the needle is nonempty and
+        * any matching substring must also be nonempty.  (Even if the
+        * collation would accept an empty match, returning one would send
+        * callers that search for successive matches into an infinite loop.)
          */
         const char *result_hptr = NULL;
  
         hptr = start_ptr;
         while (hptr < haystack_end)
         {
+           const char *test_end;
+
             /*
              * First check the common case that there is a match in the
              * haystack of exactly the length of the needle.
@@ -1146,11 +1154,13 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
                 return (char *) hptr;
  
             /*
-            * Else check if any of the possible substrings starting at hptr
-            * are equal to the needle.
+            * Else check if any of the non-empty substrings starting at hptr
+            * compare equal to the needle.
              */
-           for (const char *test_end = hptr; test_end < haystack_end; test_end += pg_mblen(test_end))
+           test_end = hptr;
+           do
             {
+               test_end += pg_mblen(test_end);
                 if (pg_strncoll(hptr, (test_end - hptr), needle, needle_len, state->locale) == 0)
                 {
                     state->last_match_len_tmp = (test_end - hptr);
@@ -1158,7 +1168,8 @@ text_position_next_internal(char *start_ptr, TextPositionState *state)
                     if (!state->greedy)
                         break;
                 }
-           }
+           } while (test_end < haystack_end);
+
             if (result_hptr)
                 break;
  
diff --git a/src/test/regress/expected/collate.icu.utf8.out b/src/test/regress/expected/collate.icu.utf8.out

index b8579a1efc63d07211e7cd290db2dbb1bc40f910..8023014fe63764e63ea3b4745cfd9d1eb885081a 100644 (file)
--- a/src/test/regress/expected/collate.icu.utf8.out
+++ b/src/test/regress/expected/collate.icu.utf8.out
@@ -1484,6 +1484,13 @@ SELECT array_sort('{a,B}'::text[] COLLATE "C");
   {B,a}
  (1 row)
  
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
+ replace 
+---------
+ tester
+(1 row)
+
  -- test language tags
  CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
  SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
diff --git a/src/test/regress/sql/collate.icu.utf8.sql b/src/test/regress/sql/collate.icu.utf8.sql

index 6f5abac0dc0407002a3b02257a3c478494e07ce7..b6c54503d219a28750305c4914420f1b24993f25 100644 (file)
--- a/src/test/regress/sql/collate.icu.utf8.sql
+++ b/src/test/regress/sql/collate.icu.utf8.sql
@@ -568,6 +568,9 @@ SELECT 'abc' <= 'ABC' COLLATE case_insensitive, 'abc' >= 'ABC' COLLATE case_inse
  SELECT array_sort('{a,B}'::text[] COLLATE case_insensitive);
  SELECT array_sort('{a,B}'::text[] COLLATE "C");
  
+-- test replace() at the end of the string (bug #19341)
+SELECT replace('testX' COLLATE case_insensitive, 'x' COLLATE case_insensitive, 'er');
+
  -- test language tags
  CREATE COLLATION lt_insensitive (provider = icu, locale = 'en-u-ks-level1', deterministic = false);
  SELECT 'aBcD' COLLATE lt_insensitive = 'AbCd' COLLATE lt_insensitive;
author	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)
committer	Tom Lane <tgl@sss.pgh.pa.us>
	Sat, 6 Dec 2025 01:10:33 +0000 (20:10 -0500)
src/backend/utils/adt/varlena.c		patch \| blob \| blame \| history
src/test/regress/expected/collate.icu.utf8.out		patch \| blob \| blame \| history
src/test/regress/sql/collate.icu.utf8.sql		patch \| blob \| blame \| history