The regex mechanism scans through the first "max_chr" character values
to cache character property ranges (isalpha, etc.). For single-byte
encodings, there's no sense in scanning beyond UCHAR_MAX; but for
UTF-8 it makes sense to cache higher code point values (though not all
of them; only up to MAX_SIMPLE_CHR).
Prior to
5a38104b36, the logic about how many character values to scan
was based on the pg_regex_strategy, which was dependent on the
provider. Commit
5a38104b36 preserved that logic exactly, allowing
different providers to define the "max_chr".
Now, change it to depend only on the encoding and whether
ctype_is_c. For this specific calculation, distinguishing between
providers creates more complexity than it's worth.
Discussion: https://postgr.es/m/
450ceb6260cad30d7afdf155d991a9caafee7c0d.camel@j-davis.com
Reviewed-by: Chao Li <li.evan.chao@gmail.com>
max_chr = (pg_wchar) MAX_SIMPLE_CHR;
#endif
}
+ else if (GetDatabaseEncoding() == PG_UTF8)
+ {
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+ }
else
{
- if (pg_regex_locale->ctype->max_chr != 0 &&
- pg_regex_locale->ctype->max_chr <= MAX_SIMPLE_CHR)
- {
- max_chr = pg_regex_locale->ctype->max_chr;
- pcc->cv.cclasscode = -1;
- }
- else
- max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#if MAX_SIMPLE_CHR >= UCHAR_MAX
+ max_chr = (pg_wchar) UCHAR_MAX;
+ pcc->cv.cclasscode = -1;
+#else
+ max_chr = (pg_wchar) MAX_SIMPLE_CHR;
+#endif
}
/*
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
};
/*
.char_tolower = char_tolower_libc,
.wc_toupper = toupper_libc_sb,
.wc_tolower = tolower_libc_sb,
- .max_chr = UCHAR_MAX,
};
static const struct ctype_methods ctype_methods_libc_utf8 = {
* pg_strlower().
*/
char (*char_tolower) (unsigned char ch, pg_locale_t locale);
-
- /*
- * For regex and pattern matching efficiency, the maximum char value
- * supported by the above methods. If zero, limit is set by regex code.
- */
- pg_wchar max_chr;
};
/*