Fix usage of char2wchar/wchar2char. Changes:
authorTeodor Sigaev <teodor@sigaev.ru>
Mon, 2 Mar 2009 15:11:25 +0000 (15:11 +0000)
committerTeodor Sigaev <teodor@sigaev.ru>
Mon, 2 Mar 2009 15:11:25 +0000 (15:11 +0000)
- pg_wchar and wchar_t could have different size, so char2wchar
  doesn't call pg_mb2wchar_with_len to prevent out-of-bound
  memory bug
- make char2wchar/wchar2char symmetric, now they should not be
  called with C-locale because mbstowcs/wcstombs oftenly doesn't
  work correct with C-locale.
- Text parser uses pg_mb2wchar_with_len directly in case of
  C-locale and multibyte encoding

Per bug report by Hiroshi Inoue <inoue@tpf.co.jp> and
following discussion.

Backpatch up to 8.2 when multybyte support was implemented in tsearch.

src/backend/tsearch/ts_locale.c
src/backend/tsearch/wparser_def.c

index 73d8ed51de7452a68e9988a18085faf56929ceba..71f99febdbc76a4a970633358ea3b3b0c6d3757a 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.7.2.1 2008/06/18 20:55:49 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/ts_locale.c,v 1.7.2.2 2009/03/02 15:11:25 teodor Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,7 @@ wchar2char(char *to, const wchar_t *from, size_t tolen)
    }
 #endif   /* WIN32 */
 
+   Assert( !lc_ctype_is_c() );
    return wcstombs(to, from, tolen);
 }
 
@@ -99,17 +100,8 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
 
        return r;
    }
-#endif   /* WIN32 */
-
-   if (lc_ctype_is_c())
-   {
-       /*
-        * pg_mb2wchar_with_len always adds trailing '\0', so 'to' should be
-        * allocated with sufficient space
-        */
-       return pg_mb2wchar_with_len(from, (pg_wchar *) to, fromlen);
-   }
    else
+#endif   /* WIN32 */
    {
        /*
         * mbstowcs requires ending '\0'
@@ -117,6 +109,7 @@ char2wchar(wchar_t *to, size_t tolen, const char *from, size_t fromlen)
        char       *str = pnstrdup(from, fromlen);
        size_t      result;
 
+       Assert( !lc_ctype_is_c() );
        result = mbstowcs(to, str, tolen);
 
        pfree(str);
index 0ab06ad0f15bf57986381539ef122e40f6474e52..61e367b650cebac902ffb7b560372f396cd690d8 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.14.2.3 2009/01/15 17:06:03 teodor Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.14.2.4 2009/03/02 15:11:25 teodor Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -240,12 +240,12 @@ typedef struct TParser
    int         lenstr;         /* length of mbstring */
 #ifdef TS_USE_WIDE
    wchar_t    *wstr;           /* wide character string */
-   int         lenwstr;        /* length of wsting */
+   pg_wchar   *pgwstr;         /* wide character string for C-locale */
+   bool        usewide;
 #endif
 
    /* State of parse */
    int         charmaxlen;
-   bool        usewide;
    TParserPosition *state;
    bool        ignore;
    bool        wanthost;
@@ -299,13 +299,24 @@ TParserInit(char *str, int len)
    if (prs->charmaxlen > 1)
    {
        prs->usewide = true;
-       prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
-       prs->lenwstr = char2wchar(prs->wstr, prs->lenstr + 1,
-                                 prs->str, prs->lenstr);
+       if ( lc_ctype_is_c() )
+       {
+           /*
+            * char2wchar doesn't work for C-locale and
+            * sizeof(pg_wchar) could be not equal to sizeof(wchar_t)
+            */
+           prs->pgwstr = (pg_wchar*) palloc(sizeof(pg_wchar) * (prs->lenstr + 1));
+           pg_mb2wchar_with_len(prs->str, prs->pgwstr, prs->lenstr);
+       }
+       else
+       {
+           prs->wstr = (wchar_t *) palloc(sizeof(wchar_t) * (prs->lenstr + 1));
+           char2wchar(prs->wstr, prs->lenstr + 1, prs->str, prs->lenstr);
+       }
    }
    else
-#endif
        prs->usewide = false;
+#endif
 
    prs->state = newTParserPosition(NULL);
    prs->state->state = TPS_Base;
@@ -331,6 +342,8 @@ TParserClose(TParser *prs)
 #ifdef TS_USE_WIDE
    if (prs->wstr)
        pfree(prs->wstr);
+   if (prs->pgwstr)
+       pfree(prs->pgwstr);
 #endif
 
    pfree(prs);
@@ -338,10 +351,12 @@ TParserClose(TParser *prs)
 
 /*
  * Character-type support functions, equivalent to is* macros, but
- * working with any possible encodings and locales. Note,
- * that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-locale
- * often are used for Asian languages
+ * working with any possible encodings and locales. Notes:
+ *  - with multibyte encoding and C-locale isw* function may fail
+ *    or give wrong result. 
+ *  - multibyte encoding and C-locale often are used for 
+ *    Asian languages.
+ *  - if locale is C the we use pgwstr instead of wstr
  */
 
 #ifdef TS_USE_WIDE
@@ -352,14 +367,14 @@ p_is##type(TParser *prs) {                                                    \
    Assert( prs->state );                                                   \
    if ( prs->usewide )                                                     \
    {                                                                       \
-       if ( lc_ctype_is_c() )                                              \
-           return is##type( 0xff & *( prs->wstr + prs->state->poschar) );  \
+       if ( prs->pgwstr )                                                  \
+           return is##type( 0xff & *( prs->pgwstr + prs->state->poschar) );\
                                                                            \
        return isw##type( *(wint_t*)( prs->wstr + prs->state->poschar ) );  \
    }                                                                       \
                                                                            \
    return is##type( *(unsigned char*)( prs->str + prs->state->posbyte ) ); \
-}  \
+}                                                                          \
                                                                            \
 static int                                                                 \
 p_isnot##type(TParser *prs) {                                              \
@@ -373,9 +388,9 @@ p_isalnum(TParser *prs)
 
    if (prs->usewide)
    {
-       if (lc_ctype_is_c())
+       if (prs->pgwstr)
        {
-           unsigned int c = *(prs->wstr + prs->state->poschar);
+           unsigned int c = *(prs->pgwstr + prs->state->poschar);
 
            /*
             * any non-ascii symbol with multibyte encoding with C-locale is
@@ -405,9 +420,9 @@ p_isalpha(TParser *prs)
 
    if (prs->usewide)
    {
-       if (lc_ctype_is_c())
+       if (prs->pgwstr)
        {
-           unsigned int c = *(prs->wstr + prs->state->poschar);
+           unsigned int c = *(prs->pgwstr + prs->state->poschar);
 
            /*
             * any non-ascii symbol with multibyte encoding with C-locale is