Change the backend to reject strings containing invalidly-encoded multibyte
authorTom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:05:50 +0000 (20:05 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Sun, 21 May 2006 20:05:50 +0000 (20:05 +0000)
characters in all cases.  Formerly we mostly just threw warnings for invalid
input, and failed to detect it at all if no encoding conversion was required.
The tighter check is needed to defend against SQL-injection attacks as per
CVE-2006-2313 (further details will be published after release).  Embedded
zero (null) bytes will be rejected as well.  The checks are applied during
input to the backend (receipt from client or COPY IN), so it no longer seems
necessary to check in textin() and related routines; any string arriving at
those functions will already have been validated.  Conversion failure
reporting (for characters with no equivalent in the destination encoding)
has been cleaned up and made consistent while at it.

Also, fix a few longstanding errors in little-used encoding conversion
routines: win1251_to_iso, win866_to_iso, euc_tw_to_big5, euc_tw_to_mic,
mic_to_euc_tw were all broken to varying extents.

Patches by Tatsuo Ishii and Tom Lane.  Thanks to Akio Ishida and Yasuo Ohgaki
for identifying the security issues.

35 files changed:
src/backend/commands/copy.c
src/backend/utils/adt/name.c
src/backend/utils/adt/varchar.c
src/backend/utils/adt/varlena.c
src/backend/utils/mb/conv.c
src/backend/utils/mb/conversion_procs/cyrillic_and_mic/cyrillic_and_mic.c
src/backend/utils/mb/conversion_procs/euc_cn_and_mic/euc_cn_and_mic.c
src/backend/utils/mb/conversion_procs/euc_jp_and_sjis/euc_jp_and_sjis.c
src/backend/utils/mb/conversion_procs/euc_kr_and_mic/euc_kr_and_mic.c
src/backend/utils/mb/conversion_procs/euc_tw_and_big5/euc_tw_and_big5.c
src/backend/utils/mb/conversion_procs/latin2_and_win1250/latin2_and_win1250.c
src/backend/utils/mb/conversion_procs/latin_and_mic/latin_and_mic.c
src/backend/utils/mb/conversion_procs/utf8_and_ascii/utf8_and_ascii.c
src/backend/utils/mb/conversion_procs/utf8_and_big5/utf8_and_big5.c
src/backend/utils/mb/conversion_procs/utf8_and_cyrillic/utf8_and_cyrillic.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_cn/utf8_and_euc_cn.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_jp/utf8_and_euc_jp.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_kr/utf8_and_euc_kr.c
src/backend/utils/mb/conversion_procs/utf8_and_euc_tw/utf8_and_euc_tw.c
src/backend/utils/mb/conversion_procs/utf8_and_gb18030/utf8_and_gb18030.c
src/backend/utils/mb/conversion_procs/utf8_and_gbk/utf8_and_gbk.c
src/backend/utils/mb/conversion_procs/utf8_and_iso8859/utf8_and_iso8859.c
src/backend/utils/mb/conversion_procs/utf8_and_iso8859_1/utf8_and_iso8859_1.c
src/backend/utils/mb/conversion_procs/utf8_and_johab/utf8_and_johab.c
src/backend/utils/mb/conversion_procs/utf8_and_sjis/utf8_and_sjis.c
src/backend/utils/mb/conversion_procs/utf8_and_uhc/utf8_and_uhc.c
src/backend/utils/mb/conversion_procs/utf8_and_win1250/utf8_and_win1250.c
src/backend/utils/mb/conversion_procs/utf8_and_win1252/utf8_and_win1252.c
src/backend/utils/mb/conversion_procs/utf8_and_win1256/utf8_and_win1256.c
src/backend/utils/mb/conversion_procs/utf8_and_win1258/utf8_and_win1258.c [new file with mode: 0644]
src/backend/utils/mb/conversion_procs/utf8_and_win874/utf8_and_win874.c
src/backend/utils/mb/mbutils.c
src/backend/utils/mb/wchar.c
src/include/c.h
src/include/mb/pg_wchar.h

index 0748f27a51d6d37be56cb25ad6a986792ff20913..dd130b13078ff76434f9e84b2e8dd0f8c1cddeb2 100644 (file)
@@ -937,9 +937,15 @@ DoCopy(const CopyStmt *stmt)
        cstate->raw_buf = (char *) palloc(RAW_BUF_SIZE + 1);
        cstate->raw_buf_index = cstate->raw_buf_len = 0;
 
-       /* Set up encoding conversion info */
+       /*
+        * Set up encoding conversion info.  Even if the client and server
+        * encodings are the same, we must apply pg_client_to_server() to
+        * validate data in multibyte encodings.
+        */
        cstate->client_encoding = pg_get_client_encoding();
-       cstate->need_transcoding = (cstate->client_encoding != GetDatabaseEncoding());
+       cstate->need_transcoding =
+               (cstate->client_encoding != GetDatabaseEncoding() ||
+                pg_database_encoding_max_length() > 1);
        cstate->client_only_encoding = PG_ENCODING_IS_CLIENT_ONLY(cstate->client_encoding);
 
        cstate->copy_dest = COPY_FILE;          /* default */
index b4934f5eef42d28318857e74ea6e1b5b0bd8272b..14da93d212854bad1d6bd4acd6582fc0b1238604 100644 (file)
@@ -49,10 +49,7 @@ namein(PG_FUNCTION_ARGS)
        NameData   *result;
        int                     len;
 
-       /* verify encoding */
        len = strlen(s);
-       pg_verifymbstr(s, len, false);
-
        len = pg_mbcliplen(s, len, NAMEDATALEN - 1);
 
        result = (NameData *) palloc0(NAMEDATALEN);
index 224156b25168cafe928c85370c4a8c6a6713a1ed..29a992ce1dbd7dc3b2fb9ad30fcca9bcb9eeb9ca 100644 (file)
@@ -73,9 +73,6 @@ bpchar_input(const char *s, size_t len, int32 atttypmod)
        char       *r;
        size_t          maxlen;
 
-       /* verify encoding */
-       pg_verifymbstr(s, len, false);
-
        /* If typmod is -1 (or invalid), use the actual string length */
        if (atttypmod < (int32) VARHDRSZ)
                maxlen = len;
@@ -393,9 +390,6 @@ varchar_input(const char *s, size_t len, int32 atttypmod)
        VarChar    *result;
        size_t          maxlen;
 
-       /* verify encoding */
-       pg_verifymbstr(s, len, false);
-
        maxlen = atttypmod - VARHDRSZ;
 
        if (atttypmod >= (int32) VARHDRSZ && len > maxlen)
index 41c3a6356a8e2ad922ac2f79ecccf8e922b4777d..4523e883701a09f8049962598f060b074d640ed1 100644 (file)
@@ -256,10 +256,7 @@ textin(PG_FUNCTION_ARGS)
        text       *result;
        int                     len;
 
-       /* verify encoding */
        len = strlen(inputText);
-       pg_verifymbstr(inputText, len, false);
-
        result = (text *) palloc(len + VARHDRSZ);
        VARATT_SIZEP(result) = len + VARHDRSZ;
 
@@ -299,9 +296,6 @@ textrecv(PG_FUNCTION_ARGS)
 
        str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
 
-       /* verify encoding */
-       pg_verifymbstr(str, nbytes, false);
-
        result = (text *) palloc(nbytes + VARHDRSZ);
        VARATT_SIZEP(result) = nbytes + VARHDRSZ;
        memcpy(VARDATA(result), str, nbytes);
index ff871780f185d201df864a271088a39d1963c2e2..96c60ffd4d209c01e83cf3bdfc3d54bf8e358055 100644 (file)
 #include "postgres.h"
 #include "mb/pg_wchar.h"
 
-/*
- * convert bogus chars that cannot be represented in the current
- * encoding system.
- */
-void
-pg_print_bogus_char(unsigned char **mic, unsigned char **p)
-{
-       char            strbuf[16];
-       int                     l = pg_mic_mblen(*mic);
-
-       *(*p)++ = '(';
-       while (l--)
-       {
-               sprintf(strbuf, "%02x", *(*mic)++);
-               *(*p)++ = strbuf[0];
-               *(*p)++ = strbuf[1];
-       }
-       *(*p)++ = ')';
-}
-
-#ifdef NOT_USED
 
 /*
- * GB18030 ---> MIC
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-gb180302mic(unsigned char *gb18030, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *gb18030++))
-       {
-               if (c1 < 0x80)
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
-               }
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *gb18030++;
-
-                       if (c2 >= 0x30 && c2 <= 0x69)
-                       {
-                               len -= 4;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                               *p++ = *gb18030++;
-                       }
-                       else if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               len -= 2;
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *gb18030++;
-                       }
-                       else
-                       {                                       /* throw the strange code */
-                               len--;
-                       }
-               }
-       }
-       *p = '\0';
-}
-
-/*
- * MIC ---> GB18030
- * Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
- */
-static void
-mic2gb18030(unsigned char *mic, unsigned char *p, int len)
-{
-       int                     c1;
-       int                     c2;
-
-       while (len > 0 && (c1 = *mic))
-       {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 <= 0x7f)                 /* ASCII */
-                       *p++ = c1;
-               else if (c1 >= 0x81 && c1 <= 0xfe)
-               {
-                       c2 = *mic++;
-
-                       if ((c2 >= 0x40 && c2 <= 0x7e) || (c2 >= 0x80 && c2 <= 0xfe))
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                       }
-                       else if (c2 >= 0x30 && c2 <= 0x39)
-                       {
-                               *p++ = c1;
-                               *p++ = c2;
-                               *p++ = *mic++;
-                               *p++ = *mic++;
-                       }
-                       else
-                       {
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                               mic--;
-                               pg_print_bogus_char(&mic, &p);
-                       }
-               }
-               else
-               {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
-       }
-       *p = '\0';
-}
-#endif
-
-/*
- * LATINn ---> MIC
+ * LATINn ---> MIC when the charset's local codes map directly to MIC
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
  */
 void
-latin2mic(unsigned char *l, unsigned char *p, int len, int lc)
+latin2mic(const unsigned char *l, unsigned char *p, int len,
+                 int lc, int encoding)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
        {
-               if (c1 > 0x7f)
-               {                                               /* Latin? */
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
+               if (IS_HIGHBIT_SET(c1))
                        *p++ = lc;
-               }
                *p++ = c1;
+               l++;
+               len--;
        }
        *p = '\0';
 }
 
 /*
- * MIC ---> LATINn
+ * MIC ---> LATINn when the charset's local codes map directly to MIC
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
  */
 void
-mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
+mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                 int lc, int encoding)
 {
        int                     c1;
 
-       while (len > 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == lc)
-                       *p++ = *mic++;
-               else if (c1 > 0x7f)
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
+                       len--;
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
+               {
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]))
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                       *p++ = mic[1];
+                       mic += 2;
+                       len -= 2;
                }
        }
        *p = '\0';
@@ -180,14 +89,25 @@ mic2latin(unsigned char *mic, unsigned char *p, int len, int lc)
 
 /*
  * ASCII ---> MIC
+ *
+ * While ordinarily SQL_ASCII encoding is forgiving of high-bit-set
+ * characters, here we must take a hard line because we don't know
+ * the appropriate MIC equivalent.
  */
 void
-pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
+pg_ascii2mic(const unsigned char *l, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *l++))
-               *p++ = (c1 & 0x7f);
+       while (len > 0)
+       {
+               c1 = *l;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_invalid_encoding(PG_SQL_ASCII, (const char *) l, len);
+               *p++ = c1;
+               l++;
+               len--;
+       }
        *p = '\0';
 }
 
@@ -195,19 +115,19 @@ pg_ascii2mic(unsigned char *l, unsigned char *p, int len)
  * MIC ---> ASCII
  */
 void
-pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
+pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len-- > 0 && (c1 = *mic))
+       while (len > 0)
        {
-               if (c1 > 0x7f)
-                       pg_print_bogus_char(&mic, &p);
-               else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-                       mic++;
-               }
+               c1 = *mic;
+               if (c1 == 0 || IS_HIGHBIT_SET(c1))
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SQL_ASCII,
+                                                                          (const char *) mic, len);
+               *p++ = c1;
+               mic++;
+               len--;
        }
        *p = '\0';
 }
@@ -215,86 +135,103 @@ pg_mic2ascii(unsigned char *mic, unsigned char *p, int len)
 /*
  * latin2mic_with_table: a generic single byte charset encoding
  * conversion from a local charset to the mule internal code.
- * with a encoding conversion table.
- * the table is ordered according to the local charset,
+ *
+ * l points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the local charset
  * starting from 128 (0x80). each entry in the table
  * holds the corresponding code point for the mule internal code.
  */
 void
-latin2mic_with_table(
-                                        unsigned char *l,      /* local charset string (source) */
-                                        unsigned char *p,      /* pointer to store mule internal code
-                                                                                * (destination) */
-                                        int len,       /* length of l */
-                                        int lc,        /* leading character of p */
-                                        unsigned char *tab /* code conversion table */
-)
+latin2mic_with_table(const unsigned char *l,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
 {
        unsigned char c1,
                                c2;
 
-       while (len-- > 0 && (c1 = *l++))
+       while (len > 0)
        {
-               if (c1 < 128)
+               c1 = *l;
+               if (c1 == 0)
+                       report_invalid_encoding(encoding, (const char *) l, len);
+               if (!IS_HIGHBIT_SET(c1))
                        *p++ = c1;
                else
                {
-                       c2 = tab[c1 - 128];
+                       c2 = tab[c1 - HIGHBIT];
                        if (c2)
                        {
                                *p++ = lc;
                                *p++ = c2;
                        }
                        else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
+                               report_untranslatable_char(encoding, PG_MULE_INTERNAL,
+                                                                                  (const char *) l, len);
                }
+               l++;
+               len--;
        }
        *p = '\0';
 }
 
 /*
  * mic2latin_with_table: a generic single byte charset encoding
- * conversion from the mule internal code to a local charset
- * with a encoding conversion table.
- * the table is ordered according to the second byte of the mule
- * internal code starting from 128 (0x80).
- * each entry in the table
- * holds the corresponding code point for the local code.
+ * conversion from the mule internal code to a local charset.
+ *
+ * mic points to the source string of length len
+ * p is the output area (must be large enough!)
+ * lc is the mule character set id for the local encoding
+ * encoding is the PG identifier for the local encoding
+ * tab holds conversion entries for the mule internal code's
+ * second byte, starting from 128 (0x80). each entry in the table
+ * holds the corresponding code point for the local charset.
  */
 void
-mic2latin_with_table(
-                                        unsigned char *mic,            /* mule internal code (source) */
-                                        unsigned char *p,      /* local code (destination) */
-                                        int len,       /* length of p */
-                                        int lc,        /* leading character */
-                                        unsigned char *tab /* code conversion table */
-)
+mic2latin_with_table(const unsigned char *mic,
+                                        unsigned char *p,
+                                        int len,
+                                        int lc,
+                                        int encoding,
+                                        const unsigned char *tab)
 {
-
        unsigned char c1,
                                c2;
 
-       while (len-- > 0 && (c1 = *mic++))
+       while (len > 0)
        {
-               if (c1 < 128)
-                       *p++ = c1;
-               else if (c1 == lc)
+               c1 = *mic;
+               if (c1 == 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       c1 = *mic++;
+                       /* easy for ASCII */
+                       *p++ = c1;
+                       mic++;
                        len--;
-                       c2 = tab[c1 - 128];
-                       if (c2)
-                               *p++ = c2;
-                       else
-                       {
-                               *p++ = ' ';             /* cannot convert */
-                       }
                }
                else
                {
-                       *p++ = ' ';                     /* bogus character */
+                       int             l = pg_mic_mblen(mic);
+
+                       if (len < l)
+                               report_invalid_encoding(PG_MULE_INTERNAL, (const char *) mic,
+                                                                               len);
+                       if (l != 2 || c1 != lc || !IS_HIGHBIT_SET(mic[1]) ||
+                               (c2 = tab[mic[1] - HIGHBIT]) == 0)
+                       {
+                               report_untranslatable_char(PG_MULE_INTERNAL, encoding,
+                                                                                  (const char *) mic, len);
+                               break;                  /* keep compiler quiet */
+                       }
+                       *p++ = c2;
+                       mic += 2;
+                       len -= 2;
                }
        }
        *p = '\0';
@@ -333,25 +270,38 @@ compare2(const void *p1, const void *p2)
 /*
  * UTF8 ---> local code
  *
- * utf: input UTF8 string. Its length is limited by "len" parameter
- *             or a null terminator.
- * iso: pointer to the output.
+ * utf: input UTF8 string (need not be null-terminated).
+ * iso: pointer to the output area (must be large enough!)
  * map: the conversion map.
  * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
  */
 void
-UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len)
+UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len)
 {
        unsigned int iutf;
        int                     l;
        pg_utf_to_local *p;
 
-       for (; len > 0 && *utf; len -= l)
+       for (; len > 0; len -= l)
        {
+               /* "break" cases all represent errors */
+               if (*utf == '\0')
+                       break;
+
                l = pg_utf_mblen(utf);
+
+               if (len < l)
+                       break;
+
+               if (!pg_utf8_islegal(utf, l))
+                       break;
+
                if (l == 1)
                {
+                       /* ASCII case is easy */
                        *iso++ = *utf++;
                        continue;
                }
@@ -373,16 +323,14 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                        iutf |= *utf++ << 8;
                        iutf |= *utf++;
                }
+
                p = bsearch(&iutf, map, size,
                                        sizeof(pg_utf_to_local), compare1);
+
                if (p == NULL)
-               {
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                                        errmsg("ignoring unconvertible UTF-8 character 0x%04x",
-                                                       iutf)));
-                       continue;
-               }
+                       report_untranslatable_char(PG_UTF8, encoding,
+                                                                          (const char *) (utf - l), len);
+
                if (p->code & 0xff000000)
                        *iso++ = p->code >> 24;
                if (p->code & 0x00ff0000)
@@ -392,15 +340,26 @@ UtfToLocal(unsigned char *utf, unsigned char *iso,
                if (p->code & 0x000000ff)
                        *iso++ = p->code & 0x000000ff;
        }
+
+       if (len > 0)
+               report_invalid_encoding(PG_UTF8, (const char *) utf, len);
+
        *iso = '\0';
 }
 
 /*
  * local code ---> UTF8
+ *
+ * iso: input local string (need not be null-terminated).
+ * utf: pointer to the output area (must be large enough!)
+ * map: the conversion map.
+ * size: the size of the conversion map.
+ * encoding: the PG identifier for the local encoding.
+ * len: length of input string.
  */
 void
-LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len)
+LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len)
 {
        unsigned int iiso;
        int                     l;
@@ -411,16 +370,23 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                                 errmsg("invalid encoding number: %d", encoding)));
 
-       for (; len > 0 && *iso; len -= l)
+       for (; len > 0; len -= l)
        {
-               if (*iso < 0x80)
+               /* "break" cases all represent errors */
+               if (*iso == '\0')
+                       break;
+
+               if (!IS_HIGHBIT_SET(*iso))
                {
+                       /* ASCII case is easy */
                        *utf++ = *iso++;
                        l = 1;
                        continue;
                }
 
-               l = pg_encoding_mblen(encoding, (char *) iso);
+               l = pg_encoding_verifymb(encoding, (const char *) iso, len);
+               if (l < 0)
+                       break;
 
                if (l == 1)
                        iiso = *iso++;
@@ -442,16 +408,13 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                        iiso |= *iso++ << 8;
                        iiso |= *iso++;
                }
+
                p = bsearch(&iiso, map, size,
                                        sizeof(pg_local_to_utf), compare2);
                if (p == NULL)
-               {
-                       ereport(WARNING,
-                                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
-                                        errmsg("ignoring unconvertible %s character 0x%04x",
-                                                       (&pg_enc2name_tbl[encoding])->name, iiso)));
-                       continue;
-               }
+                       report_untranslatable_char(encoding, PG_UTF8,
+                                                                          (const char *) (iso - l), len);
+
                if (p->utf & 0xff000000)
                        *utf++ = p->utf >> 24;
                if (p->utf & 0x00ff0000)
@@ -461,5 +424,9 @@ LocalToUtf(unsigned char *iso, unsigned char *utf,
                if (p->utf & 0x000000ff)
                        *utf++ = p->utf & 0x000000ff;
        }
+
+       if (len > 0)
+               report_invalid_encoding(encoding, (const char *) iso, len);
+
        *utf = '\0';
 }
index 2a93ab52051e3ceb0870595227712f6765981eeb..d6440ceac443161193738e0621ddf724002b8f46 100644 (file)
@@ -70,14 +70,14 @@ extern Datum win866_to_iso(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void koi8r2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2koi8r(unsigned char *mic, unsigned char *p, int len);
-static void iso2mic(unsigned char *l, unsigned char *p, int len);
-static void mic2iso(unsigned char *mic, unsigned char *p, int len);
-static void win12512mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1251(unsigned char *mic, unsigned char *p, int len);
-static void win8662mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win866(unsigned char *mic, unsigned char *p, int len);
+static void koi8r2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2koi8r(const unsigned char *mic, unsigned char *p, int len);
+static void iso2mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2iso(const unsigned char *mic, unsigned char *p, int len);
+static void win12512mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1251(const unsigned char *mic, unsigned char *p, int len);
+static void win8662mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win866(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 koi8r_to_mic(PG_FUNCTION_ARGS)
@@ -401,7 +401,7 @@ win1251_to_iso(PG_FUNCTION_ARGS)
 
        buf = palloc(len * ENCODING_GROWTH_RATE);
        win12512mic(src, buf, len);
-       mic2win1251(buf, dest, strlen((char *) buf));
+       mic2iso(buf, dest, strlen((char *) buf));
        pfree(buf);
 
        PG_RETURN_VOID();
@@ -441,7 +441,7 @@ win866_to_iso(PG_FUNCTION_ARGS)
 
        buf = palloc(len * ENCODING_GROWTH_RATE);
        win8662mic(src, buf, len);
-       mic2win866(buf, dest, strlen((char *) buf));
+       mic2iso(buf, dest, strlen((char *) buf));
        pfree(buf);
 
        PG_RETURN_VOID();
@@ -460,23 +460,23 @@ win866_to_iso(PG_FUNCTION_ARGS)
 
 /* koi8r2mic: KOI8-R to Mule internal code */
 static void
-koi8r2mic(unsigned char *l, unsigned char *p, int len)
+koi8r2mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_KOI8_R);
+       latin2mic(l, p, len, LC_KOI8_R, PG_KOI8R);
 }
 
 /* mic2koi8r: Mule internal code to KOI8-R */
 static void
-mic2koi8r(unsigned char *mic, unsigned char *p, int len)
+mic2koi8r(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_KOI8_R);
+       mic2latin(mic, p, len, LC_KOI8_R, PG_KOI8R);
 }
 
 /* iso2mic: ISO-8859-5 to Mule internal code */
 static void
-iso2mic(unsigned char *l, unsigned char *p, int len)
+iso2mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char iso2koi[] = {
+       static const unsigned char iso2koi[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -495,14 +495,14 @@ iso2mic(unsigned char *l, unsigned char *p, int len)
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, iso2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_ISO_8859_5, iso2koi);
 }
 
 /* mic2iso: Mule internal code to ISO8859-5 */
 static void
-mic2iso(unsigned char *mic, unsigned char *p, int len)
+mic2iso(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2iso[] = {
+       static const unsigned char koi2iso[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -521,14 +521,14 @@ mic2iso(unsigned char *mic, unsigned char *p, int len)
                0xcc, 0xcb, 0xb7, 0xc8, 0xcd, 0xc9, 0xc7, 0xca
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2iso);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_ISO_8859_5, koi2iso);
 }
 
 /* win2mic: CP1251 to Mule internal code */
 static void
-win12512mic(unsigned char *l, unsigned char *p, int len)
+win12512mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char win2koi[] = {
+       static const unsigned char win2koi[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -547,14 +547,14 @@ win12512mic(unsigned char *l, unsigned char *p, int len)
                0xdb, 0xdd, 0xdf, 0xd9, 0xd8, 0xdc, 0xc0, 0xd1
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, win2koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN1251, win2koi);
 }
 
 /* mic2win: Mule internal code to CP1251 */
 static void
-mic2win1251(unsigned char *mic, unsigned char *p, int len)
+mic2win1251(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2win[] = {
+       static const unsigned char koi2win[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -573,14 +573,14 @@ mic2win1251(unsigned char *mic, unsigned char *p, int len)
                0xdc, 0xdb, 0xc7, 0xd8, 0xdd, 0xd9, 0xd7, 0xda
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN1251, koi2win);
 }
 
 /* win8662mic: CP866 to Mule internal code */
 static void
-win8662mic(unsigned char *l, unsigned char *p, int len)
+win8662mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char win8662koi[] = {
+       static const unsigned char win8662koi[] = {
                0xe1, 0xe2, 0xf7, 0xe7, 0xe4, 0xe5, 0xf6, 0xfa,
                0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0,
                0xf2, 0xf3, 0xf4, 0xf5, 0xe6, 0xe8, 0xe3, 0xfe,
@@ -599,14 +599,14 @@ win8662mic(unsigned char *l, unsigned char *p, int len)
                0xb6, 0xa6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
        };
 
-       latin2mic_with_table(l, p, len, LC_KOI8_R, win8662koi);
+       latin2mic_with_table(l, p, len, LC_KOI8_R, PG_WIN866, win8662koi);
 }
 
 /* mic2win866: Mule internal code to CP866 */
 static void
-mic2win866(unsigned char *mic, unsigned char *p, int len)
+mic2win866(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char koi2win866[] = {
+       static const unsigned char koi2win866[] = {
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
                0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -625,5 +625,5 @@ mic2win866(unsigned char *mic, unsigned char *p, int len)
                0x9c, 0x9b, 0x87, 0x98, 0x9d, 0x99, 0x97, 0x9a
        };
 
-       mic2latin_with_table(mic, p, len, LC_KOI8_R, koi2win866);
+       mic2latin_with_table(mic, p, len, LC_KOI8_R, PG_WIN866, koi2win866);
 }
index 72d70e853152692f4df1b61a5711e47381103182..26b68d63ea3b77f69c4e07b1689223a3c511023d 100644 (file)
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_cn(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void euc_cn2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_cn(unsigned char *mic, unsigned char *p, int len);
+static void euc_cn2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_cn(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_cn_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,30 @@ mic_to_euc_cn(PG_FUNCTION_ARGS)
  * EUC_CN ---> MIC
  */
 static void
-euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
+euc_cn2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 & 0x80)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 2;
+                       if (len < 2 || !IS_HIGHBIT_SET(euc[1]))
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                        *p++ = LC_GB2312_80;
                        *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_CN, (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -97,27 +104,34 @@ euc_cn2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_CN
  */
 static void
-mic2euc_cn(unsigned char *mic, unsigned char *p, int len)
+mic2euc_cn(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_GB2312_80)
+               c1 = *mic;
+               if (IS_HIGHBIT_SET(c1))
                {
+                       if (c1 != LC_GB2312_80)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_CN,
+                                                                                  (const char *) mic, len);
+                       if (len < 3 || !IS_HIGHBIT_SET(mic[1]) || !IS_HIGHBIT_SET(mic[2]))
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       mic++;
                        *p++ = *mic++;
                        *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_CN! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       len -= 3;
                }
                else
                {                                               /* should be ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
                        *p++ = c1;
+                       mic++;
+                       len--;
                }
        }
        *p = '\0';
index ed0d3e894998bfc1e82634bf78bc9a622d5e9212..51cf69de6aa9c4d944fb3f8ee784cc0d444fbd38 100644 (file)
@@ -22,9 +22,6 @@
 #define PGSJISALTCODE 0x81ac
 #define PGEUCALTCODE 0xa2ae
 
-#define ISSJISHEAD(c) ((c >= 0x81 && c <= 0x9f) || (c >= 0xe0 && c <= 0xfc))
-#define ISSJISTAIL(c) ((c >= 0x40 && c <= 0x7e) || (c >= 0x80 && c <= 0xfc))
-
 /*
  * conversion table between SJIS UDC (IBM kanji) and EUC_JP
  */
@@ -57,12 +54,12 @@ extern Datum mic_to_sjis(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void sjis2mic(unsigned char *sjis, unsigned char *p, int len);
-static void mic2sjis(unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_jp(unsigned char *mic, unsigned char *p, int len);
-static void euc_jp2sjis(unsigned char *mic, unsigned char *p, int len);
-static void sjis2euc_jp(unsigned char *mic, unsigned char *p, int len);
+static void sjis2mic(const unsigned char *sjis, unsigned char *p, int len);
+static void mic2sjis(const unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_jp(const unsigned char *mic, unsigned char *p, int len);
+static void euc_jp2sjis(const unsigned char *mic, unsigned char *p, int len);
+static void sjis2euc_jp(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_jp_to_sjis(PG_FUNCTION_ARGS)
@@ -164,38 +161,34 @@ mic_to_sjis(PG_FUNCTION_ARGS)
  * SJIS ---> MIC
  */
 static void
-sjis2mic(unsigned char *sjis, unsigned char *p, int len)
+sjis2mic(const unsigned char *sjis, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
-/* Eiji Tokuya patched begin */
                                i,
                                k,
                                k2;
 
-/* Eiji Tokuya patched end */
-       while (len >= 0 && (c1 = *sjis++))
+       while (len > 0)
        {
+               c1 = *sjis;
                if (c1 >= 0xa1 && c1 <= 0xdf)
                {
                        /* JIS X0201 (1 byte kana) */
-                       len--;
                        *p++ = LC_JISX0201K;
                        *p++ = c1;
+                       sjis++;
+                       len--;
                }
-               else if (c1 > 0x7f)
+               else if (IS_HIGHBIT_SET(c1))
                {
                        /*
                         * JIS X0208, X0212, user defined extended characters
                         */
-                       c2 = *sjis++;
-                       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x",
-                                                   c1, c2)));
+                       if (len < 2 || !ISSJISHEAD(c1) || !ISSJISTAIL(sjis[1]))
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
+                       c2 = sjis[1];
                        k = (c1 << 8) + c2;
-/* Eiji Tokuya patched begin */
                        if (k >= 0xed40 && k < 0xf040)
                        {
                                /* NEC selection IBM kanji */
@@ -214,19 +207,15 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                        }
 
                        if (k < 0xeb3f)
-/* Eiji Tokuya patched end */
                        {
                                /* JIS X0208 */
-                               len -= 2;
                                *p++ = LC_JISX0208;
                                *p++ = ((c1 & 0x3f) << 1) + 0x9f + (c2 > 0x9e);
                                *p++ = c2 + ((c2 > 0x9e) ? 2 : 0x60) + (c2 < 0x80);
                        }
-/* Eiji Tokuya patched begin */
                        else if ((k >= 0xeb40 && k < 0xf040) || (k >= 0xfc4c && k <= 0xfcfc))
                        {
                                /* NEC selection IBM kanji - Other undecided justice */
-/* Eiji Tokuya patched end */
                                *p++ = LC_JISX0208;
                                *p++ = PGEUCALTCODE >> 8;
                                *p++ = PGEUCALTCODE & 0xff;
@@ -237,7 +226,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                 * UDC1 mapping to X0208 85 ku - 94 ku JIS code 0x7521 -
                                 * 0x7e7e EUC 0xf5a1 - 0xfefe
                                 */
-                               len -= 2;
                                *p++ = LC_JISX0208;
                                c1 -= 0x6f;
                                *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -249,7 +237,6 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                 * UDC2 mapping to X0212 85 ku - 94 ku JIS code 0x7521 -
                                 * 0x7e7e EUC 0x8ff5a1 - 0x8ffefe
                                 */
-                               len -= 2;
                                *p++ = LC_JISX0212;
                                c1 -= 0x74;
                                *p++ = ((c1 & 0x3f) << 1) + 0xf3 + (c2 > 0x9e);
@@ -259,9 +246,7 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                        {
                                /*
                                 * mapping IBM kanji to X0208 and X0212
-                                *
                                 */
-                               len -= 2;
                                for (i = 0;; i++)
                                {
                                        k2 = ibmkanji[i].sjis;
@@ -285,11 +270,16 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
                                        }
                                }
                        }
+                       sjis += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_SJIS, (const char *) sjis, len);
                        *p++ = c1;
+                       sjis++;
+                       len--;
                }
        }
        *p = '\0';
@@ -299,22 +289,37 @@ sjis2mic(unsigned char *sjis, unsigned char *p, int len)
  * MIC ---> SJIS
  */
 static void
-mic2sjis(unsigned char *mic, unsigned char *p, int len)
+mic2sjis(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
-                               k;
+                               k,
+                               l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_JISX0201K)
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                else if (c1 == LC_JISX0208)
                {
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                        k = (c1 << 8) | (c2 & 0xff);
                        if (k >= 0xf5a1)
                        {
@@ -331,8 +336,8 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                        int                     i,
                                                k2;
 
-                       c1 = *mic++;
-                       c2 = *mic++;
+                       c1 = mic[1];
+                       c2 = mic[2];
                        k = c1 << 8 | c2;
                        if (k >= 0xf5a1)
                        {
@@ -363,16 +368,11 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
                                }
                        }
                }
-               else if (c1 > 0x7f)
-               {
-                       /* cannot convert to SJIS! */
-                       *p++ = PGSJISALTCODE >> 8;
-                       *p++ = PGSJISALTCODE & 0xff;
-               }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_SJIS,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -381,37 +381,48 @@ mic2sjis(unsigned char *mic, unsigned char *p, int len)
  * EUC_JP ---> MIC
  */
 static void
-euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
+euc_jp2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
+               c1 = *euc;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_JP,
+                                                                               (const char *) euc, len);
+                       *p++ = c1;
+                       euc++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_EUC_JP,
+                                                                       (const char *) euc, len);
                if (c1 == SS2)
                {                                               /* 1 byte kana? */
-                       len -= 2;
                        *p++ = LC_JISX0201K;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
                }
                else if (c1 == SS3)
                {                                               /* JIS X0212 kanji? */
-                       len -= 3;
                        *p++ = LC_JISX0212;
-                       *p++ = *euc++;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       *p++ = euc[2];
                }
-               else if (c1 & 0x80)
+               else
                {                                               /* kanji? */
-                       len -= 2;
                        *p++ = LC_JISX0208;
                        *p++ = c1;
-                       *p++ = *euc++;
-               }
-               else
-               {                                               /* should be ASCII */
-                       len--;
-                       *p++ = c1;
+                       *p++ = euc[1];
                }
+               euc += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -420,39 +431,50 @@ euc_jp2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_JP
  */
 static void
-mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
+mic2euc_jp(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_JISX0201K)
                {
                        *p++ = SS2;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
                }
                else if (c1 == LC_JISX0212)
                {
                        *p++ = SS3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else if (c1 == LC_JISX0208)
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_JP! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_JP,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -461,30 +483,41 @@ mic2euc_jp(unsigned char *mic, unsigned char *p, int len)
  * EUC_JP -> SJIS
  */
 static void
-euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
+euc_jp2sjis(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
                                k;
-       unsigned char *euc_end = euc + len;
+       int                     l;
 
-       while (euc_end >= euc && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 < 0x80)
+               c1 = *euc;
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       /* should be ASCII */
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_JP,
+                                                                               (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
+                       continue;
                }
-               else if (c1 == SS2)
+               l = pg_encoding_verifymb(PG_EUC_JP, (const char *) euc, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_EUC_JP,
+                                                                       (const char *) euc, len);
+               if (c1 == SS2)
                {
                        /* hankaku kana? */
-                       *p++ = *euc++;
+                       *p++ = euc[1];
                }
                else if (c1 == SS3)
                {
                        /* JIS X0212 kanji? */
-                       c1 = *euc++;
-                       c2 = *euc++;
+                       c1 = euc[1];
+                       c2 = euc[2];
                        k = c1 << 8 | c2;
                        if (k >= 0xf5a1)
                        {
@@ -521,7 +554,7 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
                else
                {
                        /* JIS X0208 kanji? */
-                       c2 = *euc++;
+                       c2 = euc[1];
                        k = (c1 << 8) | (c2 & 0xff);
                        if (k >= 0xf5a1)
                        {
@@ -533,6 +566,8 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
                                *p++ = ((c1 - 0xa1) >> 1) + ((c1 < 0xdf) ? 0x81 : 0xc1);
                        *p++ = c2 - ((c1 & 1) ? ((c2 < 0xe0) ? 0x61 : 0x60) : 2);
                }
+               euc += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -541,23 +576,34 @@ euc_jp2sjis(unsigned char *euc, unsigned char *p, int len)
  * SJIS ---> EUC_JP
  */
 static void
-sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
+sjis2euc_jp(const unsigned char *sjis, unsigned char *p, int len)
 {
        int                     c1,
                                c2,
                                i,
                                k,
                                k2;
-       unsigned char *sjis_end = sjis + len;
+       int                     l;
 
-       while (sjis_end >= sjis && (c1 = *sjis++))
+       while (len > 0)
        {
-               if (c1 < 0x80)
+               c1 = *sjis;
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       /* should be ASCII */
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_SJIS,
+                                                                               (const char *) sjis, len);
                        *p++ = c1;
+                       sjis++;
+                       len--;
+                       continue;
                }
-               else if (c1 >= 0xa1 && c1 <= 0xdf)
+               l = pg_encoding_verifymb(PG_SJIS, (const char *) sjis, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_SJIS,
+                                                                       (const char *) sjis, len);
+               if (c1 >= 0xa1 && c1 <= 0xdf)
                {
                        /* JIS X0201 (1 byte kana) */
                        *p++ = SS2;
@@ -568,12 +614,7 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
                        /*
                         * JIS X0208, X0212, user defined extended characters
                         */
-                       c2 = *sjis++;
-                       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"SJIS\": 0x%02x%02x",
-                                                   c1, c2)));
+                       c2 = sjis[1];
                        k = (c1 << 8) + c2;
                        if (k >= 0xed40 && k < 0xf040)
                        {
@@ -654,6 +695,8 @@ sjis2euc_jp(unsigned char *sjis, unsigned char *p, int len)
                                }
                        }
                }
+               sjis += l;
+               len -= l;
        }
        *p = '\0';
 }
index e4f70b66a58bf9ccb6b839f22d6358f747be28c7..0aa0cd7f947cde7c6ff005de30b50eb7c8db9d67 100644 (file)
@@ -32,8 +32,8 @@ extern Datum mic_to_euc_kr(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void euc_kr2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_kr(unsigned char *mic, unsigned char *p, int len);
+static void euc_kr2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_kr(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_kr_to_mic(PG_FUNCTION_ARGS)
@@ -71,23 +71,34 @@ mic_to_euc_kr(PG_FUNCTION_ARGS)
  * EUC_KR ---> MIC
  */
 static void
-euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
+euc_kr2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 & 0x80)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 2;
+                       l = pg_encoding_verifymb(PG_EUC_KR, (const char *) euc, len);
+                       if (l != 2)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                        *p++ = LC_KS5601;
                        *p++ = c1;
-                       *p++ = *euc++;
+                       *p++ = euc[1];
+                       euc += 2;
+                       len -= 2;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_KR,
+                                                                               (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -97,28 +108,39 @@ euc_kr2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_KR
  */
 static void
-mic2euc_kr(unsigned char *mic, unsigned char *p, int len)
+mic2euc_kr(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
-               if (c1 == LC_KS5601)
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
                }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_KR! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
+               if (c1 == LC_KS5601)
+               {
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_KR,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
index 3063bdfd852486bf08573d673009a3a5018d5a03..2ece3d51c236f3ac8eeac0b2117d8de2e7c78034 100644 (file)
@@ -42,10 +42,10 @@ extern Datum mic_to_big5(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void big52mic(unsigned char *big5, unsigned char *p, int len);
-static void mic2big5(unsigned char *mic, unsigned char *p, int len);
-static void euc_tw2mic(unsigned char *euc, unsigned char *p, int len);
-static void mic2euc_tw(unsigned char *mic, unsigned char *p, int len);
+static void big52mic(const unsigned char *big5, unsigned char *p, int len);
+static void mic2big5(const unsigned char *mic, unsigned char *p, int len);
+static void euc_tw2mic(const unsigned char *euc, unsigned char *p, int len);
+static void mic2euc_tw(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 euc_tw_to_big5(PG_FUNCTION_ARGS)
@@ -114,7 +114,7 @@ mic_to_euc_tw(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_EUC_TW);
        Assert(len >= 0);
 
-       mic2big5(src, dest, len);
+       mic2euc_tw(src, dest, len);
 
        PG_RETURN_VOID();
 }
@@ -155,39 +155,52 @@ mic_to_big5(PG_FUNCTION_ARGS)
  * EUC_TW ---> MIC
  */
 static void
-euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
+euc_tw2mic(const unsigned char *euc, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *euc++))
+       while (len > 0)
        {
-               if (c1 == SS2)
+               c1 = *euc;
+               if (IS_HIGHBIT_SET(c1))
                {
-                       len -= 4;
-                       c1 = *euc++;            /* plane No. */
-                       if (c1 == 0xa1)
-                               *p++ = LC_CNS11643_1;
-                       else if (c1 == 0xa2)
-                               *p++ = LC_CNS11643_2;
-                       else
+                       l = pg_encoding_verifymb(PG_EUC_TW, (const char *) euc, len);
+                       if (l < 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
+                       if (c1 == SS2)
                        {
-                               *p++ = 0x9d;    /* LCPRV2 */
-                               *p++ = 0xa3 - c1 + LC_CNS11643_3;
+                               c1 = euc[1];            /* plane No. */
+                               if (c1 == 0xa1)
+                                       *p++ = LC_CNS11643_1;
+                               else if (c1 == 0xa2)
+                                       *p++ = LC_CNS11643_2;
+                               else
+                               {
+                                       *p++ = 0x9d;    /* LCPRV2 */
+                                       *p++ = c1 - 0xa3 + LC_CNS11643_3;
+                               }
+                               *p++ = euc[2];
+                               *p++ = euc[3];
                        }
-                       *p++ = *euc++;
-                       *p++ = *euc++;
-               }
-               else if (c1 & 0x80)
-               {                                               /* CNS11643-1 */
-                       len -= 2;
-                       *p++ = LC_CNS11643_1;
-                       *p++ = c1;
-                       *p++ = *euc++;
+                       else
+                       {                                               /* CNS11643-1 */
+                               *p++ = LC_CNS11643_1;
+                               *p++ = c1;
+                               *p++ = euc[1];
+                       }
+                       euc += l;
+                       len -= l;
                }
                else
                {                                               /* should be ASCII */
-                       len--;
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_EUC_TW,
+                                                                               (const char *) euc, len);
                        *p++ = c1;
+                       euc++;
+                       len--;
                }
        }
        *p = '\0';
@@ -197,42 +210,54 @@ euc_tw2mic(unsigned char *euc, unsigned char *p, int len)
  * MIC ---> EUC_TW
  */
 static void
-mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
+mic2euc_tw(const unsigned char *mic, unsigned char *p, int len)
 {
        int                     c1;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               len -= pg_mic_mblen(mic++);
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                if (c1 == LC_CNS11643_1)
                {
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
                else if (c1 == LC_CNS11643_2)
                {
                        *p++ = SS2;
                        *p++ = 0xa2;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
+                       *p++ = mic[1];
+                       *p++ = mic[2];
                }
-               else if (c1 == 0x9d)
+               else if (c1 == 0x9d &&
+                                mic[1] >= LC_CNS11643_3 && mic[1] <= LC_CNS11643_7)
                {                                               /* LCPRV2? */
                        *p++ = SS2;
-                       *p++ = *mic++ - LC_CNS11643_3 + 0xa3;
-                       *p++ = *mic++;
-                       *p++ = *mic++;
-               }
-               else if (c1 > 0x7f)
-               {                                               /* cannot convert to EUC_TW! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
+                       *p++ = mic[1] - LC_CNS11643_3 + 0xa3;
+                       *p++ = mic[2];
+                       *p++ = mic[3];
                }
                else
-               {                                               /* should be ASCII */
-                       *p++ = c1;
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_EUC_TW,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -241,52 +266,49 @@ mic2euc_tw(unsigned char *mic, unsigned char *p, int len)
  * Big5 ---> MIC
  */
 static void
-big52mic(unsigned char *big5, unsigned char *p, int len)
+big52mic(const unsigned char *big5, unsigned char *p, int len)
 {
        unsigned short c1;
        unsigned short big5buf,
                                cnsBuf;
        unsigned char lc;
-       char            bogusBuf[3];
-       int                     i;
+       int                     l;
 
-       while (len >= 0 && (c1 = *big5++))
+       while (len > 0)
        {
-               if (c1 <= 0x7fU)
-               {                                               /* ASCII */
-                       len--;
+               c1 = *big5;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_BIG5,
+                                                                               (const char *) big5, len);
                        *p++ = c1;
+                       big5++;
+                       len--;
+                       continue;
                }
-               else
+               l = pg_encoding_verifymb(PG_BIG5, (const char *) big5, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_BIG5,
+                                                                       (const char *) big5, len);
+               big5buf = (c1 << 8) | big5[1];
+               cnsBuf = BIG5toCNS(big5buf, &lc);
+               if (lc != 0)
                {
-                       len -= 2;
-                       big5buf = c1 << 8;
-                       c1 = *big5++;
-                       big5buf |= c1;
-                       cnsBuf = BIG5toCNS(big5buf, &lc);
-                       if (lc != 0)
+                       if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
                        {
-                               if (lc == LC_CNS11643_3 || lc == LC_CNS11643_4)
-                               {
-                                       *p++ = 0x9d;    /* LCPRV2 */
-                               }
-                               *p++ = lc;              /* Plane No. */
-                               *p++ = (cnsBuf >> 8) & 0x00ff;
-                               *p++ = cnsBuf & 0x00ff;
-                       }
-                       else
-                       {                                       /* cannot convert */
-                               big5 -= 2;
-                               *p++ = '(';
-                               for (i = 0; i < 2; i++)
-                               {
-                                       sprintf(bogusBuf, "%02x", *big5++);
-                                       *p++ = bogusBuf[0];
-                                       *p++ = bogusBuf[1];
-                               }
-                               *p++ = ')';
+                               *p++ = 0x9d;    /* LCPRV2 */
                        }
+                       *p++ = lc;              /* Plane No. */
+                       *p++ = (cnsBuf >> 8) & 0x00ff;
+                       *p++ = cnsBuf & 0x00ff;
                }
+               else
+                       report_untranslatable_char(PG_BIG5, PG_MULE_INTERNAL,
+                                                                          (const char *) big5, len);
+               big5 += l;
+               len -= l;
        }
        *p = '\0';
 }
@@ -295,46 +317,55 @@ big52mic(unsigned char *big5, unsigned char *p, int len)
  * MIC ---> Big5
  */
 static void
-mic2big5(unsigned char *mic, unsigned char *p, int len)
+mic2big5(const unsigned char *mic, unsigned char *p, int len)
 {
-       int                     l;
        unsigned short c1;
        unsigned short big5buf,
                                cnsBuf;
+       int                     l;
 
-       while (len >= 0 && (c1 = *mic))
+       while (len > 0)
        {
-               l = pg_mic_mblen(mic++);
-               len -= l;
-
+               c1 = *mic;
+               if (!IS_HIGHBIT_SET(c1))
+               {
+                       /* ASCII */
+                       if (c1 == 0)
+                               report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                               (const char *) mic, len);
+                       *p++ = c1;
+                       mic++;
+                       len--;
+                       continue;
+               }
+               l = pg_encoding_verifymb(PG_MULE_INTERNAL, (const char *) mic, len);
+               if (l < 0)
+                       report_invalid_encoding(PG_MULE_INTERNAL,
+                                                                       (const char *) mic, len);
                /* 0x9d means LCPRV2 */
                if (c1 == LC_CNS11643_1 || c1 == LC_CNS11643_2 || c1 == 0x9d)
                {
                        if (c1 == 0x9d)
                        {
-                               c1 = *mic++;    /* get plane no. */
-                       }
-                       cnsBuf = (*mic++) << 8;
-                       cnsBuf |= (*mic++) & 0x00ff;
-                       big5buf = CNStoBIG5(cnsBuf, c1);
-                       if (big5buf == 0)
-                       {                                       /* cannot convert to Big5! */
-                               mic -= l;
-                               pg_print_bogus_char(&mic, &p);
+                               c1 = mic[1];    /* get plane no. */
+                               cnsBuf = (mic[2] << 8) | mic[3];
                        }
                        else
                        {
-                               *p++ = (big5buf >> 8) & 0x00ff;
-                               *p++ = big5buf & 0x00ff;
+                               cnsBuf = (mic[1] << 8) | mic[2];
                        }
+                       big5buf = CNStoBIG5(cnsBuf, c1);
+                       if (big5buf == 0)
+                               report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                                  (const char *) mic, len);
+                       *p++ = (big5buf >> 8) & 0x00ff;
+                       *p++ = big5buf & 0x00ff;
                }
-               else if (c1 <= 0x7f)    /* ASCII */
-                       *p++ = c1;
                else
-               {                                               /* cannot convert to Big5! */
-                       mic--;
-                       pg_print_bogus_char(&mic, &p);
-               }
+                       report_untranslatable_char(PG_MULE_INTERNAL, PG_BIG5,
+                                                                          (const char *) mic, len);
+               mic += l;
+               len -= l;
        }
        *p = '\0';
 }
index 4de1724c5b839ab87dba450d2138d4e338fbbe72..b4bee522decdb1991c664ef741aa1aade8f8e770 100644 (file)
@@ -42,10 +42,10 @@ extern Datum win1250_to_latin2(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void latin22mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin2(unsigned char *mic, unsigned char *p, int len);
-static void win12502mic(unsigned char *l, unsigned char *p, int len);
-static void mic2win1250(unsigned char *mic, unsigned char *p, int len);
+static void latin22mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin2(const unsigned char *mic, unsigned char *p, int len);
+static void win12502mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2win1250(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 latin2_to_mic(PG_FUNCTION_ARGS)
@@ -152,14 +152,15 @@ win1250_to_latin2(PG_FUNCTION_ARGS)
 }
 
 static void
-latin22mic(unsigned char *l, unsigned char *p, int len)
+latin22mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_2);
+       latin2mic(l, p, len, LC_ISO8859_2, PG_LATIN2);
 }
+
 static void
-mic2latin2(unsigned char *mic, unsigned char *p, int len)
+mic2latin2(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_2);
+       mic2latin(mic, p, len, LC_ISO8859_2, PG_LATIN2);
 }
 
 /*-----------------------------------------------------------------
@@ -167,9 +168,9 @@ mic2latin2(unsigned char *mic, unsigned char *p, int len)
  * Microsoft's CP1250(windows-1250)
  *-----------------------------------------------------------------*/
 static void
-win12502mic(unsigned char *l, unsigned char *p, int len)
+win12502mic(const unsigned char *l, unsigned char *p, int len)
 {
-       static unsigned char win1250_2_iso88592[] = {
+       static const unsigned char win1250_2_iso88592[] = {
                0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                0x88, 0x89, 0xA9, 0x8B, 0xA6, 0xAB, 0xAE, 0xAC,
                0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -188,12 +189,14 @@ win12502mic(unsigned char *l, unsigned char *p, int len)
                0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
        };
 
-       latin2mic_with_table(l, p, len, LC_ISO8859_2, win1250_2_iso88592);
+       latin2mic_with_table(l, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                win1250_2_iso88592);
 }
+
 static void
-mic2win1250(unsigned char *mic, unsigned char *p, int len)
+mic2win1250(const unsigned char *mic, unsigned char *p, int len)
 {
-       static unsigned char iso88592_2_win1250[] = {
+       static const unsigned char iso88592_2_win1250[] = {
                0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
                0x88, 0x89, 0x00, 0x8B, 0x00, 0x00, 0x00, 0x00,
                0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
@@ -212,5 +215,6 @@ mic2win1250(unsigned char *mic, unsigned char *p, int len)
                0xF8, 0xF9, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE, 0xFF
        };
 
-       mic2latin_with_table(mic, p, len, LC_ISO8859_2, iso88592_2_win1250);
+       mic2latin_with_table(mic, p, len, LC_ISO8859_2, PG_WIN1250,
+                                                iso88592_2_win1250);
 }
index a2f7f45f3ebde5f4a11e1af738b857fd18f61097..3fb72a727e0ba9954fb0645786cbcac127429a3a 100644 (file)
@@ -40,12 +40,12 @@ extern Datum mic_to_latin4(PG_FUNCTION_ARGS);
  * ----------
  */
 
-static void latin12mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin1(unsigned char *mic, unsigned char *p, int len);
-static void latin32mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin3(unsigned char *mic, unsigned char *p, int len);
-static void latin42mic(unsigned char *l, unsigned char *p, int len);
-static void mic2latin4(unsigned char *mic, unsigned char *p, int len);
+static void latin12mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin1(const unsigned char *mic, unsigned char *p, int len);
+static void latin32mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin3(const unsigned char *mic, unsigned char *p, int len);
+static void latin42mic(const unsigned char *l, unsigned char *p, int len);
+static void mic2latin4(const unsigned char *mic, unsigned char *p, int len);
 
 Datum
 latin1_to_mic(PG_FUNCTION_ARGS)
@@ -144,32 +144,37 @@ mic_to_latin4(PG_FUNCTION_ARGS)
 }
 
 static void
-latin12mic(unsigned char *l, unsigned char *p, int len)
+latin12mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_1);
+       latin2mic(l, p, len, LC_ISO8859_1, PG_LATIN1);
 }
+
 static void
-mic2latin1(unsigned char *mic, unsigned char *p, int len)
+mic2latin1(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_1);
+       mic2latin(mic, p, len, LC_ISO8859_1, PG_LATIN1);
 }
+
 static void
-latin32mic(unsigned char *l, unsigned char *p, int len)
+latin32mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_3);
+       latin2mic(l, p, len, LC_ISO8859_3, PG_LATIN3);
 }
+
 static void
-mic2latin3(unsigned char *mic, unsigned char *p, int len)
+mic2latin3(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_3);
+       mic2latin(mic, p, len, LC_ISO8859_3, PG_LATIN3);
 }
+
 static void
-latin42mic(unsigned char *l, unsigned char *p, int len)
+latin42mic(const unsigned char *l, unsigned char *p, int len)
 {
-       latin2mic(l, p, len, LC_ISO8859_4);
+       latin2mic(l, p, len, LC_ISO8859_4, PG_LATIN4);
 }
+
 static void
-mic2latin4(unsigned char *mic, unsigned char *p, int len)
+mic2latin4(const unsigned char *mic, unsigned char *p, int len)
 {
-       mic2latin(mic, p, len, LC_ISO8859_4);
+       mic2latin(mic, p, len, LC_ISO8859_4, PG_LATIN4);
 }
index 7212717221d1c028ee7d5a99f42fb3c33dd0f9c9..14369b4cdf65528660f0d8f4725b35a70637d325 100644 (file)
@@ -43,6 +43,7 @@ ascii_to_utf8(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_UTF8);
        Assert(len >= 0);
 
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
        pg_ascii2mic(src, dest, len);
 
        PG_RETURN_VOID();
@@ -59,6 +60,7 @@ utf8_to_ascii(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_SQL_ASCII);
        Assert(len >= 0);
 
+       /* this looks wrong, but basically we're just rejecting high-bit-set */
        pg_mic2ascii(src, dest, len);
 
        PG_RETURN_VOID();
index 8774f9dc438a1c2d93e06495ee66a424bf0a1204..42a5bd4411f2439f0fc5a15dadaa998b99ababa3 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_big5(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapBIG5,
-                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapBIG5) / sizeof(pg_utf_to_local), PG_BIG5, len);
 
        PG_RETURN_VOID();
 }
index f5179cf206dbfc59bb8c3bb899746dd395cf6ef7..98715e075bf6bec72bf8116cf4c241d147a0fe00 100644 (file)
@@ -58,7 +58,7 @@ utf8_to_koi8r(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_KOI8R,
-                          sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_KOI8R) / sizeof(pg_utf_to_local), PG_KOI8R, len);
 
        PG_RETURN_VOID();
 }
@@ -92,7 +92,7 @@ utf8_to_win1251(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_WIN1251,
-                          sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_WIN1251) / sizeof(pg_utf_to_local), PG_WIN1251, len);
 
        PG_RETURN_VOID();
 }
@@ -126,7 +126,7 @@ utf8_to_win866(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmap_WIN866,
-                          sizeof(ULmap_WIN866) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmap_WIN866) / sizeof(pg_utf_to_local), PG_WIN866, len);
 
        PG_RETURN_VOID();
 }
index 14b073b908f078b2db1da3a3c31077f9b2f9adf8..ff5a995ff94fb5dad11b9fc8b309469689e088cb 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_euc_cn(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_CN,
-                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_CN) / sizeof(pg_utf_to_local), PG_EUC_CN, len);
 
        PG_RETURN_VOID();
 }
index 9100e6f671fe061e32d5bfca9d61e9438043d104..bd07c9c3da1f1dd71d2d6572c5e32de3b3081cd5 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_euc_jp(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_JP,
-                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_JP) / sizeof(pg_utf_to_local), PG_EUC_JP, len);
 
        PG_RETURN_VOID();
 }
index abef0066560e3388e5b8d86948d99787687c5737..bee3fd99f7d204f9221df8e178f380a600e6c071 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_euc_kr(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_KR,
-                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_KR) / sizeof(pg_utf_to_local), PG_EUC_KR, len);
 
        PG_RETURN_VOID();
 }
index 5c16b97fae8cc2aa94e4d96c25ebfcab2903fdf8..175bede0c6ca12c7cf4f6796e0e391bd063a1a4e 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_euc_tw(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapEUC_TW,
-                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapEUC_TW) / sizeof(pg_utf_to_local), PG_EUC_TW, len);
 
        PG_RETURN_VOID();
 }
index 64f3d4fa10b967cdb8f053368878a7dd3ed8d75a..a675cddabf5017df3f3cfcd47a31c78c49e5649f 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_gb18030(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapGB18030,
-                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGB18030) / sizeof(pg_utf_to_local), PG_GB18030, len);
 
        PG_RETURN_VOID();
 }
index cb9791c4697fd60c3beaa0ebb5fa6ddf1da15b6c..3cbbaa985bf7c6102cf71caa158271511ac5a0f1 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_gbk(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapGBK,
-                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapGBK) / sizeof(pg_utf_to_local), PG_GBK, len);
 
        PG_RETURN_VOID();
 }
index 409775a2994334f5a5569aced81f9d0350083314..f4cc007daecf18f466608c615367844f1563ca13 100644 (file)
@@ -153,7 +153,7 @@ utf8_to_iso8859(PG_FUNCTION_ARGS)
        {
                if (encoding == maps[i].encoding)
                {
-                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, len);
+                       UtfToLocal(src, dest, maps[i].map2, maps[i].size2, encoding, len);
                        PG_RETURN_VOID();
                }
        }
index 034b657055811e7184f37b1c8eff44a6bfff4a68..ae38859a7e046449537fc45d78fbd1080f993536 100644 (file)
@@ -44,15 +44,20 @@ iso8859_1_to_utf8(PG_FUNCTION_ARGS)
        Assert(PG_GETARG_INT32(1) == PG_UTF8);
        Assert(len >= 0);
 
-       while (len-- > 0 && (c = *src++))
+       while (len > 0)
        {
-               if (c < 0x80)
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_LATIN1, (const char *) src, len);
+               if (!IS_HIGHBIT_SET(c))
                        *dest++ = c;
                else
                {
                        *dest++ = (c >> 6) | 0xc0;
                        *dest++ = (c & 0x003f) | 0x80;
                }
+               src++;
+               len--;
        }
        *dest = '\0';
 
@@ -66,30 +71,44 @@ utf8_to_iso8859_1(PG_FUNCTION_ARGS)
        unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
        int                     len = PG_GETARG_INT32(4);
        unsigned short c,
-                               c1,
-                               c2;
+                               c1;
 
        Assert(PG_GETARG_INT32(0) == PG_UTF8);
        Assert(PG_GETARG_INT32(1) == PG_LATIN1);
        Assert(len >= 0);
 
-       while (len >= 0 && (c = *src++))
+       while (len > 0)
        {
-               if ((c & 0xe0) == 0xc0)
+               c = *src;
+               if (c == 0)
+                       report_invalid_encoding(PG_UTF8, (const char *) src, len);
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(c))
                {
-                       c1 = c & 0x1f;
-                       c2 = *src++ & 0x3f;
-                       *dest = c1 << 6;
-                       *dest++ |= c2;
-                       len -= 2;
+                       *dest++ = c;
+                       src++;
+                       len--;
                }
-               else if ((c & 0xe0) == 0xe0)
-                       elog(ERROR, "could not convert UTF8 character 0x%04x to ISO8859-1",
-                                c);
                else
                {
-                       *dest++ = c;
-                       len--;
+                       int             l = pg_utf_mblen(src);
+
+                       if (l > len || !pg_utf8_islegal(src, l))
+                               report_invalid_encoding(PG_UTF8, (const char *) src, len);
+                       if (l != 2)
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
+                       c1 = src[1] & 0x3f;
+                       c = ((c & 0x1f) << 6) | c1;
+                       if (c >= 0x80 && c <= 0xff)
+                       {
+                               *dest++ = (unsigned char) c;
+                               src += 2;
+                               len -= 2;
+                       }
+                       else
+                               report_untranslatable_char(PG_UTF8, PG_LATIN1,
+                                                                                  (const char *) src, len);
                }
        }
        *dest = '\0';
index 6e78e9101ef3badeb25555376ce9c386bd21290e..2ad3319d3cc34c445b47f75e2263007c544c66d5 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_johab(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapJOHAB,
-                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapJOHAB) / sizeof(pg_utf_to_local), PG_JOHAB, len);
 
        PG_RETURN_VOID();
 }
index c592c0ec50fa37a5e67fbc7411022589b5a2cc3e..864d9daddc345ea15e4328042885c0ceb26e7292 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_sjis(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapSJIS,
-                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapSJIS) / sizeof(pg_utf_to_local), PG_SJIS, len);
 
        PG_RETURN_VOID();
 }
index e64349c593f4978531d4a20aa62e292018400148..84ca5c0abe38ff96b04a768443d2ed691b05cc3a 100644 (file)
@@ -62,7 +62,7 @@ utf8_to_uhc(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapUHC,
-                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapUHC) / sizeof(pg_utf_to_local), PG_UHC, len);
 
        PG_RETURN_VOID();
 }
index 6270234eb0f9f29a22cff93ad021c8e1a487dd20..ca35da82ca6cce7a74ef1c10f58d933ca07f311f 100644 (file)
@@ -46,7 +46,7 @@ utf8_to_win1250(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN1250,
-                          sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN1250) / sizeof(pg_utf_to_local), PG_WIN1250, len);
 
        PG_RETURN_VOID();
 }
index 6afeb6a1487210b3d5fe8bacccc57b8e75afab63..61ecbb481c5cf30ee3b582b317b441bb3e5451d5 100644 (file)
@@ -46,7 +46,7 @@ utf8_to_win1252(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN1252,
-                          sizeof(ULmapWIN1252) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN1252) / sizeof(pg_utf_to_local), PG_WIN1252, len);
 
        PG_RETURN_VOID();
 }
index 7260a638ba941de724aee4094cebcc99affaeea6..e704634a2c81f2d4cadc5d3d10e2ba9fab07106b 100644 (file)
@@ -46,7 +46,7 @@ utf8_to_win1256(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN1256,
-                          sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN1256) / sizeof(pg_utf_to_local), PG_WIN1256, len);
 
        PG_RETURN_VOID();
 }
diff --git a/src/backend/utils/mb/conversion_procs/utf8_and_win1258/utf8_and_win1258.c b/src/backend/utils/mb/conversion_procs/utf8_and_win1258/utf8_and_win1258.c
new file mode 100644 (file)
index 0000000..238c5ec
--- /dev/null
@@ -0,0 +1,68 @@
+/*-------------------------------------------------------------------------
+ *
+ *       WIN1258 <--> UTF8
+ *
+ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * IDENTIFICATION
+ *       $PostgreSQL$
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+#include "fmgr.h"
+#include "mb/pg_wchar.h"
+#include "../../Unicode/win1258_to_utf8.map"
+#include "../../Unicode/utf8_to_win1258.map"
+
+PG_FUNCTION_INFO_V1(win1258_to_utf8);
+PG_FUNCTION_INFO_V1(utf8_to_win1258);
+
+extern Datum win1258_to_utf8(PG_FUNCTION_ARGS);
+extern Datum utf8_to_win1258(PG_FUNCTION_ARGS);
+
+/* ----------
+ * conv_proc(
+ *             INTEGER,        -- source encoding id
+ *             INTEGER,        -- destination encoding id
+ *             CSTRING,        -- source string (null terminated C string)
+ *             CSTRING,        -- destination string (null terminated C string)
+ *             INTEGER         -- source string length
+ * ) returns VOID;
+ * ----------
+ */
+Datum
+win1258_to_utf8(PG_FUNCTION_ARGS)
+{
+       unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
+       unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
+       int                     len = PG_GETARG_INT32(4);
+
+       Assert(PG_GETARG_INT32(0) == PG_WIN1258);
+       Assert(PG_GETARG_INT32(1) == PG_UTF8);
+       Assert(len >= 0);
+
+       LocalToUtf(src, dest, LUmapWIN1258,
+                       sizeof(LUmapWIN1258) / sizeof(pg_local_to_utf), PG_WIN1258, len);
+
+       PG_RETURN_VOID();
+}
+
+Datum
+utf8_to_win1258(PG_FUNCTION_ARGS)
+{
+       unsigned char *src = (unsigned char *) PG_GETARG_CSTRING(2);
+       unsigned char *dest = (unsigned char *) PG_GETARG_CSTRING(3);
+       int                     len = PG_GETARG_INT32(4);
+
+       Assert(PG_GETARG_INT32(0) == PG_UTF8);
+       Assert(PG_GETARG_INT32(1) == PG_WIN1258);
+       Assert(len >= 0);
+
+       UtfToLocal(src, dest, ULmapWIN1258,
+                          sizeof(ULmapWIN1258) / sizeof(pg_utf_to_local), PG_WIN1258, len);
+
+       PG_RETURN_VOID();
+}
index 428588b4685d5d119d76af7de0b83c8e9efdd5ad..b75c6c52037908ba37ee6769bb8622b22adf6e7f 100644 (file)
@@ -46,7 +46,7 @@ utf8_to_win874(PG_FUNCTION_ARGS)
        Assert(len >= 0);
 
        UtfToLocal(src, dest, ULmapWIN874,
-                          sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), len);
+                          sizeof(ULmapWIN874) / sizeof(pg_utf_to_local), PG_WIN874, len);
 
        PG_RETURN_VOID();
 }
index 5846fc5716da4e86a367509e31ab51a78c437f2b..d11e9eab7adf052ad16601cafdbcc5589008c1d9 100644 (file)
@@ -368,8 +368,49 @@ pg_client_to_server(const char *s, int len)
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
+               return (char *) s;
+
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is needed, but we must still validate the data.
+                */
+               (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
                return (char *) s;
+       }
+
+       if (DatabaseEncoding->encoding == PG_SQL_ASCII)
+       {
+               /*
+                * No conversion is possible, but we must still validate the data,
+                * because the client-side code might have done string escaping
+                * using the selected client_encoding.  If the client encoding is
+                * ASCII-safe then we just do a straight validation under that
+                * encoding.  For an ASCII-unsafe encoding we have a problem:
+                * we dare not pass such data to the parser but we have no way
+                * to convert it.  We compromise by rejecting the data if it
+                * contains any non-ASCII characters.
+                */
+               if (PG_VALID_BE_ENCODING(ClientEncoding->encoding))
+                       (void) pg_verify_mbstr(ClientEncoding->encoding, s, len, false);
+               else
+               {
+                       int             i;
+
+                       for (i = 0; i < len; i++)
+                       {
+                               if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
+                                       ereport(ERROR,
+                                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                                                        errmsg("invalid byte value for encoding \"%s\": 0x%02x",
+                                                                       pg_enc2name_tbl[PG_SQL_ASCII].name,
+                                                                       (unsigned char) s[i])));
+                       }
+               }
+               return (char *) s;
+       }
 
        return perform_default_encoding_conversion(s, len, true);
 }
@@ -383,9 +424,14 @@ pg_server_to_client(const char *s, int len)
        Assert(DatabaseEncoding);
        Assert(ClientEncoding);
 
-       if (ClientEncoding->encoding == DatabaseEncoding->encoding)
+       if (len <= 0)
                return (char *) s;
 
+       if (ClientEncoding->encoding == DatabaseEncoding->encoding ||
+               ClientEncoding->encoding == PG_SQL_ASCII ||
+               DatabaseEncoding->encoding == PG_SQL_ASCII)
+               return (char *) s;              /* assume data is valid */
+
        return perform_default_encoding_conversion(s, len, false);
 }
 
@@ -404,9 +450,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
                                dest_encoding;
        FmgrInfo   *flinfo;
 
-       if (len <= 0)
-               return (char *) src;
-
        if (is_client_to_server)
        {
                src_encoding = ClientEncoding->encoding;
@@ -423,12 +466,6 @@ perform_default_encoding_conversion(const char *src, int len, bool is_client_to_
        if (flinfo == NULL)
                return (char *) src;
 
-       if (src_encoding == dest_encoding)
-               return (char *) src;
-
-       if (src_encoding == PG_SQL_ASCII || dest_encoding == PG_SQL_ASCII)
-               return (char *) src;
-
        result = palloc(len * 4 + 1);
 
        FunctionCall5(flinfo,
index 9d5b116c9a2e562cf4ea4d4dfdba44ac06a391fa..7cf4473d96f72afa585fdbefd7150c83af49ccaf 100644 (file)
@@ -96,7 +96,7 @@ static int    pg_euc2wchar_with_len
        return (cnt);
 }
 
-static int
+static inline int
 pg_euc_mblen(const unsigned char *s)
 {
        int                     len;
@@ -112,7 +112,7 @@ pg_euc_mblen(const unsigned char *s)
        return (len);
 }
 
-static int
+static inline int
 pg_euc_dsplen(const unsigned char *s)
 {
        int                     len;
@@ -396,7 +396,7 @@ pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
 }
 
 /*
- * returns the byte length of a UTF8 word pointed to by s
+ * returns the byte length of a UTF8 character pointed to by s
  */
 int
 pg_utf_mblen(const unsigned char *s)
@@ -720,229 +720,642 @@ pg_gb18030_dsplen(const unsigned char *s)
        return (len);
 }
 
+/*
+ *-------------------------------------------------------------------
+ * multibyte sequence validators
+ *
+ * These functions accept "s", a pointer to the first byte of a string,
+ * and "len", the remaining length of the string.  If there is a validly
+ * encoded character beginning at *s, return its length in bytes; else
+ * return -1.
+ *
+ * The functions can assume that len > 0 and that *s != '\0', but they must
+ * test for and reject zeroes in any additional bytes of a multibyte character.
+ *
+ * Note that this definition allows the function for a single-byte
+ * encoding to be just "return 1".
+ *-------------------------------------------------------------------
+ */
 
-pg_wchar_tbl pg_wchar_table[] = {
-       {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, 1},          /* 0; PG_SQL_ASCII      */
-       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, 3},          /* 1; PG_EUC_JP */
-       {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, 3},          /* 2; PG_EUC_CN */
-       {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, 3},          /* 3; PG_EUC_KR */
-       {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, 3},          /* 4; PG_EUC_TW */
-       {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, 3},          /* 5; PG_JOHAB */
-       {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, 4},        /* 6; PG_UTF8 */
-       {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, 3}, /* 7; PG_MULE_INTERNAL */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 8; PG_LATIN1 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 9; PG_LATIN2 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 10; PG_LATIN3 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 11; PG_LATIN4 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 12; PG_LATIN5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 13; PG_LATIN6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 14; PG_LATIN7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 15; PG_LATIN8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 16; PG_LATIN9 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 17; PG_LATIN10 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 18; PG_WIN1256 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 19; PG_WIN1258 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 20; PG_WIN874 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 21; PG_KOI8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 22; PG_WIN1251 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 22; PG_WIN1252 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 23; PG_WIN866 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 24; ISO-8859-5 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 25; ISO-8859-6 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 26; ISO-8859-7 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 27; ISO-8859-8 */
-       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, 1},       /* 28; PG_WIN1250 */
-       {0, pg_sjis_mblen, pg_sjis_dsplen, 2},          /* 29; PG_SJIS */
-       {0, pg_big5_mblen, pg_big5_dsplen, 2},          /* 30; PG_BIG5 */
-       {0, pg_gbk_mblen, pg_gbk_dsplen, 2},            /* 31; PG_GBK */
-       {0, pg_uhc_mblen, pg_uhc_dsplen, 2},            /* 32; PG_UHC */
-       {0, pg_gb18030_mblen, pg_gb18030_dsplen, 2} /* 33; PG_GB18030 */
-};
+static int
+pg_ascii_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
 
-/* returns the byte length of a word for mule internal code */
-int
-pg_mic_mblen(const unsigned char *mbstr)
+#define IS_EUC_RANGE_VALID(c)  ((c) >= 0xa1 && (c) <= 0xfe)
+
+static int
+pg_eucjp_verifier(const unsigned char *s, int len)
 {
-       return (pg_mule_mblen(mbstr));
+       int                     l;
+       unsigned char c1, c2;
+
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* JIS X 0201 */
+                       l = 2;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xdf)
+                               return -1;
+                       break;
+
+               case SS3:               /* JIS X 0212 */
+                       l = 3;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* JIS X 0208? */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               if (!IS_EUC_RANGE_VALID(c1))
+                                       return -1;
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+
+       return l;
 }
 
-/*
- * Returns the byte length of a multibyte word.
- */
-int
-pg_encoding_mblen(int encoding, const char *mbstr)
+static int
+pg_euckr_verifier(const unsigned char *s, int len)
 {
-       Assert(PG_VALID_ENCODING(encoding));
+       int                     l;
+       unsigned char c1, c2;
 
-       return ((encoding >= 0 &&
-                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-               ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
-       ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
+       c1 = *s++;
+
+       if (IS_HIGHBIT_SET(c1))
+       {
+               l = 2;
+               if (l > len)
+                       return -1;
+               if (!IS_EUC_RANGE_VALID(c1))
+                       return -1;
+               c2 = *s++;
+               if (!IS_EUC_RANGE_VALID(c2))
+                       return -1;
+       }
+       else            /* must be ASCII */
+       {
+               l = 1;
+       }
+
+       return l;
 }
 
-/*
- * Returns the display length of a multibyte word.
- */
-int
-pg_encoding_dsplen(int encoding, const char *mbstr)
+/* EUC-CN byte sequences are exactly same as EUC-KR */
+#define pg_euccn_verifier      pg_euckr_verifier
+
+static int
+pg_euctw_verifier(const unsigned char *s, int len)
 {
-       Assert(PG_VALID_ENCODING(encoding));
+       int                     l;
+       unsigned char c1, c2;
 
-       return ((encoding >= 0 &&
-                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
-          ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
-       ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
+       c1 = *s++;
+
+       switch (c1)
+       {
+               case SS2:               /* CNS 11643 Plane 1-7 */
+                       l = 4;
+                       if (l > len)
+                               return -1;
+                       c2 = *s++;
+                       if (c2 < 0xa1 || c2 > 0xa7)
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       c2 = *s++;
+                       if (!IS_EUC_RANGE_VALID(c2))
+                               return -1;
+                       break;
+
+               case SS3:               /* unused */
+                       return -1;
+
+               default:
+                       if (IS_HIGHBIT_SET(c1))         /* CNS 11643 Plane 1 */
+                       {
+                               l = 2;
+                               if (l > len)
+                                       return -1;
+                               /* no further range check on c1? */
+                               c2 = *s++;
+                               if (!IS_EUC_RANGE_VALID(c2))
+                                       return -1;
+                       }
+                       else            /* must be ASCII */
+                       {
+                               l = 1;
+                       }
+                       break;
+       }
+       return l;
 }
 
-/*
- * fetch maximum length of a char encoding
- */
-int
-pg_encoding_max_length(int encoding)
+static int
+pg_johab_verifier(const unsigned char *s, int len)
 {
-       Assert(PG_VALID_ENCODING(encoding));
+       int l, mbl;
+       unsigned char c;
 
-       return pg_wchar_table[encoding].maxmblen;
+       l = mbl = pg_johab_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!IS_HIGHBIT_SET(*s))
+               return mbl;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_EUC_RANGE_VALID(c))
+                       return -1;
+       }
+       return mbl;
 }
 
-#ifndef FRONTEND
+static int
+pg_mule_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c;
+
+       l = mbl = pg_mule_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               c = *++s;
+               if (!IS_HIGHBIT_SET(c))
+                       return -1;
+       }
+       return mbl;
+}
+
+static int
+pg_latin1_verifier(const unsigned char *s, int len)
+{
+       return 1;
+}
+
+static int
+pg_sjis_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+       unsigned char c1, c2;
+
+       l = mbl = pg_sjis_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (l == 1)                                     /* pg_sjis_mblen already verified it */
+               return mbl;
+
+       c1 = *s++;
+       c2 = *s;
+       if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
+               return -1;
+       return mbl;
+}
+
+static int
+pg_big5_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_big5_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_gbk_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gbk_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_uhc_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_uhc_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
 
+static int
+pg_gb18030_verifier(const unsigned char *s, int len)
+{
+       int l, mbl;
+
+       l = mbl = pg_gb18030_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       while (--l > 0)
+       {
+               if (*++s == '\0')
+                       return -1;
+       }
+
+       return mbl;
+}
+
+static int
+pg_utf8_verifier(const unsigned char *s, int len)
+{
+       int l = pg_utf_mblen(s);
+
+       if (len < l)
+               return -1;
+
+       if (!pg_utf8_islegal(s, l))
+               return -1;
+
+       return l;
+}
+
+/*
+ * Check for validity of a single UTF-8 encoded character
+ *
+ * This directly implements the rules in RFC3629.  The bizarre-looking
+ * restrictions on the second byte are meant to ensure that there isn't
+ * more than one encoding of a given Unicode character point; that is,
+ * you may not use a longer-than-necessary byte sequence with high order
+ * zero bits to represent a character that would fit in fewer bytes.
+ * To do otherwise is to create security hazards (eg, create an apparent
+ * non-ASCII character that decodes to plain ASCII).
+ *
+ * length is assumed to have been obtained by pg_utf_mblen(), and the
+ * caller must have checked that that many bytes are present in the buffer.
+ */
 bool
 pg_utf8_islegal(const unsigned char *source, int length)
 {
        unsigned char a;
-       const unsigned char *srcptr = source + length;
 
        switch (length)
        {
                default:
+                       /* reject lengths 5 and 6 for now */
                        return false;
-                       /* Everything else falls through when "true"... */
                case 4:
-                       if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
+                       a = source[3];
+                       if (a < 0x80 || a > 0xBF)
                                return false;
+                       /* FALL THRU */
                case 3:
-                       if ((a = (*--srcptr)) < 0x80 || a > 0xBF)
+                       a = source[2];
+                       if (a < 0x80 || a > 0xBF)
                                return false;
+                       /* FALL THRU */
                case 2:
-                       if ((a = (*--srcptr)) > 0xBF)
-                               return false;
+                       a = source[1];
                        switch (*source)
                        {
-                                       /* no fall-through in this inner switch */
                                case 0xE0:
-                                       if (a < 0xA0)
+                                       if (a < 0xA0 || a > 0xBF)
                                                return false;
                                        break;
                                case 0xED:
-                                       if (a > 0x9F)
+                                       if (a < 0x80 || a > 0x9F)
                                                return false;
                                        break;
                                case 0xF0:
-                                       if (a < 0x90)
+                                       if (a < 0x90 || a > 0xBF)
                                                return false;
                                        break;
                                case 0xF4:
-                                       if (a > 0x8F)
+                                       if (a < 0x80 || a > 0x8F)
                                                return false;
                                        break;
                                default:
-                                       if (a < 0x80)
+                                       if (a < 0x80 || a > 0xBF)
                                                return false;
+                                       break;
                        }
-
+                       /* FALL THRU */
                case 1:
-                       if (*source >= 0x80 && *source < 0xC2)
+                       a = *source;
+                       if (a >= 0x80 && a < 0xC2)
+                               return false;
+                       if (a > 0xF4)
                                return false;
+                       break;
        }
-       if (*source > 0xF4)
-               return false;
        return true;
 }
 
+/*
+ *-------------------------------------------------------------------
+ * encoding info table
+ *-------------------------------------------------------------------
+ */
+pg_wchar_tbl pg_wchar_table[] = {
+       {pg_ascii2wchar_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1},               /* 0; PG_SQL_ASCII      */
+       {pg_eucjp2wchar_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},               /* 1; PG_EUC_JP */
+       {pg_euccn2wchar_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 3},               /* 2; PG_EUC_CN */
+       {pg_euckr2wchar_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},               /* 3; PG_EUC_KR */
+       {pg_euctw2wchar_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 3},               /* 4; PG_EUC_TW */
+       {pg_johab2wchar_with_len, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},               /* 5; PG_JOHAB */
+       {pg_utf2wchar_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},      /* 6; PG_UTF8 */
+       {pg_mule2wchar_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 3}, /* 7; PG_MULE_INTERNAL */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 8; PG_LATIN1 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 9; PG_LATIN2 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 10; PG_LATIN3 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 11; PG_LATIN4 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 12; PG_LATIN5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 13; PG_LATIN6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 14; PG_LATIN7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 15; PG_LATIN8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 16; PG_LATIN9 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 17; PG_LATIN10 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 18; PG_WIN1256 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 19; PG_WIN1258 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 20; PG_WIN874 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 21; PG_KOI8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 22; PG_WIN1251 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 22; PG_WIN1252 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 23; PG_WIN866 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 24; ISO-8859-5 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 25; ISO-8859-6 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 26; ISO-8859-7 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 27; ISO-8859-8 */
+       {pg_latin12wchar_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1},   /* 28; PG_WIN1250 */
+       {0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2},                /* 29; PG_SJIS */
+       {0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2},                /* 30; PG_BIG5 */
+       {0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},           /* 31; PG_GBK */
+       {0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},           /* 32; PG_UHC */
+       {0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 2} /* 33; PG_GB18030 */
+};
+
+/* returns the byte length of a word for mule internal code */
+int
+pg_mic_mblen(const unsigned char *mbstr)
+{
+       return pg_mule_mblen(mbstr);
+}
+
+/*
+ * Returns the byte length of a multibyte character.
+ */
+int
+pg_encoding_mblen(int encoding, const char *mbstr)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+               ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
+}
+
+/*
+ * Returns the display length of a multibyte character.
+ */
+int
+pg_encoding_dsplen(int encoding, const char *mbstr)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+          ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
+}
 
 /*
- * Verify mbstr to make sure that it has a valid character sequence.
- * mbstr is not necessarily NULL terminated; length of mbstr is
+ * Verify the first multibyte character of the given string.
+ * Return its byte length if good, -1 if bad.  (See comments above for
+ * full details of the mbverify API.)
+ */
+int
+pg_encoding_verifymb(int encoding, const char *mbstr, int len)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return ((encoding >= 0 &&
+                        encoding < sizeof(pg_wchar_table) / sizeof(pg_wchar_tbl)) ?
+               ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
+       ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
+}
+
+/*
+ * fetch maximum length of a given encoding
+ */
+int
+pg_encoding_max_length(int encoding)
+{
+       Assert(PG_VALID_ENCODING(encoding));
+
+       return pg_wchar_table[encoding].maxmblen;
+}
+
+#ifndef FRONTEND
+
+/*
+ * fetch maximum length of the encoding for the current database
+ */
+int
+pg_database_encoding_max_length(void)
+{
+       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the current
+ * database encoding.  Otherwise same as pg_verify_mbstr().
+ */
+bool
+pg_verifymbstr(const char *mbstr, int len, bool noError)
+{
+       return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
+}
+
+/*
+ * Verify mbstr to make sure that it is validly encoded in the specified
+ * encoding.
+ *
+ * mbstr is not necessarily zero terminated; length of mbstr is
  * specified by len.
  *
  * If OK, return TRUE. If a problem is found, return FALSE when noError is
  * true; when noError is false, ereport() a descriptive message.
  */
 bool
-pg_verifymbstr(const char *mbstr, int len, bool noError)
+pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 {
-       int                     l;
-       int                     i;
-       int                     encoding;
+       mbverifier      mbverify;
+
+       Assert(PG_VALID_ENCODING(encoding));
+
+       /*
+        * In single-byte encodings, we need only reject nulls (\0).
+        */
+       if (pg_encoding_max_length(encoding) <= 1)
+       {
+               const char *nullpos = memchr(mbstr, 0, len);
 
-       /* we do not need any check in single-byte encodings */
-       if (pg_database_encoding_max_length() <= 1)
-               return true;
+               if (nullpos == NULL)
+                       return true;
+               if (noError)
+                       return false;
+               report_invalid_encoding(encoding, nullpos, 1);
+       }
 
-       encoding = GetDatabaseEncoding();
+       /* fetch function pointer just once */
+       mbverify = pg_wchar_table[encoding].mbverify;
 
-       while (len > 0 && *mbstr)
+       while (len > 0)
        {
-               l = pg_mblen(mbstr);
+               int                     l;
 
-               /* special UTF-8 check */
-               if (encoding == PG_UTF8)
+               /* fast path for ASCII-subset characters */
+               if (!IS_HIGHBIT_SET(*mbstr))
                {
-                       if (!pg_utf8_islegal((const unsigned char *) mbstr, l))
+                       if (*mbstr != '\0')
                        {
-                               if (noError)
-                                       return false;
-                               ereport(ERROR,
-                                               (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                                errmsg("invalid UTF-8 byte sequence detected near byte 0x%02x",
-                                                               (unsigned char) *mbstr)));
+                               mbstr++;
+                               len--;
+                               continue;
                        }
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
                }
-               else
-               {
-                       for (i = 1; i < l; i++)
-                       {
-                               /*
-                                * we expect that every multibyte char consists of bytes
-                                * having the 8th bit set
-                                */
-                               if (i >= len || (mbstr[i] & 0x80) == 0)
-                               {
-                                       char            buf[8 * 2 + 1];
-                                       char       *p = buf;
-                                       int                     j,
-                                                               jlimit;
-
-                                       if (noError)
-                                               return false;
-
-                                       jlimit = Min(l, len);
-                                       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
 
-                                       for (j = 0; j < jlimit; j++)
-                                               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+               l = (*mbverify) ((const unsigned char *) mbstr, len);
 
-                                       ereport(ERROR,
-                                                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                                       errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
-                                                  GetDatabaseEncodingName(), buf)));
-                               }
-                       }
+               if (l < 0)
+               {
+                       if (noError)
+                               return false;
+                       report_invalid_encoding(encoding, mbstr, len);
                }
-               len -= l;
+
                mbstr += l;
+               len -= l;
        }
        return true;
 }
 
 /*
- * fetch maximum length of a char encoding for the current database
+ * report_invalid_encoding: complain about invalid multibyte character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
  */
-int
-pg_database_encoding_max_length(void)
+void
+report_invalid_encoding(int encoding, const char *mbstr, int len)
 {
-       return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
+       int                     l = pg_encoding_mblen(encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                        errmsg("invalid byte sequence for encoding \"%s\": 0x%s",
+                                       pg_enc2name_tbl[encoding].name,
+                                       buf)));
+}
+
+/*
+ * report_untranslatable_char: complain about untranslatable character
+ *
+ * note: len is remaining length of string, not length of character;
+ * len must be greater than zero, as we always examine the first byte.
+ */
+void
+report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                  const char *mbstr, int len)
+{
+       int                     l = pg_encoding_mblen(src_encoding, mbstr);
+       char            buf[8 * 2 + 1];
+       char       *p = buf;
+       int                     j,
+                               jlimit;
+
+       jlimit = Min(l, len);
+       jlimit = Min(jlimit, 8);        /* prevent buffer overrun */
+
+       for (j = 0; j < jlimit; j++)
+               p += sprintf(p, "%02x", (unsigned char) mbstr[j]);
+
+       ereport(ERROR,
+                       (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
+                        errmsg("character 0x%s of encoding \"%s\" has no equivalent in \"%s\"",
+                                       buf,
+                                       pg_enc2name_tbl[src_encoding].name,
+                                       pg_enc2name_tbl[dest_encoding].name)));
 }
 
 #endif
index fd9410c15ef58e2b05619c04a8354c9f0dd27334..e95595fd6b72fd1b0233973e4b273e940948bc86 100644 (file)
@@ -705,6 +705,8 @@ typedef NameData *Name;
 
 /* msb for char */
 #define CSIGNBIT (0x80)
+#define HIGHBIT                                        (0x80)
+#define IS_HIGHBIT_SET(ch)             ((unsigned char)(ch) & HIGHBIT)
 
 #define STATUS_OK                              (0)
 #define STATUS_ERROR                   (-1)
index 7e21e5b02e4caf55c450c2d3446447bd4bb5374a..cb2c66a53ed60636b52b4d8f4a88e462a5558f8c 100644 (file)
@@ -23,11 +23,17 @@ typedef unsigned int pg_wchar;
 #define SS2 0x8e                               /* single shift 2 (JIS0201) */
 #define SS3 0x8f                               /* single shift 3 (JIS0212) */
 
+/*
+ * SJIS validation macros
+ */
+#define ISSJISHEAD(c) (((c) >= 0x81 && (c) <= 0x9f) || ((c) >= 0xe0 && (c) <= 0xfc))
+#define ISSJISTAIL(c) (((c) >= 0x40 && (c) <= 0x7e) || ((c) >= 0x80 && (c) <= 0xfc))
+
 /*
  * Leading byte types or leading prefix byte for MULE internal code.
  * See http://www.xemacs.org for more details. (there is a doc titled
  * "XEmacs Internals Manual", "MULE Character Sets and Encodings"
- * section.
+ * section.)
  */
 /*
  * Is a leading byte for "official" single byte encodings?
@@ -64,7 +70,7 @@ typedef unsigned int pg_wchar;
 #define LC_ISO8859_8   0x88    /* Hebrew (not supported yet) */
 #define LC_JISX0201K   0x89    /* Japanese 1 byte kana */
 #define LC_JISX0201R   0x8a    /* Japanese 1 byte Roman */
-/* Note that 0x8b seems to be unused in as of Emacs 20.7.
+/* Note that 0x8b seems to be unused as of Emacs 20.7.
  * However, there might be a chance that 0x8b could be used
  * in later version of Emacs.
  */
@@ -135,13 +141,13 @@ typedef unsigned int pg_wchar;
 /* #define FREE                0xff    free (unused) */
 
 /*
- * Encoding numeral identificators
+ * PostgreSQL encoding identifiers
  *
  * WARNING: the order of this table must be same as order
  *                     in the pg_enc2name[] (mb/encnames.c) array!
  *
- *                     If you add some encoding don'y forget check
- *                     PG_ENCODING_[BE|FE]_LAST macros.
+ *                     If you add some encoding don't forget to check
+ *                     PG_ENCODING_BE_LAST macro.
  *
  *             The PG_SQL_ASCII is default encoding and must be = 0.
  */
@@ -203,8 +209,7 @@ typedef enum pg_enc
 #define PG_VALID_ENCODING(_enc) \
                ((_enc) >= 0 && (_enc) < _PG_LAST_ENCODING_)
 
-/* On FE are possible all encodings
- */
+/* On FE are possible all encodings */
 #define PG_VALID_FE_ENCODING(_enc)     PG_VALID_ENCODING(_enc)
 
 /*
@@ -244,18 +249,21 @@ extern const char *pg_encoding_to_char(int encoding);
 typedef int (*mb2wchar_with_len_converter) (const unsigned char *from,
                                                                                                                pg_wchar *to,
                                                                                                                int len);
+
 typedef int (*mblen_converter) (const unsigned char *mbstr);
 
 typedef int (*mbdisplaylen_converter) (const unsigned char *mbstr);
 
+typedef int (*mbverifier) (const unsigned char *mbstr, int len);
+
 typedef struct
 {
        mb2wchar_with_len_converter mb2wchar_with_len;          /* convert a multibyte
                                                                                                                 * string to a wchar */
-       mblen_converter mblen;          /* returns the length of a multibyte char */
-       mbdisplaylen_converter dsplen;          /* returns the lenghth of a display
-                                                                                * length */
-       int                     maxmblen;               /* max bytes for a char in this charset */
+       mblen_converter mblen;          /* get byte length of a char */
+       mbdisplaylen_converter dsplen;          /* get display width of a char */
+       mbverifier      mbverify;               /* verify multibyte sequence */
+       int                     maxmblen;               /* max bytes for a char in this encoding */
 } pg_wchar_tbl;
 
 extern pg_wchar_tbl pg_wchar_table[];
@@ -288,6 +296,7 @@ extern int  pg_mblen(const char *mbstr);
 extern int     pg_dsplen(const char *mbstr);
 extern int     pg_encoding_mblen(int encoding, const char *mbstr);
 extern int     pg_encoding_dsplen(int encoding, const char *mbstr);
+extern int     pg_encoding_verifymb(int encoding, const char *mbstr, int len);
 extern int     pg_mule_mblen(const unsigned char *mbstr);
 extern int     pg_mic_mblen(const unsigned char *mbstr);
 extern int     pg_mbstrlen(const char *mbstr);
@@ -321,21 +330,32 @@ extern char *pg_server_to_client(const char *s, int len);
 extern unsigned short BIG5toCNS(unsigned short big5, unsigned char *lc);
 extern unsigned short CNStoBIG5(unsigned short cns, unsigned char lc);
 
-extern void LocalToUtf(unsigned char *iso, unsigned char *utf,
-                  pg_local_to_utf *map, int size, int encoding, int len);
+extern void LocalToUtf(const unsigned char *iso, unsigned char *utf,
+                  const pg_local_to_utf *map, int size, int encoding, int len);
 
-extern void UtfToLocal(unsigned char *utf, unsigned char *iso,
-                  pg_utf_to_local *map, int size, int len);
+extern void UtfToLocal(const unsigned char *utf, unsigned char *iso,
+                  const pg_utf_to_local *map, int size, int encoding, int len);
 
 extern bool pg_verifymbstr(const char *mbstr, int len, bool noError);
-
-extern void pg_ascii2mic(unsigned char *src, unsigned char *dest, int len);
-extern void pg_mic2ascii(unsigned char *src, unsigned char *dest, int len);
-extern void pg_print_bogus_char(unsigned char **mic, unsigned char **p);
-extern void latin2mic(unsigned char *l, unsigned char *p, int len, int lc);
-extern void mic2latin(unsigned char *mic, unsigned char *p, int len, int lc);
-extern void latin2mic_with_table(unsigned char *l, unsigned char *p, int len, int lc, unsigned char *tab);
-extern void mic2latin_with_table(unsigned char *mic, unsigned char *p, int len, int lc, unsigned char *tab);
+extern bool pg_verify_mbstr(int encoding, const char *mbstr, int len,
+                                                       bool noError);
+
+extern void report_invalid_encoding(int encoding, const char *mbstr, int len);
+extern void report_untranslatable_char(int src_encoding, int dest_encoding,
+                                                                          const char *mbstr, int len);
+
+extern void pg_ascii2mic(const unsigned char *l, unsigned char *p, int len);
+extern void pg_mic2ascii(const unsigned char *mic, unsigned char *p, int len);
+extern void latin2mic(const unsigned char *l, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void mic2latin(const unsigned char *mic, unsigned char *p, int len,
+                                         int lc, int encoding);
+extern void latin2mic_with_table(const unsigned char *l, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
+extern void mic2latin_with_table(const unsigned char *mic, unsigned char *p,
+                                                                int len, int lc, int encoding,
+                                                                const unsigned char *tab);
 
 extern bool pg_utf8_islegal(const unsigned char *source, int length);