BEGIN(xe);
                                }
-<xeu>.                 |
-<xeu>\n                        |
+<xeu>.                 { yyerror("invalid Unicode surrogate pair"); }
+<xeu>\n                        { yyerror("invalid Unicode surrogate pair"); }
 <xeu><<EOF>>   { yyerror("invalid Unicode surrogate pair"); }
-
 <xe,xeu>{xeunicodefail}        {
                                                ereport(ERROR,
                                                                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                                                                 errmsg("invalid Unicode escape"),
                                                                 errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
                                                                 lexer_errposition()));
-                                       }
-
+                               }
 <xe>{xeescape}  {
                                        if (yytext[1] == '\'')
                                        {
 
 static YY_BUFFER_STATE prepare_buffer(const char *txt, int len,
                                                                          char **txtcopy);
 static void emit(const char *txt, int len);
+static bool is_utf16_surrogate_first(uint32 c);
 
 #define ECHO emit(yytext, yyleng)
 
  *  <xdolq> $foo$ quoted strings
  *  <xui> quoted identifier with Unicode escapes
  *  <xus> quoted string with Unicode escapes
+ *  <xeu> Unicode surrogate pair in extended quoted string
  */
 
 %x xb
 %x xdolq
 %x xui
 %x xus
+%x xeu
 /* Additional exclusive states for psql only: lex backslash commands */
 %x xslashcmd
 %x xslasharg
  * did not end with a newline.
  *
  * XXX perhaps \f (formfeed) should be treated as a newline as well?
+ *
+ * XXX if you change the set of whitespace characters, fix scanner_isspace()
+ * to agree, and see also the plpgsql lexer.
  */
 
 space                  [ \t\n\r\f]
 xeescape               [\\][^0-7]
 xeoctesc               [\\][0-7]{1,3}
 xehexesc               [\\]x[0-9A-Fa-f]{1,2}
+xeunicode              [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
+xeunicodefail  [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
 
 /* Extended quote
  * xqdouble implements embedded quote, ''''
 
 typecast               "::"
 
+/* these two token types are used by PL/pgsql, though not in core SQL */
+dot_dot                        \.\.
+colon_equals   ":="
+
 /*
  * "self" is the set of chars that should be returned as single-character
  * tokens.  "op_chars" is the set of chars that can make up "Op" tokens,
 <xe>{xeinside}  {
                                        ECHO;
                                }
+<xe>{xeunicode} {
+                                       uint32 c = strtoul(yytext+2, NULL, 16);
+
+                                       if (is_utf16_surrogate_first(c))
+                                               BEGIN(xeu);
+                                       ECHO;
+                               }
+<xeu>{xeunicode} {
+                                       BEGIN(xe);
+                                       ECHO;
+                               }
+<xeu>.                 { ECHO; }
+<xeu>\n                        { ECHO; }
+<xe,xeu>{xeunicodefail}        {
+                                       ECHO;
+                               }
 <xe>{xeescape}  {
                                        ECHO;
                                }
                                        ECHO;
                                }
 
+{dot_dot}              {
+                                       ECHO;
+                               }
+
+{colon_equals} {
+                                       ECHO;
+                               }
+
        /*
         * These rules are specific to psql --- they implement parenthesis
         * counting and detection of command-ending semicolon.  These must
                }
        }
 }
+
+static bool
+is_utf16_surrogate_first(uint32 c)
+{
+       return (c >= 0xD800 && c <= 0xDBFF);
+}