Avoid confusion start_parse_str function with tsearch V1

author Teodor Sigaev <teodor@sigaev.ru>

Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)

committer Teodor Sigaev <teodor@sigaev.ru>

Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)
author Teodor Sigaev <teodor@sigaev.ru>
Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)
committer Teodor Sigaev <teodor@sigaev.ru>
Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)
diff --git a/contrib/tsearch2/wordparser/parser.h b/contrib/tsearch2/wordparser/parser.h

new file mode 100644 (file)

index 0000000..7fb2d0a
--- /dev/null
+++ b/contrib/tsearch2/wordparser/parser.h
@@ -0,0 +1,10 @@
+#ifndef __PARSER_H__
+#define __PARSER_H__
+
+extern char       *token;
+extern int                     tokenlen;
+int                    tsearch2_yylex(void);
+void           tsearch2_start_parse_str(char *, int);
+void           tsearch2_end_parse(void);
+
+#endif
diff --git a/contrib/tsearch2/wordparser/parser.l b/contrib/tsearch2/wordparser/parser.l

new file mode 100644 (file)

index 0000000..fd82448
--- /dev/null
+++ b/contrib/tsearch2/wordparser/parser.l
@@ -0,0 +1,296 @@
+%{
+#include "postgres.h"
+
+#include "deflex.h"
+#include "parser.h"
+#include "common.h"
+
+/* Avoid exit() on fatal scanner errors */
+#define fprintf(file, fmt, msg)  ts_error(ERROR, fmt, msg)
+
+char *token = NULL;  /* pointer to token */
+int tokenlen;
+char *s     = NULL;  /* to return WHOLE hyphenated-word */
+
+YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nounput
+%option noyywrap
+
+/* parser's state for parsing hyphenated-word */
+%x DELIM  
+/* parser's state for parsing URL*/
+%x URL  
+%x SERVER  
+
+/* parser's state for parsing TAGS */
+%x INTAG
+%x QINTAG
+%x INCOMMENT
+%x INSCRIPT
+
+/* cyrillic koi8 char */
+CYRALNUM       [0-9\200-\377]
+CYRALPHA       [\200-\377]
+ALPHA          [a-zA-Z\200-\377]
+ALNUM          [0-9a-zA-Z\200-\377]
+
+
+HOSTNAME       ([-_[:alnum:]]+\.)+[[:alpha:]]+
+URI            [-_[:alnum:]/%,\.;=&?#]+
+
+%%
+
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
+
+<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
+       BEGIN INITIAL; 
+       *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return SPACE;
+}
+
+"<!--" { BEGIN INCOMMENT; }
+
+<INCOMMENT>"-->"       { 
+       BEGIN INITIAL;
+       *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0'; 
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return SPACE;
+}
+
+
+"<"[\![:alpha:]]       { BEGIN INTAG; }
+
+"</"[[:alpha:]]        { BEGIN INTAG; }
+
+<INTAG>"\""    { BEGIN QINTAG; }
+
+<QINTAG>"\\\"" ;
+
+<QINTAG>"\""   { BEGIN INTAG; }
+
+<INTAG>">"     { 
+       BEGIN INITIAL;
+       token = tsearch2_yytext;
+       *tsearch2_yytext=' '; 
+       token = tsearch2_yytext;
+       tokenlen = 1;
+       return TAG;
+}
+
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n  ;
+
+\&(quot|amp|nbsp|lt|gt)\;   {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return HTMLENTITY;
+}
+
+\&\#[0-9][0-9]?[0-9]?\; {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return HTMLENTITY;
+}
+ 
+[-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return EMAIL; 
+}
+
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */   { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return SCIENTIFIC; 
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return DECIMAL;
+}
+
+[+-][0-9]+ { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return SIGNEDINT; 
+}
+
+<DELIM,INITIAL>[0-9]+ { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return UNSIGNEDINT; 
+}
+
+http"://"        { 
+       BEGIN URL; 
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return HTTP;
+}
+
+ftp"://"        { 
+       BEGIN URL; 
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return HTTP;
+}
+
+<URL,INITIAL>{HOSTNAME}[/:]{URI} { 
+       BEGIN SERVER;
+       if (s) { free(s); s=NULL; } 
+       s = strdup( tsearch2_yytext ); 
+       tokenlen = tsearch2_yyleng;
+       yyless( 0 ); 
+       token = s;
+       return FURL;
+}
+
+<SERVER,URL,INITIAL>{HOSTNAME} {
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return HOST;
+}
+
+<SERVER>[/:]{URI}      {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return URI;
+}
+
+[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return FILEPATH;
+}
+
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */        {
+       BEGIN DELIM;
+       if (s) { free(s); s=NULL; } 
+       s = strdup( tsearch2_yytext );
+       tokenlen = tsearch2_yyleng;
+       yyless( 0 );
+       token = s;
+       return CYRHYPHENWORD;
+}
+
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */      {
+        BEGIN DELIM;
+       if (s) { free(s); s=NULL; } 
+       s = strdup( tsearch2_yytext );
+       tokenlen = tsearch2_yyleng;
+       yyless( 0 );
+       token = s;
+       return LATHYPHENWORD;
+}
+
+({ALNUM}+-)+{ALNUM}+ /* composite-word */      {
+       BEGIN DELIM;
+       if (s) { free(s); s=NULL; } 
+       s = strdup( tsearch2_yytext );
+       tokenlen = tsearch2_yyleng;
+       yyless( 0 );
+       token = s;
+       return HYPHENWORD;
+}
+
+<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return VERSIONNUMBER;
+}
+
+<DELIM>\+?[0-9]+\.[0-9]+ {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return DECIMAL;
+}
+
+<DELIM>{CYRALPHA}+  /* one word in composite-word */   { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return CYRPARTHYPHENWORD; 
+}
+
+<DELIM>[[:alpha:]]+  /* one word in composite-word */  { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return LATPARTHYPHENWORD; 
+}
+
+<DELIM>{ALNUM}+  /* one word in composite-word */      { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return PARTHYPHENWORD; 
+}
+
+<DELIM>-  { 
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return SPACE;
+}
+
+<DELIM,SERVER,URL>.|\n /* return in basic state */     {
+       BEGIN INITIAL;
+       yyless( 0 );
+}
+
+{CYRALPHA}+ /* normal word */  { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return CYRWORD; 
+}
+
+[[:alpha:]]+ /* normal word */ { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return LATWORD; 
+}
+
+{ALNUM}+ /* normal word */     { 
+       token = tsearch2_yytext; 
+       tokenlen = tsearch2_yyleng;
+       return UWORD; 
+}
+
+[ \r\n\t]+ {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return SPACE;
+}
+
+. {
+       token = tsearch2_yytext;
+       tokenlen = tsearch2_yyleng;
+       return SPACE;
+} 
+
+%%
+
+/* clearing after parsing from string */
+void tsearch2_end_parse() {
+       if (s) { free(s); s=NULL; } 
+       tsearch2_yy_delete_buffer( buf );
+       buf = NULL;
+} 
+
+/* start parse from string */
+void tsearch2_start_parse_str(char* str, int limit) {
+       if (buf) end_parse();
+       buf = tsearch2_yy_scan_bytes( str, limit );
+       tsearch2_yy_switch_to_buffer( buf );
+       BEGIN INITIAL;
+}
diff --git a/contrib/tsearch2/wparser_def.c b/contrib/tsearch2/wparser_def.c

new file mode 100644 (file)

index 0000000..6e87750
--- /dev/null
+++ b/contrib/tsearch2/wparser_def.c
@@ -0,0 +1,357 @@
+/*
+ * default word parser
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "postgres.h"
+#include "utils/builtins.h"
+
+#include "dict.h"
+#include "wparser.h"
+#include "common.h"
+#include "ts_cfg.h"
+#include "wordparser/parser.h"
+#include "wordparser/deflex.h"
+
+PG_FUNCTION_INFO_V1(prsd_lextype);
+Datum          prsd_lextype(PG_FUNCTION_ARGS);
+
+Datum
+prsd_lextype(PG_FUNCTION_ARGS)
+{
+       LexDescr   *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
+       int                     i;
+
+       for (i = 1; i <= LASTNUM; i++)
+       {
+               descr[i - 1].lexid = i;
+               descr[i - 1].alias = pstrdup(tok_alias[i]);
+               descr[i - 1].descr = pstrdup(lex_descr[i]);
+       }
+
+       descr[LASTNUM].lexid = 0;
+
+       PG_RETURN_POINTER(descr);
+}
+
+PG_FUNCTION_INFO_V1(prsd_start);
+Datum          prsd_start(PG_FUNCTION_ARGS);
+Datum
+prsd_start(PG_FUNCTION_ARGS)
+{
+       tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
+       PG_RETURN_POINTER(NULL);
+}
+
+PG_FUNCTION_INFO_V1(prsd_getlexeme);
+Datum          prsd_getlexeme(PG_FUNCTION_ARGS);
+Datum
+prsd_getlexeme(PG_FUNCTION_ARGS)
+{
+       /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+       char      **t = (char **) PG_GETARG_POINTER(1);
+       int                *tlen = (int *) PG_GETARG_POINTER(2);
+       int                     type = tsearch2_yylex();
+
+       *t = token;
+       *tlen = tokenlen;
+       PG_RETURN_INT32(type);
+}
+
+PG_FUNCTION_INFO_V1(prsd_end);
+Datum          prsd_end(PG_FUNCTION_ARGS);
+Datum
+prsd_end(PG_FUNCTION_ARGS)
+{
+       /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+       tsearch2_end_parse();
+       PG_RETURN_VOID();
+}
+
+#define LEAVETOKEN(x)  ( (x)==12 )
+#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
+#define ENDPUNCTOKEN(x) ( (x)==12 )
+
+
+#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
+#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
+#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
+#define NOENDTOKEN(x)  ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
+
+typedef struct
+{
+       HLWORD     *words;
+       int                     len;
+}      hlCheck;
+
+static bool
+checkcondition_HL(void *checkval, ITEM * val)
+{
+       int                     i;
+
+       for (i = 0; i < ((hlCheck *) checkval)->len; i++)
+       {
+               if (((hlCheck *) checkval)->words[i].item == val)
+                       return true;
+       }
+       return false;
+}
+
+
+static bool
+hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
+{
+       int                     i,
+                               j;
+       ITEM       *item = GETQUERY(query);
+       int                     pos = *p;
+
+       *q = 0;
+       *p = 0x7fffffff;
+
+       for (j = 0; j < query->size; j++)
+       {
+               if (item->type != VAL)
+               {
+                       item++;
+                       continue;
+               }
+               for (i = pos; i < prs->curwords; i++)
+               {
+                       if (prs->words[i].item == item)
+                       {
+                               if (i > *q)
+                                       *q = i;
+                               break;
+                       }
+               }
+               item++;
+       }
+
+       if (*q == 0)
+               return false;
+
+       item = GETQUERY(query);
+       for (j = 0; j < query->size; j++)
+       {
+               if (item->type != VAL)
+               {
+                       item++;
+                       continue;
+               }
+               for (i = *q; i >= pos; i--)
+               {
+                       if (prs->words[i].item == item)
+                       {
+                               if (i < *p)
+                                       *p = i;
+                               break;
+                       }
+               }
+               item++;
+       }
+
+       if (*p <= *q)
+       {
+               hlCheck         ch = {&(prs->words[*p]), *q - *p + 1};
+
+               if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
+                       return true;
+               else
+               {
+                       (*p)++;
+                       return hlCover(prs, query, p, q);
+               }
+       }
+
+       return false;
+}
+
+PG_FUNCTION_INFO_V1(prsd_headline);
+Datum          prsd_headline(PG_FUNCTION_ARGS);
+Datum
+prsd_headline(PG_FUNCTION_ARGS)
+{
+       HLPRSTEXT  *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
+       text       *opt = (text *) PG_GETARG_POINTER(1);        /* can't be toasted */
+       QUERYTYPE  *query = (QUERYTYPE *) PG_GETARG_POINTER(2);         /* can't be toasted */
+
+       /* from opt + start and and tag */
+       int                     min_words = 15;
+       int                     max_words = 35;
+       int                     shortword = 3;
+
+       int                     p = 0,
+                               q = 0;
+       int                     bestb = -1,
+                               beste = -1;
+       int                     bestlen = -1;
+       int                     pose = 0, posb,
+                               poslen,
+                               curlen;
+
+       int                     i;
+
+       /* config */
+       prs->startsel = NULL;
+       prs->stopsel = NULL;
+       if (opt)
+       {
+               Map                *map,
+                                  *mptr;
+
+               parse_cfgdict(opt, &map);
+               mptr = map;
+
+               while (mptr && mptr->key)
+               {
+                       if (strcasecmp(mptr->key, "MaxWords") == 0)
+                               max_words = pg_atoi(mptr->value, 4, 1);
+                       else if (strcasecmp(mptr->key, "MinWords") == 0)
+                               min_words = pg_atoi(mptr->value, 4, 1);
+                       else if (strcasecmp(mptr->key, "ShortWord") == 0)
+                               shortword = pg_atoi(mptr->value, 4, 1);
+                       else if (strcasecmp(mptr->key, "StartSel") == 0)
+                               prs->startsel = pstrdup(mptr->value);
+                       else if (strcasecmp(mptr->key, "StopSel") == 0)
+                               prs->stopsel = pstrdup(mptr->value);
+
+                       pfree(mptr->key);
+                       pfree(mptr->value);
+
+                       mptr++;
+               }
+               pfree(map);
+
+               if (min_words >= max_words)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("MinWords should be less than MaxWords")));
+               if (min_words <= 0)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("MinWords should be positive")));
+               if (shortword < 0)
+                       ereport(ERROR,
+                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                        errmsg("ShortWord should be >= 0")));
+       }
+
+       while (hlCover(prs, query, &p, &q))
+       {
+               /* find cover len in words */
+               curlen = 0;
+               poslen = 0;
+               for (i = p; i <= q && curlen < max_words; i++)
+               {
+                       if (!NONWORDTOKEN(prs->words[i].type))
+                               curlen++;
+                       if (prs->words[i].item && !prs->words[i].repeated)
+                               poslen++;
+                       pose = i;
+               }
+
+               if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+               {
+                       /* best already finded, so try one more cover */
+                       p++;
+                       continue;
+               }
+
+               posb=p;
+               if (curlen < max_words)
+               {                                               /* find good end */
+                       for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+                       {
+                               if (i != q)
+                               {
+                                       if (!NONWORDTOKEN(prs->words[i].type))
+                                               curlen++;
+                                       if (prs->words[i].item && !prs->words[i].repeated)
+                                               poslen++;
+                               }
+                               pose = i;
+                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                       continue;
+                               if (curlen >= min_words)
+                                       break;
+                       }
+                       if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
+                               for(i=p; i>= 0; i--) {
+                                       if (!NONWORDTOKEN(prs->words[i].type))
+                                               curlen++;
+                                       if (prs->words[i].item && !prs->words[i].repeated)
+                                               poslen++;
+                                       if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                               continue;
+                                       if (curlen >= min_words)
+                                               break;
+                               }
+                               posb=(i>=0) ? i : 0;
+                       }
+               }
+               else
+               {                                               /* shorter cover :((( */
+                       for (; curlen > min_words; i--)
+                       {
+                               if (!NONWORDTOKEN(prs->words[i].type))
+                                       curlen--;
+                               if (prs->words[i].item && !prs->words[i].repeated)
+                                       poslen--;
+                               pose = i;
+                               if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+                                       continue;
+                               break;
+                       }
+               }
+
+               if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
+                       (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
+                        (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+               {
+                       bestb = posb;
+                       beste = pose;
+                       bestlen = poslen;
+               }
+
+               p++;
+       }
+
+       if (bestlen < 0)
+       {
+               curlen = 0;
+               poslen = 0;
+               for (i = 0; i < prs->curwords && curlen < min_words; i++)
+               {
+                       if (!NONWORDTOKEN(prs->words[i].type))
+                               curlen++;
+                       pose = i;
+               }
+               bestb = 0;
+               beste = pose;
+       }
+
+       for (i = bestb; i <= beste; i++)
+       {
+               if (prs->words[i].item)
+                       prs->words[i].selected = 1;
+               if (prs->words[i].repeated)
+                       prs->words[i].skip = 1;
+               if (HLIDIGNORE(prs->words[i].type))
+                       prs->words[i].replace = 1;
+
+               prs->words[i].in = 1;
+       }
+
+       if (!prs->startsel)
+               prs->startsel = pstrdup("<b>");
+       if (!prs->stopsel)
+               prs->stopsel = pstrdup("</b>");
+       prs->startsellen = strlen(prs->startsel);
+       prs->stopsellen = strlen(prs->stopsel);
+
+       PG_RETURN_POINTER(prs);
+}
author	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)
committer	Teodor Sigaev <teodor@sigaev.ru>
	Fri, 5 Dec 2003 14:28:21 +0000 (14:28 +0000)
contrib/tsearch2/wordparser/parser.h	[new file with mode: 0644]	patch \| blob
contrib/tsearch2/wordparser/parser.l	[new file with mode: 0644]	patch \| blob
contrib/tsearch2/wparser_def.c	[new file with mode: 0644]	patch \| blob