--- /dev/null
+%{
+#include "postgres.h"
+
+#include "deflex.h"
+#include "parser.h"
+#include "common.h"
+
+/* Avoid exit() on fatal scanner errors */
+#define fprintf(file, fmt, msg) ts_error(ERROR, fmt, msg)
+
+char *token = NULL; /* pointer to token */
+int tokenlen;
+char *s = NULL; /* to return WHOLE hyphenated-word */
+
+YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
+
+%}
+
+%option 8bit
+%option never-interactive
+%option nounput
+%option noyywrap
+
+/* parser's state for parsing hyphenated-word */
+%x DELIM
+/* parser's state for parsing URL*/
+%x URL
+%x SERVER
+
+/* parser's state for parsing TAGS */
+%x INTAG
+%x QINTAG
+%x INCOMMENT
+%x INSCRIPT
+
+/* cyrillic koi8 char */
+CYRALNUM [0-9\200-\377]
+CYRALPHA [\200-\377]
+ALPHA [a-zA-Z\200-\377]
+ALNUM [0-9a-zA-Z\200-\377]
+
+
+HOSTNAME ([-_[:alnum:]]+\.)+[[:alpha:]]+
+URI [-_[:alnum:]/%,\.;=&?#]+
+
+%%
+
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
+
+<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" {
+ BEGIN INITIAL;
+ *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SPACE;
+}
+
+"<!--" { BEGIN INCOMMENT; }
+
+<INCOMMENT>"-->" {
+ BEGIN INITIAL;
+ *tsearch2_yytext=' '; *(tsearch2_yytext+1) = '\0';
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SPACE;
+}
+
+
+"<"[\![:alpha:]] { BEGIN INTAG; }
+
+"</"[[:alpha:]] { BEGIN INTAG; }
+
+<INTAG>"\"" { BEGIN QINTAG; }
+
+<QINTAG>"\\\"" ;
+
+<QINTAG>"\"" { BEGIN INTAG; }
+
+<INTAG>">" {
+ BEGIN INITIAL;
+ token = tsearch2_yytext;
+ *tsearch2_yytext=' ';
+ token = tsearch2_yytext;
+ tokenlen = 1;
+ return TAG;
+}
+
+<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ;
+
+\&(quot|amp|nbsp|lt|gt)\; {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return HTMLENTITY;
+}
+
+\&\#[0-9][0-9]?[0-9]?\; {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return HTMLENTITY;
+}
+
+[-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return EMAIL;
+}
+
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SCIENTIFIC;
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return DECIMAL;
+}
+
+[+-][0-9]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SIGNEDINT;
+}
+
+<DELIM,INITIAL>[0-9]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return UNSIGNEDINT;
+}
+
+http"://" {
+ BEGIN URL;
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return HTTP;
+}
+
+ftp"://" {
+ BEGIN URL;
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return HTTP;
+}
+
+<URL,INITIAL>{HOSTNAME}[/:]{URI} {
+ BEGIN SERVER;
+ if (s) { free(s); s=NULL; }
+ s = strdup( tsearch2_yytext );
+ tokenlen = tsearch2_yyleng;
+ yyless( 0 );
+ token = s;
+ return FURL;
+}
+
+<SERVER,URL,INITIAL>{HOSTNAME} {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return HOST;
+}
+
+<SERVER>[/:]{URI} {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return URI;
+}
+
+[[:alnum:]\./_-]+"/"[[:alnum:]\./_-]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return FILEPATH;
+}
+
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ {
+ BEGIN DELIM;
+ if (s) { free(s); s=NULL; }
+ s = strdup( tsearch2_yytext );
+ tokenlen = tsearch2_yyleng;
+ yyless( 0 );
+ token = s;
+ return CYRHYPHENWORD;
+}
+
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ {
+ BEGIN DELIM;
+ if (s) { free(s); s=NULL; }
+ s = strdup( tsearch2_yytext );
+ tokenlen = tsearch2_yyleng;
+ yyless( 0 );
+ token = s;
+ return LATHYPHENWORD;
+}
+
+({ALNUM}+-)+{ALNUM}+ /* composite-word */ {
+ BEGIN DELIM;
+ if (s) { free(s); s=NULL; }
+ s = strdup( tsearch2_yytext );
+ tokenlen = tsearch2_yyleng;
+ yyless( 0 );
+ token = s;
+ return HYPHENWORD;
+}
+
+<DELIM>[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return VERSIONNUMBER;
+}
+
+<DELIM>\+?[0-9]+\.[0-9]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return DECIMAL;
+}
+
+<DELIM>{CYRALPHA}+ /* one word in composite-word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return CYRPARTHYPHENWORD;
+}
+
+<DELIM>[[:alpha:]]+ /* one word in composite-word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return LATPARTHYPHENWORD;
+}
+
+<DELIM>{ALNUM}+ /* one word in composite-word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return PARTHYPHENWORD;
+}
+
+<DELIM>- {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SPACE;
+}
+
+<DELIM,SERVER,URL>.|\n /* return in basic state */ {
+ BEGIN INITIAL;
+ yyless( 0 );
+}
+
+{CYRALPHA}+ /* normal word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return CYRWORD;
+}
+
+[[:alpha:]]+ /* normal word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return LATWORD;
+}
+
+{ALNUM}+ /* normal word */ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return UWORD;
+}
+
+[ \r\n\t]+ {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SPACE;
+}
+
+. {
+ token = tsearch2_yytext;
+ tokenlen = tsearch2_yyleng;
+ return SPACE;
+}
+
+%%
+
+/* clearing after parsing from string */
+void tsearch2_end_parse() {
+ if (s) { free(s); s=NULL; }
+ tsearch2_yy_delete_buffer( buf );
+ buf = NULL;
+}
+
+/* start parse from string */
+void tsearch2_start_parse_str(char* str, int limit) {
+ if (buf) end_parse();
+ buf = tsearch2_yy_scan_bytes( str, limit );
+ tsearch2_yy_switch_to_buffer( buf );
+ BEGIN INITIAL;
+}
--- /dev/null
+/*
+ * default word parser
+ * Teodor Sigaev <teodor@sigaev.ru>
+ */
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "postgres.h"
+#include "utils/builtins.h"
+
+#include "dict.h"
+#include "wparser.h"
+#include "common.h"
+#include "ts_cfg.h"
+#include "wordparser/parser.h"
+#include "wordparser/deflex.h"
+
+PG_FUNCTION_INFO_V1(prsd_lextype);
+Datum prsd_lextype(PG_FUNCTION_ARGS);
+
+Datum
+prsd_lextype(PG_FUNCTION_ARGS)
+{
+ LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (LASTNUM + 1));
+ int i;
+
+ for (i = 1; i <= LASTNUM; i++)
+ {
+ descr[i - 1].lexid = i;
+ descr[i - 1].alias = pstrdup(tok_alias[i]);
+ descr[i - 1].descr = pstrdup(lex_descr[i]);
+ }
+
+ descr[LASTNUM].lexid = 0;
+
+ PG_RETURN_POINTER(descr);
+}
+
+PG_FUNCTION_INFO_V1(prsd_start);
+Datum prsd_start(PG_FUNCTION_ARGS);
+Datum
+prsd_start(PG_FUNCTION_ARGS)
+{
+ tsearch2_start_parse_str((char *) PG_GETARG_POINTER(0), PG_GETARG_INT32(1));
+ PG_RETURN_POINTER(NULL);
+}
+
+PG_FUNCTION_INFO_V1(prsd_getlexeme);
+Datum prsd_getlexeme(PG_FUNCTION_ARGS);
+Datum
+prsd_getlexeme(PG_FUNCTION_ARGS)
+{
+ /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+ char **t = (char **) PG_GETARG_POINTER(1);
+ int *tlen = (int *) PG_GETARG_POINTER(2);
+ int type = tsearch2_yylex();
+
+ *t = token;
+ *tlen = tokenlen;
+ PG_RETURN_INT32(type);
+}
+
+PG_FUNCTION_INFO_V1(prsd_end);
+Datum prsd_end(PG_FUNCTION_ARGS);
+Datum
+prsd_end(PG_FUNCTION_ARGS)
+{
+ /* ParserState *p=(ParserState*)PG_GETARG_POINTER(0); */
+ tsearch2_end_parse();
+ PG_RETURN_VOID();
+}
+
+#define LEAVETOKEN(x) ( (x)==12 )
+#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
+#define ENDPUNCTOKEN(x) ( (x)==12 )
+
+
+#define IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
+#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
+#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
+#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || IDIGNORE(x) )
+
+typedef struct
+{
+ HLWORD *words;
+ int len;
+} hlCheck;
+
+static bool
+checkcondition_HL(void *checkval, ITEM * val)
+{
+ int i;
+
+ for (i = 0; i < ((hlCheck *) checkval)->len; i++)
+ {
+ if (((hlCheck *) checkval)->words[i].item == val)
+ return true;
+ }
+ return false;
+}
+
+
+static bool
+hlCover(HLPRSTEXT * prs, QUERYTYPE * query, int *p, int *q)
+{
+ int i,
+ j;
+ ITEM *item = GETQUERY(query);
+ int pos = *p;
+
+ *q = 0;
+ *p = 0x7fffffff;
+
+ for (j = 0; j < query->size; j++)
+ {
+ if (item->type != VAL)
+ {
+ item++;
+ continue;
+ }
+ for (i = pos; i < prs->curwords; i++)
+ {
+ if (prs->words[i].item == item)
+ {
+ if (i > *q)
+ *q = i;
+ break;
+ }
+ }
+ item++;
+ }
+
+ if (*q == 0)
+ return false;
+
+ item = GETQUERY(query);
+ for (j = 0; j < query->size; j++)
+ {
+ if (item->type != VAL)
+ {
+ item++;
+ continue;
+ }
+ for (i = *q; i >= pos; i--)
+ {
+ if (prs->words[i].item == item)
+ {
+ if (i < *p)
+ *p = i;
+ break;
+ }
+ }
+ item++;
+ }
+
+ if (*p <= *q)
+ {
+ hlCheck ch = {&(prs->words[*p]), *q - *p + 1};
+
+ if (TS_execute(GETQUERY(query), &ch, false, checkcondition_HL))
+ return true;
+ else
+ {
+ (*p)++;
+ return hlCover(prs, query, p, q);
+ }
+ }
+
+ return false;
+}
+
+PG_FUNCTION_INFO_V1(prsd_headline);
+Datum prsd_headline(PG_FUNCTION_ARGS);
+Datum
+prsd_headline(PG_FUNCTION_ARGS)
+{
+ HLPRSTEXT *prs = (HLPRSTEXT *) PG_GETARG_POINTER(0);
+ text *opt = (text *) PG_GETARG_POINTER(1); /* can't be toasted */
+ QUERYTYPE *query = (QUERYTYPE *) PG_GETARG_POINTER(2); /* can't be toasted */
+
+ /* from opt + start and and tag */
+ int min_words = 15;
+ int max_words = 35;
+ int shortword = 3;
+
+ int p = 0,
+ q = 0;
+ int bestb = -1,
+ beste = -1;
+ int bestlen = -1;
+ int pose = 0, posb,
+ poslen,
+ curlen;
+
+ int i;
+
+ /* config */
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+ if (opt)
+ {
+ Map *map,
+ *mptr;
+
+ parse_cfgdict(opt, &map);
+ mptr = map;
+
+ while (mptr && mptr->key)
+ {
+ if (strcasecmp(mptr->key, "MaxWords") == 0)
+ max_words = pg_atoi(mptr->value, 4, 1);
+ else if (strcasecmp(mptr->key, "MinWords") == 0)
+ min_words = pg_atoi(mptr->value, 4, 1);
+ else if (strcasecmp(mptr->key, "ShortWord") == 0)
+ shortword = pg_atoi(mptr->value, 4, 1);
+ else if (strcasecmp(mptr->key, "StartSel") == 0)
+ prs->startsel = pstrdup(mptr->value);
+ else if (strcasecmp(mptr->key, "StopSel") == 0)
+ prs->stopsel = pstrdup(mptr->value);
+
+ pfree(mptr->key);
+ pfree(mptr->value);
+
+ mptr++;
+ }
+ pfree(map);
+
+ if (min_words >= max_words)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be less than MaxWords")));
+ if (min_words <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinWords should be positive")));
+ if (shortword < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("ShortWord should be >= 0")));
+ }
+
+ while (hlCover(prs, query, &p, &q))
+ {
+ /* find cover len in words */
+ curlen = 0;
+ poslen = 0;
+ for (i = p; i <= q && curlen < max_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ pose = i;
+ }
+
+ if (poslen < bestlen && !(NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword))
+ {
+ /* best already finded, so try one more cover */
+ p++;
+ continue;
+ }
+
+ posb=p;
+ if (curlen < max_words)
+ { /* find good end */
+ for (i = i - 1; i < prs->curwords && curlen < max_words; i++)
+ {
+ if (i != q)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ }
+ pose = i;
+ if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ if ( curlen < min_words && i>=prs->curwords ) { /* got end of text and our cover is shoter than min_words */
+ for(i=p; i>= 0; i--) {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen++;
+ if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ continue;
+ if (curlen >= min_words)
+ break;
+ }
+ posb=(i>=0) ? i : 0;
+ }
+ }
+ else
+ { /* shorter cover :((( */
+ for (; curlen > min_words; i--)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen--;
+ if (prs->words[i].item && !prs->words[i].repeated)
+ poslen--;
+ pose = i;
+ if (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword)
+ continue;
+ break;
+ }
+ }
+
+ if (bestlen < 0 || (poslen > bestlen && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword)) ||
+ (bestlen >= 0 && !(NOENDTOKEN(prs->words[pose].type) || prs->words[pose].len <= shortword) &&
+ (NOENDTOKEN(prs->words[beste].type) || prs->words[beste].len <= shortword)))
+ {
+ bestb = posb;
+ beste = pose;
+ bestlen = poslen;
+ }
+
+ p++;
+ }
+
+ if (bestlen < 0)
+ {
+ curlen = 0;
+ poslen = 0;
+ for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ {
+ if (!NONWORDTOKEN(prs->words[i].type))
+ curlen++;
+ pose = i;
+ }
+ bestb = 0;
+ beste = pose;
+ }
+
+ for (i = bestb; i <= beste; i++)
+ {
+ if (prs->words[i].item)
+ prs->words[i].selected = 1;
+ if (prs->words[i].repeated)
+ prs->words[i].skip = 1;
+ if (HLIDIGNORE(prs->words[i].type))
+ prs->words[i].replace = 1;
+
+ prs->words[i].in = 1;
+ }
+
+ if (!prs->startsel)
+ prs->startsel = pstrdup("<b>");
+ if (!prs->stopsel)
+ prs->stopsel = pstrdup("</b>");
+ prs->startsellen = strlen(prs->startsel);
+ prs->stopsellen = strlen(prs->stopsel);
+
+ PG_RETURN_POINTER(prs);
+}