This modules generates template for tsearch2 dictionary. It has built-in support for snowball stemmers.
Read http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/README.gendict for more details.
Here is how to add portuguese stemmer:
0. cd PGSQL_SRC/contrib/tsearch2/gendict 1. Obtain stem.{c,h} files for Portuguese wget http://snowball.tartarus.org/portuguese/stem.c wget http://snowball.tartarus.org/portuguese/stem.h 2. Create template files for Portuguese ./config.sh -n pt -s -p portuguese -v -C'Snowball stemmer for Portuguese' Note, that argument for -p option should be *the same* as name of stemming function in stem.c (without _stem) A bunch of files will be generated and placed in PGSQL_SRC/contrib/dict_pt directory. 3. Compile and install dictionary cd ../../dict_pt make make install 4. Test it Sample portuguese words with the stemmed forms are available from http://snowball.tartarus.org/portuguese/stemmer.html createdb testdict psql testdict < /usr/local/pgsql/share/contrib/tsearch.sql psql testdict < /usr/local/pgsql/share/contrib/dict_pt.sql psql -d testdict -c "select lexize('pt','bobagem');" lexize --------- {bobag}
Motivation for this dictionary is to control indexing of integers (signed and unsigned), and, consequently, to minimize the number of unique words, which, in turn, greatly affects to performance of searching.
Dictionary accepts two init options: *MAXLEN parameter specifies maximum length of the number considered as a 'good' integer. Default value is 6. *REJECTLONG parameter specifies if 'long' integer should be indexed or treated as stop word. **If REJECTLONG=false (default), than dictionary returns prefixed part of integer number with length MAXLEN. **If REJECTLONG=true, than dictionary consider integer as a stop word.
Examples:
cd tsearch2/gendict/ ./config.sh -n intdict -v -i -C 'dictionary for integers' cd ../../dict_intdict
Now I could edit file dict_tmpl.c generated by gendict. (See explanatory notes below)
/* * example of dictionary * Teodor Sigaev <teodor@sigaev.ru> */ #include <errno.h> #include <stdlib.h> #include <string.h> #include "postgres.h" #include "fmgr.h" /* needed for 8.2+ */ #ifdef PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif #include "dict.h" #include "common.h" #include "subinclude.h" typedef struct { int maxlen; bool rejectlong; } DictInt; PG_FUNCTION_INFO_V1(dinit_intdict); Datum dinit_intdict(PG_FUNCTION_ARGS); Datum dinit_intdict(PG_FUNCTION_ARGS) { DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); Map *cfg, *pcfg; text *in; if ( !d ) elog(ERROR, "No memory"); memset(d,0,sizeof(DictInt)); /* Your INIT code */ /* defaults */ d->maxlen = 6; d->rejectlong = false; if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) { /* no options */ PG_RETURN_POINTER(d); } in = PG_GETARG_TEXT_P(0); parse_cfgdict(in,&cfg); PG_FREE_IF_COPY(in, 0); pcfg=cfg; while (pcfg->key) { if ( strcasecmp("MAXLEN", pcfg->key) == 0 ) { d->maxlen=atoi(pcfg->value); } else if ( strcasecmp("REJECTLONG", pcfg->key) == 0 ) { if ( strcasecmp("true", pcfg->value) == 0 ) { d->rejectlong=true; } else if ( strcasecmp("false", pcfg->value) == 0 ) { d->rejectlong=false; } else { elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); } } else { elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); } pfree(pcfg->key); pfree(pcfg->value); pcfg++; } pfree(cfg); PG_RETURN_POINTER(d); } PG_FUNCTION_INFO_V1(dlexize_intdict); Datum dlexize_intdict(PG_FUNCTION_ARGS); Datum dlexize_intdict(PG_FUNCTION_ARGS) { DictInt *d = (DictInt*)PG_GETARG_POINTER(0); char *in = (char*)PG_GETARG_POINTER(1); char *txt = pnstrdup(in, PG_GETARG_INT32(2)); TSLexeme *res = palloc(sizeof(TSLexeme)*2); /* Your INIT dictionary code */ res[1].lexeme = NULL; if ( PG_GETARG_INT32(2) > d->maxlen ) { if ( d->rejectlong ) { /* stop, return void array */ pfree(txt); res[0].lexeme = NULL; } else { /* cut integer */ txt[d->maxlen] = '\0'; res[0].lexeme = txt; } } else { res[0].lexeme = txt; } PG_RETURN_POINTER(res); }
Specify default options in dict_intdict.sql.in
insert into pg_ts_dict select 'intdict', (select oid from pg_proc where proname='dinit_intdict'), 'MAXLEN=6,REJECTLONG=false', (select oid from pg_proc where proname='dlexize_intdict'), 'dictionary for integers';
After that, I compile and install it.
make make install
Test it
createdb qq psql qq < /usr/local/pgsql/share/contrib/tsearch2.sql psql qq < /usr/local/pgsql/share/contrib/dict_intdict.sql qq=# select dict_name, dict_initoption from pg_ts_dict where dict_name='intdict'; dict_name | dict_initoption -----------+--------------------------- intdict | MAXLEN=6,REJECTLONG=false qq=# select lexize('intdict','12345678'); lexize ---------- {123456} qq=# select lexize('intdict','123'); lexize -------- {123}
Now, change initoption:
qq=# update pg_ts_dict set dict_initoption='MAXLEN=6,REJECTLONG=true' where dic t_name = 'intdict'; UPDATE 1 qq=# select dict_name, dict_initoption from pg_ts_dict where dict_name='intdict'; dict_name | dict_initoption -----------+-------------------------- intdict | MAXLEN=6,REJECTLONG=true qq=# select lexize('intdict','12345678'); lexize ---------- {123456}
It appears, that changing of initoptions doesn't works :) For performance reason init function of dictionary calls only once per session, so you need to stop session or begin new one or use function reset_tsearch().
qq=# select reset_tsearch(); NOTICE: TSearch cache cleaned reset_tsearch --------------- (1 row) qq=# select lexize('intdict','12345678'); lexize -------- {}
Now, it works as expected - returns stop word.
Specify intdict dictionary to process int and uint tokens ( for simplicity, I did that for all configurations)
qq=# select * from pg_ts_cfgmap where tok_alias ~ 'int'; ts_name | tok_alias | dict_name -----------------+-----------+----------- default | int | {simple} default | uint | {simple} default_russian | int | {simple} default_russian | uint | {simple} simple | int | {simple} simple | uint | {simple} qq=# update pg_ts_cfgmap set dict_name='{intdict}' where tok_alias ~ 'int'; UPDATE 6 qq=# select * from pg_ts_cfgmap where tok_alias ~ 'int'; ts_name | tok_alias | dict_name -----------------+-----------+----------- default | int | {intdict} default | uint | {intdict} default_russian | int | {intdict} default_russian | uint | {intdict} simple | int | {intdict} simple | uint | {intdict}
That's all.
Motivation for this dictionary is to control indexing of decimal numbers and, consequently, to minimize the number of unique words, which, in turn, greatly affects to performance of searching.
Dictionary accepts two init options: *MAXLENFRAC parameter specifies maximum length of the fraction part considered as a 'good' decimal. Default value is 3. *REJECTLONG parameter specifies if decimal number with 'long' fraction part should be indexed or treated as a stop word. **If REJECTLONG=false (default), than dictionary returns decimal number with length of fraction part MAXLEN. **If REJECTLONG=true, than dictionary consider number as a stop word.
Examples:
Notice, that REJECTLONG=false allow indexing 'shortened' numbers and search results will contain documents with original 'garbage' numbers.
Implementation of this dictionary I leave to readers :)
typedef struct { int maxlen; bool rejectlong; } DictInt; ............................. DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
in = PG_GETARG_TEXT_P(0); parse_cfgdict(in,&cfg); PG_FREE_IF_COPY(in, 0);
res[0].lexeme = NULL;
PG_RETURN_POINTER(NULL)
txt[d->maxlen] = '\0'; res[0].lexeme = txt;
res[0].lexeme = txt;