Appendix C. FTS Dictionary Example

Motivation for this dictionary is to control indexing of integers (signed and unsigned), and, consequently, to minimize the number of unique words, which, in turn, greatly affects to performance of searching.

Dictionary accepts two init options:

Similar idea can be applied to the indexing of decimal numbers, for example, DecDict dictionary. Dictionary accepts two init options: MAXLENFRAC parameter specifies maximum length of the fraction part considered as a 'good' decimal, default value is 3. REJECTLONG parameter specifies if decimal number with 'long' fraction part should be indexed or treated as a stop word. If REJECTLONG=FALSE (default), than dictionary returns decimal number with length of fraction part MAXLEN. If REJECTLONG=TRUE, than dictionary consider number as a stop word. Notice, that REJECTLONG=FALSE allow indexing 'shortened' numbers and search results will contain documents with original 'garbage' numbers.

Examples:

=# select lexize('intdict', 11234567890);
  lexize
----------
 {112345}

Now, we want to ignore long integers.


=# ALTER FULLTEXT DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
=# select lexize('intdict', 11234567890);
 lexize
--------
 {}

Create contrib/dict_intdict directory with files dict_tmpl.c,Makefile,dict_intdict.sql.in, then

make && make install
psql DBNAME < dict_intdict.sql

This is a dict_tmpl.c file.

#include "postgres.h"
#include "utils/builtins.h"
#include "fmgr.h"

#ifdef PG_MODULE_MAGIC
PG_MODULE_MAGIC;
#endif

#include "utils/ts_locale.h"
#include "utils/ts_public.h"
#include "utils/ts_utils.h"

 typedef struct {
        int     maxlen;
        bool    rejectlong;
 } DictInt;


 PG_FUNCTION_INFO_V1(dinit_intdict);
 Datum dinit_intdict(PG_FUNCTION_ARGS);

 Datum
 dinit_intdict(PG_FUNCTION_ARGS) {
        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
        Map *cfg, *pcfg;
        text *in;

        if ( !d )
                elog(ERROR, "No memory");
        memset(d,0,sizeof(DictInt));

        /* Your INIT code */
/* defaults */
        d->maxlen = 6;
        d->rejectlong = false;

if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) { /* no options */
        PG_RETURN_POINTER(d);
}
        in = PG_GETARG_TEXT_P(0);
        parse_keyvalpairs(in,&cfg);
        PG_FREE_IF_COPY(in, 0);
        pcfg=cfg;

        while (pcfg->key) {
                if ( strcasecmp("MAXLEN", pcfg->key) == 0 ) {
                        d->maxlen=atoi(pcfg->value);
                } else if ( strcasecmp("REJECTLONG", pcfg->key) == 0 ) {
                        if ( strcasecmp("true", pcfg->value) == 0 ) {
                                d->rejectlong=true;
                        } else if ( strcasecmp("false", pcfg->value) == 0 ) {
                                d->rejectlong=false;
                        } else {
                                elog(ERROR,"Unknown value: %s => %s", pcfg->key,
 pcfg->value);
                        }
                } else {
                        elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->
value);
                }
                pfree(pcfg->key);
                pfree(pcfg->value);
                pcfg++;
        }
        pfree(cfg);

        PG_RETURN_POINTER(d);
 }

PG_FUNCTION_INFO_V1(dlexize_intdict);
Datum dlexize_intdict(PG_FUNCTION_ARGS);
Datum
dlexize_intdict(PG_FUNCTION_ARGS) {
        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
        char       *in = (char*)PG_GETARG_POINTER(1);
        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
        TSLexeme *res=palloc(sizeof(TSLexeme)*2);

        /* Your INIT dictionary code */
        res[1].lexeme = NULL;
        if  ( PG_GETARG_INT32(2) > d->maxlen ) {
                if ( d->rejectlong ) {          /* stop, return void array */
                        pfree(txt);
                        res[0].lexeme = NULL;
                } else {                        /* cut integer */
                        txt[d->maxlen] = '\0';
                       res[0].lexeme = txt;
                }
        } else {
                res[0].lexeme = txt;
        }

        PG_RETURN_POINTER(res);
}

This is a Makefile:

subdir = contrib/dict_intdict
top_builddir = ../..
include $(top_builddir)/src/Makefile.global

MODULE_big = dict_intdict
OBJS =  dict_tmpl.o
DATA_built = dict_intdict.sql
DOCS =

include $(top_srcdir)/contrib/contrib-global.mk

This is a dict_intdict.sql.in:

SET search_path = public;
BEGIN;

CREATE OR REPLACE FUNCTION dinit_intdict(internal)
         returns internal
         as 'MODULE_PATHNAME'
         language 'C';

CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
        returns internal
        as 'MODULE_PATHNAME'
        language 'C'
        with (isstrict);

CREATE FULLTEXT DICTIONARY intdict
        LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
        OPTION  'MAXLEN=6,REJECTLONG=false'
;

COMMENT ON FULLTEXT DICTIONARY intdict IS 'Dictionary for Integers';

END;