如何为Postgresql数据库全文搜索(full text search)编写解析器(parser)

youngfan007 · 发表于 2016-11-21 06:36:36

　　用英文写的，主要是命令和代码，就不翻译了，偷懒一下。
　　This article show you how to create a parser to handle camel case [3] in
string in Postgresql full text search. This parser is tested on Postgresql 8.3.9
version. The OS is ubuntu 8.10.
　　[1,2] will give you basic background and sample for old version. I update the
code to fit 8.3.9 version, especial lots of changes in the SQL statement.

Create parser
　　First, create a directory for put all files, for example: /home/user/Desktop/test_parser
　　Create file "test_parser.c". I use vim.

#include "postgres.h"

#include "utils/builtins.h"

#ifdef PG_MODULE_MAGIC

PG_MODULE_MAGIC;

#endif

/*

* types

*/

/* self-defined type */

typedef struct {

char * buffer; /* text to parse */

int len; /* length of the text in buffer */

int pos; /* position of the parser */

} ParserState;

/* copy-paste from wparser.h of tsearch2 */

typedef struct {

int lexid;

char *alias;

char *descr;

} LexDescr;

/*

* prototypes

*/

PG_FUNCTION_INFO_V1(testprs_start);

Datum testprs_start(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_getlexeme);

Datum testprs_getlexeme(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_end);

Datum testprs_end(PG_FUNCTION_ARGS);

PG_FUNCTION_INFO_V1(testprs_lextype);

Datum testprs_lextype(PG_FUNCTION_ARGS);

/*

* functions

*/

Datum testprs_start

(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) palloc(sizeof(ParserState));

pst->buffer = (char *) PG_GETARG_POINTER(0);

pst->len = PG_GETARG_INT32(1);

pst->pos = 0;

PG_RETURN_POINTER(pst);

}

Datum testprs_getlexeme

(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

char **t = (char **) PG_GETARG_POINTER(1);

int *tlen = (int *) PG_GETARG_POINTER(2);

int type;

*tlen = pst->pos;

*t = pst->buffer + pst->pos;

HTML clipboard/* main process here */

if (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') &&
(pst->pos < pst->len)) {

/* word type */

type = 3;

(pst->pos)++;

/* for case like: ItemURL => Item URL*/

if(((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z') && (pst->pos
< pst->len)){

while (((pst->buffer)[pst->pos] >= 'A' && (pst->buffer)[pst->pos] <= 'Z')
&& (pst->pos < pst->len)) {

(pst->pos)++;

}

}

/* go to the next upper case character */

while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z')
&& (pst->pos < pst->len)) {

(pst->pos)++;

}

}

if (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z') &&
(pst->pos < pst->len)) {

/* word type */

type = 3;

(pst->pos)++;

/* go to the next upper case character */

while (((pst->buffer)[pst->pos] >= 'a' && (pst->buffer)[pst->pos] <= 'z')
&& (pst->pos < pst->len)) {

(pst->pos)++;

}

}

*tlen = pst->pos - *tlen;

/* we are finished if (*tlen == 0) */

if (*tlen == 0) type=0;

PG_RETURN_INT32(type);

}

Datum testprs_end

(PG_FUNCTION_ARGS)

{

ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);

pfree(pst);

PG_RETURN_VOID();

}

Datum testprs_lextype

(PG_FUNCTION_ARGS)

{

/*

Remarks:

- we have to return the blanks for headline reason

- we use the same lexids like Teodor in the default

word parser; in this way we can reuse the headline

function of the default word parser.

*/

LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));

/* there are only two types in this parser */

descr[0].lexid = 3;

descr[0].alias = pstrdup("word");

descr[0].descr = pstrdup("Word");

descr[1].lexid = 12;

descr[1].alias = pstrdup("blank");

descr[1].descr = pstrdup("Space symbols");

descr[2].lexid = 0;

PG_RETURN_POINTER(descr);

}

　　Create "Makefile" file under the same directory.

override CPPFLAGS := -I. $(CPPFLAGS)

MODULE_big = test_parser

OBJS = test_parser.o

DATA_built = test_parser.sql

DATA =

REGRESS = test_parser

ifdef USE_PGXS

PGXS := $(shell pg_config --pgxs)

include $(PGXS)

else

subdir = contrib/test_parser

top_builddir = ../..

include $(top_builddir)/src/Makefile.global

include $(top_srcdir)/contrib/contrib-global.mk

endif

　　Create "test_parser.sql" file under the same directory.

SET search_path = public;

BEGIN;

--DROP TEXT SEARCH CONFIGURATION testcfg;

--DROP TEXT SEARCH PARSER testparser;

CREATE OR REPLACE FUNCTION testprs_start(internal,int4)

RETURNS internal

AS '$libdir/test_parser'

LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_getlexeme(internal,internal,internal)

RETURNS internal

AS '$libdir/test_parser'

LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_end(internal)

RETURNS void

AS '$libdir/test_parser'

LANGUAGE 'C';

CREATE OR REPLACE FUNCTION testprs_lextype(internal)

RETURNS internal

AS '$libdir/test_parser'

LANGUAGE 'C';

CREATE TEXT SEARCH PARSER testparser(

START = 'testprs_start',

GETTOKEN = 'testprs_getlexeme',

END = 'testprs_end',

LEXTYPES = 'testprs_lextype'

);

CREATE TEXT SEARCH CONFIGURATION testcfg (

PARSER ='testparser'

);

ALTER TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH
english_stem;

END;

　　Note: the bold part is total changed for the 8.3.9
version.

"DROP TEXT SEARCH
"
is very useful if you make modification and load the script again.

Install the parser:

open a terminal, switch to the directory you just create. For example:
cd /home/user/Desktop/test_parser
type: sudo PATH=/usr/local/pgsql/bin:$PATH USE_PGXS=1 make install

switch to user "prostgres", type: /usr/local/pgsql/bin/psql
-f /usr/local/pgsql/share/contrib/test_parser.sql your-database

SET

BEGIN

CREATE FUNCTION

CREATE FUNCTION

CREATE FUNCTION

CREATE FUNCTION

DROP TEXT SEARCH CONFIGURATION

DROP TEXT SEARCH PARSER

CREATE TEXT SEARCH PARSER

CREATE TEXT SEARCH CONFIGURATION

ALTER TEXT SEARCH CONFIGURATION

COMMIT

type: /usr/local/pgsql/bin/psql postgres

test the parser:

postgres=# SELECT
to_tsvector('testcfg','itemValue');

to_tsvector

--------------------

'item':1 'value':2

(1 row)

postgres=# SELECT to_tsvector('testcfg','ItemValue');

to_tsvector

--------------------

'item':1 'value':2

(1 row)

postgres=# SELECT to_tsquery('testcfg','itemsValue');

to_tsquery

-------------------

'items' & 'value'

(1 row)

　　
Reference:

[1]
http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html

[2]
http://www.pgcon.org/2007/schedule/attachments/12-fts.pdf

[3]
http://en.wikipedia.org/wiki/CamelCase

[4]
http://www.sai.msu.su/~megera/postgres/fts/doc/sql-fts-createmap.html

[5]
http://www.postgresql.org/docs/8.3/static/textsearch-configuration.html

[6]
http://developer.postgresql.org/pgdocs/postgres/sql-createtsconfig.html

[7]
http://www.postgresql.org/docs/7.4/interactive/sql-createfunction.html

账号		自动登录	找回密码
密码			立即注册

大疆运维招人啦，

C++ :try 语句块和异常处理

C++的多态

Red Hat RHCE 8 (EX294) Cert Guide

Java/C++ 区别：看完这一篇，就够用！

别再用过时库了！这 13 个顶级 C++ 库才是

c++ size_t 和 int 的区别

[经验分享] 如何为Postgresql数据库全文搜索(full text search)编写解析器(parser)

浏览过的版块

扫码加入运维网微信交流群