/*------------------------------------------------------------------------- * * tsvector2_json.c * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 2018, PostgresPro * *------------------------------------------------------------------------- */ #include "postgres.h" #include "tsearch/ts_cache.h" #include "utils/builtins.h" #include "utils/jsonb.h" #include "utils/jsonapi.h" #include "tsearch/ts_utils.h" #include "tsvector2.h" typedef struct TSVectorBuildState { ParsedText *prs; Oid cfgId; } TSVectorBuildState; typedef void (*action_function) (void *state, char *elem_value, int elem_len); static void add_to_tsvector2(void *_state, char *elem_value, int elem_len); #ifdef __GNUC__ __attribute__((unused)) #endif static void collect_jsonb_strings(Jsonb *jb, void *state, action_function action) { JsonbIterator *it; JsonbValue v; JsonbIteratorToken type; it = JsonbIteratorInit(&jb->root); while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE) { if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString) { action(state, v.val.string.val, v.val.string.len); } } } struct parseState { action_function action; void *action_state; }; static void iterate_string_values_scalar(void *state, char *token, JsonTokenType tokentype) { struct parseState *pstate = (struct parseState *) state; if (tokentype == JSON_TOKEN_STRING) (pstate->action) (pstate->action_state, token, strlen(token)); } #ifdef __GNUC__ __attribute__((unused)) #endif static void collect_json_strings(text *json, void *action_state, action_function action) { JsonLexContext *lex = makeJsonLexContext(json, true); JsonSemAction *sem = palloc0(sizeof(JsonSemAction)); struct parseState *state = palloc0(sizeof(struct parseState)); state->action = action; state->action_state = action_state; sem->semstate = (void *) state; sem->scalar = iterate_string_values_scalar; pg_parse_json(lex, sem); } /* * Worker function for jsonb(_string)_to_tsvector2(_byid) */ static TSVector2 jsonb_to_tsvector2_worker(Oid cfgId, Jsonb *jb, uint32 flags) { TSVectorBuildState state; ParsedText prs; prs.words = NULL; prs.curwords = 0; state.prs = &prs; state.cfgId = cfgId; #if PG_VERSION_NUM >= 110000 iterate_jsonb_values(jb, flags, &state, add_to_tsvector2); #else collect_jsonb_strings(jb, &state, add_to_tsvector2); #endif return make_tsvector2(&prs); } PG_FUNCTION_INFO_V1(jsonb_string_to_tsvector2_byid); Datum jsonb_string_to_tsvector2_byid(PG_FUNCTION_ARGS) { Oid cfgId = PG_GETARG_OID(0); Jsonb *jb = PG_GETARG_JSONB_P(1); TSVector2 result; result = jsonb_to_tsvector2_worker(cfgId, jb, jtiString); PG_FREE_IF_COPY(jb, 1); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(jsonb_string_to_tsvector2); Datum jsonb_string_to_tsvector2(PG_FUNCTION_ARGS) { Jsonb *jb = PG_GETARG_JSONB_P(0); Oid cfgId; TSVector2 result; cfgId = getTSCurrentConfig(true); result = jsonb_to_tsvector2_worker(cfgId, jb, jtiString); PG_FREE_IF_COPY(jb, 0); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(jsonb_to_tsvector2_byid); Datum jsonb_to_tsvector2_byid(PG_FUNCTION_ARGS) { Oid cfgId = PG_GETARG_OID(0); Jsonb *jb = PG_GETARG_JSONB_P(1); Jsonb *jbFlags = PG_GETARG_JSONB_P(2); TSVector2 result; uint32 flags = 0; #if PG_VERSION_NUM >= 110000 flags = parse_jsonb_index_flags(jbFlags); #endif result = jsonb_to_tsvector2_worker(cfgId, jb, flags); PG_FREE_IF_COPY(jb, 1); PG_FREE_IF_COPY(jbFlags, 2); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(jsonb_to_tsvector2); Datum jsonb_to_tsvector2(PG_FUNCTION_ARGS) { Jsonb *jb = PG_GETARG_JSONB_P(0); Jsonb *jbFlags = PG_GETARG_JSONB_P(1); Oid cfgId; TSVector2 result; uint32 flags = 0; #if PG_VERSION_NUM >= 110000 flags = parse_jsonb_index_flags(jbFlags); #endif cfgId = getTSCurrentConfig(true); result = jsonb_to_tsvector2_worker(cfgId, jb, flags); PG_FREE_IF_COPY(jb, 0); PG_FREE_IF_COPY(jbFlags, 1); PG_RETURN_TSVECTOR2(result); } /* * Worker function for json(_string)_to_tsvector2(_byid) */ static TSVector2 json_to_tsvector2_worker(Oid cfgId, text *json, uint32 flags) { TSVectorBuildState state; ParsedText prs; prs.words = NULL; prs.curwords = 0; state.prs = &prs; state.cfgId = cfgId; #if PG_VERSION_NUM >= 110000 iterate_json_values(json, flags, &state, add_to_tsvector2); #else collect_json_strings(json, &state, add_to_tsvector2); #endif return make_tsvector2(&prs); } PG_FUNCTION_INFO_V1(json_string_to_tsvector2_byid); Datum json_string_to_tsvector2_byid(PG_FUNCTION_ARGS) { Oid cfgId = PG_GETARG_OID(0); text *json = PG_GETARG_TEXT_P(1); TSVector2 result; result = json_to_tsvector2_worker(cfgId, json, jtiString); PG_FREE_IF_COPY(json, 1); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(json_string_to_tsvector2); Datum json_string_to_tsvector2(PG_FUNCTION_ARGS) { text *json = PG_GETARG_TEXT_P(0); Oid cfgId; TSVector2 result; cfgId = getTSCurrentConfig(true); result = json_to_tsvector2_worker(cfgId, json, jtiString); PG_FREE_IF_COPY(json, 0); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(json_to_tsvector2_byid); Datum json_to_tsvector2_byid(PG_FUNCTION_ARGS) { Oid cfgId = PG_GETARG_OID(0); text *json = PG_GETARG_TEXT_P(1); Jsonb *jbFlags = PG_GETARG_JSONB_P(2); TSVector2 result; uint32 flags = 0; #if PG_VERSION_NUM >= 110000 flags = parse_jsonb_index_flags(jbFlags); #endif result = json_to_tsvector2_worker(cfgId, json, flags); PG_FREE_IF_COPY(json, 1); PG_FREE_IF_COPY(jbFlags, 2); PG_RETURN_TSVECTOR2(result); } PG_FUNCTION_INFO_V1(json_to_tsvector2); Datum json_to_tsvector2(PG_FUNCTION_ARGS) { text *json = PG_GETARG_TEXT_P(0); Jsonb *jbFlags = PG_GETARG_JSONB_P(1); Oid cfgId; TSVector2 result; uint32 flags = 0; #if PG_VERSION_NUM >= 110000 flags = parse_jsonb_index_flags(jbFlags); #endif cfgId = getTSCurrentConfig(true); result = json_to_tsvector2_worker(cfgId, json, flags); PG_FREE_IF_COPY(json, 0); PG_FREE_IF_COPY(jbFlags, 1); PG_RETURN_TSVECTOR2(result); } /* * to_tsvector */ static int compareWORD(const void *a, const void *b) { int res; res = ts2_compare_string( ((const ParsedWord *) a)->word, ((const ParsedWord *) a)->len, ((const ParsedWord *) b)->word, ((const ParsedWord *) b)->len, false); if (res == 0) { if (((const ParsedWord *) a)->pos.pos == ((const ParsedWord *) b)->pos.pos) return 0; res = (((const ParsedWord *) a)->pos.pos > ((const ParsedWord *) b)->pos.pos) ? 1 : -1; } return res; } static int uniqueWORD(ParsedWord *a, int32 l) { ParsedWord *ptr, *res; int tmppos; if (l == 1) { tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; return l; } res = a; ptr = a + 1; /* * Sort words with its positions */ qsort((void *) a, l, sizeof(ParsedWord), compareWORD); /* * Initialize first word and its first position */ tmppos = LIMITPOS(a->pos.pos); a->alen = 2; a->pos.apos = (uint16 *) palloc(sizeof(uint16) * a->alen); a->pos.apos[0] = 1; a->pos.apos[1] = tmppos; /* * Summarize position information for each word */ while (ptr - a < l) { if (!(ptr->len == res->len && strncmp(ptr->word, res->word, res->len) == 0)) { /* * Got a new word, so put it in result */ res++; res->len = ptr->len; res->word = ptr->word; tmppos = LIMITPOS(ptr->pos.pos); res->alen = 2; res->pos.apos = (uint16 *) palloc(sizeof(uint16) * res->alen); res->pos.apos[0] = 1; res->pos.apos[1] = tmppos; } else { /* * The word already exists, so adjust position information. But * before we should check size of position's array, max allowed * value for position and uniqueness of position */ pfree(ptr->word); if (res->pos.apos[0] < MAXNUMPOS - 1 && res->pos.apos[res->pos.apos[0]] != MAXENTRYPOS - 1 && res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) { if (res->pos.apos[0] + 1 >= res->alen) { res->alen *= 2; res->pos.apos = (uint16 *) repalloc(res->pos.apos, sizeof(uint16) * res->alen); } if (res->pos.apos[0] == 0 || res->pos.apos[res->pos.apos[0]] != LIMITPOS(ptr->pos.pos)) { res->pos.apos[res->pos.apos[0] + 1] = LIMITPOS(ptr->pos.pos); res->pos.apos[0]++; } } } ptr++; } return res + 1 - a; } /* * Parse lexemes in an element of a json(b) value, add to TSVectorBuildState. */ static void add_to_tsvector2(void *_state, char *elem_value, int elem_len) { TSVectorBuildState *state = (TSVectorBuildState *) _state; ParsedText *prs = state->prs; int32 prevwords; if (prs->words == NULL) { /* * First time through: initialize words array to a reasonable size. * (parsetext() will realloc it bigger as needed.) */ prs->lenwords = 16; prs->words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs->lenwords); prs->curwords = 0; prs->pos = 0; } prevwords = prs->curwords; parsetext(state->cfgId, prs, elem_value, elem_len); /* * If we extracted any words from this JSON element, advance pos to create * an artificial break between elements. This is because we don't want * phrase searches to think that the last word in this element is adjacent * to the first word in the next one. */ if (prs->curwords > prevwords) prs->pos += 1; } /* * make value of tsvector, given parsed text * * Note: frees prs->words and subsidiary data. */ TSVector2 make_tsvector2(void *prs1) { int i, lenstr = 0, totallen, stroff = 0; TSVector2 in; ParsedText *prs = (ParsedText *) prs1; /* Merge duplicate words */ if (prs->curwords > 0) prs->curwords = uniqueWORD(prs->words, prs->curwords); /* Determine space needed */ for (i = 0; i < prs->curwords; i++) { int npos = prs->words[i].alen ? prs->words[i].pos.apos[0] : 0; INCRSIZE(lenstr, i, prs->words[i].len, npos); } if (lenstr > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector (%d bytes, max %d bytes)", lenstr, MAXSTRPOS))); totallen = CALCDATASIZE(prs->curwords, lenstr); in = (TSVector2) palloc0(totallen); SET_VARSIZE(in, totallen); in->size = prs->curwords; for (i = 0; i < prs->curwords; i++) { int npos = 0; if (prs->words[i].alen) npos = prs->words[i].pos.apos[0]; tsvector2_addlexeme(in, i, &stroff, prs->words[i].word, prs->words[i].len, prs->words[i].pos.apos + 1, npos); pfree(prs->words[i].word); if (prs->words[i].alen) pfree(prs->words[i].pos.apos); } if (prs->words) pfree(prs->words); return in; } PG_FUNCTION_INFO_V1(to_tsvector2_byid); Datum to_tsvector2_byid(PG_FUNCTION_ARGS) { Oid cfgId = PG_GETARG_OID(0); text *in = PG_GETARG_TEXT_PP(1); ParsedText prs; TSVector2 out; prs.lenwords = VARSIZE_ANY_EXHDR(in) / 6; /* just estimation of word's * number */ if (prs.lenwords < 2) prs.lenwords = 2; prs.curwords = 0; prs.pos = 0; prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); parsetext(cfgId, &prs, VARDATA_ANY(in), VARSIZE_ANY_EXHDR(in)); PG_FREE_IF_COPY(in, 1); out = make_tsvector2(&prs); PG_RETURN_TSVECTOR2(out); } PG_FUNCTION_INFO_V1(to_tsvector2); Datum to_tsvector2(PG_FUNCTION_ARGS) { text *in = PG_GETARG_TEXT_PP(0); Oid cfgId; cfgId = getTSCurrentConfig(true); PG_RETURN_DATUM(DirectFunctionCall2(to_tsvector2_byid, ObjectIdGetDatum(cfgId), PointerGetDatum(in))); }