/*------------------------------------------------------------------------- * * tsvector2_op.c * operations over tsvector2 * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 2018, PostgresPro * *------------------------------------------------------------------------- */ #include "postgres.h" #include #include "access/htup_details.h" #include "catalog/namespace.h" #include "catalog/pg_type.h" #include "commands/trigger.h" #include "executor/spi.h" #include "funcapi.h" #include "mb/pg_wchar.h" #include "miscadmin.h" #include "parser/parse_coerce.h" #include "utils/builtins.h" #include "utils/lsyscache.h" #if PG_VERSION_NUM >= 100000 #include "utils/regproc.h" #endif #include "utils/rel.h" #include "tsearch/ts_utils.h" #include "tsvector2.h" typedef struct { TSVector2 vec; int bidx; int eidx; char *operand; } CHKVAL; typedef struct StatEntry { uint32 ndoc; /* zero indicates that we were already here * while walking through the tree */ uint32 nentry; struct StatEntry *left; struct StatEntry *right; uint32 lenlexeme; char lexeme[FLEXIBLE_ARRAY_MEMBER]; } StatEntry; #define STATENTRYHDRSZ (offsetof(StatEntry, lexeme)) typedef struct { int32 weight; uint32 maxdepth; StatEntry **stack; uint32 stackpos; StatEntry *root; } TSVectorStat; #define STATHDRSIZE (offsetof(TSVectorStat, data)) static int tsvector2_bsearch(const TSVector2 tsv, char *lexeme, int lexeme_len); PG_FUNCTION_INFO_V1(tsvector2_match_tsquery); /* * Order: npos, len, word, for all positions (pos, weight) */ static int silly_cmp_tsvector2(const TSVector2 a, const TSVector2 b) { if (VARSIZE(a) < VARSIZE(b)) return -1; else if (VARSIZE(a) > VARSIZE(b)) return 1; else if (a->size < b->size) return -1; else if (a->size > b->size) return 1; else { WordEntry2 *aptr = tsvector2_entries(a); WordEntry2 *bptr = tsvector2_entries(b); int i = 0; int res; uint32 pos1, pos2; INITPOS(pos1); INITPOS(pos2); for (i = 0; i < a->size; i++) { char *lex1 = tsvector2_storage(a) + pos1, *lex2 = tsvector2_storage(b) + pos2; int npos1 = ENTRY_NPOS(a, aptr), npos2 = ENTRY_NPOS(b, bptr); int len1 = ENTRY_LEN(a, aptr), len2 = ENTRY_LEN(b, bptr); if ((npos1 == 0 || npos2 == 0) && npos1 != npos2) return npos1 > npos2 ? -1 : 1; else if ((res = ts2_compare_string(lex1, len1, lex2, len2, false)) != 0) { return res; } else if (npos1 > 0) { WordEntryPos *ap, *bp; int j; ap = get_lexeme_positions(lex1, len1); bp = get_lexeme_positions(lex2, len2); if (npos1 != npos2) return (npos1 > npos2) ? -1 : 1; for (j = 0; j < npos1; j++) { if (WEP_GETPOS(*ap) != WEP_GETPOS(*bp)) { return (WEP_GETPOS(*ap) > WEP_GETPOS(*bp)) ? -1 : 1; } else if (WEP_GETWEIGHT(*ap) != WEP_GETWEIGHT(*bp)) { return (WEP_GETWEIGHT(*ap) > WEP_GETWEIGHT(*bp)) ? -1 : 1; } ap++, bp++; } } INCRPTR(a, aptr, pos1); INCRPTR(b, bptr, pos2); } } return 0; } #define TSVECTOR2CMPFUNC( type, action, ret ) \ PG_FUNCTION_INFO_V1(tsvector2_##type); \ Datum \ tsvector2_##type(PG_FUNCTION_ARGS) \ { \ TSVector2 a = PG_GETARG_TSVECTOR2(0); \ TSVector2 b = PG_GETARG_TSVECTOR2(1); \ int res = silly_cmp_tsvector2(a, b); \ PG_FREE_IF_COPY(a,0); \ PG_FREE_IF_COPY(b,1); \ PG_RETURN_##ret( res action 0 ); \ } \ /* keep compiler quiet - no extra ; */ \ extern int no_such_variable TSVECTOR2CMPFUNC(lt, <, BOOL); TSVECTOR2CMPFUNC(le, <=, BOOL); TSVECTOR2CMPFUNC(eq, ==, BOOL); TSVECTOR2CMPFUNC(ge, >=, BOOL); TSVECTOR2CMPFUNC(gt, >, BOOL); TSVECTOR2CMPFUNC(ne, !=, BOOL); TSVECTOR2CMPFUNC(cmp, +, INT32); PG_FUNCTION_INFO_V1(tsvector2_strip); Datum tsvector2_strip(PG_FUNCTION_ARGS) { TSVector2 in = PG_GETARG_TSVECTOR2(0); TSVector2 out; int i, count, posout = 0, pos, len = 0; WordEntry2 *entryin = tsvector2_entries(in); count = in->size; for (i = 0; i < count; i++) INCRSIZE(len, i, ENTRY_LEN(in, tsvector2_entries(in) + i), 0); len = CALCDATASIZE(count, len); out = (TSVector2) palloc0(len); SET_VARSIZE(out, len); out->size = count; INITPOS(pos); for (i = 0; i < count; i++) { tsvector2_addlexeme(out, i, &posout, tsvector2_storage(in) + pos, ENTRY_LEN(in, entryin), NULL, 0); INCRPTR(in, entryin, pos); } PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(out); } PG_FUNCTION_INFO_V1(tsvector2_length); Datum tsvector2_length(PG_FUNCTION_ARGS) { TSVector2 in = PG_GETARG_TSVECTOR2(0); int32 ret = in->size; PG_FREE_IF_COPY(in, 0); PG_RETURN_INT32(ret); } PG_FUNCTION_INFO_V1(tsvector2_setweight); Datum tsvector2_setweight(PG_FUNCTION_ARGS) { TSVector2 in = PG_GETARG_TSVECTOR2(0); char cw = PG_GETARG_CHAR(1); TSVector2 out; int i; WordEntry2 *weptr; int w = 0; uint32 pos; switch (cw) { case 'A': case 'a': w = 3; break; case 'B': case 'b': w = 2; break; case 'C': case 'c': w = 1; break; case 'D': case 'd': w = 0; break; default: /* internal error */ elog(ERROR, "unrecognized weight: %d", cw); } out = (TSVector2) palloc(VARSIZE(in)); memcpy(out, in, VARSIZE(in)); weptr = tsvector2_entries(out); INITPOS(pos); for (i = 0; i < out->size; i++) { int j, npos = ENTRY_NPOS(out, weptr); if (npos) { WordEntryPos *p = get_lexeme_positions(tsvector2_storage(out) + pos, ENTRY_LEN(out, weptr)); for (j = 0; j < npos; j++) WEP_SETWEIGHT(p[j], w); } INCRPTR(out, weptr, pos); } PG_FREE_IF_COPY(in, 0); PG_RETURN_POINTER(out); } /* * setweight(tsin tsvector2, char_weight "char", lexemes "text"[]) * * Assign weight w to elements of tsin that are listed in lexemes. */ PG_FUNCTION_INFO_V1(tsvector2_setweight_by_filter); Datum tsvector2_setweight_by_filter(PG_FUNCTION_ARGS) { TSVector2 tsin = PG_GETARG_TSVECTOR2(0); char char_weight = PG_GETARG_CHAR(1); ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(2); TSVector2 tsout; int i, nlexemes, weight; Datum *dlexemes; bool *nulls; switch (char_weight) { case 'A': case 'a': weight = 3; break; case 'B': case 'b': weight = 2; break; case 'C': case 'c': weight = 1; break; case 'D': case 'd': weight = 0; break; default: /* internal error */ elog(ERROR, "unrecognized weight: %c", char_weight); } tsout = (TSVector2) palloc(VARSIZE(tsin)); memcpy(tsout, tsin, VARSIZE(tsin)); deconstruct_array(lexemes, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nlexemes); /* * Assuming that lexemes array is significantly shorter than tsvector2 we * can iterate through lexemes performing binary search of each lexeme * from lexemes in tsvector2. */ for (i = 0; i < nlexemes; i++) { char *lex; int lex_len, lex_idx, npos; if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; lex_idx = tsvector2_bsearch(tsin, lex, lex_len); npos = ENTRY_NPOS(tsin, tsvector2_entries(tsout) + lex_idx); if (lex_idx >= 0 && npos > 0) { int j; WordEntry2 *we; char *lexeme = tsvector2_getlexeme(tsout, lex_idx, &we); WordEntryPos *p = get_lexeme_positions(lexeme, we->len); for (j = 0; j < npos; j++) WEP_SETWEIGHT(p[j], weight); } } PG_FREE_IF_COPY(tsin, 0); PG_FREE_IF_COPY(lexemes, 2); PG_RETURN_POINTER(tsout); } #define compareEntry(pa, a, pb, b) \ ts2_compare_string((pa) + (a)->pos, (a)->len, \ (pb) + (b)->pos, (b)->len, \ false) /* * Add positions from src to dest after offsetting them by maxpos. * Return the number added (might be less than expected due to overflow) */ static int32 add_pos(char *src, WordEntry2 *srcptr, WordEntryPos *dest, int from, int32 maxpos) { uint16 clen = from; int i; uint16 slen = srcptr->npos; WordEntryPos *spos = get_lexeme_positions(src, srcptr->len); Assert(!srcptr->hasoff); for (i = 0; i < slen && clen < MAXNUMPOS && (clen == 0 || WEP_GETPOS(dest[clen - 1]) != MAXENTRYPOS - 1); i++) { WEP_SETWEIGHT(dest[clen], WEP_GETWEIGHT(spos[i])); WEP_SETPOS(dest[clen], LIMITPOS(WEP_GETPOS(spos[i]) + maxpos)); clen++; } return clen - from; } /* * Perform binary search of given lexeme in TSVector2. * Returns lexeme position in TSVector2's entry array or -1 if lexeme wasn't * found. */ static int tsvector2_bsearch(const TSVector2 tsv, char *lexeme, int lexeme_len) { int StopLow = 0, StopHigh = tsv->size, StopMiddle, cmp; while (StopLow < StopHigh) { WordEntry2 *entry = NULL; char *str; StopMiddle = (StopLow + StopHigh) / 2; str = tsvector2_getlexeme(tsv, StopMiddle, &entry); cmp = ts2_compare_string(lexeme, lexeme_len, str, entry->len, false); if (cmp < 0) StopHigh = StopMiddle; else if (cmp > 0) StopLow = StopMiddle + 1; else /* found it */ return StopMiddle; } return -1; } /* * qsort comparator functions */ static int compare_int(const void *va, const void *vb) { int a = *((const int *) va); int b = *((const int *) vb); if (a == b) return 0; return (a > b) ? 1 : -1; } static int compare_text_lexemes(const void *va, const void *vb) { Datum a = *((const Datum *) va); Datum b = *((const Datum *) vb); char *alex = VARDATA_ANY(a); int alex_len = VARSIZE_ANY_EXHDR(a); char *blex = VARDATA_ANY(b); int blex_len = VARSIZE_ANY_EXHDR(b); return ts2_compare_string(alex, alex_len, blex, blex_len, false); } /* * Internal routine to delete lexemes from TSVector2 by array of offsets. * * int *indices_to_delete -- array of lexeme offsets to delete (modified here!) * int indices_count -- size of that array * * Returns new TSVector2 without given lexemes along with their positions * and weights. */ static TSVector2 tsvector2_delete_by_indices(TSVector2 tsv, int *indices_to_delete, int indices_count) { TSVector2 tsout; WordEntry2 *ptr = tsvector2_entries(tsv); int i, /* index in input tsvector2 */ j, /* index in output tsvector2 */ k, /* index in indices_to_delete */ curoff = 0, /* index in data area of output */ pos; /* * Sort the filter array to simplify membership checks below. Also, get * rid of any duplicate entries, so that we can assume that indices_count * is exactly equal to the number of lexemes that will be removed. */ if (indices_count > 1) { int kp; qsort(indices_to_delete, indices_count, sizeof(int), compare_int); kp = 0; for (k = 1; k < indices_count; k++) { if (indices_to_delete[k] != indices_to_delete[kp]) indices_to_delete[++kp] = indices_to_delete[k]; } indices_count = ++kp; } /* * Here we overestimate tsout size, since we don't know how much space is * used by the deleted lexeme(s). We will set exact size below. */ tsout = (TSVector2) palloc0(VARSIZE(tsv)); /* This count must be correct because tsvector2_storage(tsout) relies on it. */ tsout->size = tsv->size - indices_count; /* * Copy tsv to tsout, skipping lexemes listed in indices_to_delete. */ INITPOS(pos); for (i = j = k = 0; i < tsv->size; i++) { char *lex = tsvector2_storage(tsv) + pos; int lex_len = ENTRY_LEN(tsv, ptr); /* * If current i is present in indices_to_delete, skip this lexeme. * Since indices_to_delete is already sorted, we only need to check * the current (k'th) entry. */ if (k < indices_count && i == indices_to_delete[k]) { k++; goto next; } tsvector2_addlexeme(tsout, j++, &curoff, lex, lex_len, get_lexeme_positions(lex, lex_len), ENTRY_NPOS(tsv, ptr)); next: INCRPTR(tsv, ptr, pos); } /* * k should now be exactly equal to indices_count. If it isn't then the * caller provided us with indices outside of [0, tsv->size) range and * estimation of tsout's size is wrong. */ Assert(k == indices_count); SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, curoff)); return tsout; } /* * Delete given lexeme from tsvector2. * Implementation of user-level ts_delete(tsvector2, text). */ PG_FUNCTION_INFO_V1(tsvector2_delete_str); Datum tsvector2_delete_str(PG_FUNCTION_ARGS) { TSVector2 tsin = PG_GETARG_TSVECTOR2(0), tsout; text *tlexeme = PG_GETARG_TEXT_PP(1); char *lexeme = VARDATA_ANY(tlexeme); int lexeme_len = VARSIZE_ANY_EXHDR(tlexeme), skip_index; if ((skip_index = tsvector2_bsearch(tsin, lexeme, lexeme_len)) == -1) PG_RETURN_POINTER(tsin); tsout = tsvector2_delete_by_indices(tsin, &skip_index, 1); PG_FREE_IF_COPY(tsin, 0); PG_FREE_IF_COPY(tlexeme, 1); PG_RETURN_POINTER(tsout); } /* * Delete given array of lexemes from tsvector2. * Implementation of user-level ts_delete(tsvector2, text[]). */ PG_FUNCTION_INFO_V1(tsvector2_delete_arr); Datum tsvector2_delete_arr(PG_FUNCTION_ARGS) { TSVector2 tsin = PG_GETARG_TSVECTOR2(0), tsout; ArrayType *lexemes = PG_GETARG_ARRAYTYPE_P(1); int i, nlex, skip_count, *skip_indices; Datum *dlexemes; bool *nulls; deconstruct_array(lexemes, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nlex); /* * In typical use case array of lexemes to delete is relatively small. So * here we optimize things for that scenario: iterate through lexarr * performing binary search of each lexeme from lexarr in tsvector2. */ skip_indices = palloc0(nlex * sizeof(int)); for (i = skip_count = 0; i < nlex; i++) { char *lex; int lex_len, lex_pos; if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); lex = VARDATA(dlexemes[i]); lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; lex_pos = tsvector2_bsearch(tsin, lex, lex_len); if (lex_pos >= 0) skip_indices[skip_count++] = lex_pos; } tsout = tsvector2_delete_by_indices(tsin, skip_indices, skip_count); pfree(skip_indices); PG_FREE_IF_COPY(tsin, 0); PG_FREE_IF_COPY(lexemes, 1); PG_RETURN_POINTER(tsout); } /* * Expand tsvector2 as table with following columns: * lexeme: lexeme text * positions: integer array of lexeme positions * weights: char array of weights corresponding to positions */ PG_FUNCTION_INFO_V1(tsvector2_unnest); Datum tsvector2_unnest(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; TSVector2 tsin; uint32 pos; if (SRF_IS_FIRSTCALL()) { MemoryContext oldcontext; TupleDesc tupdesc; funcctx = SRF_FIRSTCALL_INIT(); oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "lexeme", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "positions", INT2ARRAYOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "weights", TEXTARRAYOID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); INITPOS(pos); funcctx->user_fctx = list_make2(PG_GETARG_TSVECTOR2(0), makeInteger(pos)); MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); tsin = (TSVector2) linitial(funcctx->user_fctx); pos = intVal(lsecond(funcctx->user_fctx)); if (funcctx->call_cntr < tsin->size) { WordEntry2 *entry = tsvector2_entries(tsin) + funcctx->call_cntr; char *data = tsvector2_storage(tsin); HeapTuple tuple; int j, npos = ENTRY_NPOS(tsin, entry), lex_len = ENTRY_LEN(tsin, entry); bool nulls[] = {false, false, false}; Datum values[3]; values[0] = PointerGetDatum( cstring_to_text_with_len(data + pos, lex_len)); if (npos) { WordEntryPos *apos = get_lexeme_positions(data + pos, lex_len); Datum *positions; Datum *weights; char weight; /* * Internally tsvector2 stores position and weight in the same * uint16 (2 bits for weight, 14 for position). Here we extract * that in two separate arrays. */ positions = palloc(npos * sizeof(Datum)); weights = palloc(npos * sizeof(Datum)); for (j = 0; j < npos; j++) { positions[j] = Int16GetDatum(WEP_GETPOS(apos[j])); weight = 'D' - WEP_GETWEIGHT(apos[j]); weights[j] = PointerGetDatum( cstring_to_text_with_len(&weight, 1) ); } values[1] = PointerGetDatum( construct_array(positions, npos, INT2OID, 2, true, 's')); values[2] = PointerGetDatum( construct_array(weights, npos, TEXTOID, -1, false, 'i')); } else { nulls[1] = nulls[2] = true; } INCRPTR(tsin, entry, intVal(lsecond(funcctx->user_fctx))); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); SRF_RETURN_NEXT(funcctx, HeapTupleGetDatum(tuple)); } else { pfree(tsin); SRF_RETURN_DONE(funcctx); } } /* * Convert tsvector2 to array of lexemes. */ PG_FUNCTION_INFO_V1(tsvector2_to_array); Datum tsvector2_to_array(PG_FUNCTION_ARGS) { TSVector2 tsin = PG_GETARG_TSVECTOR2(0); WordEntry2 *entry = tsvector2_entries(tsin); Datum *elements; int i; ArrayType *array; long pos; elements = palloc(tsin->size * sizeof(Datum)); INITPOS(pos); for (i = 0; i < tsin->size; i++) { elements[i] = PointerGetDatum( cstring_to_text_with_len(tsvector2_storage(tsin) + pos, ENTRY_LEN(tsin, entry))); INCRPTR(tsin, entry, pos); } array = construct_array(elements, tsin->size, TEXTOID, -1, false, 'i'); pfree(elements); PG_FREE_IF_COPY(tsin, 0); PG_RETURN_POINTER(array); } /* * Returns offset by given index in TSVector2, * this function used when we need random access */ int tsvector2_getoffset(TSVector2 vec, int idx, WordEntry2 **we) { int offset = 0; WordEntry2 *entry; entry = tsvector2_entries(vec) + idx; if (we) *we = entry; while (!entry->hasoff) { entry--; if (!entry->hasoff) offset += SHORTALIGN(entry->len) + entry->npos * sizeof(WordEntryPos); } Assert(entry >= tsvector2_entries(vec)); if (idx % TS_OFFSET_STRIDE) { /* if idx is by offset */ WordEntry2 *offset_entry = (WordEntry2 *) (tsvector2_storage(vec) + entry->offset); offset += entry->offset + sizeof(WordEntry2); offset += SHORTALIGN(offset_entry->len) + offset_entry->npos * sizeof(WordEntryPos); } else { Assert(entry == tsvector2_entries(vec) + idx); if (we) *we = (WordEntry2 *) (tsvector2_storage(vec) + entry->offset); offset = entry->offset + sizeof(WordEntry2); } return offset; } /* * Add lexeme and its positions to tsvector2 and move dataoff (offset where * data should be added) to new position. * Returns pointer to lexeme start */ char * tsvector2_addlexeme(TSVector2 tsv, int idx, int *dataoff, char *lexeme, int lexeme_len, WordEntryPos *pos, int npos) { int stroff; WordEntry2 *entry; char *result; /* when idx is 0, dataoff should be 0 too, and otherwise */ Assert(!((idx == 0) ^ (*dataoff == 0))); stroff = *dataoff; entry = tsvector2_entries(tsv) + idx; if (idx % TS_OFFSET_STRIDE == 0) { /* WordEntry2 with offset */ WordEntry2 offentry; stroff = INTALIGN(stroff); entry->hasoff = 1; entry->offset = stroff; /* fill WordEntry2 for offset */ offentry.hasoff = 0; offentry.len = lexeme_len; offentry.npos = npos; memcpy(tsvector2_storage(tsv) + stroff, &offentry, sizeof(WordEntry2)); stroff += sizeof(WordEntry2); } else { stroff = SHORTALIGN(stroff); entry->hasoff = 0; entry->len = lexeme_len; entry->npos = npos; } memcpy(tsvector2_storage(tsv) + stroff, lexeme, lexeme_len); result = tsvector2_storage(tsv) + stroff; stroff += lexeme_len; if (npos) { if (npos > 0xFFFF) elog(ERROR, "positions array too long"); /* * Pad to 2-byte alignment if necessary. We don't know how memory was * allocated, so in case of aligning we need to make sure that unused * is zero. */ if (stroff != SHORTALIGN(stroff)) { *(tsvector2_storage(tsv) + stroff) = '\0'; stroff = SHORTALIGN(stroff); } /* Copy positions */ if (pos) memcpy(tsvector2_storage(tsv) + stroff, pos, npos * sizeof(WordEntryPos)); stroff += npos * sizeof(WordEntryPos); } *dataoff = stroff; return result; } /* * Build tsvector2 from array of lexemes. */ PG_FUNCTION_INFO_V1(array_to_tsvector2); Datum array_to_tsvector2(PG_FUNCTION_ARGS) { ArrayType *v = PG_GETARG_ARRAYTYPE_P(0); TSVector2 tsout; Datum *dlexemes; bool *nulls; int nitems, i, j, tslen, cur = 0, datalen = 0; deconstruct_array(v, TEXTOID, -1, false, 'i', &dlexemes, &nulls, &nitems); /* Reject nulls (maybe we should just ignore them, instead?) */ for (i = 0; i < nitems; i++) { if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("lexeme array may not contain nulls"))); } /* Sort and de-dup, because this is required for a valid tsvector2. */ if (nitems > 1) { qsort(dlexemes, nitems, sizeof(Datum), compare_text_lexemes); j = 0; for (i = 1; i < nitems; i++) { if (compare_text_lexemes(&dlexemes[j], &dlexemes[i]) < 0) dlexemes[++j] = dlexemes[i]; } nitems = ++j; } /* Calculate space needed for surviving lexemes. */ for (i = 0; i < nitems; i++) { int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; INCRSIZE(datalen, i, lex_len, 0); } tslen = CALCDATASIZE(nitems, datalen); /* Allocate and fill tsvector2. */ tsout = (TSVector2) palloc0(tslen); SET_VARSIZE(tsout, tslen); tsout->size = nitems; for (i = 0; i < nitems; i++) { char *lex = VARDATA(dlexemes[i]); int lex_len = VARSIZE(dlexemes[i]) - VARHDRSZ; tsvector2_addlexeme(tsout, i, &cur, lex, lex_len, NULL, 0); } PG_FREE_IF_COPY(v, 0); PG_RETURN_POINTER(tsout); } /* * ts_filter(): keep only lexemes with given weights in tsvector2. */ PG_FUNCTION_INFO_V1(tsvector2_filter); Datum tsvector2_filter(PG_FUNCTION_ARGS) { TSVector2 tsin = PG_GETARG_TSVECTOR2(0), tsout; ArrayType *weights = PG_GETARG_ARRAYTYPE_P(1); char *dataout; Datum *dweights; bool *nulls; int nweights; int i, j, dataoff = 0, pos; char mask = 0; WordEntry2 *ptr = tsvector2_entries(tsin); deconstruct_array(weights, CHAROID, 1, true, 'c', &dweights, &nulls, &nweights); for (i = 0; i < nweights; i++) { char char_weight; if (nulls[i]) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("weight array may not contain nulls"))); char_weight = DatumGetChar(dweights[i]); switch (char_weight) { case 'A': case 'a': mask = mask | 8; break; case 'B': case 'b': mask = mask | 4; break; case 'C': case 'c': mask = mask | 2; break; case 'D': case 'd': mask = mask | 1; break; default: ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("unrecognized weight: \"%c\"", char_weight))); } } tsout = (TSVector2) palloc0(VARSIZE(tsin)); tsout->size = tsin->size; dataout = tsvector2_storage(tsout); INITPOS(pos); for (i = j = 0; i < tsin->size; i++) { WordEntryPos *posin, *posout; int k, npos = 0, lex_len = ENTRY_LEN(tsin, ptr); char *lex = tsvector2_storage(tsin) + pos, *lexout; posin = get_lexeme_positions(lex, lex_len); for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++) { if (mask & (1 << WEP_GETWEIGHT(posin[k]))) npos++; } if (!npos) goto next; lexout = tsvector2_addlexeme(tsout, j++, &dataoff, lex, lex_len, NULL, npos); posout = get_lexeme_positions(lexout, lex_len); npos = 0; for (k = 0; k < ENTRY_NPOS(tsin, ptr); k++) { if (mask & (1 << WEP_GETWEIGHT(posin[k]))) posout[npos++] = posin[k]; } next: INCRPTR(tsin, ptr, pos); } tsout->size = j; if (dataout != tsvector2_storage(tsout)) memmove(tsvector2_storage(tsout), dataout, dataoff); SET_VARSIZE(tsout, CALCDATASIZE(tsout->size, dataoff)); PG_FREE_IF_COPY(tsin, 0); PG_RETURN_POINTER(tsout); } /* Get max position in in1; we'll need this to offset in2's positions */ static int get_maxpos(TSVector2 tsv) { int i, j, maxpos = 0; WordEntry2 *ptr = tsvector2_entries(tsv); uint32 pos; WordEntryPos *apos; INITPOS(pos); for (i = 0; i < tsv->size; i++) { apos = get_lexeme_positions(tsvector2_storage(tsv) + pos, ENTRY_LEN(tsv, ptr)); for (j = 0; j < ENTRY_NPOS(tsv, ptr); j++) { if (WEP_GETPOS(apos[j]) > maxpos) maxpos = WEP_GETPOS(apos[j]); } INCRPTR(tsv, ptr, pos); } return maxpos; } PG_FUNCTION_INFO_V1(tsvector2_concat); Datum tsvector2_concat(PG_FUNCTION_ARGS) { TSVector2 in1 = PG_GETARG_TSVECTOR2(0), in2 = PG_GETARG_TSVECTOR2(1), out; WordEntry2 *ptr, *ptr1, *ptr2; int maxpos = 0, i, i1, i2, output_bytes, pos1, pos2, dataoff; char *data; ptr1 = tsvector2_entries(in1); ptr2 = tsvector2_entries(in2); i1 = in1->size; i2 = in2->size; /* * Conservative estimate of space needed. We might need all the data in * both inputs, and conceivably add a pad bytes before lexeme and position * data, and pad bytes before WordEntry2 for offset entry. */ output_bytes = VARSIZE(in1) + VARSIZE(in2) + i1 * 2 + i2 * 2; output_bytes += 4 * (i1 + i2) / TS_OFFSET_STRIDE; out = (TSVector2) palloc0(output_bytes); SET_VARSIZE(out, output_bytes); /* * We must make out->size valid so that tsvector2_storage(out) is sensible. We'll * collapse out any unused space at the end. */ out->size = i1 + i2; ptr = NULL; data = tsvector2_storage(out); i = 0; dataoff = 0; INITPOS(pos1); INITPOS(pos2); /* * we will need max position from first tsvector2 to add it positions of * second tsvector2 */ maxpos = get_maxpos(in1); while (i1 && i2) { char *lex = tsvector2_storage(in1) + pos1, *lex2 = tsvector2_storage(in2) + pos2; int lex_len = ENTRY_LEN(in1, ptr1), lex2_len = ENTRY_LEN(in2, ptr2); int cmp = ts2_compare_string(lex, lex_len, lex2, lex2_len, false); if (cmp < 0) { /* in1 first */ tsvector2_addlexeme(out, i, &dataoff, lex, lex_len, get_lexeme_positions(lex, lex_len), ENTRY_NPOS(in1, ptr1)); INCRPTR(in1, ptr1, pos1); i1--; i++; } else if (cmp > 0) { /* in2 first */ char *new_lex; WordEntry2 *we = UNWRAP_ENTRY(in2, ptr2); new_lex = tsvector2_addlexeme(out, i, &dataoff, lex2, lex2_len, NULL, 0); if (we->npos > 0) { int addlen; WordEntryPos *apos = get_lexeme_positions(new_lex, lex2_len); addlen = add_pos(lex2, we, apos, 0, maxpos); if (addlen > 0) { ptr = UNWRAP_ENTRY(out, tsvector2_entries(out) + i); ptr->npos = addlen; dataoff = SHORTALIGN(dataoff); dataoff += ptr->npos * sizeof(WordEntryPos); } } INCRPTR(in2, ptr2, pos2); i++; i2--; } else { char *new_lex; int npos1 = ENTRY_NPOS(in1, ptr1), npos2 = ENTRY_NPOS(in2, ptr2); WordEntryPos *apos; new_lex = tsvector2_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0); apos = get_lexeme_positions(new_lex, lex_len); if (npos1 || npos2) { int addlen; char *lex2 = tsvector2_storage(in2) + pos2; ptr = UNWRAP_ENTRY(out, tsvector2_entries(out) + i); if (npos1) { /* add positions from left tsvector2 */ addlen = add_pos(lex, UNWRAP_ENTRY(in1, ptr1), apos, 0, 0); ptr->npos = addlen; if (npos2) { /* add positions from right right tsvector2 */ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, addlen, maxpos); ptr->npos += addlen; } } else /* npos in second should be > 0 */ { /* add positions from right tsvector2 */ addlen = add_pos(lex2, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos); ptr->npos = addlen; } dataoff = SHORTALIGN(dataoff); dataoff += ptr->npos * sizeof(WordEntryPos); } INCRPTR(in1, ptr1, pos1); INCRPTR(in2, ptr2, pos2); i++; i1--; i2--; } } while (i1) { char *lex = tsvector2_storage(in1) + pos1; int lex_len = ENTRY_LEN(in1, ptr1); tsvector2_addlexeme(out, i, &dataoff, lex, lex_len, get_lexeme_positions(lex, lex_len), ENTRY_NPOS(in1, ptr1)); INCRPTR(in1, ptr1, pos1); i++; i1--; } while (i2) { char *lex = tsvector2_storage(in2) + pos2, *new_lex; int lex_len = ENTRY_LEN(in2, ptr2), npos = ENTRY_NPOS(in2, ptr2); new_lex = tsvector2_addlexeme(out, i, &dataoff, lex, lex_len, NULL, 0); if (npos > 0) { int addlen; WordEntryPos *apos = get_lexeme_positions(new_lex, lex_len); addlen = add_pos(lex, UNWRAP_ENTRY(in2, ptr2), apos, 0, maxpos); if (addlen > 0) { WordEntry2 *ptr = UNWRAP_ENTRY(out, tsvector2_entries(out) + i); ptr->npos = addlen; dataoff = SHORTALIGN(dataoff); dataoff += npos * sizeof(WordEntryPos); } } INCRPTR(in2, ptr2, pos2); i++; i2--; } /* * Instead of checking each offset individually, we check for overflow of * pos fields once at the end. */ if (dataoff > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector2 (%d bytes, max %d bytes)", dataoff, MAXSTRPOS))); /* * Adjust sizes (asserting that we didn't overrun the original estimates) * and collapse out any unused array entries. */ out->size = i; if (data != tsvector2_storage(out)) memmove(tsvector2_storage(out), data, dataoff); output_bytes = CALCDATASIZE(out->size, dataoff); Assert(output_bytes <= VARSIZE(out)); SET_VARSIZE(out, output_bytes); PG_FREE_IF_COPY(in1, 0); PG_FREE_IF_COPY(in2, 1); PG_RETURN_POINTER(out); } /* * Compare two strings by tsvector2 rules. * * if isPrefix = true then it returns zero value iff b has prefix a */ int32 ts2_compare_string(char *a, int lena, char *b, int lenb, bool prefix) { int cmp; if (lena == 0) { if (prefix) cmp = 0; /* empty string is prefix of anything */ else cmp = (lenb > 0) ? -1 : 0; } else if (lenb == 0) { cmp = (lena > 0) ? 1 : 0; } else { cmp = memcmp(a, b, Min(lena, lenb)); if (prefix) { if (cmp == 0 && lena > lenb) cmp = 1; /* a is longer, so not a prefix of b */ } else if (cmp == 0 && lena != lenb) { cmp = (lena < lenb) ? -1 : 1; } } return cmp; } /* * Check weight info or/and fill 'data' with the required positions */ static bool checkclass_str(WordEntryPos *pv, int npos, QueryOperand *val, ExecPhraseData *data) { bool result = false; if (npos && (val->weight || data)) { if (val->weight && data) { WordEntryPos *posvec_iter = pv; WordEntryPos *dptr; /* * Filter position information by weights */ dptr = data->pos = palloc(sizeof(WordEntryPos) * npos); data->allocated = true; /* Is there a position with a matching weight? */ while (posvec_iter < (pv + npos)) { /* If true, append this position to the data->pos */ if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) { *dptr = WEP_GETPOS(*posvec_iter); dptr++; } posvec_iter++; } data->npos = dptr - data->pos; if (data->npos > 0) result = true; } else if (val->weight) { WordEntryPos *posvec_iter = pv; /* Is there a position with a matching weight? */ while (posvec_iter < (pv + npos)) { if (val->weight & (1 << WEP_GETWEIGHT(*posvec_iter))) { result = true; break; /* no need to go further */ } posvec_iter++; } } else if (data) /* data != NULL */ { data->npos = npos; data->pos = pv; data->allocated = false; result = true; } else /* should not happen */ elog(ERROR, "invalid parameters"); } else { result = true; } return result; } /* * Removes duplicate pos entries. We can't use uniquePos() from * tsvector2.c because array might be longer than MAXENTRYPOS * * Returns new length. */ static int uniqueLongPos(WordEntryPos *pos, int npos) { WordEntryPos *pos_iter, *result; if (npos <= 1) return npos; qsort((void *) pos, npos, sizeof(WordEntryPos), compareWordEntryPos); result = pos; pos_iter = pos + 1; while (pos_iter < pos + npos) { if (WEP_GETPOS(*pos_iter) != WEP_GETPOS(*result)) { result++; *result = WEP_GETPOS(*pos_iter); } pos_iter++; } return result + 1 - pos; } /* * is there value 'val' in array or not ? */ static bool checkcondition_str(void *checkval, QueryOperand *val, ExecPhraseData *data) { CHKVAL *chkval = (CHKVAL *) checkval; int StopLow = chkval->bidx; int StopHigh = chkval->eidx; int StopMiddle = StopHigh; int difference = -1; bool res = false; char *lexeme; WordEntry2 *entry; /* Loop invariant: StopLow <= val < StopHigh */ while (StopLow < StopHigh) { StopMiddle = StopLow + (StopHigh - StopLow) / 2; lexeme = tsvector2_getlexeme(chkval->vec, StopMiddle, &entry); Assert(!entry->hasoff); difference = ts2_compare_string(chkval->operand + val->distance, val->length, lexeme, entry->len, false); if (difference == 0) { /* Check weight info & fill 'data' with positions */ res = checkclass_str(get_lexeme_positions(lexeme, entry->len), entry->npos, val, data); break; } else if (difference > 0) StopLow = StopMiddle + 1; else StopHigh = StopMiddle; } if ((!res || data) && val->prefix) { WordEntryPos *allpos = NULL; int npos = 0, totalpos = 0; /* * there was a failed exact search, so we should scan further to find * a prefix match. We also need to do so if caller needs position info */ if (StopLow >= StopHigh) StopMiddle = StopHigh; while ((!res || data) && StopMiddle < chkval->eidx) { char *lexeme; int cmp; WordEntryPos *pv; lexeme = tsvector2_getlexeme(chkval->vec, StopMiddle, &entry); Assert(!entry->hasoff); pv = get_lexeme_positions(lexeme, entry->len); cmp = ts2_compare_string(chkval->operand + val->distance, val->length, lexeme, entry->len, true); if (cmp != 0) break; if (data) { /* * We need to join position information */ res = checkclass_str(pv, entry->npos, val, data); if (res) { while (npos + data->npos >= totalpos) { if (totalpos == 0) { totalpos = 256; allpos = palloc(sizeof(WordEntryPos) * totalpos); } else { totalpos *= 2; allpos = repalloc(allpos, sizeof(WordEntryPos) * totalpos); } } if (allpos) { memcpy(allpos + npos, data->pos, sizeof(WordEntryPos) * data->npos); npos += data->npos; } } } else { res = checkclass_str(pv, entry->npos, val, NULL); } StopMiddle++; } if (res && data) { /* Sort and make unique array of found positions */ data->pos = allpos; data->npos = uniqueLongPos(allpos, npos); data->allocated = true; } } return res; } /* * Compute output position list for a tsquery operator in phrase mode. * * Merge the position lists in Ldata and Rdata as specified by "emit", * returning the result list into *data. The input position lists must be * sorted and unique, and the output will be as well. * * data: pointer to initially-all-zeroes output struct, or NULL * Ldata, Rdata: input position lists * emit: bitmask of TSPO_XXX flags * Loffset: offset to be added to Ldata positions before comparing/outputting * Roffset: offset to be added to Rdata positions before comparing/outputting * max_npos: maximum possible required size of output position array * * Loffset and Roffset should not be negative, else we risk trying to output * negative positions, which won't fit into WordEntryPos. * * Returns true if any positions were emitted to *data; or if data is NULL, * returns true if any positions would have been emitted. */ #define TSPO_L_ONLY 0x01 /* emit positions appearing only in L */ #define TSPO_R_ONLY 0x02 /* emit positions appearing only in R */ #define TSPO_BOTH 0x04 /* emit positions appearing in both L&R */ static bool TS_phrase_output(ExecPhraseData *data, ExecPhraseData *Ldata, ExecPhraseData *Rdata, int emit, int Loffset, int Roffset, int max_npos) { int Lindex, Rindex; /* Loop until both inputs are exhausted */ Lindex = Rindex = 0; while (Lindex < Ldata->npos || Rindex < Rdata->npos) { int Lpos, Rpos; int output_pos = 0; /* * Fetch current values to compare. WEP_GETPOS() is needed because * ExecPhraseData->data can point to a tsvector2's WordEntryPosVector. */ if (Lindex < Ldata->npos) Lpos = WEP_GETPOS(Ldata->pos[Lindex]) + Loffset; else { /* L array exhausted, so we're done if R_ONLY isn't set */ if (!(emit & TSPO_R_ONLY)) break; Lpos = INT_MAX; } if (Rindex < Rdata->npos) Rpos = WEP_GETPOS(Rdata->pos[Rindex]) + Roffset; else { /* R array exhausted, so we're done if L_ONLY isn't set */ if (!(emit & TSPO_L_ONLY)) break; Rpos = INT_MAX; } /* Merge-join the two input lists */ if (Lpos < Rpos) { /* Lpos is not matched in Rdata, should we output it? */ if (emit & TSPO_L_ONLY) output_pos = Lpos; Lindex++; } else if (Lpos == Rpos) { /* Lpos and Rpos match ... should we output it? */ if (emit & TSPO_BOTH) output_pos = Rpos; Lindex++; Rindex++; } else /* Lpos > Rpos */ { /* Rpos is not matched in Ldata, should we output it? */ if (emit & TSPO_R_ONLY) output_pos = Rpos; Rindex++; } if (output_pos > 0) { if (data) { /* Store position, first allocating output array if needed */ if (data->pos == NULL) { data->pos = (WordEntryPos *) palloc(max_npos * sizeof(WordEntryPos)); data->allocated = true; } data->pos[data->npos++] = output_pos; } else { /* * Exact positions not needed, so return true as soon as we * know there is at least one. */ return true; } } } if (data && data->npos > 0) { /* Let's assert we didn't overrun the array */ Assert(data->npos <= max_npos); return true; } return false; } /* * Execute tsquery at or below an OP_PHRASE operator. * * This handles tsquery execution at recursion levels where we need to care * about match locations. * * In addition to the same arguments used for TS_execute, the caller may pass * a preinitialized-to-zeroes ExecPhraseData struct, to be filled with lexeme * match position info on success. data == NULL if no position data need be * returned. (In practice, outside callers pass NULL, and only the internal * recursion cases pass a data pointer.) * Note: the function assumes data != NULL for operators other than OP_PHRASE. * This is OK because an outside call always starts from an OP_PHRASE node. * * The detailed semantics of the match data, given that the function returned * "true" (successful match, or possible match), are: * * npos > 0, negate = false: * query is matched at specified position(s) (and only those positions) * npos > 0, negate = true: * query is matched at all positions *except* specified position(s) * npos = 0, negate = false: * query is possibly matched, matching position(s) are unknown * (this should only be returned when TS_EXEC_PHRASE_NO_POS flag is set) * npos = 0, negate = true: * query is matched at all positions * * Successful matches also return a "width" value which is the match width in * lexemes, less one. Hence, "width" is zero for simple one-lexeme matches, * and is the sum of the phrase operator distances for phrase matches. Note * that when width > 0, the listed positions represent the ends of matches not * the starts. (This unintuitive rule is needed to avoid possibly generating * negative positions, which wouldn't fit into the WordEntryPos arrays.) * * When the function returns "false" (no match), it must return npos = 0, * negate = false (which is the state initialized by the caller); but the * "width" output in such cases is undefined. */ static bool TS_phrase_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond, ExecPhraseData *data) { ExecPhraseData Ldata, Rdata; bool lmatch, rmatch; int Loffset, Roffset, maxwidth; /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); if (curitem->type == QI_VAL) return chkcond(arg, (QueryOperand *) curitem, data); if (!data) { /* keep clang checker quiet */ return false; } switch (curitem->qoperator.oper) { case OP_NOT: /* * Because a "true" result with no specific positions is taken as * uncertain, we need no special care here for !TS_EXEC_CALC_NOT. * If it's a false positive, the right things happen anyway. * * Also, we need not touch data->width, since a NOT operation does * not change the match width. */ if (TS_phrase_execute(curitem + 1, arg, flags, chkcond, data)) { if (data->npos > 0) { /* we have some positions, invert negate flag */ data->negate = !data->negate; return true; } else if (data->negate) { /* change "match everywhere" to "match nowhere" */ data->negate = false; return false; } /* match positions are, and remain, uncertain */ return true; } else { /* change "match nowhere" to "match everywhere" */ Assert(data->npos == 0 && !data->negate); data->negate = true; return true; } case OP_PHRASE: case OP_AND: memset(&Ldata, 0, sizeof(Ldata)); memset(&Rdata, 0, sizeof(Rdata)); if (!TS_phrase_execute(curitem + curitem->qoperator.left, arg, flags, chkcond, &Ldata)) return false; if (!TS_phrase_execute(curitem + 1, arg, flags, chkcond, &Rdata)) return false; /* * If either operand has no position information, then we can't * return position data, only a "possible match" result. "Possible * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag * is set, otherwise return false. */ if ((Ldata.npos == 0 && !Ldata.negate) || (Rdata.npos == 0 && !Rdata.negate)) return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false; if (curitem->qoperator.oper == OP_PHRASE) { /* * Compute Loffset and Roffset suitable for phrase match, and * compute overall width of whole phrase match. */ Loffset = curitem->qoperator.distance + Rdata.width; Roffset = 0; if (data) data->width = curitem->qoperator.distance + Ldata.width + Rdata.width; } else { /* * For OP_AND, set output width and alignment like OP_OR (see * comment below) */ maxwidth = Max(Ldata.width, Rdata.width); Loffset = maxwidth - Ldata.width; Roffset = maxwidth - Rdata.width; if (data) data->width = maxwidth; } if (Ldata.negate && Rdata.negate) { /* !L & !R: treat as !(L | R) */ (void) TS_phrase_output(data, &Ldata, &Rdata, TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, Loffset, Roffset, Ldata.npos + Rdata.npos); if (data) data->negate = true; return true; } else if (Ldata.negate) { /* !L & R */ return TS_phrase_output(data, &Ldata, &Rdata, TSPO_R_ONLY, Loffset, Roffset, Rdata.npos); } else if (Rdata.negate) { /* L & !R */ return TS_phrase_output(data, &Ldata, &Rdata, TSPO_L_ONLY, Loffset, Roffset, Ldata.npos); } else { /* straight AND */ return TS_phrase_output(data, &Ldata, &Rdata, TSPO_BOTH, Loffset, Roffset, Min(Ldata.npos, Rdata.npos)); } case OP_OR: memset(&Ldata, 0, sizeof(Ldata)); memset(&Rdata, 0, sizeof(Rdata)); lmatch = TS_phrase_execute(curitem + curitem->qoperator.left, arg, flags, chkcond, &Ldata); rmatch = TS_phrase_execute(curitem + 1, arg, flags, chkcond, &Rdata); if (!lmatch && !rmatch) return false; /* * If a valid operand has no position information, then we can't * return position data, only a "possible match" result. "Possible * match" answers are only wanted when TS_EXEC_PHRASE_NO_POS flag * is set, otherwise return false. */ if ((lmatch && Ldata.npos == 0 && !Ldata.negate) || (rmatch && Rdata.npos == 0 && !Rdata.negate)) return (flags & TS_EXEC_PHRASE_NO_POS) ? true : false; /* * Cope with undefined output width from failed submatch. (This * takes less code than trying to ensure that all failure returns * set data->width to zero.) */ if (!lmatch) Ldata.width = 0; if (!rmatch) Rdata.width = 0; /* * For OP_AND and OP_OR, report the width of the wider of the two * inputs, and align the narrower input's positions to the right * end of that width. This rule deals at least somewhat * reasonably with cases like "x <-> (y | z <-> q)". */ maxwidth = Max(Ldata.width, Rdata.width); Loffset = maxwidth - Ldata.width; Roffset = maxwidth - Rdata.width; data->width = maxwidth; if (Ldata.negate && Rdata.negate) { /* !L | !R: treat as !(L & R) */ (void) TS_phrase_output(data, &Ldata, &Rdata, TSPO_BOTH, Loffset, Roffset, Min(Ldata.npos, Rdata.npos)); data->negate = true; return true; } else if (Ldata.negate) { /* !L | R: treat as !(L & !R) */ (void) TS_phrase_output(data, &Ldata, &Rdata, TSPO_L_ONLY, Loffset, Roffset, Ldata.npos); data->negate = true; return true; } else if (Rdata.negate) { /* L | !R: treat as !(!L & R) */ (void) TS_phrase_output(data, &Ldata, &Rdata, TSPO_R_ONLY, Loffset, Roffset, Rdata.npos); data->negate = true; return true; } else { /* straight OR */ return TS_phrase_output(data, &Ldata, &Rdata, TSPO_BOTH | TSPO_L_ONLY | TSPO_R_ONLY, Loffset, Roffset, Ldata.npos + Rdata.npos); } default: elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); } /* not reachable, but keep compiler quiet */ return false; } /* * Evaluate tsquery boolean expression. * * curitem: current tsquery item (initially, the first one) * arg: opaque value to pass through to callback function * flags: bitmask of flag bits shown in ts_utils.h * chkcond: callback function to check whether a primitive value is present * * The logic here deals only with operators above any phrase operator, for * which we do not need to worry about lexeme positions. As soon as we hit an * OP_PHRASE operator, we pass it off to TS_phrase_execute which does worry. */ bool TS_execute(QueryItem *curitem, void *arg, uint32 flags, TSExecuteCallback chkcond) { /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); if (curitem->type == QI_VAL) return chkcond(arg, (QueryOperand *) curitem, NULL /* we don't need position info */ ); switch (curitem->qoperator.oper) { case OP_NOT: if (flags & TS_EXEC_CALC_NOT) return !TS_execute(curitem + 1, arg, flags, chkcond); else return true; case OP_AND: if (TS_execute(curitem + curitem->qoperator.left, arg, flags, chkcond)) return TS_execute(curitem + 1, arg, flags, chkcond); else return false; case OP_OR: if (TS_execute(curitem + curitem->qoperator.left, arg, flags, chkcond)) return true; else return TS_execute(curitem + 1, arg, flags, chkcond); case OP_PHRASE: return TS_phrase_execute(curitem, arg, flags, chkcond, NULL); default: elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); } /* not reachable, but keep compiler quiet */ return false; } /* * Detect whether a tsquery boolean expression requires any positive matches * to values shown in the tsquery. * * This is needed to know whether a GIN index search requires full index scan. * For example, 'x & !y' requires a match of x, so it's sufficient to scan * entries for x; but 'x | !y' could match rows containing neither x nor y. */ bool tsquery_requires_match(QueryItem *curitem) { /* since this function recurses, it could be driven to stack overflow */ check_stack_depth(); if (curitem->type == QI_VAL) return true; switch (curitem->qoperator.oper) { case OP_NOT: /* * Assume there are no required matches underneath a NOT. For * some cases with nested NOTs, we could prove there's a required * match, but it seems unlikely to be worth the trouble. */ return false; case OP_PHRASE: /* * Treat OP_PHRASE as OP_AND here */ case OP_AND: /* If either side requires a match, we're good */ if (tsquery_requires_match(curitem + curitem->qoperator.left)) return true; else return tsquery_requires_match(curitem + 1); case OP_OR: /* Both sides must require a match */ if (tsquery_requires_match(curitem + curitem->qoperator.left)) return tsquery_requires_match(curitem + 1); else return false; default: elog(ERROR, "unrecognized operator: %d", curitem->qoperator.oper); } /* not reachable, but keep compiler quiet */ return false; } /* * boolean operations */ PG_FUNCTION_INFO_V1(tsquery_match_tsvector2); Datum tsquery_match_tsvector2(PG_FUNCTION_ARGS) { PG_RETURN_DATUM(DirectFunctionCall2(tsvector2_match_tsquery, PG_GETARG_DATUM(1), PG_GETARG_DATUM(0))); } Datum tsvector2_match_tsquery(PG_FUNCTION_ARGS) { TSVector2 val = PG_GETARG_TSVECTOR2(0); TSQuery query = PG_GETARG_TSQUERY(1); CHKVAL chkval; bool result; /* empty query matches nothing */ if (!query->size) { PG_FREE_IF_COPY(val, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_BOOL(false); } chkval.bidx = 0; chkval.eidx = val->size; chkval.vec = val; chkval.operand = GETOPERAND(query); result = TS_execute(GETQUERY(query), &chkval, TS_EXEC_CALC_NOT, checkcondition_str); PG_FREE_IF_COPY(val, 0); PG_FREE_IF_COPY(query, 1); PG_RETURN_BOOL(result); } /* * ts_stat statistic function support */ /* * Returns the number of positions in value 'wptr' within tsvector2 'txt', * that have a weight equal to one of the weights in 'weight' bitmask. */ static int check_weight(char *lexeme, WordEntry2 *wptr, int8 weight) { int len; int num = 0; WordEntryPos *ptr; Assert(!wptr->hasoff); len = wptr->len; ptr = get_lexeme_positions(lexeme, len); while (len--) { if (weight & (1 << WEP_GETWEIGHT(*ptr))) num++; ptr++; } return num; } #define compareStatWord(a,e,t) \ (ts2_compare_string((a)->lexeme, (a)->lenlexeme, \ t, (e)->len, false)) static void insertStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector2 txt, uint32 off) { WordEntry2 *we; StatEntry *node = stat->root, *pnode = NULL; int n, res = 0; uint32 depth = 1; char *lexeme; lexeme = tsvector2_getlexeme(txt, off, &we); Assert(!we->hasoff); if (stat->weight == 0) n = (we->npos) ? we->npos : 1; else n = (we->npos) ? check_weight(lexeme, we, stat->weight) : 0; if (n == 0) return; /* nothing to insert */ while (node) { res = compareStatWord(node, we, lexeme); if (res == 0) { break; } else { pnode = node; node = (res < 0) ? node->left : node->right; } depth++; } if (depth > stat->maxdepth) stat->maxdepth = depth; if (node == NULL) { node = MemoryContextAlloc(persistentContext, STATENTRYHDRSZ + we->len); node->left = node->right = NULL; node->ndoc = 1; node->nentry = n; node->lenlexeme = we->len; memcpy(node->lexeme, lexeme, node->lenlexeme); if (pnode == NULL) { stat->root = node; } else { if (res < 0) pnode->left = node; else pnode->right = node; } } else { node->ndoc++; node->nentry += n; } } static void chooseNextStatEntry(MemoryContext persistentContext, TSVectorStat *stat, TSVector2 txt, uint32 low, uint32 high, uint32 offset) { uint32 pos; uint32 middle = (low + high) >> 1, count = txt->size; pos = (low + middle) >> 1; if (low != middle && pos >= offset && pos - offset < count) insertStatEntry(persistentContext, stat, txt, pos - offset); pos = (high + middle + 1) >> 1; if (middle + 1 != high && pos >= offset && pos - offset < count) insertStatEntry(persistentContext, stat, txt, pos - offset); if (low != middle) chooseNextStatEntry(persistentContext, stat, txt, low, middle, offset); if (high != middle + 1) chooseNextStatEntry(persistentContext, stat, txt, middle + 1, high, offset); } /* * This is written like a custom aggregate function, because the * original plan was to do just that. Unfortunately, an aggregate function * can't return a set, so that plan was abandoned. If that limitation is * lifted in the future, ts_stat could be a real aggregate function so that * you could use it like this: * * SELECT ts_stat(vector_column) FROM vector_table; * * where vector_column is a tsvector2-type column in vector_table. */ static TSVectorStat * ts_accum(MemoryContext persistentContext, TSVectorStat *stat, Datum data) { TSVector2 txt = DatumGetTSVector2(data); uint32 i, nbit = 0, offset; if (stat == NULL) { /* Init in first */ stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); stat->maxdepth = 1; } if (txt == NULL) return stat; /* simple check of correctness */ if (txt->size == 0) { if (txt != (TSVector2) DatumGetPointer(data)) pfree(txt); return stat; } i = txt->size - 1; for (; i > 0; i >>= 1) nbit++; nbit = 1 << nbit; offset = (nbit - txt->size) / 2; insertStatEntry(persistentContext, stat, txt, (nbit >> 1) - offset); chooseNextStatEntry(persistentContext, stat, txt, 0, nbit, offset); return stat; } static void ts_setup_firstcall(FunctionCallInfo fcinfo, FuncCallContext *funcctx, TSVectorStat *stat) { TupleDesc tupdesc; MemoryContext oldcontext; StatEntry *node; funcctx->user_fctx = (void *) stat; oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); stat->stack = palloc0(sizeof(StatEntry *) * (stat->maxdepth + 1)); stat->stackpos = 0; node = stat->root; /* find leftmost value */ if (node == NULL) stat->stack[stat->stackpos] = NULL; else for (;;) { stat->stack[stat->stackpos] = node; if (node->left) { stat->stackpos++; node = node->left; } else break; } Assert(stat->stackpos <= stat->maxdepth); tupdesc = CreateTemplateTupleDesc(3, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "word", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "ndoc", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "nentry", INT4OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); funcctx->attinmeta = TupleDescGetAttInMetadata(tupdesc); MemoryContextSwitchTo(oldcontext); } static StatEntry * walkStatEntryTree(TSVectorStat *stat) { StatEntry *node = stat->stack[stat->stackpos]; if (node == NULL) return NULL; if (node->ndoc != 0) { /* return entry itself: we already was at left sublink */ return node; } else if (node->right && node->right != stat->stack[stat->stackpos + 1]) { /* go on right sublink */ stat->stackpos++; node = node->right; /* find most-left value */ for (;;) { stat->stack[stat->stackpos] = node; if (node->left) { stat->stackpos++; node = node->left; } else break; } Assert(stat->stackpos <= stat->maxdepth); } else { /* we already return all left subtree, itself and right subtree */ if (stat->stackpos == 0) return NULL; stat->stackpos--; return walkStatEntryTree(stat); } return node; } static Datum ts_process_call(FuncCallContext *funcctx) { TSVectorStat *st; StatEntry *entry; st = (TSVectorStat *) funcctx->user_fctx; entry = walkStatEntryTree(st); if (entry != NULL) { Datum result; char *values[3]; char ndoc[16]; char nentry[16]; HeapTuple tuple; values[0] = palloc(entry->lenlexeme + 1); memcpy(values[0], entry->lexeme, entry->lenlexeme); (values[0])[entry->lenlexeme] = '\0'; sprintf(ndoc, "%d", entry->ndoc); values[1] = ndoc; sprintf(nentry, "%d", entry->nentry); values[2] = nentry; tuple = BuildTupleFromCStrings(funcctx->attinmeta, values); result = HeapTupleGetDatum(tuple); pfree(values[0]); /* mark entry as already visited */ entry->ndoc = 0; return result; } return (Datum) 0; } static TSVectorStat * ts_stat_sql(MemoryContext persistentContext, text *txt, text *ws) { char *query = text_to_cstring(txt); TSVectorStat *stat; bool isnull; Portal portal; SPIPlanPtr plan; Oid oid; if ((plan = SPI_prepare(query, 0, NULL)) == NULL) /* internal error */ elog(ERROR, "SPI_prepare(\"%s\") failed", query); if ((portal = SPI_cursor_open(NULL, plan, NULL, NULL, true)) == NULL) /* internal error */ elog(ERROR, "SPI_cursor_open(\"%s\") failed", query); SPI_cursor_fetch(portal, true, 100); oid = TypenameGetTypid("tsvector2"); if (SPI_tuptable == NULL || SPI_tuptable->tupdesc->natts != 1 || !IsBinaryCoercible(SPI_gettypeid(SPI_tuptable->tupdesc, 1), oid)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("ts_stat query must return one tsvector2 column"))); stat = MemoryContextAllocZero(persistentContext, sizeof(TSVectorStat)); stat->maxdepth = 1; if (ws) { char *buf; buf = VARDATA_ANY(ws); while (buf - VARDATA_ANY(ws) < VARSIZE_ANY_EXHDR(ws)) { if (pg_mblen(buf) == 1) { switch (*buf) { case 'A': case 'a': stat->weight |= 1 << 3; break; case 'B': case 'b': stat->weight |= 1 << 2; break; case 'C': case 'c': stat->weight |= 1 << 1; break; case 'D': case 'd': stat->weight |= 1; break; default: stat->weight |= 0; } } buf += pg_mblen(buf); } } while (SPI_processed > 0) { uint64 i; for (i = 0; i < SPI_processed; i++) { Datum data = SPI_getbinval(SPI_tuptable->vals[i], SPI_tuptable->tupdesc, 1, &isnull); if (!isnull) stat = ts_accum(persistentContext, stat, data); } SPI_freetuptable(SPI_tuptable); SPI_cursor_fetch(portal, true, 100); } SPI_freetuptable(SPI_tuptable); SPI_cursor_close(portal); SPI_freeplan(plan); pfree(query); return stat; } PG_FUNCTION_INFO_V1(tsvector2_stat1); Datum tsvector2_stat1(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; if (SRF_IS_FIRSTCALL()) { TSVectorStat *stat; text *txt = PG_GETARG_TEXT_PP(0); funcctx = SRF_FIRSTCALL_INIT(); SPI_connect(); stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, NULL); PG_FREE_IF_COPY(txt, 0); ts_setup_firstcall(fcinfo, funcctx, stat); SPI_finish(); } funcctx = SRF_PERCALL_SETUP(); if ((result = ts_process_call(funcctx)) != (Datum) 0) SRF_RETURN_NEXT(funcctx, result); SRF_RETURN_DONE(funcctx); } PG_FUNCTION_INFO_V1(tsvector2_stat2); Datum tsvector2_stat2(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Datum result; if (SRF_IS_FIRSTCALL()) { TSVectorStat *stat; text *txt = PG_GETARG_TEXT_PP(0); text *ws = PG_GETARG_TEXT_PP(1); funcctx = SRF_FIRSTCALL_INIT(); SPI_connect(); stat = ts_stat_sql(funcctx->multi_call_memory_ctx, txt, ws); PG_FREE_IF_COPY(txt, 0); PG_FREE_IF_COPY(ws, 1); ts_setup_firstcall(fcinfo, funcctx, stat); SPI_finish(); } funcctx = SRF_PERCALL_SETUP(); if ((result = ts_process_call(funcctx)) != (Datum) 0) SRF_RETURN_NEXT(funcctx, result); SRF_RETURN_DONE(funcctx); } /* * Triggers for automatic update of a tsvector2 column from text column(s) * * Trigger arguments are either * name of tsvector2 col, name of tsconfig to use, name(s) of text col(s) * name of tsvector2 col, name of regconfig col, name(s) of text col(s) * ie, tsconfig can either be specified by name, or indirectly as the * contents of a regconfig field in the row. If the name is used, it must * be explicitly schema-qualified. */ static Datum tsvector2_update_trigger(PG_FUNCTION_ARGS, bool config_column) { TriggerData *trigdata; Trigger *trigger; Relation rel; HeapTuple rettuple = NULL; int tsvector_attr_num, i; ParsedText prs; Datum datum; bool isnull; text *txt; Oid cfgId; /* Check call context */ if (!CALLED_AS_TRIGGER(fcinfo)) /* internal error */ elog(ERROR, "tsvector2_update_trigger: not fired by trigger manager"); trigdata = (TriggerData *) fcinfo->context; if (!TRIGGER_FIRED_FOR_ROW(trigdata->tg_event)) elog(ERROR, "tsvector2_update_trigger: must be fired for row"); if (!TRIGGER_FIRED_BEFORE(trigdata->tg_event)) elog(ERROR, "tsvector2_update_trigger: must be fired BEFORE event"); if (TRIGGER_FIRED_BY_INSERT(trigdata->tg_event)) rettuple = trigdata->tg_trigtuple; else if (TRIGGER_FIRED_BY_UPDATE(trigdata->tg_event)) rettuple = trigdata->tg_newtuple; else elog(ERROR, "tsvector2_update_trigger: must be fired for INSERT or UPDATE"); trigger = trigdata->tg_trigger; rel = trigdata->tg_relation; if (trigger->tgnargs < 3) elog(ERROR, "tsvector2_update_trigger: arguments must be tsvector_field, ts_config, text_field1, ...)"); /* Find the target tsvector2 column */ tsvector_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[0]); if (tsvector_attr_num == SPI_ERROR_NOATTRIBUTE) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("tsvector2 column \"%s\" does not exist", trigger->tgargs[0]))); /* This will effectively reject system columns, so no separate test: */ if (strcmp(SPI_gettype(rel->rd_att, tsvector_attr_num), "tsvector2") != 0) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("column \"%s\" is not of tsvector2 type", trigger->tgargs[0]))); /* Find the configuration to use */ if (config_column) { int config_attr_num; config_attr_num = SPI_fnumber(rel->rd_att, trigger->tgargs[1]); if (config_attr_num == SPI_ERROR_NOATTRIBUTE) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("configuration column \"%s\" does not exist", trigger->tgargs[1]))); if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, config_attr_num), REGCONFIGOID)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("column \"%s\" is not of regconfig type", trigger->tgargs[1]))); datum = SPI_getbinval(rettuple, rel->rd_att, config_attr_num, &isnull); if (isnull) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("configuration column \"%s\" must not be null", trigger->tgargs[1]))); cfgId = DatumGetObjectId(datum); } else { List *names; names = stringToQualifiedNameList(trigger->tgargs[1]); /* require a schema so that results are not search path dependent */ if (list_length(names) < 2) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("text search configuration name \"%s\" must be schema-qualified", trigger->tgargs[1]))); cfgId = get_ts_config_oid(names, false); } /* initialize parse state */ prs.lenwords = 32; prs.curwords = 0; prs.pos = 0; prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords); /* find all words in indexable column(s) */ for (i = 2; i < trigger->tgnargs; i++) { int numattr; numattr = SPI_fnumber(rel->rd_att, trigger->tgargs[i]); if (numattr == SPI_ERROR_NOATTRIBUTE) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_COLUMN), errmsg("column \"%s\" does not exist", trigger->tgargs[i]))); if (!IsBinaryCoercible(SPI_gettypeid(rel->rd_att, numattr), TEXTOID)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("column \"%s\" is not of a character type", trigger->tgargs[i]))); datum = SPI_getbinval(rettuple, rel->rd_att, numattr, &isnull); if (isnull) continue; txt = DatumGetTextPP(datum); parsetext(cfgId, &prs, VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt)); if (txt != (text *) DatumGetPointer(datum)) pfree(txt); } #if PG_VERSION_NUM >= 100000 /* make tsvector2 value */ if (prs.curwords) { datum = PointerGetDatum(make_tsvector2(&prs)); isnull = false; rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, 1, &tsvector_attr_num, &datum, &isnull); pfree(DatumGetPointer(datum)); } else { TSVector2 out = palloc(tsvector2_calcsize(0, 0)); SET_VARSIZE(out, tsvector2_calcsize(0, 0)); out->size = 0; datum = PointerGetDatum(out); isnull = false; rettuple = heap_modify_tuple_by_cols(rettuple, rel->rd_att, 1, &tsvector_attr_num, &datum, &isnull); pfree(prs.words); } #else /* make tsvector2 value */ if (prs.curwords) { datum = PointerGetDatum(make_tsvector2(&prs)); rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, &datum, NULL); pfree(DatumGetPointer(datum)); } else { TSVector2 out = palloc(CALCDATASIZE(0, 0)); SET_VARSIZE(out, tsvector2_calcsize(0, 0)); out->size = 0; datum = PointerGetDatum(out); rettuple = SPI_modifytuple(rel, rettuple, 1, &tsvector_attr_num, &datum, NULL); pfree(prs.words); } if (rettuple == NULL) /* internal error */ elog(ERROR, "tsvector_update_trigger: %d returned by SPI_modifytuple", SPI_result); #endif return PointerGetDatum(rettuple); } PG_FUNCTION_INFO_V1(tsvector2_update_trigger_byid); Datum tsvector2_update_trigger_byid(PG_FUNCTION_ARGS) { return tsvector2_update_trigger(fcinfo, false); } PG_FUNCTION_INFO_V1(tsvector2_update_trigger_bycolumn); Datum tsvector2_update_trigger_bycolumn(PG_FUNCTION_ARGS) { return tsvector2_update_trigger(fcinfo, true); }