/*------------------------------------------------------------------------- * * tsvector2.c * I/O functions for tsvector2 * * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group * Portions Copyright (c) 2018, PostgresPro * *------------------------------------------------------------------------- */ #include "postgres.h" #include "libpq/pqformat.h" #include "utils/builtins.h" #include "utils/memutils.h" #include "tsearch/ts_locale.h" #include "tsearch/ts_utils.h" #include "tsvector2.h" PG_MODULE_MAGIC; typedef struct { WordEntry2 entry; /* must be first! */ size_t offset; /* offset of lexeme in some buffer */ WordEntryPos *pos; } WordEntryIN; /* Compare two WordEntryPos values for qsort */ int compareWordEntryPos(const void *a, const void *b) { int apos = WEP_GETPOS(*(const WordEntryPos *) a); int bpos = WEP_GETPOS(*(const WordEntryPos *) b); if (apos == bpos) return 0; return (apos > bpos) ? 1 : -1; } /* * Removes duplicate pos entries. If there's two entries with same pos * but different weight, the higher weight is retained. * * Returns new length. */ static int uniquePos(WordEntryPos *a, int l) { WordEntryPos *ptr, *res; if (l <= 1) return l; qsort((void *) a, l, sizeof(WordEntryPos), compareWordEntryPos); res = a; ptr = a + 1; while (ptr - a < l) { if (WEP_GETPOS(*ptr) != WEP_GETPOS(*res)) { res++; *res = *ptr; if (res - a >= MAXNUMPOS - 1 || WEP_GETPOS(*res) == MAXENTRYPOS - 1) break; } else if (WEP_GETWEIGHT(*ptr) > WEP_GETWEIGHT(*res)) WEP_SETWEIGHT(*res, WEP_GETWEIGHT(*ptr)); ptr++; } return res + 1 - a; } /* Compare two WordEntryIN values for qsort */ static int compareentry_in(const void *va, const void *vb, void *arg) { const WordEntryIN *a = (const WordEntryIN *) va; const WordEntryIN *b = (const WordEntryIN *) vb; char *BufferStr = (char *) arg; return tsCompareString(&BufferStr[a->offset], a->entry.len, &BufferStr[b->offset], b->entry.len, false); } /* Compare two WordEntry2 values for qsort */ static int compareentry(const void *va, const void *vb, void *arg) { const WordEntry2 *a = (const WordEntry2 *) va; const WordEntry2 *b = (const WordEntry2 *) vb; TSVector2 tsv = (TSVector2) arg; uint32 offset1 = tsvector2_getoffset(tsv, a - tsvector2_entries(tsv), NULL), offset2 = tsvector2_getoffset(tsv, b - tsvector2_entries(tsv), NULL); return tsCompareString(tsvector2_storage(tsv) + offset1, ENTRY_LEN(tsv, a), tsvector2_storage(tsv) + offset2, ENTRY_LEN(tsv, b), false); } /* * Sort an array of WordEntryIN, remove duplicates. * *outbuflen receives the amount of space needed for strings and positions. */ static int uniqueentry(WordEntryIN *a, int l, char *buf, int *outbuflen) { int buflen, i = 0; WordEntryIN *ptr, *res; Assert(l >= 1); if (l > 1) qsort_arg((void *) a, l, sizeof(WordEntryIN), compareentry_in, (void *) buf); buflen = 0; res = a; ptr = a + 1; while (ptr - a < l) { Assert(!ptr->entry.hasoff); if (!(ptr->entry.len == res->entry.len && strncmp(&buf[ptr->offset], &buf[res->offset], res->entry.len) == 0)) { /* done accumulating data into *res, count space needed */ buflen = SHORTALIGN(buflen); if (i++ % TS_OFFSET_STRIDE == 0) { buflen = INTALIGN(buflen); buflen += sizeof(WordEntry2); } buflen += res->entry.len; if (res->entry.npos) { res->entry.npos = uniquePos(res->pos, res->entry.npos); buflen = SHORTALIGN(buflen); buflen += res->entry.npos * sizeof(WordEntryPos); } res++; if (res != ptr) *res = *ptr; } else if (ptr->entry.npos) { if (res->entry.npos) { /* append ptr's positions to res's positions */ int newlen = ptr->entry.npos + res->entry.npos; res->pos = (WordEntryPos *) repalloc(res->pos, newlen * sizeof(WordEntryPos)); memcpy(&res->pos[res->entry.npos], ptr->pos, ptr->entry.npos * sizeof(WordEntryPos)); res->entry.npos = newlen; pfree(ptr->pos); } else { /* just give ptr's positions to pos */ res->entry.npos = ptr->entry.npos; res->pos = ptr->pos; } } ptr++; } /* count space needed for last item */ if (i % TS_OFFSET_STRIDE == 0) { buflen = INTALIGN(buflen); buflen += sizeof(WordEntry2); } else buflen = SHORTALIGN(buflen); buflen += res->entry.len; if (res->entry.npos) { res->entry.npos = uniquePos(res->pos, res->entry.npos); buflen = SHORTALIGN(buflen); buflen += res->entry.npos * sizeof(WordEntryPos); } *outbuflen = buflen; return res + 1 - a; } PG_FUNCTION_INFO_V1(tsvector2in); Datum tsvector2in(PG_FUNCTION_ARGS) { char *buf = PG_GETARG_CSTRING(0); TSVectorParseState state; WordEntryIN *arr; int totallen; int arrlen; /* allocated size of arr */ int len = 0; TSVector2 in; int i; char *token; int toklen; WordEntryPos *pos; int poslen; int stroff; /* * Tokens are appended to tmpbuf, cur is a pointer to the end of used * space in tmpbuf. */ char *tmpbuf; char *cur; int buflen = 256; /* allocated size of tmpbuf */ state = init_tsvector_parser_compat(buf, 0); arrlen = 64; arr = (WordEntryIN *) palloc(sizeof(WordEntryIN) * arrlen); cur = tmpbuf = (char *) palloc(buflen); while (gettoken_tsvector(state, &token, &toklen, &pos, &poslen, NULL)) { if (toklen >= MAXSTRLEN) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("word is too long (%ld bytes, max %ld bytes)", (long) toklen, (long) (MAXSTRLEN - 1)))); if (cur - tmpbuf > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector2 (%ld bytes, max %ld bytes)", (long) (cur - tmpbuf), (long) MAXSTRPOS))); /* * Enlarge buffers if needed */ if (len >= arrlen) { arrlen *= 2; arr = (WordEntryIN *) repalloc((void *) arr, sizeof(WordEntryIN) * arrlen); } while ((cur - tmpbuf) + toklen >= buflen) { int dist = cur - tmpbuf; buflen *= 2; tmpbuf = (char *) repalloc((void *) tmpbuf, buflen); cur = tmpbuf + dist; } arr[len].entry.hasoff = 0; arr[len].entry.len = toklen; arr[len].offset = cur - tmpbuf; arr[len].entry.npos = poslen; arr[len].pos = (poslen != 0) ? pos : NULL; memcpy((void *) cur, (void *) token, toklen); cur += toklen; len++; } close_tsvector_parser(state); if (len > 0) len = uniqueentry(arr, len, tmpbuf, &buflen); else buflen = 0; if (buflen > MAXSTRPOS) ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), errmsg("string is too long for tsvector2 (%d bytes, max %d bytes)", buflen, MAXSTRPOS))); totallen = tsvector2_calcsize(len, buflen); in = (TSVector2) palloc0(totallen); SET_VARSIZE(in, totallen); in->size = len; stroff = 0; for (i = 0; i < len; i++) { tsvector2_addlexeme(in, i, &stroff, &tmpbuf[arr[i].offset], arr[i].entry.len, arr[i].pos, arr[i].entry.npos); if (arr[i].entry.npos) pfree(arr[i].pos); } Assert((tsvector2_storage(in) + stroff - (char *) in) == totallen); PG_RETURN_TSVECTOR2(in); } PG_FUNCTION_INFO_V1(tsvector2out); Datum tsvector2out(PG_FUNCTION_ARGS) { TSVector2 out = PG_GETARG_TSVECTOR2(0); char *outbuf; int32 i, lenbuf = 0, pp, tscount = out->size; uint32 pos; WordEntry2 *ptr = tsvector2_entries(out); char *curbegin, *curin, *curout; lenbuf = tscount * 2 /* '' */ + tscount - 1 /* space */ + 2 /* \0 */ ; for (i = 0; i < tscount; i++) { int npos = ENTRY_NPOS(out, ptr + i); lenbuf += ENTRY_LEN(out, ptr + i) * 2 * pg_database_encoding_max_length() /* for escape */ ; if (npos) lenbuf += 1 /* : */ + 7 /* int2 + , + weight */ * npos; } curout = outbuf = (char *) palloc(lenbuf); INITPOS(pos); for (i = 0; i < tscount; i++) { int lex_len = ENTRY_LEN(out, ptr), npos = ENTRY_NPOS(out, ptr); curbegin = curin = tsvector2_storage(out) + pos; if (i != 0) *curout++ = ' '; *curout++ = '\''; while (curin - curbegin < lex_len) { int len = pg_mblen(curin); if (t_iseq(curin, '\'')) *curout++ = '\''; else if (t_iseq(curin, '\\')) *curout++ = '\\'; while (len--) *curout++ = *curin++; } *curout++ = '\''; if ((pp = npos) != 0) { WordEntryPos *wptr; *curout++ = ':'; wptr = get_lexeme_positions(curbegin, lex_len); while (pp) { curout += sprintf(curout, "%d", WEP_GETPOS(*wptr)); switch (WEP_GETWEIGHT(*wptr)) { case 3: *curout++ = 'A'; break; case 2: *curout++ = 'B'; break; case 1: *curout++ = 'C'; break; case 0: default: break; } if (pp > 1) *curout++ = ','; pp--; wptr++; } } INCRPTR(out, ptr, pos); } *curout = '\0'; PG_FREE_IF_COPY(out, 0); PG_RETURN_CSTRING(outbuf); } /* * Binary Input / Output functions. The binary format is as follows: * * uint32 number of lexemes * * for each lexeme: * lexeme text in client encoding, null-terminated * uint16 number of positions * for each position: * uint16 WordEntryPos */ PG_FUNCTION_INFO_V1(tsvector2send); Datum tsvector2send(PG_FUNCTION_ARGS) { TSVector2 vec = PG_GETARG_TSVECTOR2(0); StringInfoData buf; int i, j; uint32 pos; WordEntry2 *weptr = tsvector2_entries(vec); pq_begintypsend(&buf); pq_sendint32(&buf, vec->size); INITPOS(pos); for (i = 0; i < vec->size; i++) { char *lexeme = tsvector2_storage(vec) + pos; int npos = ENTRY_NPOS(vec, weptr), lex_len = ENTRY_LEN(vec, weptr); /* * the strings in the TSVector2 array are not null-terminated, so we * have to send the null-terminator separately */ pq_sendtext(&buf, lexeme, lex_len); pq_sendbyte(&buf, '\0'); pq_sendint16(&buf, npos); if (npos > 0) { WordEntryPos *wepptr = get_lexeme_positions(lexeme, lex_len); for (j = 0; j < npos; j++) pq_sendint16(&buf, wepptr[j]); } INCRPTR(vec, weptr, pos); } PG_FREE_IF_COPY(vec, 0); PG_RETURN_BYTEA_P(pq_endtypsend(&buf)); } PG_FUNCTION_INFO_V1(tsvector2recv); Datum tsvector2recv(PG_FUNCTION_ARGS) { StringInfo buf = (StringInfo) PG_GETARG_POINTER(0); TSVector2 vec; int i, datalen; /* number of bytes used in the variable size * area after fixed size TSVector2 header and * WordEntries */ int32 nentries; Size hdrlen; Size len; /* allocated size of vec */ bool needSort = false; char *prev_lexeme = NULL; int prev_lex_len; nentries = pq_getmsgint(buf, sizeof(int32)); if (nentries < 0 || nentries > (MaxAllocSize / sizeof(WordEntry2))) elog(ERROR, "invalid size of tsvector2"); hdrlen = tsvector2_hdrlen() + sizeof(WordEntry2) * nentries; len = hdrlen * 2; /* times two to make room for lexemes */ vec = (TSVector2) palloc0(len); vec->size = nentries; datalen = 0; for (i = 0; i < nentries; i++) { char *lexeme, *lexeme_out; uint16 npos; int lex_len; lexeme = (char *) pq_getmsgstring(buf); npos = (uint16) pq_getmsgint(buf, sizeof(uint16)); /* sanity checks */ lex_len = strlen(lexeme); if (lex_len > MAXSTRLEN) elog(ERROR, "invalid tsvector2: lexeme too long"); if (datalen > MAXSTRPOS) elog(ERROR, "invalid tsvector2: maximum total lexeme length exceeded"); if (npos > MAXNUMPOS) elog(ERROR, "unexpected number of tsvector2 positions"); /* * Looks valid. Fill the WordEntry2 struct, and copy lexeme. * * But make sure the buffer is large enough first. */ while (hdrlen + SHORTALIGN(datalen + lex_len) + sizeof(WordEntry2) + npos * sizeof(WordEntryPos) >= len) { len *= 2; vec = (TSVector2) repalloc(vec, len); } if (prev_lexeme && tsCompareString(lexeme, lex_len, prev_lexeme, prev_lex_len, false) <= 0) needSort = true; lexeme_out = tsvector2_addlexeme(vec, i, &datalen, lexeme, lex_len, NULL, npos); if (npos > 0) { WordEntryPos *wepptr; int j; wepptr = get_lexeme_positions(lexeme_out, lex_len); for (j = 0; j < npos; j++) { wepptr[j] = (WordEntryPos) pq_getmsgint(buf, sizeof(WordEntryPos)); if (j > 0 && WEP_GETPOS(wepptr[j]) <= WEP_GETPOS(wepptr[j - 1])) elog(ERROR, "position information is misordered"); } } prev_lexeme = lexeme; prev_lex_len = lex_len; } SET_VARSIZE(vec, hdrlen + datalen); if (needSort) qsort_arg((void *) tsvector2_entries(vec), vec->size, sizeof(WordEntry2), compareentry, (void *) vec); PG_RETURN_TSVECTOR2(vec); }