/* * icu_transform.c * * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU * (see http://icu-project.org) * * By Daniel Vérité, 2018-2019. See LICENSE.md */ #include "postgres.h" #include "fmgr.h" #include "funcapi.h" #include "utils/builtins.h" #include "utils/pg_locale.h" #include "unicode/uenum.h" #include "unicode/utrans.h" PG_FUNCTION_INFO_V1(icu_transforms_list); PG_FUNCTION_INFO_V1(icu_transform); /* * List the available pre-defined transforms/transliterations. */ Datum icu_transforms_list(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; UErrorCode status = U_ZERO_ERROR; UEnumeration *en; const char *elt; if (SRF_IS_FIRSTCALL()) { funcctx = SRF_FIRSTCALL_INIT(); en = utrans_openIDs(&status); if (U_FAILURE(status)) elog(ERROR, "utrans_openIDs failed: %s", u_errorName(status)); funcctx->user_fctx = (void *)en; } funcctx = SRF_PERCALL_SETUP(); en = (UEnumeration*) funcctx->user_fctx; elt = uenum_next(en, NULL, &status); if (U_FAILURE(status)) elog(ERROR, "uenum_next failed: %s", u_errorName(status)); if (elt) { text* item = cstring_to_text(elt); SRF_RETURN_NEXT(funcctx, PointerGetDatum(item)); } else { uenum_close(en); SRF_RETURN_DONE(funcctx); } } /* * Main function to apply a tranformation based on UTransliterator. * Input: * 1st arg: string to transform * 2nd arg: name (system identifier) of the transliterator */ Datum icu_transform(PG_FUNCTION_ARGS) { text *arg1 = PG_GETARG_TEXT_PP(0); text *arg2 = PG_GETARG_TEXT_PP(1); int32_t len1 = VARSIZE_ANY_EXHDR(arg1); const char *input_id = text_to_cstring(arg2); UErrorCode status = U_ZERO_ERROR; UTransliterator *utrans; int32_t ulen, limit, capacity, start, original_ulen; int32_t result_len, in_ulen; UChar* utext; UChar* trans_id; char* result; UChar* original; bool done = false; in_ulen = icu_to_uchar(&trans_id, input_id, strlen(input_id)); utrans = utrans_openU(trans_id, in_ulen, UTRANS_FORWARD, NULL, /* rules. NULL for system transliterators */ -1, NULL, /* pointer to parseError. Not used */ &status); if (U_FAILURE(status) || !utrans) { elog(ERROR, "utrans_open failed: %s", u_errorName(status)); } ulen = icu_to_uchar(&utext, text_to_cstring(arg1), len1); /* utext is terminated by a zero UChar that we include in the copy. */ original = (UChar*) palloc((ulen+1)*sizeof(UChar)); original_ulen = ulen; memcpy(original, utext, (ulen+1)*sizeof(UChar)); limit = ulen; capacity = ulen + 1; start = 0; /* * utrans_transUChars() updates the string in-place, stopping if * it would go over `capacity`. * The following loop doubles the capacity and restarts from * scratch with a clean copy of the source if the buffer was * too small. * Although it looks like we could use `start` and `limit` * to reallocate and make the transliteration continue from * where it stopped, in practice this does not appear to work. * The documentation is quite unclear about this function. */ do { status = U_ZERO_ERROR; utrans_transUChars(utrans, utext, &ulen, capacity, start, /* beginning index */ &limit, &status); if (U_FAILURE(status)) { if (status != U_BUFFER_OVERFLOW_ERROR) { elog(ERROR, "utrans_transUChars failed: %s", u_errorName(status)); } else { pfree(utext); capacity = capacity * 2; utext = (UChar*) palloc(capacity*sizeof(UChar)); /* restore the original text in the enlarged buffer */ ulen = original_ulen; limit = ulen; memcpy(utext, original, (ulen+1)*sizeof(UChar)); } } else done = true; } while (!done); if (U_FAILURE(status)) { utrans_close(utrans); elog(ERROR, "utrans_transUChars failed: %s", u_errorName(status)); } utrans_close(utrans); result_len = icu_from_uchar(&result, utext, ulen); PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len)); }