/*
 * icu_normalize.c
 *
 * Part of icu_ext: a PostgreSQL extension to expose functionality from ICU
 * (see http://icu-project.org)
 *
 * By Daniel Vérité, 2018-2020. See LICENSE.md
 */

/* Postgres includes */
#include "postgres.h"
#include "mb/pg_wchar.h"
#include "utils/builtins.h"
#include "utils/pg_locale.h"

/* ICU includes */
#include "unicode/unorm.h"

#include "icu_ext.h"

PG_FUNCTION_INFO_V1(icu_is_normalized);
PG_FUNCTION_INFO_V1(icu_normalize);

typedef enum {
	UNICODE_NFC,
	UNICODE_NFD,
	UNICODE_NFKC,
	UNICODE_NFKD
} norm_form_t;

static norm_form_t
name_to_norm(const char *formstr)
{
	if (pg_strcasecmp(formstr, "NFC") == 0)
		return UNICODE_NFC;
	else if (pg_strcasecmp(formstr, "NFD") == 0)
		return UNICODE_NFD;
	else if (pg_strcasecmp(formstr, "NFKC") == 0)
		return UNICODE_NFKC;
	else if (pg_strcasecmp(formstr, "NFKD") == 0)
		return UNICODE_NFKD;
	else
		elog(ERROR, "invalid normalization form: %s", formstr);
}

static const
UNormalizer2* norm_instance(norm_form_t form)
{
	UErrorCode	status = U_ZERO_ERROR;
	const UNormalizer2 *instance = NULL;

	switch (form)
	{
	case UNICODE_NFC:
		instance = unorm2_getNFCInstance(&status);
		break;
	case UNICODE_NFD:
		instance = unorm2_getNFDInstance(&status);
		break;
	case UNICODE_NFKC:
		instance = unorm2_getNFKCInstance(&status);
		break;
	case UNICODE_NFKD:
		instance = unorm2_getNFKDInstance(&status);
		break;
	}
	if (U_FAILURE(status))
		elog(ERROR, "norm_instance failure: %s", u_errorName(status));
	return instance;
}

/*
 * Return the string (1st arg) with the given Unicode normalization
 * (2nd arg).
 */
Datum
icu_normalize(PG_FUNCTION_ARGS)
{
	text *src_text = PG_GETARG_TEXT_PP(0);
	const char* arg_form = text_to_cstring(PG_GETARG_TEXT_P(1));
	norm_form_t form = name_to_norm(arg_form);
	const UNormalizer2 *instance = norm_instance(form);
	int32_t u_src_length, u_dest_length, effective_length, result_len;
	char *result;
	UChar *u_src, *u_dest;
	UErrorCode	status = U_ZERO_ERROR;

	if (GetDatabaseEncoding() != PG_UTF8)
		elog(ERROR, "non-Unicode database encoding");

	u_src_length = icu_to_uchar(&u_src,
								VARDATA_ANY(src_text),
								VARSIZE_ANY_EXHDR(src_text));

	/*
	 * The result may be expanded by the maximum factor given at:
	 * https://unicode.org/faq/normalization.html#12
	 * (given that the UChar buffer is in UTF-16)
	 */
	switch(form)
	{
	case UNICODE_NFC:
		u_dest_length = u_src_length * 3;
		break;
	case UNICODE_NFD:
		u_dest_length = u_src_length * 4;
		break;
	case UNICODE_NFKC:
	case UNICODE_NFKD:
	default:
		u_dest_length = u_src_length * 18;
		break;
	}

	u_dest = (UChar*) palloc(u_dest_length*sizeof(UChar));

	effective_length = unorm2_normalize(instance,
										u_src,
										u_src_length,
										u_dest,
										u_dest_length,
										&status);
	if (U_FAILURE(status))
		elog(ERROR, "unorm2_normalize failure: %s", u_errorName(status));

	result_len = icu_from_uchar(&result, u_dest, effective_length);
	PG_RETURN_TEXT_P(cstring_to_text_with_len(result, result_len));
}

/*
 * Check if a string (1st arg) is in the given Unicode normal form
 * (2nd arg).
 */
Datum
icu_is_normalized(PG_FUNCTION_ARGS)
{
    text *src_text = PG_GETARG_TEXT_PP(0);
	const char* arg_form = text_to_cstring(PG_GETARG_TEXT_PP(1));
	norm_form_t form = name_to_norm(arg_form);
	UErrorCode	status = U_ZERO_ERROR;
	UChar *u_src;
	int32_t u_src_length;
	UBool is_norm;
	const UNormalizer2 *instance = norm_instance(form);

	if (GetDatabaseEncoding() != PG_UTF8)
		elog(ERROR, "non-Unicode database encoding");

	u_src_length = icu_to_uchar(&u_src,
								VARDATA_ANY(src_text),
								VARSIZE_ANY_EXHDR(src_text));

	is_norm = unorm2_isNormalized(instance, u_src, u_src_length, &status);

	if (U_FAILURE(status))
		elog(ERROR, "unorm2_isNormalized failure: %s", u_errorName(status));

	PG_RETURN_BOOL(is_norm == TRUE);
}