#include "postgres.h" #include #include "fmgr.h" #include "utils/array.h" #include #include "catalog/pg_type.h" #ifndef NO_PG_MODULE_MAGIC PG_MODULE_MAGIC; #endif #ifndef min #define min(a,b)((a) < (b) ? (a):(b)) #endif Datum vcrf_top1_label(PG_FUNCTION_ARGS); /** * @file viterbi_top1.c * @brief find the most probable sequence assignment and the conditional probability using Viterbi algorithm [1] * @date February 2012 * @param marray Encode the edge feature, start feature and end feature * @param rarray Encode the single state feature, e.g, word feature, regex feature. * @param nlabel Total number of labels in the label space * @literature * [1] http://en.wikipedia.org/wiki/Viterbi_algorithm * @return the most probable sequence assignment and two integer numbers used to calculate the conditional probability. * the fist number is the numerator, the second number is the normalization factor. **/ PG_FUNCTION_INFO_V1(vcrf_top1_label); Datum vcrf_top1_label(PG_FUNCTION_ARGS) { ArrayType *result; int *prev_top1_array, *curr_top1_array, *prev_norm_array, *curr_norm_array, *path, *mArray, *rArray; int ndatabytes, nbytes, norm_factor, i; int start_pos, label, currlabel, prevlabel, doclen, nlabel; Oid element_type; mArray = (int*)ARR_DATA_PTR(PG_GETARG_ARRAYTYPE_P(0)); element_type = ARR_ELEMTYPE(PG_GETARG_ARRAYTYPE_P(0)); rArray = (int*)ARR_DATA_PTR(PG_GETARG_ARRAYTYPE_P(1)); nlabel = PG_GETARG_INT32(2); doclen = ARR_DIMS(PG_GETARG_ARRAYTYPE_P(1))[0]/nlabel; // initialize the four temporary arrays ndatabytes = sizeof(int) * nlabel; prev_top1_array = (int*) palloc0(ndatabytes); curr_top1_array = (int*) palloc0(ndatabytes); prev_norm_array = (int*) palloc0(ndatabytes); curr_norm_array = (int*) palloc0(ndatabytes); // define the path nbytes = doclen*nlabel*sizeof(int); path = (int*)palloc0(nbytes); // define the output, the first doclen elements are to store the best label sequence // the last two elements are used to calculate the probability. result = (ArrayType *) palloc(sizeof(int)*(doclen+1) + ARR_OVERHEAD_NONULLS(1)); SET_VARSIZE(result, sizeof(int)*(doclen+1) + ARR_OVERHEAD_NONULLS(1)); result->ndim = 1; result->dataoffset = 0; result->elemtype = element_type; ARR_DIMS(result)[0] = doclen+1; ARR_LBOUND(result)[0] = 1; for(start_pos=0;start_pos curr_top1_array[currlabel]){ curr_top1_array[currlabel] = top1_new_score; path[start_pos*nlabel+currlabel] = prevlabel; } // calculate the probability of the best label sequence int norm_new_score = prev_norm_array[prevlabel] + rArray[start_pos*nlabel+currlabel] + mArray[(prevlabel+1)*nlabel+currlabel]; // the last token in a sentence, the end feature should be fired if(start_pos == doclen-1){ norm_new_score += mArray[(nlabel+1)*nlabel+currlabel]; } // the following wants to do z=log(exp(x)+exp(y)), the faster implementation is // z=min(x,y) + log(exp(abs(x-y))+1) // 0.5 is for rounding if (curr_norm_array[currlabel] == 0){ curr_norm_array[currlabel] = norm_new_score; } else { curr_norm_array[currlabel] = min(curr_norm_array[currlabel], norm_new_score) + (int)(log(exp(abs(norm_new_score - curr_norm_array[currlabel])/1000.0) + 1)*1000.0 + 0.5); } } } } for(label=0; labelmaxscore){ maxscore = curr_top1_array[label]; top1label = label; } } // trace back to get the labels for the rest tokens in a sentence ((int*)ARR_DATA_PTR(result))[doclen-1] = top1label; for (pos=doclen-1; pos>=1; pos--){ top1label = path[pos*nlabel+top1label]; ((int*)ARR_DATA_PTR(result))[pos-1] = top1label; } // compute the sum_i of log(v1[i]/1000), return (e^sum)*1000 // used in the UDFs which needs marginalization e.g., normalization // the following wants to do z=log(exp(x)+exp(y)), the faster implementation is // z=min(x,y) + log(exp(abs(x-y))+1) norm_factor = 0; for(i = 0; i < nlabel; i++) { if(i == 0) { norm_factor = curr_norm_array[0]; } else { norm_factor = min(curr_norm_array[i], norm_factor) + (int)(log(exp(abs(norm_factor - curr_norm_array[i])/1000.0) + 1)*1000.0 + 0.5); } } // calculate the conditional probability. // to convert the probability into integer, firstly,let it multiply 1000000, then later make the product divided by 1000000 // to get the real conditional probability ((int*)ARR_DATA_PTR(result))[doclen] = (int)(exp((maxscore - norm_factor)/1000.0)*1000000); PG_RETURN_ARRAYTYPE_P(result); }