/* ----------------------------------------------------------------------- *//** * * @file viterbi.cpp * * @brief Viterbi Algorithm for inferencing implementation * * This function implements viterbi algorithm to generate best label sequence * given the state emission and state transition matrices. * *//* ----------------------------------------------------------------------- */ #include #include #include "viterbi.hpp" namespace madlib { namespace modules { namespace crf { using namespace dbal::eigen_integration; using madlib::dbconnector::postgres::madlib_construct_array; AnyType vcrf_top1_label::run(AnyType& args) { ArrayHandle mArray = args[0].getAs >(); ArrayHandle rArray = args[1].getAs >(); const int32_t numLabels = args[2].getAs(); if (numLabels == 0) throw std::invalid_argument("Number of labels cannot be zero"); int doc_len = static_cast(rArray.size() / numLabels); double* prev_top1_array = new double[numLabels]; double* curr_top1_array = new double[numLabels]; double* prev_norm_array = new double[numLabels]; double* curr_norm_array = new double[numLabels]; int* path = new int[doc_len*numLabels]; memset(prev_top1_array, 0, numLabels*sizeof(double)); memset(prev_norm_array, 0, numLabels*sizeof(double)); memset(path, 0, doc_len*numLabels*sizeof(int)); for(int start_pos = 0; start_pos < doc_len; start_pos++) { memset(curr_top1_array, 0, numLabels*sizeof(double)); memset(curr_norm_array, 0, numLabels*sizeof(double)); if (start_pos == 0) { for (int label = 0; label < numLabels; label++) { curr_norm_array[label] = rArray[label] + mArray[label]; curr_top1_array[label] = rArray[label] + mArray[label]; } } else { for (int curr_label = 0; curr_label < numLabels; curr_label++) { for (int prev_label = 0; prev_label < numLabels; prev_label++) { double top1_new_score = prev_top1_array[prev_label] + rArray[start_pos*numLabels + curr_label] + mArray[(prev_label+1)*numLabels + curr_label]; if (start_pos == doc_len - 1) top1_new_score += mArray[(numLabels+1)*numLabels + curr_label]; if (top1_new_score > curr_top1_array[curr_label]) { curr_top1_array[curr_label] = top1_new_score; path[start_pos*numLabels + curr_label] = prev_label; } /* calculate the probability of the best label sequence */ double norm_new_score = prev_norm_array[prev_label] + rArray[start_pos * numLabels + curr_label] + mArray[(prev_label+1)*numLabels + curr_label]; /* last token in a sentence, the end feature should be fired */ if (start_pos == doc_len - 1) norm_new_score += mArray[(numLabels+1)*numLabels + curr_label]; /* The following wants to do z = log(exp(x)+exp(y)), the faster implementation is * z=min(x,y) + log(exp(abs(x-y))+1) * 0.5 is for rounding */ if (curr_norm_array[curr_label] == 0) curr_norm_array[curr_label] = norm_new_score; else { double x = curr_norm_array[curr_label]; double y = norm_new_score; curr_norm_array[curr_label] = std::min(x,y) + static_cast(log(std::exp(std::abs(y-x)/1000.0) +1)*1000.0 + 0.5); } } } } for (int label = 0; label < numLabels; label++) { prev_top1_array[label] = curr_top1_array[label]; prev_norm_array[label] = curr_norm_array[label]; } } /* find the label of the last token in a sentence */ double max_score = 0.0; int top1_label = 0; for(int label = 0; label < numLabels; label++) { if(curr_top1_array[label] > max_score) { max_score = curr_top1_array[label]; top1_label = label; } } /* Define the result array with doc_len+1 elements, where the first doc_len * elements are used to store the best labels and the last element is used * to store the conditional probability of the sequence. */ // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle result( madlib_construct_array( NULL, doc_len + 1, INT4OID, sizeof(int32_t), true, 'i')); /* trace back to get the labels for the rest tokens in a sentence */ result[doc_len - 1] = top1_label; for (int pos = doc_len - 1; pos >= 1; pos--) { top1_label = path[pos * numLabels + top1_label]; result[pos-1] = top1_label; } /* compute the sum_i of log(v1[i]/1000), return (e^sum)*1000 * used in the UDFs which needs marginalization e.g., normalization * the following wants to do z=log(exp(x)+exp(y)), the faster implementation is * z = min(x,y) + log(exp(abs(x-y))+1) */ double norm_factor = 0.0; for (int i = 0; i < numLabels; i++) { if (i==0) norm_factor = curr_norm_array[0]; else { double x = curr_norm_array[i]; double y = norm_factor; norm_factor = std::min(x,y) + static_cast(log(exp(std::abs(y-x)/1000.0) +1)*1000.0+0.5); } } /* calculate the conditional probability. * To convert the probability into integer, firstly,let it multiply 1000000, then later make the product divided by 1000000 * to get the real conditional probability */ result[doc_len] = static_cast(std::exp((max_score - norm_factor)/1000.0)*1000000); delete[] prev_top1_array; delete[] curr_top1_array; delete[] prev_norm_array; delete[] curr_norm_array; delete[] path; return result; } } } }