/* ------------------------------------------------------ * * @file feature_encoding.cpp * * @brief Feature encoding functions for Decision Tree models * * */ /* ----------------------------------------------------------------------- */ #include #include #include #include "feature_encoding.hpp" #include "ConSplits.hpp" namespace madlib { namespace modules { namespace recursive_partitioning { // Use Eigen using namespace dbal; using namespace dbal::eigen_integration; // ------------------------------------------------------------ AnyType print_con_splits::run(AnyType &args){ ConSplitsResult state = args[0].getAs(); AnyType tuple; tuple << state.con_splits; return tuple; } AnyType dst_compute_con_splits_transition::run(AnyType &args){ ConSplitsSample state = args[0].getAs(); if (!state.empty() && state.num_rows >= state.buff_size) { return args[0]; } // NULLs are handled by caller to ensure consistency between // feature encoding and tree training MappedColumnVector con_features = args[1].getAs(); if (state.empty()) { uint32_t n_per_seg = args[2].getAs(); uint16_t n_bins = args[3].getAs(); state.num_splits = static_cast(n_bins - 1); state.num_features = static_cast(con_features.size()); state.buff_size = n_per_seg; state.resize(); } state << con_features; return state.storage(); } // ------------------------------------------------------------ AnyType dst_compute_con_splits_final::run(AnyType &args){ ConSplitsSample state = args[0].getAs(); ConSplitsResult result = defaultAllocator().allocateByteString< dbal::FunctionContext, dbal::DoZero, dbal::ThrowBadAlloc>(0); result.num_features = state.num_features; result.num_splits = state.num_splits; result.resize(); if (state.num_rows <= state.num_splits) { std::stringstream error_msg; // In message below, add 1 to state.num_splits since the meaning of // "splits" for the caller is the number of quantiles, where as // "splits" in this function is the number of values dividing the data // into quantiles. error_msg << "Decision tree error: Number of splits (" << state.num_splits + 1 << ") is larger than the number of records (" << state.num_rows << ")"; throw std::runtime_error(error_msg.str()); } int bin_size = static_cast(state.num_rows / (state.num_splits + 1)); for (Index i = 0; i < state.num_features; i ++) { // sorting the values for each feature ColumnVector feature_i_sample = \ state.sample.row(i).segment(0, state.num_rows); std::sort(feature_i_sample.data(), feature_i_sample.data() + feature_i_sample.size()); // binning for (int j = 0; j < state.num_splits; j ++) { result.con_splits(i, j) = feature_i_sample(bin_size * (j + 1) - 1); } } return result.storage(); } // ------------------------------------------------------------ AnyType dst_compute_entropy_transition::run(AnyType &args){ int encoded_dep_var = args[1].getAs(); if (encoded_dep_var < 0) { throw std::runtime_error("unexpected negative encoded_dep_var"); } MutableNativeIntegerVector state; if (args[0].isNull()) { // allocate the state for the first row int num_dep_var = args[2].getAs(); if (num_dep_var <= 0) { throw std::runtime_error("unexpected non-positive num_dep_var"); } state.rebind(this->allocateArray(num_dep_var)); } else { // avoid state copying if initialized state.rebind(args[0].getAs >()); } if (encoded_dep_var >= state.size()) { std::stringstream err_msg; err_msg << "out-of-bound encoded_dep_var=" << encoded_dep_var << ", while smaller than " << state.size() << " expected"; throw std::runtime_error(err_msg.str()); } state(encoded_dep_var) ++; return state; } // ------------------------------------------------------------ AnyType dst_compute_entropy_merge::run(AnyType &args){ if (args[0].isNull()) { return args[1]; } if (args[1].isNull()) { return args[0]; } MutableNativeIntegerVector state0 = args[0].getAs(); NativeIntegerVector state1 = args[1].getAs(); state0 += state1; return state0; } // ------------------------------------------------------------ namespace { double p_log2_p(const double &p) { if (p < 0.) { throw std::runtime_error("unexpected negative probability"); } if (p == 0.) { return 0.; } return p * log2(p); } } // ------------------------------------------------------------ AnyType dst_compute_entropy_final::run(AnyType &args){ MappedIntegerVector state = args[0].getAs(); double sum_of_dep_counts = static_cast(state.sum()); ColumnVector probs = state.cast() / sum_of_dep_counts; // usage of unaryExpr with functor: // http://eigen.tuxfamily.org/dox/classEigen_1_1MatrixBase.html#a23fc4bf97168dee2516f85edcfd4cfe7 return -(probs.unaryExpr(std::ptr_fun(p_log2_p))).sum(); } // ------------------------------------------------------------ inline bool cmp_text(text* s1, text* s2) { if (VARSIZE_ANY(s1) != VARSIZE_ANY(s2)) return false; else { return strncmp(VARDATA_ANY(s1), VARDATA_ANY(s2), VARSIZE_ANY(s1) - VARHDRSZ) == 0; } } // ------------------------------------------------------------ AnyType map_catlevel_to_int::run(AnyType &args){ ArrayHandle cat_values = args[0].getAs >(); ArrayHandle cat_levels = args[1].getAs >(); ArrayHandle n_levels = args[2].getAs >(); bool null_as_category = args[3].getAs(); MutableArrayHandle cat_int = allocateArray(n_levels.size()); int pos = 0; for (size_t i = 0; i < n_levels.size(); i++) { // linear search to find a match // if cat_values contains any not present in cat_levels, then the mapped // integer is -1. If cat_values contains a known cat_level, then the // mapped integer is the index of that value in cat_levels. int match = -1; for (int j = 0; j < n_levels[i]; j++){ if (cmp_text(cat_values[i], cat_levels[pos + j])) { match = j; break; } } // If null_as_category is True, then match is set to the last index // instead of -1 since the last index is expected to represent NULL. if (match == -1 and null_as_category){ cat_int[i] = n_levels[i] - 1; } else { cat_int[i] = match; } pos += static_cast(n_levels[i]); } return cat_int; } // -------------------------------------------------------------- AnyType get_bin_value_by_index::run(AnyType &args) { ConSplitsResult con_splits_results = args[0].getAs(); int feature_index = args[1].getAs(); int bin_index = args[2].getAs(); // -1 is used to indicate a NaN value if (bin_index < 0) return std::numeric_limits::quiet_NaN(); // assuming we use <= to create bins by values in con_splits // see TreeAccumulator::operator<<(const tuple_type&) if (bin_index < con_splits_results.con_splits.cols()) { // covering ranges (-inf,v_0], ..., (v_{n-2}, v_{n-1}] return con_splits_results.con_splits(feature_index, bin_index); } else { // covering range (v_{n-1}, +inf) return con_splits_results.con_splits(feature_index, bin_index-1) + 1.; } } // -------------------------------------------------------------- AnyType get_bin_index_by_value::run(AnyType &args) { double bin_value = args[0].getAs(); // we return a -1 index is the value is NaN if (std::isnan(bin_value)) { return -1; } ConSplitsResult con_splits_results = args[1].getAs(); if (con_splits_results.con_splits.cols() <= 0) { return Null(); } int feature_index = args[2].getAs(); // assuming we use <= to create bins by values in con_splits // and each row in con_splits is sorted in ascending order // see TreeAccumulator::operator<<(const tuple_type&) // and dst_compute_con_splits_final::run(AnyType &) // each v_i covering ranges (-inf,v_0], ..., (v_{n-2}, v_{n-1}] int result = -1; int low = 0; int high = static_cast(con_splits_results.con_splits.cols()) - 1; if (bin_value > con_splits_results.con_splits(feature_index, high)) { // covering range (v_{n-1}, +inf) result = high + 1; } while (low < high) { int mid = (low + high) / 2; if (bin_value <= con_splits_results.con_splits(feature_index, mid)) { high = mid; } else { low = mid + 1; } } result = high; #ifndef NDEBUG // test code for binary search (mimic std::lower_bound()) if (low != high) { dbout << "bin_value: " << bin_value << "\ncon_splits_results.con_splits.row(feature_index): " << con_splits_results.con_splits.row(feature_index) << "\nlow: " << low << ", high: " << high << std::endl; throw std::runtime_error("low_bound() has bug!"); } for (int i = 0; i < con_splits_results.con_splits.cols(); i ++) { if (bin_value <= con_splits_results.con_splits(feature_index, i)) { if (i != result) { dbout << "bin_value: " << bin_value << "\ncon_splits_results.con_splits.row(feature_index): " << con_splits_results.con_splits.row(feature_index) << "\nresult: " << result << ", i: " << i << std::endl; throw std::runtime_error("low_bound() has bug!"); } return i; } } #endif return result; } // -------------------------------------------------------------- AnyType get_bin_indices_by_values::run(AnyType &args) { // Null-handling is done by declaring it as strict MappedColumnVector bin_values = args[0].getAs(); ConSplitsResult con_splits_results = args[1].getAs(); if (con_splits_results.con_splits.cols() <= 0) { return Null(); } MutableNativeIntegerVector bin_indices( this->allocateArray(bin_values.size())); // assuming we use <= to create bins by values in con_splits // and each row in con_splits is sorted in ascending order // see TreeAccumulator::operator<<(const tuple_type&) // and dst_compute_con_splits_final::run(AnyType &) // each v_i covering ranges (-inf,v_0], ..., (v_{n-2}, v_{n-1}] for (Index i = 0; i < bin_values.size(); i ++) { int low = 0; int high = static_cast(con_splits_results.con_splits.cols()) - 1; if (std::isnan(bin_values(i))) { // we return a -1 index is the value is NaN bin_indices(i) = -1; } else if (bin_values(i) > con_splits_results.con_splits(i, high)) { // covering range (v_{n-1}, +inf) bin_indices(i) = high + 1; } else { // binary search while (low < high) { int mid = (low + high) / 2; if (bin_values(i) <= con_splits_results.con_splits(i, mid)) { high = mid; } else { low = mid + 1; } } bin_indices(i) = high; } } return bin_indices; } } // namespace recursive_partitioning } // namespace modules } // namespace madlib