/* ------------------------------------------------------ * * @file random_forest.cpp * * @brief Random Forest functions * * */ /* ----------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include #include "DT_proto.hpp" #include "DT_impl.hpp" #include "ConSplits.hpp" #include /* fabs */ #include "random_forest.hpp" namespace madlib { // Use Eigen using namespace dbal::eigen_integration; using boost::random::discrete_distribution; using boost::random::variate_generator; namespace modules { namespace recursive_partitioning { typedef DecisionTree Tree; /* * Permute each categorical variable and predict */ AnyType rf_cat_imp_score::run(AnyType &args) { if (args[0].isNull() || args[7].isNull()) { return Null(); } Tree dt = args[0].getAs(); MutableNativeIntegerVector cat_features; NativeColumnVector con_features; try { if (args[1].isNull()){ // no cat features return Null(); } else { MutableNativeIntegerVector xx_cat = args[1].getAs(); cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size()); } if (args[2].isNull()){ con_features.rebind(this->allocateArray(0)); } else { NativeColumnVector xx_con = args[2].getAs(); con_features.rebind(xx_con.memoryHandle(), xx_con.size()); } } catch (const ArrayWithNullException &e) { // not expect to reach here // if max_surr = 0, nulls are filtered // otherwise, mapped to -1 or NaN return Null(); } MappedIntegerVector cat_n_levels = args[3].getAs(); int n_permutations = args[4].getAs(); double y = args[5].getAs(); bool is_classification = args[6].getAs(); MappedMatrix distributions = args[7].getAs(); // returning MutableNativeColumnVector permuted_predictions( this->allocateArray(cat_n_levels.size())); // permute each and predict NativeRandomNumberGenerator generator; for (int p = 0; p < n_permutations; p ++) { for (Index i = 0; i < cat_n_levels.size(); i ++) { int orig_i = cat_features(i); discrete_distribution<> ddist(distributions.col(i).data(), distributions.col(i).data() + cat_n_levels(i) + 1); variate_generator > rvt(generator, ddist); cat_features(i) = rvt() - 1; // calling NativeIntegerVector for a const cast // see EigenIntegration_impl.hpp in ports for details double prediction = dt.predict_response( NativeIntegerVector(cat_features.memoryHandle()), con_features); double score = 0.; if (is_classification) { score = y - prediction < 1e-3 ? 1. : 0.; } else { score = - (y - prediction) * (y - prediction); } permuted_predictions(i) += score; cat_features(i) = orig_i; } } permuted_predictions /= n_permutations; return permuted_predictions; } // ------------------------------------------------------------ /* * Permute each continuous variable and predict */ AnyType rf_con_imp_score::run(AnyType &args) { if (args[0].isNull() || args[7].isNull()) { return Null(); } Tree dt = args[0].getAs(); NativeIntegerVector cat_features; MutableNativeColumnVector con_features; try { if (args[1].isNull()){ // no cat features cat_features.rebind(this->allocateArray(0)); } else { NativeIntegerVector xx_cat = args[1].getAs(); cat_features.rebind(xx_cat.memoryHandle(), xx_cat.size()); } if (args[2].isNull()){ //no con features return Null(); } else { MutableNativeColumnVector xx_con = args[2].getAs(); con_features.rebind(xx_con.memoryHandle(), xx_con.size()); } } catch (const ArrayWithNullException &e) { // not expect to reach here // if max_surr = 0, nulls are filtered // otherwise, mapped to -1 or NaN return Null(); } // con_splits size = num_con_features x num_bins // When num_con_features = 0, the input will be an empty string that is read // as a ByteString ConSplitsResult splits_results = args[3].getAs(); int n_permutations = args[4].getAs(); double y = args[5].getAs(); bool is_classification = args[6].getAs(); MappedMatrix distributions = args[7].getAs(); // returning MutableNativeColumnVector permuted_predictions( this->allocateArray(con_features.size())); // permute each and predict NativeRandomNumberGenerator generator; for (int p = 0; p < n_permutations; p ++) { for (Index i = 0; i < con_features.size(); i ++) { double orig_i = con_features(i); discrete_distribution<> ddist(distributions.col(i).data(), distributions.col(i).data() + distributions.rows()); variate_generator > rvt(generator, ddist); int outcome = rvt(); if (outcome == 0) { con_features(i) = std::numeric_limits::quiet_NaN(); } else if (outcome == static_cast(distributions.rows()) - 1) { // bin value that is larger than the last separator (last value in con_splits) con_features(i) = splits_results.con_splits(i, outcome-2) + 1.; } else { con_features(i) = splits_results.con_splits(i, outcome-1); } // calling NativeColumnVector for a const cast // see EigenIntegration_impl.hpp in ports for details double prediction = dt.predict_response( cat_features, NativeColumnVector(con_features.memoryHandle())); double score = 0.; if (is_classification) { score = y - prediction < 1e-3 ? 1. : 0.; } else { score = - (y - prediction) * (y - prediction); } permuted_predictions(i) += score; con_features(i) = orig_i; } } permuted_predictions /= n_permutations; return permuted_predictions; } // ------------------------------------------------------------ AnyType normalize_sum_array::run(AnyType &args){ const MappedColumnVector input_vector = args[0].getAs(); const double sum_target = args[1].getAs(); double sum_input_vector = input_vector.sum(); // Avoid divide by zero by dividing by a small number if sum is small double VAR_IMP_EPSILON = 1e-6; if (sum_input_vector < VAR_IMP_EPSILON) sum_input_vector = VAR_IMP_EPSILON; ColumnVector output_vector = input_vector * sum_target / sum_input_vector; return output_vector; } } // namespace recursive_partitioning } // namespace modules } // namespace madlib