/* ----------------------------------------------------------------------- *//** * * @file DT_proto.hpp * *//* ----------------------------------------------------------------------- */ #ifndef MADLIB_MODULES_RP_DT_PROTO_HPP #define MADLIB_MODULES_RP_DT_PROTO_HPP #include #include #include #include #include #include // std::numeric_limits #include namespace madlib { namespace modules { namespace recursive_partitioning { // Use Eigen using namespace dbal; using namespace dbal::eigen_integration; using std::vector; using std::string; // stats_per_split is the number of statistics needed to accumulate for a split. // This differs for classification and regression. Value for classification is a // function of number of response values and is computed at runtime whereas // value for regression is a constant. // For classification, accumulate the following: // num of weighted tuples for each possible response and num of unweighted tuples // For regression, accumulate 4 values for evaulating a split: // weight, weight * response, weight * response^2, num of unweighted rows const uint16_t REGRESS_N_STATS = 4u; template class DecisionTree: public DynamicStruct, Container> { public: typedef DynamicStruct Base; MADLIB_DYNAMIC_STRUCT_TYPEDEFS; enum { MSE, MISCLASS, ENTROPY, GINI }; enum { IN_PROCESS_LEAF=-1, FINISHED_LEAF=-2, NODE_NON_EXISTING=-3 }; enum { SURR_IS_MAJORITY=-1, SURR_NON_EXISTING=-2 }; // functions DecisionTree(Init_type& inInitialization); DecisionTree(); void bind(ByteStream_type& inStream); DecisionTree& rebind(const uint16_t in_tree_depth, const uint16_t in_n_y_labels, const uint16_t in_max_n_surr, const bool in_is_regression); DecisionTree& incrementInPlace(); DecisionTree& my_tree(); Index search(MappedIntegerVector cat_features, MappedColumnVector con_features) const; uint64_t getMajorityCount(Index node_index) const; bool getMajoritySplit(Index node_index) const; bool getSurrSplit(Index node_index, MappedIntegerVector cat_features, MappedColumnVector con_features) const; ColumnVector predict(MappedIntegerVector cat_features, MappedColumnVector con_features) const; double predict_response(MappedIntegerVector cat_features, MappedColumnVector con_features) const; double predict_response(Index feature_index) const; bool updatePrimarySplit(const Index node_index, const int & max_feat, const double & max_threshold, const bool & max_is_cat, const uint16_t & min_split, const ColumnVector &true_stats, const ColumnVector &false_stats); template bool expand(const Accumulator &state, const MappedMatrix &con_splits, const uint16_t &min_split, const uint16_t &min_bucket, const uint16_t &max_depth); template bool expand_by_sampling(const Accumulator &state, const MappedMatrix &con_splits, const uint16_t &min_split, const uint16_t &min_bucket, const uint16_t &max_depth, const int &num_random_features); Index parentIndex(Index current) const { return current > 0? static_cast((current-1)/2): 0; } Index trueChild(Index current) const { return 2 * current + 1; } Index falseChild(Index current) const { return 2 * current + 2; } bool isInternalNode(const Index node_index) const { // Internal nodes have a valid index pointing to the feature // used in the split of the node. return feature_indices(node_index) >= 0; } double impurity(const ColumnVector & stats) const; double impurityGain(const ColumnVector &combined_stats, const uint16_t &stats_per_split) const; bool shouldSplit(const uint64_t &total_count, const uint64_t &true_count, const uint64_t &false_count, const uint16_t &min_split, const uint16_t &min_bucket, const uint16_t &max_depth) const; template void pickSurrogates(const Accumulator &state, const MappedMatrix &con_splits); ColumnVector statPredict(const ColumnVector &stats) const; uint64_t statCount(const ColumnVector &stats) const; double statWeightedCount(const ColumnVector &stats) const; uint64_t nodeCount(const Index node_index) const; double nodeWeightedCount(const Index node_index) const; double computeMisclassification(const Index node_index) const; double computeRisk(const Index node_index) const; bool isChildPure(const ColumnVector &stats) const; bool isNull(const double feature_val, const bool is_categorical) const{ return (is_categorical) ? (feature_val < 0) : std::isnan(feature_val); } uint16_t recomputeTreeDepth() const; string displayLeafNode(Index id, ArrayHandle &dep_levels, const std::string & id_prefix, bool verbose); string displayInternalNode(Index id, ArrayHandle &cat_features_str, ArrayHandle &con_features_str, ArrayHandle &cat_levels_text, ArrayHandle &cat_n_levels, ArrayHandle &dep_levels, const std::string & id_prefix, bool verbose); string display(ArrayHandle&, ArrayHandle&, ArrayHandle&, ArrayHandle&, ArrayHandle&, const std::string&, bool verbose); string getCatLabels(Index, Index, Index, ArrayHandle &, ArrayHandle &); string print_split(bool, bool, Index, double, ArrayHandle &, ArrayHandle &, ArrayHandle &, ArrayHandle &); string print(Index, ArrayHandle&, ArrayHandle&, ArrayHandle&, ArrayHandle&, ArrayHandle&, uint16_t); string surr_display( ArrayHandle &cat_features_str, ArrayHandle &con_features_str, ArrayHandle &cat_levels_text, ArrayHandle &cat_n_levels); int encodeIndex(const int &feature_index, const int &is_categorical, const int &n_cat_features) const; void computeVariableImportance(ColumnVector& cat_var_importance, ColumnVector& con_var_importance); // attributes // dimension information uint16_type tree_depth; // 1 for root-only tree uint16_type n_y_labels; uint16_type max_n_surr; bool_type is_regression; // = 0 for classification, = 1 for regression uint16_type impurity_type; // can be 'mse', 'gini', 'entropy', 'misclass' // All vectors below are of size n_nodes ( = 2^{tree_depth} - 1 ) // Note: in order to make DynamicStruct::DryRun work, // non-scalars should be of size 0 when dimension is 0 // The complete tree is broken into multiple vectors: each vector being the // collection of a single variable for all nodes. // -1 means leaf, -2 mean non-existing node IntegerVector_type feature_indices; // elements are of integer type for categorical ColumnVector_type feature_thresholds; // used as boolean array, 0 means continuous, otherwise categorical IntegerVector_type is_categorical; // Used to keep count of the number of non-null rows that are split to the // left and right children for each internal node. // For leaf nodes, the value is 0. // This is useful when using surrogates to compute the majority count/split // for an internal node. Size = n_nodes x 2 ColumnVector_type nonnull_split_count; // FIXME: we use a ColumnVector (elements of 'double' type) to store // big int values, since we dont' yet have a vector type with uint64_t elements. // 'surrogate_indices' is of size n_nodes x max_n_surrogates // If a particular node has fewer than max_n_surrogates, then // the indices on non-existing will be -1 IntegerVector_type surr_indices; ColumnVector_type surr_thresholds; // used as integer if classification // used as enumerated array to record the status of a surrogate split // 0 = default value indicating invalid surrogate split // 1 = categorical feature <= threshold // -1 = categorical feature > threshold // 2 = continuous feature <= threshold // -2 = continuous feature > threshold IntegerVector_type surr_status; // used for keeping count of number of rows that are common between primary // and surrogate IntegerVector_type surr_agreement; // 'prediction' is matrix of size n_nodes x n_predictions // where n_predictions = stats_per_split, // stats_per_split is set by the accumulator during training of tree Matrix_type predictions; // used as integer if we do classification }; // ------------------------------------------------------------------------ // TreeAccumulator is used for collecting statistics during training the nodes // for a level of the decision tree. The same accumulator is also used for // computing the statistics for surrogate splits. template class TreeAccumulator : public DynamicStruct, Container> { public: typedef DynamicStruct Base; MADLIB_DYNAMIC_STRUCT_TYPEDEFS; typedef DTree tree_type; typedef std::tuple< tree_type, MappedIntegerVector, // categorical feature values MappedColumnVector, // continuous feature values double, // response variable double, // weight MappedIntegerVector, // levels for each categorical feature MappedMatrix // split values for each continuous feature > tuple_type; typedef std::tuple< tree_type, MappedIntegerVector, // categorical feature values MappedColumnVector, // continuous feature values MappedIntegerVector, // levels for each categorical feature MappedMatrix, // split values for each continuous feature int // duplicated count for each tuple // (used in random forest) > surr_tuple_type; // functions TreeAccumulator(Init_type& inInitialization); void bind(ByteStream_type& inStream); void rebind(uint16_t n_bins, uint16_t n_cat_feat, uint16_t n_con_feat, uint32_t n_total_levels, uint16_t tree_depth, uint16_t n_stats, bool weights_as_rows, uint32_t n_reachable_leaf_nodes); TreeAccumulator& operator<<(const tuple_type& inTuple); TreeAccumulator& operator<<(const surr_tuple_type& inTuple); template TreeAccumulator& operator<<(const TreeAccumulator& inOther); bool empty() const { return this->n_rows == 0; } // cat_features[feature_index] <= cat_value Index indexCatStats(Index feature_index, int cat_value, bool is_split_true) const; // con_features[feature_index] <= bin_threshold, // bin_threshold = con_splits[feature_index, bin_index] Index indexConStats(Index feature_index, Index bin_index, bool is_split_true) const; Index computeSubIndex(Index start_Index, Index relative_index, bool is_split_true) const; void updateNodeStats(bool is_regression, Index row_index, const double response, const double weight); // apply the tuple using indices void updateStats(bool is_regression, bool is_cat, Index row_index, Index stats_index, const double response, const double weight); // apply the tuple using indices void updateSurrStats(const bool is_cat, const bool surr_agrees, Index row_index, Index stats_index, const int dup_count); // attributes uint64_type n_rows; // number of rows mapped to this node // return NULL state if the node is terminated due to an error bool_type terminated; // dimension information uint16_type n_bins; uint16_type n_cat_features; uint16_type n_con_features; // sum of num of levels in each categorical variable uint32_type total_n_cat_levels; // n_leaf_nodes = 2^{dt.tree_depth-1} for dt.tree_depth > 0 uint32_type n_leaf_nodes; // Not all "leaf" nodes at a tree level are reachable. A leaf becomes // non-reachable when one of its ancestor is itself a leaf. // For a full tree, n_leaf_nodes = n_reachable_leaf_nodes uint32_type n_reachable_leaf_nodes; // For regression, stats_per_split = 4, i.e. (w, w*y, w*y^2, 1) // For classification, stats_per_split = (number of class labels + 1) // i.e. (w_1, w_2, ..., w_c, 1) // For surrogates calculation, stats_per_split = 1 uint16_type stats_per_split; // treat weights as duplicated rows (used for random forest) bool_type weights_as_rows; // training statistics // cumulative sum of the levels of categorical variables // with first element as 0. This is helpful to compute the index into // cat_stats for each categorical variable // size = n_cat_features last // element = (total_n_cat_levels - last element of cat_levels) IntegerVector_type cat_levels_cumsum; // used as integer array // con_stats and cat_stats are matrices that contain the statistics used // during training. // cat_stats is a matrix of size: // (n_reachable_leaf_nodes) x (total_n_cat_levels * stats_per_split * 2) Matrix_type cat_stats; // con_stats is a matrix: // (n_reachable_leaf_nodes) x (n_con_features * n_bins * stats_per_split * 2) Matrix_type con_stats; // node_stats is used to keep a statistic of all the rows that land on a // node and are used to determine the prediction made by a node. // In the absence of any NULL value, these stats can be deduced from // cat_stats/con_stats. In the presence of NULL value, the stats could be // different. Matrix_type node_stats; // Above stats matrices are used as pseudo-sparse matrices since not all // leaf nodes are reachable (esp. as tree gets deeper). IntegerVector_type stats_lookup; }; // ------------------------------------------------------------------------ } // namespace recursive_partitioning } // namespace modules } // namespace madlib #endif // defined(MADLIB_MODULES_RP_DT_PROTO_HPP)