/* ----------------------------------------------------------------------- *//** * * @file MultiResponseGLM_impl.hpp * *//* ----------------------------------------------------------------------- */ #ifndef MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP #define MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP #include #include #include namespace madlib { namespace modules { namespace glm { // Use Eigen using namespace dbal::eigen_integration; template inline MultiResponseGLMAccumulator::MultiResponseGLMAccumulator( Init_type& inInitialization) : Base(inInitialization), optimizer(inInitialization) { this->initialize(); } /** * @brief Reset the accumulator before accumulating the first tuple */ template inline void MultiResponseGLMAccumulator::reset() { num_rows = 0; terminated = false; loglik = 0.; optimizer.reset(); } /** * @brief Bind all elements of the state to the data in the stream * * The bind() is special in that even after running operator>>() on an element, * there is no guarantee yet that the element can indeed be accessed. It is * cruicial to first check this. * * Provided that this methods correctly lists all member variables, all other * methods can, however, rely on that fact that all variables are correctly * initialized and accessible. */ template inline void MultiResponseGLMAccumulator::bind( ByteStream_type& inStream) { inStream >> num_features >> num_categories >> num_rows >> terminated >> loglik >> optimizer; // bind vcov and optimizer.hessian onto the same memory as we don't need them // at the same time vcov.rebind(optimizer.hessian.memoryHandle(), optimizer.hessian.rows(), optimizer.hessian.cols()); } /** * @brief Update the accumulation state by feeding a tuple */ template inline MultiResponseGLMAccumulator& MultiResponseGLMAccumulator::operator<<( const tuple_type& inTuple) { const MappedColumnVector& x = std::get<0>(inTuple); const int& y = static_cast(std::get<1>(inTuple)); // convert y to an indicator vector ColumnVector vecY(num_categories-1); vecY.fill(0); if (y != 0) { vecY(y - 1) = 1; } // The following checks were introduced with MADLIB-138. It still seems // useful to have clear error messages in case of infinite input values. if (!dbal::eigen_integration::isfinite(x)) { warning("Design matrix is not finite."); } else if (x.size() > std::numeric_limits::max()) { warning("Number of independent variables cannot be " "larger than 65535."); } else if (num_features != static_cast(x.size())) { warning("Inconsistent numbers of independent variables."); } else { // normal case uint16_t N = num_features; uint16_t C = static_cast(num_categories - 1); ColumnVector mu(C); ColumnVector ita(C); Matrix G_prime(C, C); Matrix V(C, C); // initialize ita and mu in the first iteration if (optimizer.beta.norm() == 0.) { Link::init(mu); Link::link_func(mu,ita); } else { for (int i = 0; i < ita.size(); i ++) { ita(i) = trans(x) * optimizer.beta.segment(i*N, N); } Link::mean_func(ita, mu); } // intermediate values Link::mean_derivative(ita, G_prime); Family::variance(mu, V); // log-likelihood loglik += Family::loglik(vecY, mu, 1.); // calcualte X_trans_W_X Matrix GtVinvG(trans(G_prime) * V.inverse() * G_prime); Matrix XXt(x * trans(x)); for (int i = 0; i < C; i ++) { for (int j = 0; j < C; j ++) { for (int x_i = 0; x_i < num_features; x_i ++) { for (int x_j = 0; x_j < num_features; x_j ++) { optimizer.hessian(i*num_features + x_i, j*num_features + x_j) += GtVinvG(i,j) * XXt(x_i,x_j); } } } } // calculate X_trans_W_Y ColumnVector YVinvG(trans(vecY-mu) * V.inverse() * G_prime); for (int i = 0; i < C; i ++) { optimizer.grad.segment(i*N, N) -= YVinvG(i) * x; } num_rows ++; return *this; } // all tests passed and we were in the else // error case terminated = true; return *this; } /** * @brief Merge with another accumulation state */ template template inline MultiResponseGLMAccumulator& MultiResponseGLMAccumulator::operator<<( const MultiResponseGLMAccumulator& inOther) { if (this->empty()) { *this = inOther; } else if (inOther.empty()) { } else if (num_features != inOther.num_features) { warning("Inconsistent numbers of independent variables."); terminated = true; } else { num_rows += inOther.num_rows; loglik += inOther.loglik; optimizer.grad += inOther.optimizer.grad; optimizer.hessian += inOther.optimizer.hessian; } return *this; } /** * @brief Copy from a previous state */ template template inline MultiResponseGLMAccumulator& MultiResponseGLMAccumulator::operator=( const MultiResponseGLMAccumulator& inOther) { this->copy(inOther); return *this; } /** * @brief Apply the accumulated intra-state values to inter-state member */ template inline void MultiResponseGLMAccumulator::apply() { if (!dbal::eigen_integration::isfinite(optimizer.hessian) || !dbal::eigen_integration::isfinite(optimizer.grad)) { warning("Hessian or gradient is not finite."); terminated = true; } else { optimizer.apply(); } } // ----------------------------------------------------------------------- template MultiResponseGLMResult::MultiResponseGLMResult(const MultiResponseGLMAccumulator& state) { compute(state); } /** * @brief Transform an accumulation state into a result */ template inline MultiResponseGLMResult& MultiResponseGLMResult::compute(const MultiResponseGLMAccumulator& state) { typedef Eigen::Map MMap; typedef MMap::Scalar Scalar; // Vector of coefficients: For efficiency reasons, we want to return this // by reference, so we need to bind to db memory Allocator& allocator = defaultAllocator(); Index N = static_cast(state.num_features); Index C = static_cast(state.num_categories - 1); coef.rebind(allocator.allocateArray(C, N), N, C); std_err.rebind(allocator.allocateArray(C, N), N, C); z_stats.rebind(allocator.allocateArray(C, N), N, C); p_values.rebind(allocator.allocateArray(C, N), N, C); // computation loglik = state.loglik; coef = MMap(const_cast(state.optimizer.beta.data()), N, C); Matrix tmp_sd(state.vcov.diagonal().cwiseSqrt()); tmp_sd.resize(N, C); std_err = tmp_sd; z_stats = coef.cwiseQuotient(std_err); for (Index i = 0; i < N*C; i ++) { p_values(i) = 2. * prob::cdf( prob::normal(), -std::abs(z_stats(i))); } num_rows_processed = state.num_rows; return *this; } } // namespace glm } // namespace modules } // namespace madlib #endif // defined(MADLIB_MODULES_GLM_MULTIRESPONSEGLM_IMPL_HPP)