/* ----------------------------------------------------------------------- *//** * * @file LinearRegression_impl.hpp * *//* ----------------------------------------------------------------------- */ #ifndef MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP #define MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP #include #include #include #include #include namespace madlib { // Use Eigen using namespace dbal::eigen_integration; namespace modules { namespace regress { template inline LinearRegressionAccumulator::LinearRegressionAccumulator( Init_type& inInitialization) : Base(inInitialization) { this->initialize(); } /** * @brief Bind all elements of the state to the data in the stream * * The bind() is special in that even after running operator>>() on an element, * there is no guarantee yet that the element can indeed be accessed. It is * cruicial to first check this. * * Provided that this methods correctly lists all member variables, all other * methods can, however, rely on that fact that all variables are correctly * initialized and accessible. */ template inline void LinearRegressionAccumulator::bind(ByteStream_type& inStream) { inStream >> numRows >> widthOfX >> y_sum >> y_square_sum; uint16_t actualWidthOfX = widthOfX.isNull() ? static_cast(0) : static_cast(widthOfX); inStream >> X_transp_Y.rebind(actualWidthOfX) >> X_transp_X.rebind(actualWidthOfX, actualWidthOfX); } /** * @brief Update the accumulation state * * We update the number of rows \f$ n \f$, the partial * sums \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix * \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$. */ template inline LinearRegressionAccumulator& LinearRegressionAccumulator::operator<<(const tuple_type& inTuple) { const MappedColumnVector& x = std::get<0>(inTuple); const double& y = std::get<1>(inTuple); // The following checks were introduced with MADLIB-138. It still seems // useful to have clear error messages in case of infinite input values. if (!std::isfinite(y)) throw std::domain_error("Dependent variables are not finite."); else if (!dbal::eigen_integration::isfinite(x)) throw std::domain_error("Design matrix is not finite."); else if (x.size() > std::numeric_limits::max()) throw std::domain_error("Number of independent variables cannot be " "larger than 65535."); // Initialize in first iteration if (numRows == 0) { widthOfX = static_cast(x.size()); this->resize(); } else if (widthOfX != static_cast(x.size())) { throw std::runtime_error("Inconsistent numbers of independent " "variables."); } numRows++; y_sum += y; y_square_sum += y * y; X_transp_Y.noalias() += x * y; // X^T X is symmetric, so it is sufficient to only fill a triangular part // of the matrix triangularView(X_transp_X) += x * trans(x); return *this; } /** * @brief Merge with another accumulation state */ template template inline LinearRegressionAccumulator& LinearRegressionAccumulator::operator<<( const LinearRegressionAccumulator& inOther) { // Initialize if necessary if (numRows == 0) { *this = inOther; return *this; } else if (inOther.numRows == 0) return *this; else if (widthOfX != inOther.widthOfX) throw std::runtime_error("Inconsistent numbers of independent " "variables."); numRows += inOther.numRows; y_sum += inOther.y_sum; y_square_sum += inOther.y_square_sum; X_transp_Y.noalias() += inOther.X_transp_Y; triangularView(X_transp_X) += inOther.X_transp_X; return *this; } template template inline LinearRegressionAccumulator& LinearRegressionAccumulator::operator=( const LinearRegressionAccumulator& inOther) { this->copy(inOther); return *this; } template LinearRegression::LinearRegression( const LinearRegressionAccumulator& inState) { compute(inState); } /** * @brief Transform a linear-regression accumulation state into a result * * The result of the accumulation phase is \f$ X^T X \f$ and * \f$ X^T \boldsymbol y \f$. We first compute the pseudo-inverse, then the * regression coefficients, the model statistics, etc. * * @sa For the mathematical description, see \ref grp_linreg. */ template inline LinearRegression& LinearRegression::compute( const LinearRegressionAccumulator& inState) { Allocator& allocator = defaultAllocator(); // The following checks were introduced with MADLIB-138. It still seems // useful to have clear error messages in case of infinite input values. if (!dbal::eigen_integration::isfinite(inState.X_transp_X) || !dbal::eigen_integration::isfinite(inState.X_transp_Y)) throw std::domain_error("Design matrix is not finite."); SymmetricPositiveDefiniteEigenDecomposition decomposition( inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse); // Precompute (X^T * X)^+ Matrix inverse_of_X_transp_X = decomposition.pseudoInverse(); conditionNo = decomposition.conditionNo(); // Vector of coefficients: For efficiency reasons, we want to return this // by reference, so we need to bind to db memory coef.rebind(allocator.allocateArray(inState.widthOfX)); coef.noalias() = inverse_of_X_transp_X * inState.X_transp_Y; // explained sum of squares (regression sum of squares) double ess = dot(inState.X_transp_Y, coef) - (inState.y_sum * inState.y_sum / static_cast(inState.numRows)); // total sum of squares double tss = inState.y_square_sum - (inState.y_sum * inState.y_sum / static_cast(inState.numRows)); // With infinite precision, the following checks are pointless. But due to // floating-point arithmetic, this need not hold at this point. // Without a formal proof convincing us of the contrary, we should // anticipate that numerical peculiarities might occur. if (tss < 0) tss = 0; if (ess < 0) ess = 0; // Since we know tss with greater accuracy than ess, we do the following // sanity adjustment to ess: if (ess > tss) ess = tss; // coefficient of determination // If tss == 0, then the regression perfectly fits the data, so the // coefficient of determination is 1. r2 = (tss == 0 ? 1 : ess / tss); // In the case of linear regression: // residual sum of squares (rss) = total sum of squares (tss) - explained // sum of squares (ess) // Proof: http://en.wikipedia.org/wiki/Sum_of_squares double rss = tss - ess; // Variance is also called the mean square error double variance = rss / static_cast(inState.numRows - inState.widthOfX); // Vector of standard errors and t-statistics: For efficiency reasons, we // want to return these by reference, so we need to bind to db memory stdErr.rebind(allocator.allocateArray(inState.widthOfX)); tStats.rebind(allocator.allocateArray(inState.widthOfX)); for (int i = 0; i < inState.widthOfX; i++) { // In an abundance of caution, we see a tiny possibility that numerical // instabilities in the pinv operation can lead to negative values on // the main diagonal of even a SPD matrix if (inverse_of_X_transp_X(i,i) < 0) { stdErr(i) = 0; } else { stdErr(i) = std::sqrt( variance * inverse_of_X_transp_X(i,i) ); } if (coef(i) == 0 && stdErr(i) == 0) { // In this special case, 0/0 should be interpreted as 0: // We know that 0 is the exact value for the coefficient, so // the t-value should be 0 (corresponding to a p-value of 1) tStats(i) = 0; } else { // If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which // is what we need. tStats(i) = coef(i) / stdErr(i); } } // Matrix of variance-covariance matrix : For efficiency reasons, we want to return this // by reference, so we need to bind to db memory vcov.rebind(allocator.allocateArray(inState.widthOfX, inState.widthOfX), inState.widthOfX, inState.widthOfX); vcov = variance * inverse_of_X_transp_X; // Vector of p-values: For efficiency reasons, we want to return this // by reference, so we need to bind to db memory pValues.rebind(allocator.allocateArray(inState.widthOfX)); if (inState.numRows > inState.widthOfX) for (int i = 0; i < inState.widthOfX; i++) pValues(i) = 2. * prob::cdf( boost::math::complement( prob::students_t( static_cast(inState.numRows - inState.widthOfX) ), std::fabs(tStats(i)) )); return *this; } /* * Robust Linear Regression: Huber-White Sandwich estimator */ template inline RobustLinearRegressionAccumulator::RobustLinearRegressionAccumulator( Init_type& inInitialization) : Base(inInitialization) { this->initialize(); } /** * @brief Bind all elements of the state to the data in the stream * * The bind() is special in that even after running operator>>() on an element, * there is no guarantee yet that the element can indeed be accessed. It is * cruicial to first check this. * * Provided that this methods correctly lists all member variables, all other * methods can, however, rely on that fact that all variables are correctly * initialized and accessible. */ template inline void RobustLinearRegressionAccumulator::bind(ByteStream_type& inStream) { inStream >> numRows >> widthOfX; uint16_t actualWidthOfX = widthOfX.isNull() ? static_cast(0) : static_cast(widthOfX); inStream >> ols_coef.rebind(actualWidthOfX) >> X_transp_X.rebind(actualWidthOfX, actualWidthOfX) >> X_transp_r2_X.rebind(actualWidthOfX, actualWidthOfX); } /** * @brief Update the accumulation state * * We update the number of rows \f$ n \f$, the matrix \f$ X^T X \f$, * and the matrix \f$ X^T diag(r_1^2, r_2^2 ... r_n^2 X \f$ */ template inline RobustLinearRegressionAccumulator& RobustLinearRegressionAccumulator::operator<<(const tuple_type& inTuple) { // Inputs const MappedColumnVector& x = std::get<0>(inTuple); const double& y = std::get<1>(inTuple); const MappedColumnVector& coef = std::get<2>(inTuple); if (!std::isfinite(y)) throw std::domain_error("Dependent variables are not finite."); else if (x.size() > std::numeric_limits::max()) throw std::domain_error("Number of independent variables cannot be " "larger than 65535."); // Initialize in first iteration if (numRows == 0) { widthOfX = static_cast(x.size()); this->resize(); ols_coef = coef; } // dimension check if (widthOfX != static_cast(x.size())) { throw std::runtime_error("Inconsistent numbers of independent " "variables."); } numRows++; double r = y - trans(ols_coef)*x; // The following matrices are symmetric, so it is sufficient to // only fill a triangular part triangularView(X_transp_X) += x * trans(x); triangularView(X_transp_r2_X) += r * r * x * trans(x); return *this; } /** * @brief Merge with another accumulation state */ template template inline RobustLinearRegressionAccumulator& RobustLinearRegressionAccumulator::operator<<( const RobustLinearRegressionAccumulator& inOther) { numRows += inOther.numRows; triangularView(X_transp_X) += inOther.X_transp_X; triangularView(X_transp_r2_X) += inOther.X_transp_r2_X; return *this; } template template inline RobustLinearRegressionAccumulator& RobustLinearRegressionAccumulator::operator=( const RobustLinearRegressionAccumulator& inOther) { this->copy(inOther); return *this; } template RobustLinearRegression::RobustLinearRegression( const RobustLinearRegressionAccumulator& inState) { compute(inState); } /** * @brief Transform a robust linear-regression accumulation state into a result * * The result of the accumulation phase is \f$ X^T X \f$ and * \f$ X^T U X \f$. We first compute the pseudo-inverse, then the * the robust model statistics. * * @sa For the mathematical description, see \ref grp_linreg. */ template inline RobustLinearRegression& RobustLinearRegression::compute( const RobustLinearRegressionAccumulator& inState) { Allocator& allocator = defaultAllocator(); // The following checks were introduced with MADLIB-138. It still seems // useful to have clear error messages in case of infinite input values. if (!dbal::eigen_integration::isfinite(inState.X_transp_X) || !dbal::eigen_integration::isfinite(inState.X_transp_r2_X)) throw std::domain_error("Design matrix is not finite."); SymmetricPositiveDefiniteEigenDecomposition decomposition( inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse); // Precompute (X^T * X)^+ Matrix inverse_of_X_transp_X = decomposition.pseudoInverse(); // Calculate the robust variance covariance matrix as: // (X^T X)^-1 X^T diag(r1^2,r2^2....rn^2)X (X^T X)^-1 // Where r_1, r_2 ... r_n are the residuals // Note: X_transp_r2_X calcualtes X^T diag(r1^2,r2^2....rn^2)X Matrix robust_var_cov = inState.X_transp_r2_X.template triangularView(); robust_var_cov = robust_var_cov + trans(inState.X_transp_r2_X); robust_var_cov = inverse_of_X_transp_X * robust_var_cov * inverse_of_X_transp_X; // Vector of standard errors and t-statistics: For efficiency reasons, we // want to return these by reference, so we need to bind to db memory coef.rebind(allocator.allocateArray(inState.widthOfX)); stdErr.rebind(allocator.allocateArray(inState.widthOfX)); tStats.rebind(allocator.allocateArray(inState.widthOfX)); for (int i = 0; i < inState.widthOfX; i++) { // In an abundance of caution, we see a tiny possibility that numerical // instabilities in the pinv operation can lead to negative values on // the main diagonal of even a SPD matrix coef(i) = inState.ols_coef(i); if (inverse_of_X_transp_X(i,i) < 0) { stdErr(i) = 0; } else { stdErr(i) = std::sqrt(robust_var_cov(i,i)); } if (inState.ols_coef(i) == 0 && stdErr(i) == 0) { // In this special case, 0/0 should be interpreted as 0: // We know that 0 is the exact value for the coefficient, so // the t-value should be 0 (corresponding to a p-value of 1) tStats(i) = 0; } else { // If stdErr(i) == 0 then abs(tStats(i)) will be infinity, which // is what we need. tStats(i) = inState.ols_coef(i) / stdErr(i); } } // Vector of p-values: For efficiency reasons, we want to return this // by reference, so we need to bind to db memory pValues.rebind(allocator.allocateArray(inState.widthOfX)); if (inState.numRows > inState.widthOfX){ for (int i = 0; i < inState.widthOfX; i++){ pValues(i) = 2. * prob::cdf( boost::math::complement( prob::students_t( static_cast(inState.numRows - inState.widthOfX) ), std::fabs(tStats(i)) )); } } return *this; } /* Regression for the tests for heteroskedasticity */ template inline HeteroLinearRegressionAccumulator::HeteroLinearRegressionAccumulator( Init_type& inInitialization) : Base(inInitialization) { this->initialize(); } /** * @brief Bind all elements of the state to the data in the stream * * The bind() is special in that even after running operator>>() on an element, * there is no guarantee yet that the element can indeed be accessed. It is * cruicial to first check this. * * Provided that this methods correctly lists all member variables, all other * methods can, however, rely on that fact that all variables are correctly * initialized and accessible. */ template inline void HeteroLinearRegressionAccumulator::bind(ByteStream_type& inStream) { inStream >> numRows >> widthOfX >> a_sum >> a_square_sum; uint16_t actualWidthOfX = widthOfX.isNull() ? static_cast(0) : static_cast(widthOfX); inStream >> X_transp_A.rebind(actualWidthOfX) >> X_transp_X.rebind(actualWidthOfX, actualWidthOfX); } /** * @brief Update the accumulation state * * We update the number of rows \f$ n \f$, the partial * sums \f$ \sum_{i=1}^n y_i \f$ and \f$ \sum_{i=1}^n y_i^2 \f$, the matrix * \f$ X^T X \f$, and the vector \f$ X^T \boldsymbol y \f$. */ template inline HeteroLinearRegressionAccumulator& HeteroLinearRegressionAccumulator::operator<<(const hetero_tuple_type& inTuple) { const MappedColumnVector& x = std::get<0>(inTuple); const double& y = std::get<1>(inTuple); const MappedColumnVector& coef = std::get<2>(inTuple); if (!std::isfinite(y)) throw std::domain_error("Dependent variables are not finite."); else if (!dbal::eigen_integration::isfinite(x)) throw std::domain_error("Design matrix is not finite."); else if (x.size() > std::numeric_limits::max()) throw std::domain_error("Number of independent variables cannot be " "larger than 65535."); // Initialize in first iteration if (numRows == 0) { widthOfX = static_cast(x.size()); this->resize(); } // dimension check if (widthOfX != static_cast(x.size())) { throw std::runtime_error("Inconsistent numbers of independent " "variables."); } double a = y - trans(coef)*x; a = a*a; numRows++; a_sum += a; a_square_sum += a*a; X_transp_A.noalias() += x * a; // X^T X is symmetric, so it is sufficient to only fill a triangular part // of the matrix triangularView(X_transp_X) += x * trans(x); return *this; } /** * @brief Merge with another accumulation state */ template template inline HeteroLinearRegressionAccumulator& HeteroLinearRegressionAccumulator::operator<<( const HeteroLinearRegressionAccumulator& inOther) { numRows += inOther.numRows; a_sum += inOther.a_sum; a_square_sum += inOther.a_square_sum; X_transp_A.noalias() += inOther.X_transp_A; triangularView(X_transp_X) += inOther.X_transp_X; return *this; } template template inline HeteroLinearRegressionAccumulator& HeteroLinearRegressionAccumulator::operator=( const HeteroLinearRegressionAccumulator& inOther) { this->copy(inOther); return *this; } template HeteroLinearRegression::HeteroLinearRegression( const HeteroLinearRegressionAccumulator& inState) { compute(inState); } /** * @brief Transform a linear-regression accumulation state into a result * * The result of the accumulation phase is \f$ X^T X \f$ and * \f$ X^T \boldsymbol y \f$. We first compute the pseudo-inverse, then the * regression coefficients, the model statistics, etc. * * @sa For the mathematical description, see \ref grp_linreg. */ template inline HeteroLinearRegression& HeteroLinearRegression::compute( const HeteroLinearRegressionAccumulator& inState) { // The following checks were introduced with MADLIB-138. It still seems // useful to have clear error messages in case of infinite input values. if (!dbal::eigen_integration::isfinite(inState.X_transp_X) || !dbal::eigen_integration::isfinite(inState.X_transp_A)) throw std::domain_error("Design matrix is not finite."); SymmetricPositiveDefiniteEigenDecomposition decomposition( inState.X_transp_X, EigenvaluesOnly, ComputePseudoInverse); // Precompute (X^T * X)^+ Matrix inverse_of_X_transp_X = decomposition.pseudoInverse(); ColumnVector coef; coef = inverse_of_X_transp_X * inState.X_transp_A; // explained sum of squares (regression sum of squares) double ess = dot(inState.X_transp_A, coef) - (inState.a_sum * inState.a_sum / static_cast(inState.numRows)); // total sum of squares double tss = inState.a_square_sum - (inState.a_sum * inState.a_sum / static_cast(inState.numRows)); // With infinite precision, the following checks are pointless. But due to // floating-point arithmetic, this need not hold at this point. // Without a formal proof convincing us of the contrary, we should // anticipate that numerical peculiarities might occur. if (tss < 0) tss = 0; if (ess < 0) ess = 0; // Since we know tss with greater accuracy than ess, we do the following // sanity adjustment to ess: if (ess > tss) ess = tss; // Test statistic: numRows*Coefficient of determination test_statistic = static_cast(inState.numRows) * (tss == 0 ? 1 : ess / tss); pValue = prob::cdf(complement(prob::chi_squared( static_cast(inState.widthOfX-1)), test_statistic)); return *this; } } // namespace regress } // namespace modules } // namespace madlib #endif // defined(MADLIB_MODULES_REGRESS_LINEAR_REGRESSION_IMPL_HPP)