/* ----------------------------------------------------------------------- *//** * * @file t_test.cpp * * @brief t-Test functions * *//* ----------------------------------------------------------------------- */ #include #include #include #include #include "t_test.hpp" namespace madlib { namespace modules { namespace stats { namespace { AnyType tStatsToResult(double inT, double inDegreeOfFreedom); } /** * @brief Transition state for t-Test functions * * Note: We assume that the DOUBLE PRECISION array is initialized by the * database with length 6, and all elemenets are 0. */ template class TTestTransitionState { public: TTestTransitionState(const AnyType &inArray) : mStorage(inArray.getAs()), numX(&mStorage[0]), x_sum(&mStorage[1]), correctedX_square_sum(&mStorage[2]), numY(&mStorage[3]), y_sum(&mStorage[4]), correctedY_square_sum(&mStorage[5]) { } /** * @brief Convert to backend representation * * We define this function so that we can use TransitionState in the argument * list and as a return type. */ inline operator AnyType() const { return mStorage; } private: Handle mStorage; public: typename HandleTraits::ReferenceToUInt64 numX; typename HandleTraits::ReferenceToDouble x_sum; typename HandleTraits::ReferenceToDouble correctedX_square_sum; typename HandleTraits::ReferenceToUInt64 numY; typename HandleTraits::ReferenceToDouble y_sum; typename HandleTraits::ReferenceToDouble correctedY_square_sum; }; /** * @brief Update the corrected sum of squares * * For numerical stability, we should not compute the sample variance in the * naive way. The literature has many examples where this gives bad results * even with moderately sized inputs. * * See: * * B. P. Welford (1962). "Note on a method for calculating corrected sums of * squares and products". Technometrics 4(3):419–420. * * Chan, Tony F.; Golub, Gene H.; LeVeque, Randall J. (1979), "Updating * Formulae and a Pairwise Algorithm for Computing Sample Variances.", Technical * Report STAN-CS-79-773, Department of Computer Science, Stanford University. * ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf */ inline void updateCorrectedSumOfSquares(double &ioLeftWeight, double &ioLeftSum, double &ioLeftCorrectedSumSquares, double inRightWeight, double inRightSum, double inRightCorrectedSumSquares) { if (inRightWeight <= 0) return; // FIXME: Use compensated sums for numerical stability // See Ogita et al., "Accurate Sum and Dot Product", SIAM Journal on // Scientific Computing (SISC), 26(6):1955-1988, 2005. if (ioLeftWeight <= 0) ioLeftCorrectedSumSquares = inRightCorrectedSumSquares; else { double diff = inRightWeight / ioLeftWeight * ioLeftSum - inRightSum; ioLeftCorrectedSumSquares += inRightCorrectedSumSquares + ioLeftWeight / (inRightWeight * (ioLeftWeight + inRightWeight)) * diff * diff; } ioLeftSum += inRightSum; ioLeftWeight += inRightWeight; } /** * @brief Perform the one-sample t-test transition step */ AnyType t_test_one_transition::run(AnyType &args) { TTestTransitionState > state = args[0]; double x = args[1].getAs(); updateCorrectedSumOfSquares( state.numX.ref(), state.x_sum.ref(), state.correctedX_square_sum.ref(), 1, x, 0); return state; } /** * @brief Perform the two-sample t-test transition step */ AnyType t_test_two_transition::run(AnyType &args) { TTestTransitionState > state = args[0]; bool firstSample = args[1].getAs(); double value = args[2].getAs(); if (firstSample) updateCorrectedSumOfSquares( state.numX.ref(), state.x_sum.ref(), state.correctedX_square_sum.ref(), 1, value, 0); else updateCorrectedSumOfSquares( state.numY.ref(), state.y_sum.ref(), state.correctedY_square_sum.ref(), 1, value, 0); return state; } /** * @brief Perform the perliminary aggregation function: Merge transition states */ AnyType t_test_merge_states::run(AnyType &args) { TTestTransitionState > stateLeft = args[0]; TTestTransitionState > stateRight = args[1]; // Merge states together and return updateCorrectedSumOfSquares( stateLeft.numX.ref(), stateLeft.x_sum.ref(), stateLeft.correctedX_square_sum.ref(), stateRight.numX.ref(), stateRight.x_sum, stateRight.correctedX_square_sum); updateCorrectedSumOfSquares( stateLeft.numY.ref(), stateLeft.y_sum.ref(), stateLeft.correctedY_square_sum.ref(), stateRight.numY.ref(), stateRight.y_sum, stateRight.correctedY_square_sum); return stateLeft; } /** * @brief Perform the one-sided t-Test final step */ AnyType t_test_one_final::run(AnyType &args) { TTestTransitionState > state = args[0]; // If we haven't seen enough data, just return Null. This is the standard // behavior of aggregate function on empty data sets (compare, e.g., // how PostgreSQL handles stddev_samp on quasi-empty inputs) if (state.numX <= 1) return Null(); const double& numX = state.numX.ref(); double degreeOfFreedom = numX - 1; double sampleVariance = state.correctedX_square_sum / degreeOfFreedom; double t = std::sqrt(numX / sampleVariance) * (state.x_sum / numX); return tStatsToResult(t, degreeOfFreedom); } /** * @brief Perform the pooled (i.e., assuming equal variances) two-sample t-Test * final step */ AnyType t_test_two_pooled_final::run(AnyType &args) { TTestTransitionState > state = args[0]; // If we haven't seen enough data, just return Null. This is the standard // behavior of aggregate function on empty data sets (compare, e.g., // how PostgreSQL handles stddev_samp on quasi-empty inputs) if (state.numX == 0 || state.numY == 0 || state.numX + state.numY <= 2) return Null(); // Formulas taken from: // http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm const double& numX = state.numX.ref(); const double& numY = state.numY.ref(); double dfEqualVar = numX + numY - 2; double diffInMeans = state.x_sum / numX - state.y_sum / numY; double sampleVariancePooled = (state.correctedX_square_sum + state.correctedY_square_sum) / dfEqualVar; double tDenomEqualVar = std::sqrt(sampleVariancePooled * (1. / numX + 1. / numY)); double tEqualVar = diffInMeans / tDenomEqualVar; return tStatsToResult(tEqualVar, dfEqualVar); } /** * @brief Perform the unpooled (i.e., assuming unequal variances) two-sample * t-Test final step */ AnyType t_test_two_unpooled_final::run(AnyType &args) { TTestTransitionState > state = args[0]; // If we haven't seen enough data, just return Null. This is the standard // behavior of aggregate function on empty data sets (compare, e.g., // how PostgreSQL handles stddev_samp on quasi-empty inputs) if (state.numX <= 1 || state.numY <= 1) return Null(); // Formulas taken from: // http://www.itl.nist.gov/div898/handbook/eda/section3/eda353.htm const double& numX = state.numX.ref(); const double& numY = state.numY.ref(); double sampleVarianceX = state.correctedX_square_sum / (numX - 1); double sampleVarianceY = state.correctedY_square_sum / (numY - 1); double sampleVarianceX_over_numX = sampleVarianceX / numX; double sampleVarianceY_over_numY = sampleVarianceY / numY; double dfUnequalVar = std::pow(sampleVarianceX_over_numX + sampleVarianceY_over_numY, 2) / ( std::pow(sampleVarianceX_over_numX, 2) / (numX - 1) + std::pow(sampleVarianceY_over_numY, 2) / (numY - 1) ); double diffInMeans = state.x_sum / numX - state.y_sum / numY; double tDenomUnequalVar = std::sqrt(sampleVarianceX / numX + sampleVarianceY / numY); double tUnequalVar = diffInMeans / tDenomUnequalVar; return tStatsToResult(tUnequalVar, dfUnequalVar); } /** * @brief Perform the F-test final step */ AnyType f_test_final::run(AnyType &args) { using boost::math::complement; TTestTransitionState > state = args[0]; // If we haven't seen enough data, just return Null. This is the standard // behavior of aggregate function on empty data sets (compare, e.g., // how PostgreSQL handles stddev_samp on quasi-empty inputs) if (state.numX <= 1 || state.numY <= 1) return Null(); // Formulas taken from: // http://www.itl.nist.gov/div898/handbook/eda/section3/eda359.htm double dfX = static_cast(state.numX - 1); double dfY = static_cast(state.numY - 1); double sampleVarianceX = state.correctedX_square_sum / dfX; double sampleVarianceY = state.correctedY_square_sum / dfY; double statistic = sampleVarianceX / sampleVarianceY; AnyType tuple; double pvalue_one_sided = prob::cdf(complement(prob::fisher_f(dfX, dfY), statistic)); double pvalue_two_sided = 2. * std::min(pvalue_one_sided, 1. - pvalue_one_sided); tuple << statistic << dfX << dfY << pvalue_one_sided << pvalue_two_sided; return tuple; } namespace { inline AnyType tStatsToResult(double inT, double inDegreeOfFreedom) { // Return t statistic, degrees of freedom, one-tailed p-value (Null // hypothesis \mu <= \mu_0), and two-tailed p-value (\mu = \mu_0). // Recall definition of p-value: The probability of observating a // value at least as extreme as the one observed, assuming that the null // hypothesis is true. using boost::math::complement; AnyType tuple; tuple << inT << inDegreeOfFreedom << prob::cdf(complement(prob::students_t(inDegreeOfFreedom), inT)) << 2. * prob::cdf(boost::math::complement( prob::students_t(inDegreeOfFreedom), std::fabs(inT))); return tuple; } } } // namespace stats } // namespace modules } // namespace madlib