/* ----------------------------------------------------------------------- *//** * * @file multilogistic.cpp * * @brief Multinomial Logistic-Regression functions * * We implement the iteratively-reweighted-least-squares method. * *//* ----------------------------------------------------------------------- */ #include #include #include #include #include #include "multilogistic.hpp" #include #include namespace madlib { // Use Eigen using namespace dbal::eigen_integration; namespace modules { // Import names from other MADlib modules using dbal::NoSolutionFoundException; namespace regress { /** * @brief Logistic function */ inline double sigma(double x) { return 1. / (1. + std::exp(-x)); } /** * @brief Inter- and intra-iteration state for iteratively-reweighted-least- * squares method for logistic regression * * TransitionState encapsualtes the transition state during the * logistic-regression aggregate function. To the database, the state is * exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper * object containing scalars, a vector, and a matrix. * * Note: We assume that the DOUBLE PRECISION array is initialized by the * database with length at least 4, and all elemenets are 0. */ template class MLogRegrIRLSTransitionState { // By §14.5.3/9: "Friend declarations shall not declare partial // specializations." We do access protected members in operator+=(). template friend class MLogRegrIRLSTransitionState; public: MLogRegrIRLSTransitionState(const AnyType &inArray) : mStorage(inArray.getAs()) { rebind(static_cast(mStorage[0]),static_cast(mStorage[1])); } /** * @brief Convert to backend representation * * We define this function so that we can use State in the * argument list and as a return type. */ inline operator AnyType() const { return mStorage; } /** * @brief Initialize the iteratively-reweighted-least-squares state. * * This function is only called for the first iteration, for the first row. */ inline void initialize( const Allocator &inAllocator, uint16_t inWidthOfX, uint16_t inNumCategories, uint16_t inRefCategory) { size_t state_size = arraySize(inWidthOfX, inNumCategories); // GPDB limits the single array size to be 1GB, which means that the size // of a double array cannot be large than 134217727 because // (134217727 * 8) / (1024 * 1024) = 1023. And solve // state_size = x^2 + 2^x + 6 <= 134217727 will give x <= 11584. if(state_size > 134217727) throw std::domain_error( "The product of number of independent variables and number of " "categories cannot be larger than 11584."); mStorage = inAllocator.allocateArray(state_size); rebind(inWidthOfX, inNumCategories); widthOfX = inWidthOfX; numCategories = inNumCategories; ref_category = inRefCategory; } /** * @brief We need to support assigning the previous state */ template MLogRegrIRLSTransitionState &operator=( const MLogRegrIRLSTransitionState &inOtherState) { for (size_t i = 0; i < mStorage.size(); i++) mStorage[i] = inOtherState.mStorage[i]; return *this; } /** * @brief Merge with another State object by copying the intra-iteration * fields */ template MLogRegrIRLSTransitionState &operator+=( const MLogRegrIRLSTransitionState &inOtherState) { if (mStorage.size() != inOtherState.mStorage.size() || widthOfX != inOtherState.widthOfX) throw std::logic_error("Internal error: Incompatible transition " "states"); numRows += inOtherState.numRows; gradient += inOtherState.gradient; X_transp_AX += inOtherState.X_transp_AX; logLikelihood += inOtherState.logLikelihood; return *this; } /** * @brief Reset the inter-iteration fields. */ inline void reset() { numRows = 0; gradient.fill(0); X_transp_AX.fill(0); logLikelihood = 0; } private: static inline uint32_t arraySize(const uint16_t inWidthOfX, const uint16_t inNumCategories) { return 6 + inWidthOfX * inWidthOfX * inNumCategories * inNumCategories + 2 * inWidthOfX * inNumCategories; } void rebind(uint16_t inWidthOfX = 0, uint16_t inNumCategories = 0) { widthOfX.rebind(&mStorage[0]); numCategories.rebind(&mStorage[1]); conditionNo.rebind(&mStorage[2]); coef.rebind(&mStorage[3], inWidthOfX*inNumCategories); numRows.rebind(&mStorage[3 + inWidthOfX*inNumCategories]); gradient.rebind(&mStorage[4 + inWidthOfX*inNumCategories],inWidthOfX*inNumCategories); X_transp_AX.rebind(&mStorage[4 + 2 * inWidthOfX*inNumCategories], inNumCategories*inWidthOfX, inWidthOfX*inNumCategories); logLikelihood.rebind(&mStorage[4 + inNumCategories*inNumCategories*inWidthOfX*inWidthOfX + 2 * inWidthOfX*inNumCategories]); ref_category.rebind(&mStorage[5 + inNumCategories*inNumCategories*inWidthOfX*inWidthOfX + 2 * inWidthOfX*inNumCategories]); } Handle mStorage; public: typename HandleTraits::ReferenceToUInt16 widthOfX; typename HandleTraits::ReferenceToUInt16 numCategories; typename HandleTraits::ReferenceToDouble conditionNo; typename HandleTraits::ColumnVectorTransparentHandleMap coef; typename HandleTraits::ReferenceToUInt64 numRows; typename HandleTraits::ColumnVectorTransparentHandleMap gradient; typename HandleTraits::MatrixTransparentHandleMap X_transp_AX; typename HandleTraits::ReferenceToDouble logLikelihood; typename HandleTraits::ReferenceToUInt16 ref_category; }; /** * @brief Inter- and intra-iteration state for robust variance calculations * * TransitionState encapsualtes the transition state during the * logistic-regression robust variance calculation. To the database, the state is * exposed as a single DOUBLE PRECISION array, to the C++ code it is a proper * object containing scalars, a vector, and a matrix. * * Note: We assume that the DOUBLE PRECISION array is initialized by the * database with length at least 4, and all elemenets are 0. */ template class MLogRegrRobustTransitionState { // By §14.5.3/9: "Friend declarations shall not declare partial // specializations." We do access protected members in operator+=(). template friend class MLogRegrRobustTransitionState; public: MLogRegrRobustTransitionState(const AnyType &inArray) : mStorage(inArray.getAs()) { rebind(static_cast(mStorage[0]),static_cast(mStorage[1])); } /** * @brief Convert to backend representation * * We define this function so that we can use State in the * argument list and as a return type. */ inline operator AnyType() const { return mStorage; } /** * @brief Initialize the iteratively-reweighted-least-squares state. * * This function is only called for the first iteration, for the first row. */ inline void initialize(const Allocator &inAllocator, uint16_t inWidthOfX, uint16_t inNumCategories, uint16_t inRefCategory) { mStorage = inAllocator.allocateArray(arraySize(inWidthOfX, inNumCategories)); rebind(inWidthOfX, inNumCategories); widthOfX = inWidthOfX; numCategories = inNumCategories; ref_category = inRefCategory; } /** * @brief We need to support assigning the previous state */ template MLogRegrRobustTransitionState &operator=( const MLogRegrRobustTransitionState &inOtherState) { for (size_t i = 0; i < mStorage.size(); i++) mStorage[i] = inOtherState.mStorage[i]; return *this; } /** * @brief Merge with another State object by copying the intra-iteration * fields */ template MLogRegrRobustTransitionState &operator+=( const MLogRegrRobustTransitionState &inOtherState) { if (mStorage.size() != inOtherState.mStorage.size() || widthOfX != inOtherState.widthOfX) throw std::logic_error("Internal error: Incompatible transition " "states"); numRows += inOtherState.numRows; X_transp_AX += inOtherState.X_transp_AX; meat += inOtherState.meat; return *this; } /** * @brief Reset the inter-iteration fields. */ inline void reset() { numRows = 0; meat.fill(0); X_transp_AX.fill(0); } private: static inline uint32_t arraySize(const uint16_t inWidthOfX, const uint16_t inNumCategories) { return 4 + 2*inWidthOfX * inWidthOfX * inNumCategories * inNumCategories + inWidthOfX * inNumCategories; } /** * @brief Rebind to a new storage array * * @param inWidthOfX The number of independent variables. * @param inNumCategories The number of categories of the dependant var * Array layout (iteration refers to one aggregate-function call): * Inter-iteration components (updated in final function): * - 0: widthOfX (number of independant variables) * - 1: numCategories (number of categories) * - 2: ref_category * - 3: coef (vector of coefficients) * * Intra-iteration components (updated in transition step): * - 3 + widthOfX*numCategories: numRows (number of rows already processed in this iteration) * - 4 + widthOfX * inNumCategories: X_transp_AX (X^T A X). * - 4 + widthOfX^2*numCategories^2: meat (The meat matrix) */ void rebind(uint16_t inWidthOfX = 0, uint16_t inNumCategories = 0) { widthOfX.rebind(&mStorage[0]); numCategories.rebind(&mStorage[1]); ref_category.rebind(&mStorage[2]); coef.rebind(&mStorage[3], inWidthOfX * inNumCategories); numRows.rebind(&mStorage[3 + inWidthOfX * inNumCategories]); X_transp_AX.rebind(&mStorage[4 + inWidthOfX * inNumCategories], inNumCategories * inWidthOfX, inWidthOfX * inNumCategories); meat.rebind(&mStorage[4 + inNumCategories * inNumCategories * inWidthOfX * inWidthOfX + inWidthOfX * inNumCategories], inWidthOfX * inNumCategories, inWidthOfX * inNumCategories); } Handle mStorage; public: typename HandleTraits::ReferenceToUInt16 widthOfX; typename HandleTraits::ReferenceToUInt16 numCategories; typename HandleTraits::ReferenceToUInt16 ref_category; typename HandleTraits::ColumnVectorTransparentHandleMap coef; typename HandleTraits::ReferenceToUInt64 numRows; typename HandleTraits::MatrixTransparentHandleMap X_transp_AX; typename HandleTraits::MatrixTransparentHandleMap meat; }; /** * @brief IRLS Transition * @param args * * Arguments (Matched with PSQL wrapped) * - 0: Current State * - 1: y value (Integer) * - 2: numCategories (Integer) * - 3: ref_category (Integer) * - 4: X value (Column Vector) * - 5: Previous State */ AnyType __mlogregr_irls_step_transition::run(AnyType &args) { MLogRegrIRLSTransitionState > state = args[0]; if (args[1].isNull() || args[2].isNull() || args[3].isNull() || args[4].isNull()) { return args[0]; } // Get x as a vector of double MappedColumnVector x; try{ // an exception is raised in the backend if args[2] contains nulls MappedColumnVector xx = args[4].getAs(); // x is a const reference, we can only rebind to change its pointer x.rebind(xx.memoryHandle(), xx.size()); } catch (const ArrayWithNullException &e) { return args[0]; } // Get the category & numCategories as integer int32_t category = args[1].getAs(); // Number of categories after pivoting (we pivot around the first category) int32_t numCategories = (args[2].getAs() - 1); int32_t ref_category = args[3].getAs(); // The following check was added with MADLIB-138. if (!x.is_finite()) throw std::domain_error("Design matrix is not finite."); if (state.numRows == 0) { if (x.size() > std::numeric_limits::max()) throw std::domain_error("Number of independent variables cannot be " "larger than 65535."); if (numCategories < 1) throw std::domain_error("Number of cateogires must be at least 2"); // Init the state (requires x.size() and category.size()) state.initialize(*this, static_cast(x.size()) , static_cast(numCategories), static_cast(ref_category)); if (!args[5].isNull()) { MLogRegrIRLSTransitionState > previousState = args[5]; state = previousState; state.reset(); } } /* * This check should be done for each iteration. Only checking the first * run is not enough. */ if (category > numCategories || category < 0) throw std::domain_error("Invalid category. Categories must be integer " "values between 0 and (number of categories - 1)."); if (ref_category > numCategories || ref_category < 0) throw std::domain_error("Invalid reference category. Reference category must be integer " "value between 0 and (number of categories - 1)."); // Now do the transition step state.numRows++; /* Get y: Convert to 1/0 boolean vector Example: Category 4 : 0 0 0 1 0 0 Storing it in this forms helps us get a nice closed form expression */ //To pivot around the specified reference category ColumnVector y(numCategories); y.fill(0); if (category > ref_category) { y(category - 1) = 1; } else if (category < ref_category) { y(category) = 1; } /* Compute the parameter vector (the 'pi' vector in the documentation) for the data point being processed. Casting the coefficients into a matrix makes the calculation simple. */ Matrix coef = state.coef; coef.resize(numCategories, state.widthOfX); //Store the intermediate calculations because we'll reuse them in the LLH ColumnVector t1 = x; //t1 is vector of size state.widthOfX t1 = coef*x; /* Note: The above 2 lines could have been written as: ColumnVector t1 = -coef*x; but this creates warnings. These warnings are somehow related to the factor that x is of an immutable type. The following alternative could resolve the warnings (although not necessary the best alternative): */ ColumnVector t2 = t1.array().exp(); double t3 = 1 + t2.sum(); ColumnVector pi = t2/t3; //The gradient matrix has numCategories rows and widthOfX columns Matrix grad = -y*x.transpose() + pi*x.transpose(); //We cast the gradient into a vector to make the Newton step calculations much easier. grad.resize(numCategories*state.widthOfX,1); /* a is a matrix of size JxJ where J is the number of categories a_j1j2 = -pi(j1)*(1-pi(j2))if j1 == j2 a_j1j2 = pi(j1)*pi(j2) if j1 != j2 */ Matrix a(numCategories,numCategories); // Compute the 'a' matrix. Matrix piDiag = pi.asDiagonal(); a = pi * pi.transpose() - piDiag; state.gradient.noalias() += grad; //Start the Hessian calculations Matrix X_transp_AX(numCategories * state.widthOfX, numCategories * state.widthOfX); /* Again: The following 3 lines could have been written as Matrix XXTrans = x * x.transpose(); but it creates warnings related to the type of x. Here is an easy fix */ Matrix cv_x = x; Matrix XXTrans = trans(cv_x); XXTrans = cv_x * XXTrans; //Eigen doesn't supported outer-products for matrices, so we have to do our own. //This operation is also known as a tensor-product. for (int i1 = 0; i1 < state.widthOfX; i1++){ for (int i2 = 0; i2 (state.X_transp_AX) += X_transp_AX; state.logLikelihood += y.transpose()*t1 - log(t3); return state; } /** * @brief Perform the perliminary aggregation function: Merge transition states This merge step is the same as that of logistic regression. */ AnyType __mlogregr_irls_step_merge_states::run(AnyType &args) { MLogRegrIRLSTransitionState > stateLeft = args[0]; MLogRegrIRLSTransitionState > stateRight = args[1]; // We first handle the trivial case where this function is called with one // of the states being the initial state if (stateLeft.numRows == 0) return stateRight; else if (stateRight.numRows == 0) return stateLeft; // Merge states together and return stateLeft += stateRight; return stateLeft; } /** * @brief Perform the logistic-regression final step */ AnyType __mlogregr_irls_step_final::run(AnyType &args) { // We request a mutable object. // Depending on the backend, this might perform a deep copy. MLogRegrIRLSTransitionState > state = args[0]; // Aggregates that haven't seen any data just return Null. if (state.numRows == 0) return Null(); // See MADLIB-138. At least on certain platforms and with certain versions, // LAPACK will run into an infinite loop if pinv() is called for non-finite // matrices. We extend the check also to the dependent variables. if (!state.X_transp_AX.is_finite() || !state.gradient.is_finite()) throw NoSolutionFoundException("Over- or underflow in intermediate " "calculation. Input data is likely of poor numerical condition."); SymmetricPositiveDefiniteEigenDecomposition decomposition( -1 * state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse); // Precompute (X^T * A * X)^-1 Matrix hessianInv = -1 * decomposition.pseudoInverse(); state.coef.noalias() += hessianInv * state.gradient; if(!state.coef.is_finite()) throw NoSolutionFoundException("Over- or underflow in Newton step, " "while updating coefficients. Input data is likely of poor " "numerical condition."); // We use the intra-iteration field gradient for storing the diagonal // of X^T A X, so that we don't have to recompute it in the result function. // Likewise, we store the condition number. // FIXME: This feels a bit like a hack. state.conditionNo = decomposition.conditionNo(); state.X_transp_AX = -1 * hessianInv; return state; } /** * @brief Compute the diagnostic statistics * * This function wraps the common parts of mlogregr state into the result. * (the result data type is defined in multilogistic.sql_in) */ AnyType mLogstateToResult( const Allocator &inAllocator, MLogRegrIRLSTransitionState > state) { int ref_category = state.ref_category; const HandleMap > &inCoef = state.coef; double logLikelihood = state.logLikelihood; uint64_t num_processed = state.numRows; // Per the hack at the end of the final function we place the inverse // of the X_tranp_AX into the state.X_transp_AX const Matrix & X_transp_AX_inverse = state.X_transp_AX; const ColumnVector &diagonal_of_hessian = X_transp_AX_inverse.diagonal(); MutableNativeColumnVector stdErr( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector waldZStats( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector waldPValues( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector oddsRatios( inAllocator.allocateArray(inCoef.size())); for (Index i = 0; i < inCoef.size(); ++i) { stdErr(i) = std::sqrt(diagonal_of_hessian(i)); waldZStats(i) = inCoef(i) / stdErr(i); waldPValues(i) = 2. * prob::cdf( prob::normal(), -std::abs(waldZStats(i))); oddsRatios(i) = std::exp( inCoef(i) ); } int num_iterations = 0; // Return all coefficients, standard errors, etc. in a tuple AnyType tuple; tuple << ref_category << inCoef << logLikelihood << stdErr << waldZStats << waldPValues << oddsRatios << static_cast(state.conditionNo) << num_iterations << num_processed; return tuple; } /** * @brief Return the difference in log-likelihood between two states */ AnyType __internal_mlogregr_irls_step_distance::run(AnyType &args) { MLogRegrIRLSTransitionState > stateLeft = args[0]; MLogRegrIRLSTransitionState > stateRight = args[1]; return std::abs(stateLeft.logLikelihood - stateRight.logLikelihood); } /** * @brief Return the coefficients and diagnostic statistics of the state */ AnyType __internal_mlogregr_irls_result::run(AnyType &args) { MLogRegrIRLSTransitionState > state = args[0]; return mLogstateToResult(*this, state); // state.ref_category, state.coef, // state.gradient, state.logLikelihood, state.X_transp_AX(0,0)); } /** * @brief Return the coefficients and diagnostic statistics of the state */ AnyType __internal_mlogregr_summary_results::run(AnyType &args) { MLogRegrIRLSTransitionState > state = args[0]; const Matrix & X_transp_AX_inverse = state.X_transp_AX; // X_transp_AX is actually it's inverse - this is a hack added at the end of // the final function AnyType tuple; Matrix coef = state.coef; coef.resize(state.numCategories, state.widthOfX); coef.transposeInPlace(); tuple << coef << X_transp_AX_inverse; return tuple; } // ----------------- End of Multinomial Logistic Regression -------------------- // --------------------------------------------------------------------------- // Robust Variance Multi-Logistic // --------------------------------------------------------------------------- AnyType mlogregr_robust_step_transition::run(AnyType &args) { using std::endl; MLogRegrRobustTransitionState > state = args[0]; if (args[1].isNull() || args[2].isNull() || args[3].isNull() || args[4].isNull() || args[5].isNull()) { return args[0]; } // Get x as a vector of double MappedColumnVector x; try{ // an exception is raised in the backend if args[2] contains nulls MappedColumnVector xx = args[4].getAs(); // x is a const reference, we can only rebind to change its pointer x.rebind(xx.memoryHandle(), xx.size()); } catch (const ArrayWithNullException &e) { return args[0]; } // Get the category & numCategories as integer int16_t category = static_cast(args[1].getAs()); // Number of categories after pivoting (We pivot around the first category) int16_t numCategories = static_cast(args[2].getAs() - 1); int32_t ref_category = args[3].getAs(); MappedMatrix coefMat = args[5].getAs(); // The following check was added with MADLIB-138. if (!x.is_finite()) throw std::domain_error("Design matrix is not finite."); if (state.numRows == 0) { if (x.size() > std::numeric_limits::max()) throw std::domain_error("Number of independent variables cannot be " "larger than 65535."); if (numCategories < 1) throw std::domain_error("Number of cateogires must be at least 2"); if (category > numCategories) throw std::domain_error("You have entered a category > numCategories" "Categories must be of values {0,1... numCategories-1}"); // Init the state (requires x.size() and category.size()) state.initialize(*this, static_cast(x.size()) , static_cast(numCategories), static_cast(ref_category)); Matrix mat = coefMat; mat.transposeInPlace(); mat.resize(coefMat.size(), 1); state.coef = mat; } // Now do the transition step state.numRows++; /* Get y: Convert to 1/0 boolean vector Example: Category 4 : 0 0 0 1 0 0 Storing it in this forms helps us get a nice closed form expression */ //To pivot around the specified reference category ColumnVector y(numCategories); y.fill(0); if (category > ref_category) { y(category - 1) = 1; } else if (category < ref_category) { y(category) = 1; } /* Compute the parameter vector (the 'pi' vector in the documentation) for the data point being processed. Casting the coefficients into a matrix makes the calculation simple. */ Matrix coef = state.coef; coef.resize(numCategories, state.widthOfX); //Store the intermediate calculations because we'll reuse them in the LLH ColumnVector t1 = x; //t1 is vector of size state.widthOfX t1 = coef*x; /* Note: The above 2 lines could have been written as: ColumnVector t1 = -coef*x; but this creates warnings. These warnings are somehow related to the factor that x is an immutable type. */ ColumnVector t2 = t1.array().exp(); double t3 = 1 + t2.sum(); ColumnVector pi = t2/t3; //The gradient matrix has numCategories rows and widthOfX columns Matrix grad = -y * x.transpose() + pi * x.transpose(); //We cast the gradient into a vector to make the math easier. grad.resize(numCategories * state.widthOfX, 1); Matrix GradGradTranspose; GradGradTranspose = grad * grad.transpose(); state.meat += GradGradTranspose; /* a is a matrix of size JxJ where J is the number of categories a_j1j2 = -pi(j1)*(1-pi(j2))if j1 == j2 a_j1j2 = pi(j1)*pi(j2) if j1 != j2 */ // Compute the 'a' matrix. Matrix a(numCategories,numCategories); Matrix piDiag = pi.asDiagonal(); a = pi * pi.transpose() - piDiag; //Start the Hessian calculations Matrix X_transp_AX(numCategories * state.widthOfX, numCategories * state.widthOfX); /* Again: The following 3 lines could have been written as Matrix XXTrans = x * x.transpose(); but it creates warnings related to the type of x. Here is an easy fix */ Matrix cv_x = x; Matrix XXTrans = trans(cv_x); XXTrans = cv_x * XXTrans; //Eigen doesn't supported outer-products for matrices, so we have to do our own. //This operation is also known as a tensor-product. for (int i1 = 0; i1 < state.widthOfX; i1++){ for (int i2 = 0; i2 (state.X_transp_AX) += X_transp_AX; return state; } /** * @brief Perform the perliminary aggregation function: Merge transition states This merge step is the same as that of logistic regression. */ AnyType mlogregr_robust_step_merge_states::run(AnyType &args) { MLogRegrRobustTransitionState > stateLeft = args[0]; MLogRegrRobustTransitionState > stateRight = args[1]; // We first handle the trivial case where this function is called with one // of the states being the initial state if (stateLeft.numRows == 0) return stateRight; else if (stateRight.numRows == 0) return stateLeft; // Merge states together and return stateLeft += stateRight; return stateLeft; } AnyType MLrobuststateToResult( const Allocator &inAllocator, int ref_category, const HandleMap >& inCoef, const ColumnVector &diagonal_of_varianceMat) { MutableNativeColumnVector variance( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector stdErr( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector waldZStats( inAllocator.allocateArray(inCoef.size())); MutableNativeColumnVector waldPValues( inAllocator.allocateArray(inCoef.size())); for (Index i = 0; i < inCoef.size(); ++i) { variance(i) = diagonal_of_varianceMat(i); stdErr(i) = std::sqrt(diagonal_of_varianceMat(i)); waldZStats(i) = inCoef(i) / stdErr(i); waldPValues(i) = 2. * prob::cdf( prob::normal(), -std::abs(waldZStats(i))); } // Return all coefficients, standard errors, etc. in a tuple AnyType tuple; tuple << ref_category << inCoef << stdErr << waldZStats << waldPValues; return tuple; } /** * @brief Perform the logistic-regression final step */ AnyType mlogregr_robust_step_final::run(AnyType &args) { // We request a mutable object. Depending on the backend, this might perform // a deep copy. MLogRegrRobustTransitionState > state = args[0]; // Aggregates that haven't seen any data just return Null. if (state.numRows == 0) return Null(); // See MADLIB-138. At least on certain platforms and with certain versions, // LAPACK will run into an infinite loop if pinv() is called for non-finite // matrices. We extend the check also to the dependent variables. if (!state.X_transp_AX.is_finite()) throw NoSolutionFoundException("Over- or underflow in intermediate " "calculation. Input data is likely of poor numerical condition."); SymmetricPositiveDefiniteEigenDecomposition decomposition( -1 * state.X_transp_AX, EigenvaluesOnly, ComputePseudoInverse); // Precompute (X^T * A * X)^-1 Matrix bread = decomposition.pseudoInverse(); Matrix varianceMat; varianceMat = bread * state.meat * bread; if(!state.coef.is_finite()) throw NoSolutionFoundException("Over- or underflow in Newton step, " "while updating coefficients. Input data is likely of poor " "numerical condition."); return MLrobuststateToResult(*this, state.ref_category, state.coef, varianceMat.diagonal()); } // ------------------------ End of Robust Variance ----------------------------- AnyType __sub_array::run(AnyType &args) { if (args[0].isNull() || args[1].isNull()) return Null(); ArrayHandle value = args[0].getAs >(); ArrayHandle index = args[1].getAs >(); for (size_t i = 0; i < index.size(); i++) { if (index[i] < 1 || index[i] > static_cast(value.size())) throw std::domain_error("Invalid indices - out of bound"); } MutableArrayHandle res = allocateArray(index.size()); for (size_t i = 0; i < index.size(); i++) res[i] = value[index[i] - 1]; return res; } // ---------------------------------------------------------------------------- typedef struct __sr_ctx{ const double * inarray; int32_t maxcall; int32_t num_feature; int32_t num_category; int32_t ref_category; int32_t curcall; } sr_ctx; void * __mlogregr_format::SRF_init(AnyType &args) { sr_ctx *ctx = new sr_ctx; ctx->curcall = 0; MutableArrayHandle inarray = NULL; try{ inarray = args[0].getAs >(); } catch (const ArrayWithNullException &e) { ctx->maxcall = 0; return ctx; } int32_t num_feature = args[1].getAs(); int32_t num_category = args[2].getAs(); int32_t ref_category = args[3].getAs(); ctx->inarray = inarray.ptr(); ctx->maxcall = num_category - 1; ctx->num_category = num_category - 1; ctx->num_feature = num_feature; ctx->ref_category = ref_category; if (num_feature * (num_category - 1) != static_cast(inarray.size())) { throw std::runtime_error("num_feature * (num_category - 1) != " "inarray.size()"); } if (ref_category >= num_category){ throw std::runtime_error("ref_category >= num_category"); } return ctx; } AnyType __mlogregr_format::SRF_next(void * user_fctx, bool * is_last_call) { sr_ctx * ctx = (sr_ctx *) user_fctx; if (ctx->curcall >= ctx->maxcall) { *is_last_call = true; return Null(); } MutableArrayHandle outarray = allocateArray(ctx->num_feature); for(int i = 0; i < ctx->num_feature; i++) { outarray[i] = ctx->inarray[i * ctx->num_category + ctx->curcall]; } AnyType tuple; tuple << (ctx->curcall < ctx->ref_category ? ctx->curcall : ctx->curcall + 1) << outarray; ctx->curcall++; return tuple; } /* ------------------------------------------------------------ */ AnyType mlogregr_predict_prob::run(AnyType &args) { // dimension: N x (L - 1), where L is the number of categories MappedMatrix coef = args[0].getAs(); int ref_category = args[1].getAs(); MappedColumnVector x; try { MappedColumnVector xx = args[2].getAs(); x.rebind(xx.memoryHandle(), xx.size()); } catch (const ArrayWithNullException &e) { return Null(); } int L = static_cast(coef.cols()) + 1; // number of categories ColumnVector res(L); int cat_index = 0; for (int i = 0; i < L; i++) { if (i == ref_category) { res(i) = 1; } else { res(i) = exp(coef.col(cat_index).dot(x)); cat_index++; } } res /= res.sum(); return res; } /* ------------------------------------------------------------ */ AnyType mlogregr_predict_response::run(AnyType &args) { // dimension: N x (L - 1), where L is the number of categories MappedMatrix coef = args[0].getAs(); int ref_category = args[1].getAs(); MappedColumnVector x; try { MappedColumnVector xx = args[2].getAs(); x.rebind(xx.memoryHandle(), xx.size()); } catch (const ArrayWithNullException &e) { return Null(); } ColumnVector linear_predictors(x.transpose() * coef); Index max_cat = 0; linear_predictors.maxCoeff(&max_cat); double max_lp = linear_predictors(max_cat); if (exp(max_lp) < 1){ // no category has high enough probability as reference category return ref_category; } else if (max_cat < ref_category){ return static_cast(max_cat); } else{ // since ref_category is not present in the coef matrix, index of // categories after ref_category is 1 less than the actual index return static_cast(max_cat + 1u); } } } // namespace regress } // namespace modules } // namespace madlib