/* ----------------------------------------------------------------------- *//** * * @file svd.cpp * * @brief Functions for Singular Value Decomposition * * @date Jul 10, 2013 *//* ----------------------------------------------------------------------- */ #include #include #include #include #include #include #include "svd.hpp" #include namespace madlib { using namespace dbal::eigen_integration;// Use Eigen namespace modules { namespace linalg { using madlib::dbconnector::postgres::madlib_construct_array; // To get a rank-k approximation of the original matrix if we perform k + s Lanczos // bidiagonalization steps followed by the SVD of a small matrix B(k+s) then the // algorithm constructs the best rank-k subspace in an extended subspace // Span[U(k+s)]. Hence we obtain a better rank-k approximation than the one // obtained after k steps steps of the standard Lanczos bidiagonalization algorithm. // There is a memory limit to the number of extended steps and we restrict that to // fixed number of steps for now. // Magic number computed using the 1GB memory limit. // MAX_LANCZOS_STEPS^2 < 10^9 bytes / (8 bytes * 3 matrices) const size_t MAX_LANCZOS_STEPS = 5000; // For floating point equality comparisons, // it is safer to define a small range of values // that are "zero", rather than use the exact value of 0. const double ZERO_THRESHOLD = 1e-8; /* Project the vector v into the vector u (in-place) */ static void __project(MappedColumnVector& u, MutableNativeColumnVector& v){ double uu = u.dot(u); double uv = u.dot(v); double coef; if (uu <= ZERO_THRESHOLD) { // if u is the zero vector, we have a division by zero problem coef = 0; } else coef = uv / uu; v = coef * u; } /** * @brief This function returns a random normalized unit vector of specified size * @param args[0] The dimension * @return The unit-norm vector **/ AnyType svd_unit_vector::run(AnyType & args) { int32_t dim = args[0].getAs(); if(dim < 1){ throw std::invalid_argument( "invalid argument - Positive integer expected for dimension"); } MutableNativeColumnVector vectorEigen; Allocator& allocator = defaultAllocator(); vectorEigen.rebind(allocator.allocateArray(dim)); vectorEigen.setRandom(); vectorEigen = vectorEigen.normalized(); // AnyType tuple; // tuple << vectorEigen; // return tuple; return vectorEigen; } /** * @brief This function is the transition function of the aggregator computing the Lanczos vectors * @param args[0] State variable (i.e. A * q_j OR A_trans * p_(j-1)) * @param args[1] Matrix row id * @param args[2] Matrix row array * @param args[3] Previous P/Q vector * @param args[4] Row/Column dimension **/ AnyType svd_lanczos_sfunc::run(AnyType & args){ int32_t row_id = args[1].getAs(); MappedColumnVector row_array = args[2].getAs(); MappedColumnVector vec = args[3].getAs(); int32_t dim = args[4].getAs(); if(dim < 1){ throw std::invalid_argument( "invalid argument - Positive integer expected for dimension"); } if(row_id <= 0 || row_id > dim){ throw std::invalid_argument( "invalid argument: row_id is out of range [1, dim]"); } if(row_array.size() != vec.size()){ throw std::invalid_argument( "dimensions mismatch: row_array.size() != vec.size(). " "Data contains different sized arrays"); } // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle state(NULL); if(args[0].isNull()){ state = MutableArrayHandle( madlib_construct_array( NULL, dim, FLOAT8OID, sizeof(double), true, 'd')); for (int i = 0; i < dim; i++) state[i] = 0; }else{ state = args[0].getAs >(); } state[row_id - 1] = row_array.dot(vec); return state; } /** * @brief This function is the merge function of the aggregator computing the Lanczos vectors * @param args[0] State variable 1 * @param args[1] State variable 2 **/ AnyType svd_lanczos_prefunc::run(AnyType & args){ MutableArrayHandle state1 = args[0].getAs >(); ArrayHandle state2 = args[1].getAs >(); if(state1.size() != state2.size()){ throw std::runtime_error("dimension mismatch: state1.size() != state2.size()"); } for(size_t i = 0; i < state1.size(); i++) state1[i] += state2[i]; return state1; } /** * @breif This function completes the computation of Lanczoc P vector * @param args[0] Partial P vector from the aggregator * @param args[1] Previous P vector * @param args[2] Previous beta **/ AnyType svd_lanczos_pvec::run(AnyType & args){ MutableNativeColumnVector partial_pvec = args[0].getAs(); // When args[1] is NULL, it's special case for computing p_1 if (!args[1].isNull()){ MappedColumnVector prev_pvec = args[1].getAs< MappedColumnVector >(); double beta = args[2].getAs(); if(partial_pvec.size() != prev_pvec.size()){ throw std::invalid_argument( "dimension mismatch: partial_pvec.size() != prev_pvec.size()"); } partial_pvec = partial_pvec - beta * prev_pvec; } double norm = partial_pvec.norm(); partial_pvec.normalize(); AnyType tuple; tuple << norm << partial_pvec; return tuple; } /** * @breif This function completes the computation of Lanczoc Q vector * @param args[0] Partial Q vector from the aggregator * @param args[1] Previous Q vector * @param args[2] Current alpha **/ AnyType svd_lanczos_qvec::run(AnyType & args){ MutableNativeColumnVector partial_qvec = args[0].getAs(); MappedColumnVector prev_qvec = args[1].getAs< MappedColumnVector >(); double alpha = args[2].getAs(); if(partial_qvec.size() != prev_qvec.size()){ throw std::invalid_argument( "dimension mismatch: partial_qvec.size() != prev_qvec.size()"); } partial_qvec = partial_qvec - alpha * prev_qvec; // Different with svd_lanczos_pvec, the Q vector will be furhter orthogonalized // and then be normalized in a separate function return partial_qvec; } /** * @brief This function is the transition function of the aggregaror doing the Gram-Schmidt orthogonalization * @param args[0] State variable: sum of projected vectors | vector v * @param args[1] Unorthogonalized vector (v) * @param args[2] Orthogonalized vector (u) **/ AnyType svd_gram_schmidt_orthogonalize_sfunc::run(AnyType & args){ MutableNativeColumnVector v = args[1].getAs(); MappedColumnVector u = args[2].getAs(); if(u.size() != v.size()){ throw std::invalid_argument( "dimensions mismatch: u.size() != v.size()"); } // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle state(NULL); if(args[0].isNull()){ state = MutableArrayHandle( madlib_construct_array(NULL, static_cast(u.size()) * 2, FLOAT8OID, sizeof(double), true, 'd')); // Save v into the state variable memcpy(state.ptr() + u.size(), v.data(), v.size() * sizeof(double)); }else{ state = args[0].getAs >(); } // In-place projection __project(u, v); for(int i = 0; i < u.size(); i++){ state[i] += v[i]; } return state; } /** * @brief This function is the merge function of the aggregator doing the Gram-Schmidt orthogonalization * @param args[0] State variable 1 * @param args[1] State variable 2 **/ AnyType svd_gram_schmidt_orthogonalize_prefunc::run(AnyType & args){ MutableArrayHandle state1 = args[0].getAs >(); ArrayHandle state2 = args[1].getAs >(); if(state1.size() != state2.size()){ throw std::runtime_error("dimension mismatch: state1.size() != state2.size()"); } // Note that the second half of the state variable stores the vector v for(size_t i = 0; i < state1.size() / 2; i++) state1[i] += state2[i]; return state1; } /** * @brief This function is the final function of the aggregator doing the Gram-Schmidt orthogonalization * @param args[0] State variable **/ AnyType svd_gram_schmidt_orthogonalize_ffunc::run(AnyType & args){ ArrayHandle state = args[0].getAs >(); MutableNativeColumnVector u; Allocator& allocator = defaultAllocator(); u.rebind(allocator.allocateArray(state.size() / 2)); for(int i = 0; i < u.size(); i++){ u[i] = state[u.size() + i] - state[i]; } double norm = u.norm(); u.normalize(); AnyType tuple; tuple << norm << u; return tuple; } // -- SVD Decomposition for Bidiagonal Matrix -------------------------------- /** * @brief This function is the transition function of the aggregator computing the SVD * of a sparse bidiagonal matrix * @param args[0] State variable (in-memory dense matrix) * @param args[1] Dimension of the matrix (i.e. k) * @param args[2] Row ID (i.e. row_id) * @param args[3] Column ID (i.e. col_id) * @param args[4] Value **/ AnyType svd_decompose_bidiagonal_sfunc::run(AnyType & args){ if(args[1].isNull() || args[2].isNull() || args[3].isNull() || args[4].isNull()) return args[0]; int32_t k = args[1].getAs(); int32_t row_id = args[2].getAs(); int32_t col_id = args[3].getAs(); double value = args[4].getAs(); if(k < 0){ throw std::invalid_argument( "SVD error: k should be a positive integer"); } if(k > (int)MAX_LANCZOS_STEPS){ throw std::invalid_argument( "SVD error: k is too large, try with a value in the range of [1, 6000]"); } if(row_id <= 0 || row_id > k){ throw std::invalid_argument( "SVD error: row_id should be in the range of [1, k]"); } if(col_id <= 0 || col_id > k){ throw std::invalid_argument( "invalid parameter: col_id should be in the range of [1, k]"); } // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle state(NULL); if(args[0].isNull()){ state = MutableArrayHandle( madlib_construct_array(NULL, k * k, FLOAT8OID, sizeof(double), true, 'd')); } else { state = args[0].getAs >(); } state[(row_id - 1) * k + col_id - 1] = value; return state; } /** * @brief This function is the merge function of the aggregator computing the SVD * of a sparse bidiagonal matrix * @param args[0] State variable 1 * @param args[1] State varaible 2 **/ AnyType svd_decompose_bidiagonal_prefunc::run(AnyType & args){ MutableArrayHandle state1 = args[0].getAs >(); ArrayHandle state2 = args[1].getAs >(); if(state1.size() != state2.size()){ throw std::runtime_error("dimension mismatch: state1.size() != state2.size()"); } for(size_t i = 0; i < state1.size(); i++){ state1[i] += state2[i]; } return state1; } /** * @brief Take the final matrix and run it by Eigen JacobiSVD to get the left and right decompositions along with the eigen values **/ AnyType svd_decompose_bidiagonal_ffunc::run(AnyType & args){ MappedColumnVector state = args[0].getAs(); size_t k = static_cast(sqrt(static_cast(state.size()))); // Note that Eigen Matrix deserializes the vector in the column order // Thus transpose() is needed after resize() Matrix b = state; b.resize(k, k); b.transposeInPlace(); Eigen::JacobiSVD svd(b, Eigen::ComputeThinU | Eigen::ComputeThinV); // Note that AnyType serializes the Matrix Object in the column order // Thus transpose() is needed before output Matrix u = svd.matrixU().transpose(); Matrix v = svd.matrixV().transpose(); Matrix s = svd.singularValues(); AnyType tuple; tuple << u << v << s; return tuple; } AnyType svd_decompose_bidiag::run(AnyType & args){ // triple indicate the values of a bidiagonal matrix ArrayHandle row_id = args[0].getAs >(); ArrayHandle col_id = args[1].getAs >(); MappedColumnVector value = args[2].getAs(); // since row_id, col_id start indexing from 1, the max element indicates the // dimension of of the bidiagonal matrix int32_t row_dim = *std::max_element(row_id.ptr(), row_id.ptr() + row_id.size()); int32_t col_dim = *std::max_element(col_id.ptr(), col_id.ptr() + col_id.size()); Matrix b = Matrix::Zero(row_dim, col_dim); for(size_t i = 0; i < row_id.size(); i++){ // we use -1 since row_id and col_id start from 1 b(row_id[i] - 1, col_id[i] - 1) = value[i]; } Eigen::JacobiSVD svd(b, Eigen::ComputeThinU | Eigen::ComputeThinV); // Note that AnyType serializes the Matrix Object in the column order // Thus transpose() is needed before output Matrix u = svd.matrixU().transpose(); Matrix v = svd.matrixV().transpose(); Matrix s = svd.singularValues(); AnyType tuple; tuple << u << v << s; return tuple; } /** * @brief This function is the transition function of the aggregator computing the Lanczos vectors * @param args[0] State variable (i.e. A * q_j OR A_trans * p_(j-1)) * @param args[1] Matrix block row id * @param args[2] Matrix block col id * @param args[3] Matrix block * @param args[4] Previous P/Q vector * @param args[5] Row/Column dimension **/ AnyType svd_block_lanczos_sfunc::run(AnyType & args){ int32_t row_id = args[1].getAs(); int32_t col_id = args[2].getAs(); MappedMatrix block = args[3].getAs(); MappedColumnVector vec = args[4].getAs(); int32_t dim = args[5].getAs(); if(row_id <= 0){ throw std::invalid_argument( "SVD error: row_id should be in the range of [1, dim]"); } if(col_id <= 0){ throw std::invalid_argument( "invalid parameter: col_id should be in the range of [1, dim]"); } // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle state(NULL); if(args[0].isNull()){ state = MutableArrayHandle( madlib_construct_array( NULL, dim, FLOAT8OID, sizeof(double), true, 'd')); }else{ state = args[0].getAs >(); } // Note that block is constructed in the column-major order size_t row_size = block.cols(); size_t col_size = block.rows(); Matrix v = block.transpose() * vec.segment((col_id - 1) * col_size, col_size); for(int32_t i = 0; i < v.rows(); i++) state[(row_id - 1) * row_size + i] += v.col(0)[i]; return state; } /** * @brief This function is the transition function of the aggregator computing the Lanczos vectors * @param args[0] State variable (i.e. A * q_j OR A_trans * p_(j-1)) * @param args[1] Row ID * @param args[2] Column ID * @param args[3] Value * @param args[4] Previous P/Q vector * @param args[5] Row/Column dimension **/ AnyType svd_sparse_lanczos_sfunc::run(AnyType & args){ int32_t row_id = args[1].getAs(); int32_t col_id = args[2].getAs(); double value = args[3].getAs(); MappedColumnVector vec = args[4].getAs(); int32_t dim = args[5].getAs(); // FIXME: construct_array functions circumvent the abstraction layer. These // should be replaced with appropriate Allocator:: calls. MutableArrayHandle state(NULL); if(args[0].isNull()){ state = MutableArrayHandle( madlib_construct_array( NULL, dim, FLOAT8OID, sizeof(double), true, 'd')); }else{ state = args[0].getAs >(); } state[row_id - 1] += value * vec[col_id - 1]; return state; } /* * @brief In-memory multiplication of a vector with a matrix * @param vec a 1 x r vector * @param mat a r x n matrix * @param k a positive number < n * @note first cut mat to r x k, then return vec * mat */ AnyType svd_vec_mult_matrix::run(AnyType & args){ MappedColumnVector vec = args[0].getAs(); MappedMatrix mat = args[1].getAs(); int32_t k = args[2].getAs(); // Any integer is ok if(k <= 0 || k > mat.rows()){ k = static_cast(mat.rows()); } // Note mat is constructed in the column-first order // which means that mat is actually transposed if(vec.size() != mat.cols()){ throw std::invalid_argument( "dimensions mismatch: vec.size() != matrix.rows()"); }; // trans(vec) * trans(mat) = mat * vec Matrix r = mat.topRows(k) * vec; ColumnVector v = r.col(0); return v; } typedef struct __sr_ctx{ ColumnVector vec; Matrix mat; int32_t max_call; int32_t cur_call; int32_t row_id; int32_t k; } sr_ctx; /** * @param arg[0] Column vector * @param arg[1] Matrix (l x l) * @param args[2] Column ID # @param arg[3] k (Sub-matrix: l * k) **/ void * svd_vec_trans_mult_matrix::SRF_init(AnyType &args){ sr_ctx * ctx = new sr_ctx; ctx->vec = args[0].getAs(); ctx->mat = args[1].getAs().transpose(); ctx->row_id = args[2].getAs(); ctx->k = args[3].getAs(); if(ctx->row_id <= 0 || ctx->row_id > ctx->mat.rows()){ elog(ERROR, "invalid parameter - row_id should be in the range of [1, mat.rows()]"); } if(ctx->k > ctx->mat.cols()){ elog(ERROR, "invalid parameter - k should be in the range of [0, mat.cols()]"); } ctx->max_call = static_cast(ctx->vec.size()); ctx->cur_call = 0; return ctx; } AnyType svd_vec_trans_mult_matrix::SRF_next(void *user_fctx, bool *is_last_call){ sr_ctx * ctx = (sr_ctx *) user_fctx; if (ctx->max_call == 0) { *is_last_call = true; return Null(); } ColumnVector res = ctx->vec[ctx->cur_call] * ctx->mat.row(ctx->row_id - 1).segment(0, ctx->k); AnyType tuple; tuple << ctx->cur_call << res; ctx->cur_call++; ctx->max_call--; *is_last_call = false; return tuple; } } //namespace linalg } // namespace modules } //namespace madlib