/* ----------------------------------------------------------------------- *//** * * @file metric.cpp * * @brief Metric operations * *//* ----------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include #include "metric.hpp" using std::string; namespace madlib { using namespace dbconnector::postgres; namespace modules { namespace linalg { namespace { template struct ReverseLexicographicComparator { /** * @brief Return true if the first argument is less than the second */ bool operator()(const TupleType& inTuple1, const TupleType& inTuple2) { // This could be a real reverse lexicographic comparator in C++11, but // lacking variadic template arguments, we simply pretend here that // all tuples contain only 2 elements. return std::get<1>(inTuple1) < std::get<1>(inTuple2) || (std::get<1>(inTuple1) == std::get<1>(inTuple2) && std::get<0>(inTuple1) < std::get<0>(inTuple2)); } }; } // anonymous namespace /** * @brief Compute the k columns of a matrix that are closest to a vector * * @tparam DistanceFunction Type of the distance function. This can be a * function pointer, or a class with method operator() (like * \c FunctionHandle) * @tparam NumClosestColumns The number of closest columns to compute. We assume * that \c outIndicesAndDistances is of size \c NumClosestColumns. * * @param inMatrix Matrix \f$ M \f$ * @param inVector Vector \f$ \vec x \f$ * @param[out] outClosestColumns A list of \c NumClosestColumns pairs * \f$ (i, d) \f$ sorted in ascending order of \f$ d \f$, where $i$ is a * 0-based column index in \f$ M \f$ and * \f$ d \f$ is the distance (using \c inMetric) between \f$ M_i \f$ and * \f$ x \f$. */ template void closestColumnsAndDistances( const MappedMatrix& inMatrix, const MappedColumnVector& inVector, DistanceFunction& inMetric, RandomAccessIterator ioFirst, RandomAccessIterator ioLast) { ReverseLexicographicComparator< typename std::iterator_traits::value_type> comparator; std::fill(ioFirst, ioLast, std::make_tuple(0, std::numeric_limits::infinity())); for (Index i = 0; i < inMatrix.cols(); ++i) { double currentDist; currentDist = AnyType_cast( inMetric(MappedColumnVector(inMatrix.col(i)), inVector) ); // outIndicesAndDistances is a heap, so the first element is maximal if (currentDist < std::get<1>(*ioFirst)) { // Unfortunately, the STL does not have a decrease-key function, // so we are wasting a bit of performance here std::pop_heap(ioFirst, ioLast, comparator); *(ioLast - 1) = std::make_tuple(i, currentDist); std::push_heap(ioFirst, ioLast, comparator); } } std::sort_heap(ioFirst, ioLast, comparator); } template void closestColumnsAndDistancesUDF( const MappedMatrix& inMatrix, const MappedColumnVector& inVector, RandomAccessIterator ioFirst, RandomAccessIterator ioLast, Oid oid) { ReverseLexicographicComparator< typename std::iterator_traits::value_type> comparator; std::fill(ioFirst, ioLast, std::make_tuple(0, std::numeric_limits::infinity())); for (Index i = 0; i < inMatrix.cols(); ++i) { double currentDist; currentDist = static_cast(DatumGetFloat8(OidFunctionCall2( oid, PointerGetDatum(VectorToNativeArray(inMatrix.col(i))), PointerGetDatum(VectorToNativeArray(inVector)) ))); // outIndicesAndDistances is a heap, so the first element is maximal if (currentDist < std::get<1>(*ioFirst)) { // Unfortunately, the STL does not have a decrease-key function, // so we are wasting a bit of performance here std::pop_heap(ioFirst, ioLast, comparator); *(ioLast - 1) = std::make_tuple(i, currentDist); std::push_heap(ioFirst, ioLast, comparator); } } std::sort_heap(ioFirst, ioLast, comparator); } double distPNorm(const MappedColumnVector& inX, const MappedColumnVector& inY, double p) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } if (p <= 0 || std::isnan(p)) { throw std::runtime_error("Expect input p to be positive."); } if (!std::isfinite(p)) { return (inX - inY).lpNorm(); } else { double res = 0.0; for (int i = 0; i < inX.size(); i++) { res += std::pow(std::abs(inX(i) - inY(i)), p); } return std::pow(res, 1./p); } } double distNorm1(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } return (inX - inY).lpNorm<1>(); } double distNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } return (inX - inY).norm(); } double cosineSimilarity(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } double xnorm = inX.norm(), ynorm = inY.norm(); if (xnorm < std::numeric_limits::denorm_min() || ynorm < std::numeric_limits::denorm_min()) { return -1; } return inX.dot(inY) / (xnorm * ynorm); } double squaredDistNorm2(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } return (inX - inY).squaredNorm(); } double distAngle(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } // Deal with the undefined case where one of the norm is zero // Angle is not defined. Just return \pi. double xnorm = inX.norm(), ynorm = inY.norm(); if (xnorm < std::numeric_limits::denorm_min() || ynorm < std::numeric_limits::denorm_min()) return std::acos(-1); double cosine = dot(inX, inY) / (xnorm * ynorm); if (cosine > 1) cosine = 1; else if (cosine < -1) cosine = -1; return std::acos(cosine); } double distTanimoto(const MappedColumnVector& inX, const MappedColumnVector& inY) { if (inX.size() != inY.size()) { throw std::runtime_error("Found input arrays of " "different lengths unexpectedly."); } // Note that this is not a metric in general! double dotProduct = dot(inX, inY); double tanimoto = inX.squaredNorm() + inY.squaredNorm(); return (tanimoto - 2 * dotProduct) / (tanimoto - dotProduct); } double distJaccard(const ArrayHandle& inX, const ArrayHandle& inY) { if (inX.size() == 0 && inY.size() == 0) return 0.0; // both empty are treated as zero distance if (inX.size() == 0 || inY.size() == 0) return 1.0; // one of the sets being empty is treated as max distance std::set x_set; for (size_t i = 0; i < inX.size(); i++){ x_set.insert(std::string(VARDATA_ANY(inX[i]), VARSIZE_ANY(inX[i]) - VARHDRSZ)); } size_t n_intersection = 0; size_t n_union = x_set.size(); std::set y_set; for (size_t i = 0; i < inY.size(); i++){ string y_elem = std::string(VARDATA_ANY(inY[i]), VARSIZE_ANY(inY[i]) - VARHDRSZ); if (y_set.count(y_elem) == 0){ // for sets, count returns 1 if an element with that value // exists in the container, and zero otherwise. if (x_set.count(y_elem) == 1){ // element present in both sets (already counted for union) n_intersection++; } else{ // element only in inY, not yet counted for union n_union++; } } y_set.insert(y_elem); } return 1.0 - static_cast(n_intersection) / static_cast(n_union); } /** * @brief Compute the k columns of a matrix that are closest to a vector * * For performance, we cheat here: For the following four distance functions, we * take a special shortcut. * FIXME: FunctionHandle should be tuned so that this shortcut no longer * impacts performance by more than, say, ~10%. */ std::string dist_fn_name(string s) { std::istringstream ss(s); std::string token, fname; if (std::getline(ss, token, '.')) fname = token; // suppose there is no schema name if (std::getline(ss, token, '.')) fname = token; // previous part is schema name return fname; } template inline void closestColumnsAndDistancesShortcut( const MappedMatrix& inMatrix, const MappedColumnVector& inVector, FunctionHandle &inDist, std::string fname, RandomAccessIterator ioFirst, RandomAccessIterator ioLast) { // Sorted in the order of expected use if (fname.compare("squared_dist_norm2") == 0) closestColumnsAndDistances(inMatrix, inVector, squaredDistNorm2, ioFirst, ioLast); else if (fname.compare("dist_norm2") == 0) closestColumnsAndDistances(inMatrix, inVector, distNorm2, ioFirst, ioLast); else if (fname.compare("dist_norm1") == 0) closestColumnsAndDistances(inMatrix, inVector, distNorm1, ioFirst, ioLast); else if (fname.compare("dist_angle") == 0) closestColumnsAndDistances(inMatrix, inVector, distAngle, ioFirst, ioLast); else if (fname.compare("dist_tanimoto") == 0) { closestColumnsAndDistances(inMatrix, inVector, distTanimoto, ioFirst, ioLast); } else { closestColumnsAndDistancesUDF(inMatrix, inVector, ioFirst, ioLast, inDist.funcID()); } } /** * @brief Compute the minimum distance between a vector and any column of a * matrix * * This function calls a user-supplied function, for which it does not do * garbage collection. It is therefore meant to be called only constantly many * times before control is returned to the backend. */ AnyType closest_column::run(AnyType& args) { //if (true) throw std::runtime_error("Begin cc run\n"); try{ MappedMatrix M = args[0].getAs(); MappedColumnVector x = args[1].getAs(); FunctionHandle dist = args[2].getAs() .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall); string dist_fname = args[3].getAs(); std::string fname = dist_fn_name(dist_fname); std::tuple result; closestColumnsAndDistancesShortcut(M, x, dist, fname, &result, &result + 1); AnyType tuple; return tuple << static_cast(std::get<0>(result)) << std::get<1>(result); }catch (const ArrayWithNullException &e) { return Null(); } } /** * @brief Compute the minimum distance between a vector and any column of a * matrix * * This function calls a user-supplied function, for which it does not do * garbage collection. It is therefore meant to be called only constantly many * times before control is returned to the backend. */ AnyType closest_columns::run(AnyType& args) { /* If the input has a null value, we want to return nothing for that * particular data point (because we cannot calculate the distance) * instead of failing. */ try{ MappedMatrix M = args[0].getAs(); MappedColumnVector x = args[1].getAs(); uint32_t num = args[2].getAs(); FunctionHandle dist = args[3].getAs() .unsetFunctionCallOptions(FunctionHandle::GarbageCollectionAfterCall); string dist_fname = args[4].getAs(); std::string fname = dist_fn_name(dist_fname); std::vector > result(num); closestColumnsAndDistancesShortcut(M, x, dist, fname, result.begin(), result.end()); MutableArrayHandle indices = allocateArray(num); MutableArrayHandle distances = allocateArray(num); for (uint32_t i = 0; i < num; ++i) std::tie(indices[i], distances[i]) = result[i]; AnyType tuple; return tuple << indices << distances; }catch (const ArrayWithNullException &e) { return Null(); } } AnyType norm1::run(AnyType& args) { return static_cast(args[0].getAs().lpNorm<1>()); } AnyType norm2::run(AnyType& args) { return static_cast(args[0].getAs().norm()); } AnyType dist_inf_norm::run(AnyType& args) { return distPNorm( args[0].getAs(), args[1].getAs(), std::numeric_limits::infinity() ); } AnyType dist_pnorm::run(AnyType& args) { return distPNorm( args[0].getAs(), args[1].getAs(), args[2].getAs() ); } AnyType dist_norm1::run(AnyType& args) { return distNorm1( args[0].getAs(), args[1].getAs() ); } AnyType dist_norm2::run(AnyType& args) { // FIXME: it would be nice to declare this as a template function (so it // works for dense and sparse vectors), and the C++ AL takes care of the // rest... return distNorm2( args[0].getAs(), args[1].getAs() ); } AnyType cosine_similarity::run(AnyType& args) { return cosineSimilarity( args[0].getAs(), args[1].getAs() ); } AnyType squared_dist_norm2::run(AnyType& args) { return squaredDistNorm2( args[0].getAs(), args[1].getAs() ); } AnyType dist_angle::run(AnyType& args) { return distAngle( args[0].getAs(), args[1].getAs() ); } AnyType dist_tanimoto::run(AnyType& args) { return distTanimoto( args[0].getAs(), args[1].getAs() ); } AnyType dist_jaccard::run(AnyType& args) { return distJaccard( args[0].getAs >(), args[1].getAs >() ); } } // namespace linalg } // namespace modules } // namespace regress