//---------------------------------------------------------------------------
//	Greenplum Database
//	Copyright (C) 2018 VMware, Inc. or its affiliates.
//
//	@filename:
//		CHistogram.h
//
//	@doc:
//		Histogram implementation
//---------------------------------------------------------------------------
#ifndef GPNAUCRATES_CHistogram_H
#define GPNAUCRATES_CHistogram_H

#include "gpos/base.h"
#include "gpos/common/DbgPrintMixin.h"

#include "gpopt/base/CKHeap.h"
#include "naucrates/statistics/CBucket.h"
#include "naucrates/statistics/CStatsPred.h"

namespace gpopt
{
class CColRef;
class CStatisticsConfig;
class CColumnFactory;
}  // namespace gpopt

namespace gpnaucrates
{
// type definitions
// array of doubles
using CDoubleArray = CDynamicPtrArray<CDouble, CleanupDelete>;

//---------------------------------------------------------------------------
//	@class:
//		CHistogram
//
//	@doc:
//
//---------------------------------------------------------------------------
class CHistogram : public gpos::DbgPrintMixin<CHistogram>
{
	// hash map from column id to a histogram
	using UlongToHistogramMap =
		CHashMap<ULONG, CHistogram, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
				 CleanupDelete<ULONG>, CleanupDelete<CHistogram>>;

	// iterator
	using UlongToHistogramMapIter =
		CHashMapIter<ULONG, CHistogram, gpos::HashValue<ULONG>,
					 gpos::Equals<ULONG>, CleanupDelete<ULONG>,
					 CleanupDelete<CHistogram>>;

	// hash map from column ULONG to CDouble
	using UlongToDoubleMap =
		CHashMap<ULONG, CDouble, gpos::HashValue<ULONG>, gpos::Equals<ULONG>,
				 CleanupDelete<ULONG>, CleanupDelete<CDouble>>;

	// iterator
	using UlongToDoubleMapIter =
		CHashMapIter<ULONG, CDouble, gpos::HashValue<ULONG>,
					 gpos::Equals<ULONG>, CleanupDelete<ULONG>,
					 CleanupDelete<CDouble>>;

private:
	// shared memory pool
	CMemoryPool *m_mp;

	// all the buckets in the histogram. This is shared among histograms,
	// so must not be modified unless we first make a new copy. We do not copy
	// histograms unless required, as it is an expensive operation in memory and time.
	CBucketArray *m_histogram_buckets;

	// well-defined histogram. if false, then bounds are unknown
	BOOL m_is_well_defined;

	// null fraction
	CDouble m_null_freq;

	// ndistinct of tuples not covered in the buckets
	CDouble m_distinct_remaining;

	// frequency of tuples not covered in the buckets
	CDouble m_freq_remaining;

	// has histogram skew been measures
	BOOL m_skew_was_measured;

	// skew estimate
	CDouble m_skew;

	// was the NDVs in histogram scaled
	BOOL m_NDVs_were_scaled;

	// is column statistics missing in the database
	BOOL m_is_col_stats_missing;

	// return an array buckets after applying equality filter on the histogram buckets
	CBucketArray *MakeBucketsWithEqualityFilter(CPoint *point) const;

	// return an array buckets after applying non equality filter on the histogram buckets
	CBucketArray *MakeBucketsWithInequalityFilter(CPoint *point) const;

	// less than or less than equal filter
	CHistogram *MakeHistogramLessThanOrLessThanEqualFilter(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;

	// greater than or greater than equal filter
	CHistogram *MakeHistogramGreaterThanOrGreaterThanEqualFilter(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point) const;

	// equal filter
	CHistogram *MakeHistogramEqualFilter(CPoint *point) const;

	// not equal filter
	CHistogram *MakeHistogramInequalityFilter(CPoint *point) const;

	// IDF filter
	CHistogram *MakeHistogramIDFFilter(CPoint *point) const;

	// INDF filter
	CHistogram *MakeHistogramINDFFilter(CPoint *point) const;

	// equality join
	CHistogram *MakeJoinHistogramEqualityFilter(
		const CHistogram *histogram) const;

	// generate histogram based on NDV
	CHistogram *MakeNDVBasedJoinHistogramEqualityFilter(
		const CHistogram *histogram) const;

	// construct a new histogram for an INDF join predicate
	CHistogram *MakeJoinHistogramINDFFilter(const CHistogram *histogram) const;

	// accessor for n-th bucket
	CBucket *operator[](ULONG) const;

	// Populate sample ratio within each bucket
	void GetSampleRate(DOUBLE left, DOUBLE right, DOUBLE *sample_rate,
					   ULONG index);

	// compute skew estimate
	void ComputeSkew();

	// helper to add buckets from one histogram to another
	static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
						   CBucketArray *dest_buckets, CDouble rows_old,
						   CDouble rows_new, ULONG begin, ULONG end);

	static void AddBuckets(CMemoryPool *mp, const CBucketArray *src_buckets,
						   CBucketArray *dest_buckets, CDouble rows,
						   CDoubleArray *dest_bucket_freqs, ULONG begin,
						   ULONG end);

	// helper to combine histogram buckets to reduce total buckets
	static CBucketArray *CombineBuckets(CMemoryPool *mp, CBucketArray *buckets,
										ULONG desired_num_buckets);

	// check if we can compute NDVRemain for JOIN histogram for the given input histograms
	static BOOL CanComputeJoinNDVRemain(const CHistogram *histogram1,
										const CHistogram *histogram2);

	// compute the effects of the NDV and frequency of the tuples not captured
	// by the histogram
	static void ComputeJoinNDVRemainInfo(
		const CHistogram *histogram1, const CHistogram *histogram2,
		const CBucketArray *join_buckets,  // join buckets
		CDouble
			hist1_buckets_freq,	 // frequency of the buckets in input1 that contributed to the join
		CDouble
			hist2_buckets_freq,	 // frequency of the buckets in input2 that contributed to the join
		CDouble *result_distinct_remain, CDouble *result_freq_remain);


	// check if the cardinality estimation should be done only via NDVs
	static BOOL NeedsNDVBasedCardEstimationForEq(const CHistogram *histogram);

	BOOL IsHistogramForTextRelatedTypes() const;

	// add residual union all buckets after the merge
	ULONG AddResidualUnionAllBucket(CBucketArray *histogram_buckets,
									CBucket *bucket, CDouble rows_old,
									CDouble rows_new, BOOL bucket_is_residual,
									ULONG index) const;

	// add residual union buckets after the merge
	ULONG AddResidualUnionBucket(CBucketArray *histogram_buckets,
								 CBucket *bucket, CDouble rows,
								 BOOL bucket_is_residual, ULONG index,
								 CDoubleArray *dest_bucket_freqs) const;

	// used to keep track of adjacent stats buckets and how similar
	// they are in terms of distribution
	struct SAdjBucketBoundary
	{
		// boundary_index 0 refers to boundary between b[0] and b[1]
		ULONG m_boundary_index;
		// similarity factor between two adjacent buckets calculated as (freq0/ndv0 - freq1/ndv1) + (freq0/width0 - freq1/width1)
		CDouble m_similarity_factor;

		SAdjBucketBoundary(ULONG index, CDouble similarity_factor)
			: m_boundary_index(index), m_similarity_factor(similarity_factor)
		{
		}

		// used for sorting in the binary heap
		CDouble
		DCost() const
		{
			return m_similarity_factor;
		}
		CDouble
		GetCostForHeap() const
		{
			return DCost();
		}
	};
	using SAdjBucketBoundaryArray =
		CDynamicPtrArray<SAdjBucketBoundary, CleanupDelete<SAdjBucketBoundary>>;

public:
	CHistogram &operator=(const CHistogram &) = delete;

	CHistogram(const CHistogram &) = delete;

	// ctors
	explicit CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
						BOOL is_well_defined = true);

	explicit CHistogram(CMemoryPool *mp, BOOL is_well_defined = true);

	CHistogram(CMemoryPool *mp, CBucketArray *histogram_buckets,
			   BOOL is_well_defined, CDouble null_freq,
			   CDouble distinct_remaining, CDouble freq_remaining,
			   BOOL is_col_stats_missing = false);

	// set null frequency
	void SetNullFrequency(CDouble null_freq);

	// set information about the scaling of NDVs
	void
	SetNDVScaled()
	{
		m_NDVs_were_scaled = true;
	}

	// have the NDVs been scaled
	BOOL
	WereNDVsScaled() const
	{
		return m_NDVs_were_scaled;
	}

	// filter by comparing with point
	CHistogram *MakeHistogramFilter(CStatsPred::EStatsCmpType stats_cmp_type,
									CPoint *point) const;

	// filter by comparing with point and normalize
	CHistogram *MakeHistogramFilterNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CPoint *point,
		CDouble *scale_factor) const;

	// join with another histogram
	CHistogram *MakeJoinHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
								  const CHistogram *histogram) const;

	// LASJ with another histogram
	CHistogram *MakeLASJHistogram(CStatsPred::EStatsCmpType stats_cmp_type,
								  const CHistogram *histogram) const;

	// join with another histogram and normalize it.
	// If the join is not an equality join the function returns an empty histogram
	CHistogram *MakeJoinHistogramNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
		const CHistogram *other_histogram, CDouble rows_other,
		CDouble *scale_factor) const;

	// scale factor of inequality (!=) join
	CDouble GetInequalityJoinScaleFactor(CDouble rows,
										 const CHistogram *other_histogram,
										 CDouble rows_other) const;

	// left anti semi join with another histogram and normalize
	CHistogram *MakeLASJHistogramNormalize(
		CStatsPred::EStatsCmpType stats_cmp_type, CDouble rows,
		const CHistogram *other_histogram, CDouble *scale_factor,
		BOOL
			DoIgnoreLASJHistComputation	 // except for the case of LOJ cardinality estimation this flag is always
		// "true" since LASJ stats computation is very aggressive
	) const;

	// group by and normalize
	CHistogram *MakeGroupByHistogramNormalize(CDouble rows,
											  CDouble *scale_factor) const;

	// union all and normalize
	CHistogram *MakeUnionAllHistogramNormalize(
		CDouble rows, const CHistogram *other_histogram,
		CDouble rows_other) const;

	// union and normalize
	CHistogram *MakeUnionHistogramNormalize(CDouble rows,
											const CHistogram *other_histogram,
											CDouble rows_other,
											CDouble *num_output_rows) const;

	// cleanup residual buckets
	static void CleanupResidualBucket(CBucket *bucket, BOOL bucket_is_residual);

	// get the next bucket for union / union all
	static CBucket *GetNextBucket(const CHistogram *histogram,
								  CBucket *new_bucket,
								  BOOL *target_bucket_is_residual,
								  ULONG *current_bucket_index);

	// number of buckets
	ULONG
	GetNumBuckets() const
	{
		GPOS_ASSERT(m_histogram_buckets != nullptr);
		return m_histogram_buckets->Size();
	}

	// buckets accessor
	const CBucketArray *
	GetBuckets() const
	{
		return m_histogram_buckets;
	}

	// well defined
	BOOL
	IsWellDefined() const
	{
		return m_is_well_defined;
	}

	BOOL ContainsOnlySingletonBuckets() const;

	// is the column statistics missing in the database
	BOOL
	IsColStatsMissing() const
	{
		return m_is_col_stats_missing;
	}

	// print function
	IOstream &OsPrint(IOstream &os) const;

	// total frequency from buckets and null fraction
	CDouble GetFrequency() const;

	// total number of distinct values
	CDouble GetNumDistinct() const;

	// is histogram well formed
	BOOL IsValid() const;

	// return copy of histogram
	CHistogram *CopyHistogram() const;

	// destructor
	virtual ~CHistogram()
	{
		m_histogram_buckets->Release();
	}

	// normalize histogram and return scaling factor
	CDouble NormalizeHistogram();

	// is histogram normalized
	BOOL IsNormalized() const;

	// translate the histogram into a derived column stats
	CDXLStatsDerivedColumn *TranslateToDXLDerivedColumnStats(
		CMDAccessor *md_accessor, ULONG colid, CDouble width) const;

	// randomly pick a bucket index
	ULONG GetRandomBucketIndex(ULONG *seed) const;

	// estimate of data skew
	CDouble
	GetSkew()
	{
		if (!m_skew_was_measured)
		{
			ComputeSkew();
		}

		return m_skew;
	}

	// accessor of null fraction
	CDouble
	GetNullFreq() const
	{
		return m_null_freq;
	}

	// accessor of remaining number of tuples
	CDouble
	GetDistinctRemain() const
	{
		return m_distinct_remaining;
	}

	// accessor of remaining frequency
	CDouble
	GetFreqRemain() const
	{
		return m_freq_remaining;
	}

	// check if histogram is empty
	BOOL IsEmpty() const;

	// cap the total number of distinct values (NDVs) in buckets to the number of rows
	void CapNDVs(CDouble rows);

	// is comparison type supported for filters for text columns
	static BOOL IsOpSupportedForTextFilter(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// is comparison type supported for filters
	static BOOL IsOpSupportedForFilter(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// is the join predicate's comparison type supported
	static BOOL JoinPredCmpTypeIsSupported(
		CStatsPred::EStatsCmpType stats_cmp_type);

	// create the default histogram for a given column reference
	static CHistogram *MakeDefaultHistogram(CMemoryPool *mp, CColRef *col_ref,
											BOOL is_empty);

	// create the default non empty histogram for a boolean column
	static CHistogram *MakeDefaultBoolHistogram(CMemoryPool *mp);

	// helper method to append histograms from one map to the other
	static void AddHistograms(CMemoryPool *mp,
							  UlongToHistogramMap *src_histograms,
							  UlongToHistogramMap *dest_histograms);

	// add dummy histogram buckets and column width for the array of columns
	static void AddDummyHistogramAndWidthInfo(
		CMemoryPool *mp, CColumnFactory *col_factory,
		UlongToHistogramMap *output_histograms,
		UlongToDoubleMap *output_col_widths, const ULongPtrArray *columns,
		BOOL is_empty);

	// add dummy histogram buckets for the columns in the input histogram
	static void AddEmptyHistogram(CMemoryPool *mp,
								  UlongToHistogramMap *output_histograms,
								  UlongToHistogramMap *input_histograms);

	// create a deep copy of m_histogram_buckets
	static CBucketArray *DeepCopyHistogramBuckets(CMemoryPool *mp,
												  const CBucketArray *buckets);

	// default histogram selectivity
	static const CDouble DefaultSelectivity;

	// minimum number of distinct values in a column
	static const CDouble MinDistinct;

	// default scale factor when there is no filtering of input
	static const CDouble NeutralScaleFactor;

	// default Null frequency
	static const CDouble DefaultNullFreq;

	// default NDV remain
	static const CDouble DefaultNDVRemain;

	// default frequency of NDV remain
	static const CDouble DefaultNDVFreqRemain;
};	// class CHistogram

}  // namespace gpnaucrates

#endif	// !GPNAUCRATES_CHistogram_H

// EOF