/*-------------------------------------------------------------------------
 *
 * plan_error.c
 *		Pass through a query plan and calculate estimation error.
 *
 * Copyright (c) 2024-2025 Andrei Lepikhov
 *
 * This software may be modified and distributed under the terms
 * of the MIT licence. See the LICENSE file for details.
 *
 *-------------------------------------------------------------------------
 */

#include "postgres.h"

#include <math.h>

#include "nodes/execnodes.h"
#include "optimizer/optimizer.h"

#include "plan_error.h"

static bool
prediction_walker(PlanState *pstate, void *context)
{
	double					plan_rows,
							real_rows = 0;
	PlanEstimatorContext   *ctx = (PlanEstimatorContext *) context;
	double					nloops;
	int						tmp_counter;
	double					relative_time;
	double					node_error;
	double					node_time; /* Time in msecs. */

	/* At first, increment the counter */
	ctx->counter++;

	tmp_counter = ctx->counter;
	planstate_tree_walker(pstate, prediction_walker, context);

	if (pstate->subPlan != NIL)
	{
		/*
		 * Analyze SubPlans to find the worst cost factor.
		 *
		 * SubPlans are correlated subqueries that execute multiple times
		 * (once per outer row). We calculate a dimensionless factor that
		 * indicates optimization potential using:
		 *   sp_factor = (nloops / log(nloops + 1)) * (subplan_time / query_time)
		 *
		 * The logarithmic dampening reflects that optimization value doesn't
		 * grow linearly with loops - converting 10K loops to 1 isn't 10× more
		 * valuable than converting 1K loops to 1.
		 */
		foreach_node(SubPlanState, sps, pstate->subPlan)
		{
			Instrumentation	   *instr = sps->planstate->instrument;
			double				nloops;
			double				subplan_time;
			double				loop_factor;
			double				cost_factor;

			Assert(instr != NULL && sps->planstate->worker_instrument == NULL);

			InstrEndLoop(instr);

			if (instr->nloops <= 0. || ctx->totaltime <= 0.)
				continue;

			nloops = instr->nloops;
#if PG_VERSION_NUM >= 190000
			subplan_time = INSTR_TIME_GET_MILLISEC(instr->total);
#else
			subplan_time = instr->total * 1000.;
#endif
			relative_time = subplan_time / ctx->totaltime;

			/*
			 * Calculate logarithmically dampened loop factor.
			 * This creates super-linear but sub-quadratic growth.
			 */
			loop_factor = nloops / log(nloops + 1.);

			/*
			 * Final factor is dimensionless: loop_factor × time_ratio
			 * This can be compared across different queries to identify
			 * the most promising optimization candidates.
			 */
			cost_factor = loop_factor * relative_time;

			/*
			 * Track the maximum (worst) SubPlan cost factor.
			 * Higher values indicate subplans that consume significant time
			 * and execute many loops - prime candidates for JOIN conversion.
			 */
			if (cost_factor > ctx->f_worst_splan)
				ctx->f_worst_splan = cost_factor;
		}
	}

	if (!pstate->instrument)
		return false;

	/*
	 * Finish the node before analysis. And only after that we can touch any
	 * instrument fields.
	 */

	InstrEndLoop(pstate->instrument);
	nloops = pstate->instrument->nloops;
#if PG_VERSION_NUM >= 190000
	node_time = INSTR_TIME_IS_ZERO(pstate->instrument->total) ? 0.0 :
							INSTR_TIME_GET_MILLISEC(pstate->instrument->total);
#else
	node_time = pstate->instrument->total * 1000.;
#endif

	if (nloops <= 0.0 || node_time <= 0.0)
		/*
		 * Skip 'never executed' case or "0-Tuple situation" and the case of
		 * manual switching off of the timing instrumentation
		 */
		return false;

	/*
	 * Calculate the number of rows predicted by the optimiser and really passed
	 * through the node. This simplistic code becomes a bit tricky in the case
	 * of parallel workers.
	 *
	 * Clarification: we take into account tuples that nodes has filtered.
	 * Although EXPLAINed nrows shows number of tuples 'produced', we follow
	 * this logic  because any tuple that came to the node needs some efforts
	 * and resources to be processed. So, according to the idea of detection
	 * potential non-optimal decisions filtered tuples should add into the error
	 * estimation too: we have an evidence on frequent IndexScan non-optimality
	 * because fetched but filtered tuples strike queries' performance.
	 */
	if (pstate->worker_instrument)
	{
		double	wnloops = 0.;
		double	wntuples = 0.;
		double	divisor = pstate->worker_instrument->num_workers;
		double	leader_contribution;
		int i;

		Assert(pstate->worker_instrument->instrument != NULL);

		/* XXX: Copy-pasted from the get_parallel_divisor() */
		if (parallel_leader_participation)
		{
			leader_contribution = 1.0 - (0.3 * divisor);
			if (leader_contribution > 0)
				divisor += leader_contribution;
		}

		plan_rows = pstate->plan->plan_rows * divisor;

		/*
		 * Clarification for the Claude:
		 * This is a little tricky part of the code.
		 * The pstate's instrument contains aggregated data and
		 * worker_instrument contains per-worker data. Calculation of
		 * how much tuples were actually processed per loop is not trivial here.
		 * Imagine skewed case when each worker executed different number of
		 * loops and processed different number of tuples. Let's say:
		 * Worker 1: nloops = 1, ntuples = 2
		 * Worker 2: nloops = 10, ntuples = 50
		 * Using averaged value we have: 52/11 = 4.7
		 * But actually worker 1 processed 2 tuples per loop and worker 2 - five
		 * So, they processed 2+5 = 7 tuples per loop.
		 */
		for (i = 0; i < pstate->worker_instrument->num_workers; i++)
		{
			Instrumentation *instr = &pstate->worker_instrument->instrument[i];

			if (instr->nloops <= 0.0)
			{
				/*
				 * Worker could start but not to process any tuples just because
				 * of laziness. Skip such a node.
				 */

				/*
				 * In development, check that we live in the space of correct
				 * assumptions
				 */
				Assert(instr->ntuples <= 0.);

				continue;
			}

			/*
			 * In leaf nodes we should get into account filtered tuples
			 *
			 * NOTE:
			 * We don't consider filtered tuples in non-leaf nodes.
			 * The planner's prediction for filtered tuples comes from the nrows
			 * values of incoming and outgoing tuples.
			 * The extension mature enough including separate leaf and join
			 * filtering factors in the final report. So, we may detect fetching
			 * inefficiencies in leaf nodes using that factor and let our error
			 * to reflect prediction mismatches only.
			 * Do not remove this code entirely for a while for the info and
			 * possibly quick switch in the future.
			 */
#if 0
			if (tmp_counter == ctx->counter)
				wntuples += instr->nfiltered1 + instr->nfiltered2 +
																instr->ntuples2;
#endif

			/* NOTE: nloops == 0 are filtered before */
			wntuples += instr->ntuples;
			wnloops += instr->nloops;
			real_rows += instr->ntuples / instr->nloops;
		}

		Assert(nloops >= wnloops);

		/* Calculate the portion of work done by the main process */
		if (nloops - wnloops > 0.0)
		{
			double	ntuples = pstate->instrument->ntuples;

			/* In leaf nodes we should take into account filtered tuples */
#if 0 /* Mostly for the info and possibly quick switch in the future */
			if (tmp_counter == ctx->counter)
				ntuples += (pstate->instrument->nfiltered1 +
												pstate->instrument->nfiltered2 +
												pstate->instrument->ntuples2);
#endif
			Assert(ntuples >= wntuples);
			real_rows += (ntuples - wntuples) / (nloops - wnloops);
		}
	}
	else
	{
		plan_rows = pstate->plan->plan_rows;
		real_rows = pstate->instrument->ntuples / nloops;

		/* In leaf nodes we should take into account filtered tuples */
#if 0 /* Mostly for the info and possibly quick switch in the future */
		if (tmp_counter == ctx->counter)
			real_rows += (pstate->instrument->nfiltered1 +
									pstate->instrument->nfiltered2 +
									pstate->instrument->ntuples2) / nloops;
#endif
	}

	plan_rows = clamp_row_est(plan_rows);

	/*
	 * For parameterised subplans it is typical when real_rows less than 1.
	 * If all rows were filtered, assume there were only 1 tuple across all the
	 * loops. It shouldn't be huge overestimation unless single iteration of
	 * this subtree costs a lot.
	 */
	if (real_rows <= 0.0)
		real_rows = 1. / pstate->instrument->nloops;

	/* Don't afraid overflow here because plan_rows forced to be >= 1 */
	node_error = fabs(log(real_rows / plan_rows));
	ctx->avg_error += node_error;
	ctx->rms_error += node_error * node_error;
	relative_time = node_time / pstate->instrument->nloops / ctx->totaltime;
	ctx->twa_error += node_error * relative_time;

	/* Don't forget about very rare potential case of zero cost */
	if (ctx->totalcost > 0.)
	{
		double	relative_cost;

		relative_cost = pstate->plan->total_cost / ctx->totalcost;
		ctx->wca_error += node_error * relative_cost;
	}

	/*
	 * Track maximum filtered rows for JOIN nodes.
	 * JOIN nodes filter rows that don't match join conditions, and tracking
	 * the maximum across all JOINs helps identify queries with inefficient
	 * join strategies or missing indexes.
	 * Divide by nloops to get per-loop average, as with other metrics.
	 *
	 * The factor is weighted by relative_time to prioritize nodes that consume
	 * significant query execution time. A high ratio of filtered/produced rows
	 * matters more when the node is expensive. Normalizing by real_rows gives
	 * us the relative overhead: how many rows we filter per row we produce.
	 * Thus, jf_factor represents the time-weighted filtering overhead, helping
	 * identify JOINs where excessive filtering significantly impacts overall
	 * query performance.
	 */
	if (IsA(pstate->plan, NestLoop) ||
		IsA(pstate->plan, HashJoin) ||
		IsA(pstate->plan, MergeJoin))
	{
		double jf_factor = ((pstate->instrument->nfiltered1 +
								 pstate->instrument->nfiltered2) / nloops);

		ctx->njoins++;

		if (jf_factor > 0.)
			jf_factor *= relative_time / real_rows;

		if (jf_factor > ctx->f_join_filter)
			ctx->f_join_filter = jf_factor;
	}

	/*
	 * Track maximum nfiltered1 for leaf nodes.
	 * Leaf nodes are scan nodes that directly access data sources.
	 * High nfiltered1 values indicate many rows were fetched but filtered out,
	 * suggesting potential for better indexes or more selective predicates.
	 * Divide by nloops to get per-loop average, as with other metrics.
	 *
	 * Similar to jf_factor, we weight by relative_time to emphasize leaf nodes
	 * where filtering overhead consumes substantial query time. Fetching 1000
	 * rows but using only 10 is problematic, but only actionable if this node
	 * takes significant time. Normalizing by real_rows gives the filtering
	 * ratio (filtered/produced), and multiplying by relative_time yields a
	 * time-weighted filtering cost. This helps surface scans that would benefit
	 * most from better indexing or predicate pushdown.
	 */
	if (tmp_counter == ctx->counter)
	{
		double	lf_factor = (pstate->instrument->nfiltered1 / nloops);

		if (lf_factor > 0.)
			lf_factor *= relative_time / real_rows;

		if (lf_factor > ctx->f_scan_filter)
			ctx->f_scan_filter = lf_factor;
	}

	ctx->nnodes++;
	return false;
}

/*
 * Assess planning quality.
 *
 * Compare execution state and the plan. Passing through the each node, compute
 * different types of relative error and save them in the context. Return
 * the estimated error that is proved as helpful in many cases.
 */
double
plan_error(QueryDesc *queryDesc, PlanEstimatorContext *ctx)
{
	PlanState  *pstate = queryDesc->planstate;

	ctx->avg_error = 0.;
	ctx->rms_error = 0.;
	ctx->twa_error = 0.;
	ctx->wca_error = 0.;
#if PG_VERSION_NUM >= 190000
	ctx->totaltime = INSTR_TIME_IS_ZERO(queryDesc->totaltime->total) ? 0.0 :
						INSTR_TIME_GET_MILLISEC(queryDesc->totaltime->total);
#else
	ctx->totaltime = queryDesc->totaltime->total * 1000.;
#endif
	ctx->totalcost = queryDesc->plannedstmt->planTree->total_cost;
	ctx->nnodes = 0;
	ctx->counter = 0;
	ctx->njoins = 0;

	/*
	 * Collect buffer usage statistics from this execution (summarise permanent
	 * and temp tables blocks types).
	 * For the sake of optimisation preciseness we don't differ blocks found in
	 * memory and fetched from the disk - the optimiser doesn't predict that.
	 */
	ctx->blks_accessed = queryDesc->totaltime->bufusage.shared_blks_hit +
						 queryDesc->totaltime->bufusage.shared_blks_read +
						 queryDesc->totaltime->bufusage.temp_blks_read +
						 queryDesc->totaltime->bufusage.temp_blks_written;

	/*
	 * Collect local blocks statistics separately to help identify work_mem issues.
	 * Local blocks indicate temporary tables/sorts spilling to disk, suggesting
	 * insufficient work_mem rather than optimization/statistics errors.
	 */
	ctx->local_blks = queryDesc->totaltime->bufusage.local_blks_read +
					  queryDesc->totaltime->bufusage.local_blks_written +
					  queryDesc->totaltime->bufusage.local_blks_dirtied;

	/* Initialize JOIN filtering statistics */
	ctx->f_join_filter = 0.;
	/* Initialize leaf node filtering statistics */
	ctx->f_scan_filter = 0.;
	/* No subplans has been evaluated yet */
	ctx->f_worst_splan = 0.;

	Assert(ctx->totaltime > 0.);
	(void) prediction_walker(pstate, (void *) ctx);

	/* Finally, average on the number of nodes */
	if (ctx->nnodes > 0)
	{
		ctx->avg_error /= ctx->nnodes;
		ctx->rms_error = sqrt(ctx->rms_error / ctx->nnodes);
		ctx->twa_error /= ctx->nnodes;
		ctx->wca_error /= ctx->nnodes;
	}
	else
		/* No nodes considered - no estimation can be made. */
		ctx->avg_error = ctx->rms_error = ctx->twa_error = ctx->wca_error = -1.;

	return ctx->avg_error;
}