-- ============================================================================ -- ClickHouse schema for pg_stat_ch events -- ============================================================================ -- -- CANONICAL SCHEMA REFERENCE -- -- This file is the single source of truth for the pg_stat_ch ClickHouse schema. -- It serves a dual role: -- 1. Docker init script (applied automatically by docker-compose) -- 2. Schema documentation (column comments, MV explanations) -- -- For production deployments: clickhouse-client < docker/init/00-schema.sql -- For documentation: see docs/clickhouse.md -- -- ============================================================================ -- -- This schema is designed for the pg_stat_ch PostgreSQL extension which exports -- raw query execution telemetry to ClickHouse. All aggregation (p50/p95/p99, -- top queries, error rates) happens in ClickHouse via materialized views. -- -- ============================================================================ CREATE DATABASE IF NOT EXISTS pg_stat_ch; DROP TABLE IF EXISTS pg_stat_ch.events_raw; -- ============================================================================ -- events_raw: Raw query execution events -- ============================================================================ -- -- This is the primary table storing one row per query execution. Events are -- exported in batches by the pg_stat_ch background worker. -- -- Partitioned by date for efficient time-range queries and data retention. -- Ordered by ts_start for efficient global time-range scans. -- -- ============================================================================ CREATE TABLE pg_stat_ch.events_raw ( -- ======================================================================== -- Core identity and timing -- ======================================================================== ts_start DateTime64(6, 'UTC') COMMENT 'Query execution start timestamp (UTC). Used for time-range filtering and partitioning.', duration_us UInt64 COMMENT 'Total query execution time in microseconds. HIGH: slow query, investigate EXPLAIN. LOW: fast query. Compare with p95/p99 from query_stats_5m to identify outliers.', db LowCardinality(String) COMMENT 'PostgreSQL database name. Use for multi-tenant filtering and per-database load analysis.', username LowCardinality(String) COMMENT 'PostgreSQL user/role name. Useful for auditing and per-user resource tracking.', pid Int32 COMMENT 'PostgreSQL backend process ID. Correlate with pg_stat_activity for session debugging.', query_id Int64 COMMENT '64-bit hash identifying normalized queries. Queries differing only in constants share the same query_id. Use for aggregating statistics across similar queries.', cmd_type LowCardinality(String) COMMENT 'Command type: SELECT, INSERT, UPDATE, DELETE, MERGE, UTILITY, or UNKNOWN. Use for workload characterization (read-heavy vs write-heavy).', rows UInt64 COMMENT 'Rows returned (SELECT) or affected (INSERT/UPDATE/DELETE). HIGH: large result sets or bulk operations. LOW: point queries. Watch for unexpected HIGH values indicating missing WHERE clauses.', query String COMMENT 'Full SQL query text (may be truncated). Used for debugging and query analysis.', -- ======================================================================== -- Shared buffer metrics (main buffer cache) -- ======================================================================== -- These metrics show how queries interact with PostgreSQL shared_buffers. -- Cache hit ratio = shared_blks_hit / (shared_blks_hit + shared_blks_read) -- Target: >99% for OLTP workloads, >95% for mixed workloads. -- ======================================================================== shared_blks_hit Int64 COMMENT 'Shared buffer hits (blocks found in cache). HIGH: good cache efficiency, hot data. LOW on frequently-accessed data: increase shared_buffers or data not fitting in cache.', shared_blks_read Int64 COMMENT 'Shared blocks read from disk. HIGH: cache misses, cold data, or working set exceeds shared_buffers. LOW: good cache hit ratio. High reads + low hits = tune shared_buffers or add indexes.', shared_blks_dirtied Int64 COMMENT 'Shared blocks dirtied (modified) by this query. HIGH: write-heavy query, checkpoint pressure. LOW: read-only or small writes. Monitor with checkpoint_warning for tuning.', shared_blks_written Int64 COMMENT 'Shared blocks written directly to disk by backend (not bgwriter/checkpointer). HIGH: backend doing syncs, bgwriter falling behind. Ideally LOW; writes should go through bgwriter. Increase bgwriter_lru_maxpages if high.', -- ======================================================================== -- Local buffer metrics (temp tables within session) -- ======================================================================== -- Local buffers are used for temporary tables (CREATE TEMP TABLE). -- These are session-private and don't benefit from shared_buffers. -- ======================================================================== local_blks_hit Int64 COMMENT 'Local buffer hits (temp table blocks in session cache). HIGH: repeated temp table access. LOW: temp tables not reused or not used.', local_blks_read Int64 COMMENT 'Local blocks read from disk. HIGH: large temp tables exceeding temp_buffers. Consider increasing temp_buffers or restructuring query.', local_blks_dirtied Int64 COMMENT 'Local blocks dirtied. HIGH: heavy temp table writes. Monitor for queries creating large temp tables.', local_blks_written Int64 COMMENT 'Local blocks written to disk. HIGH: temp tables spilling to disk due to temp_buffers exhaustion. Increase temp_buffers.', -- ======================================================================== -- Temp file metrics (sorts, hashes exceeding work_mem) -- ======================================================================== -- Temp files are used when work_mem is insufficient for sorts, hashes, -- or materialization. Unlike local buffers, these are always on disk. -- Non-zero values indicate work_mem pressure. -- ======================================================================== temp_blks_read Int64 COMMENT 'Temp file blocks read. HIGH: external sorts/hashes re-reading spilled data. Indicates work_mem too small for query. Consider increasing work_mem (session or global).', temp_blks_written Int64 COMMENT 'Temp file blocks written. HIGH: sorts/hashes/CTEs spilling to disk. First place to look when query is slow. Increase work_mem or optimize query to reduce intermediate data.', -- ======================================================================== -- I/O timing metrics (requires track_io_timing=on) -- ======================================================================== -- These show actual time spent on I/O, not just block counts. -- Useful for identifying storage bottlenecks vs CPU bottlenecks. -- All values in microseconds. Zero if track_io_timing=off. -- ======================================================================== shared_blk_read_time_us Int64 COMMENT 'Time reading shared blocks (μs). HIGH: storage slow, many cache misses, or high I/O latency. Compare with shared_blks_read to get avg latency per block. >10ms/block may indicate storage issues.', shared_blk_write_time_us Int64 COMMENT 'Time writing shared blocks (μs). HIGH: storage write latency issues, synchronous commits. Usually low unless backend forced to write (bgwriter behind).', local_blk_read_time_us Int64 COMMENT 'Time reading local blocks (μs). HIGH: temp tables on slow storage. Consider faster temp tablespace or reducing temp table usage.', local_blk_write_time_us Int64 COMMENT 'Time writing local blocks (μs). HIGH: temp table writes slow. Check temp tablespace storage performance.', temp_blk_read_time_us Int64 COMMENT 'Time reading temp files (μs). HIGH: work_mem spills causing I/O bottleneck. Increase work_mem or use faster temp storage.', temp_blk_write_time_us Int64 COMMENT 'Time writing temp files (μs). HIGH: sorting/hashing spilling to slow storage. Often dominates query time when work_mem is undersized.', -- ======================================================================== -- WAL (Write-Ahead Log) metrics -- ======================================================================== -- WAL is the transaction log ensuring durability. These metrics show -- write activity and are critical for replication bandwidth planning. -- ======================================================================== wal_records Int64 COMMENT 'WAL records generated. HIGH: write-intensive query, bulk operations. LOW: read-only or few writes. Useful for estimating replication bandwidth: wal_bytes/duration_us gives write rate.', wal_fpi Int64 COMMENT 'WAL full-page images (FPI) generated. HIGH: many pages modified for first time since checkpoint, or full_page_writes=on with frequent page modifications. Increases WAL volume significantly. Consider longer checkpoint_timeout if very high.', wal_bytes UInt64 COMMENT 'Total WAL bytes generated. HIGH: large transactions, COPY, bulk UPDATE/DELETE. Directly impacts replication lag and wal_keep_size. Monitor for capacity planning: sum(wal_bytes) over time = WAL generation rate.', -- ======================================================================== -- CPU time metrics -- ======================================================================== -- User vs system time breakdown. Requires getrusage() support (most systems). -- Note: May include time from parallel workers on some systems. -- ======================================================================== cpu_user_time_us Int64 COMMENT 'CPU time in user mode (μs). HIGH: CPU-intensive query (parsing, computation). If HIGH but duration_us also HIGH, query is CPU-bound. If LOW but duration HIGH, query is I/O-bound.', cpu_sys_time_us Int64 COMMENT 'CPU time in kernel mode (μs). HIGH: heavy I/O syscalls, context switches. Usually low relative to user time. Very high may indicate lock contention or I/O issues.', -- ======================================================================== -- JIT compilation metrics (PostgreSQL 15+) -- ======================================================================== -- JIT (Just-In-Time) compilation can speed up complex queries but has -- compilation overhead. Generally beneficial for long-running analytical -- queries; overhead may hurt short OLTP queries. -- Zero values if JIT not triggered (query below jit_above_cost threshold). -- ======================================================================== jit_functions Int32 COMMENT 'Functions JIT-compiled. HIGH: complex expressions, many tuple operations. >0 means JIT was triggered. Compare jit_*_time_us vs duration_us to see if JIT overhead is worth it.', jit_generation_time_us Int32 COMMENT 'Time generating JIT code (μs). Initial compilation overhead. HIGH relative to duration_us: JIT overhead exceeds benefit for this query. Consider raising jit_above_cost.', jit_deform_time_us Int32 COMMENT 'Time JIT-compiling tuple deform functions (μs). Tuple deforming unpacks rows from storage format. HIGH: many columns, complex types. PG17+ reports this separately.', jit_inlining_time_us Int32 COMMENT 'Time inlining functions (μs). Inlining can dramatically speed up execution but has compile cost. HIGH: many function calls being inlined. If too high, consider jit_inline_above_cost.', jit_optimization_time_us Int32 COMMENT 'Time optimizing JIT code (μs). LLVM optimization passes. HIGH: complex code being heavily optimized. If disproportionate to execution, consider jit_optimize_above_cost.', jit_emission_time_us Int32 COMMENT 'Time emitting final JIT code (μs). Final machine code generation. Usually smaller than optimization time. Very high may indicate LLVM issues.', -- ======================================================================== -- Parallel worker metrics (PostgreSQL 17+/18+) -- ======================================================================== -- Parallel query uses multiple workers to speed up large sequential scans, -- aggregates, and joins. Workers share work and combine results. -- ======================================================================== parallel_workers_planned Int16 COMMENT 'Parallel workers planned by optimizer. Based on table size, parallel_tuple_cost, max_parallel_workers_per_gather. >0 means query could benefit from parallelism.', parallel_workers_launched Int16 COMMENT 'Workers actually launched. If < planned: max_parallel_workers limit hit, or workers unavailable. Consistently lower than planned = increase max_parallel_workers or max_worker_processes.', -- ======================================================================== -- Error information -- ======================================================================== -- Captured via emit_log_hook when a query produces an error or warning. -- Useful for error tracking, debugging, and monitoring error rates. -- ======================================================================== err_sqlstate FixedString(5) COMMENT 'SQL standard 5-character error code. Examples: 42P01=undefined_table, 23505=unique_violation, 42601=syntax_error, 57014=query_canceled. See PostgreSQL error codes appendix.', err_elevel UInt8 COMMENT 'Error severity level. 0=none (success), 19=WARNING, 21=ERROR, 22=FATAL, 23=PANIC. Filter err_elevel>=21 for actual errors. WARNING (19) indicates potential issues.', err_message String COMMENT 'Error message text. Contains the human-readable error description. May include DETAIL and HINT in some cases.', -- ======================================================================== -- Client context -- ======================================================================== -- Identifies the source of queries for debugging and load attribution. -- ======================================================================== app LowCardinality(String) COMMENT 'Client application_name. Set via connection string or SET application_name. Use for identifying load sources: "pgAdmin", "myapp-api", "pg_dump", etc.', client_addr String COMMENT 'Client IP address. Useful for geographic analysis, debugging connection issues, and identifying load sources by host.' ) ENGINE = MergeTree PARTITION BY toDate(ts_start) ORDER BY ts_start; -- ============================================================================ -- Materialized View 1: events_recent_1h -- ============================================================================ -- -- PURPOSE: Fast access to recent events for real-time debugging and monitoring. -- -- USE CASES: -- - "Show me queries from the last 5 minutes" -- - "What's happening right now on database X?" -- - Real-time dashboards with sub-second refresh -- -- WHY A SEPARATE MV? -- - TTL keeps data bounded (auto-deletes after 1 hour) -- - Smaller data volume = faster queries -- - events_raw can have longer retention (days/weeks) -- -- EXAMPLE QUERY: -- SELECT ts_start, db, duration_us/1000 AS ms, substring(query, 1, 100) -- FROM pg_stat_ch.events_recent_1h -- WHERE ts_start > now() - INTERVAL 5 MINUTE -- ORDER BY ts_start DESC -- LIMIT 50; -- -- ============================================================================ DROP TABLE IF EXISTS pg_stat_ch.events_recent_1h; CREATE MATERIALIZED VIEW pg_stat_ch.events_recent_1h ENGINE = MergeTree PARTITION BY toDate(ts_start) ORDER BY ts_start TTL toDateTime(ts_start) + INTERVAL 1 HOUR DELETE AS SELECT * FROM pg_stat_ch.events_raw; -- ============================================================================ -- Materialized View 2: query_stats_5m -- ============================================================================ -- -- PURPOSE: Pre-aggregated query statistics in 5-minute buckets for dashboards. -- -- USE CASES: -- - Dashboard charts: QPS, latency trends, error rates over time -- - Identifying slow queries by p95/p99 latency -- - Capacity planning: queries per second by database -- -- AGGREGATE STATES: -- This MV stores intermediate "state" columns that must be finalized with -- -Merge functions when querying. This allows correct aggregation across -- multiple time buckets. -- -- HOW TO QUERY: -- - countMerge(calls_state) -> total call count -- - sumMerge(duration_sum_state) -> total duration -- - minMerge(duration_min_state) -> minimum duration -- - maxMerge(duration_max_state) -> maximum duration -- - quantilesTDigestMerge(0.95, 0.99)(duration_q_state) -> [p95, p99] -- -- EXAMPLE: Top 10 slowest query_ids by p99 latency in the last hour -- -- SELECT -- query_id, -- cmd_type, -- countMerge(calls_state) AS calls, -- round(sumMerge(duration_sum_state) / countMerge(calls_state) / 1000, 2) AS avg_ms, -- round(quantilesTDigestMerge(0.95, 0.99)(duration_q_state)[1] / 1000, 2) AS p95_ms, -- round(quantilesTDigestMerge(0.95, 0.99)(duration_q_state)[2] / 1000, 2) AS p99_ms -- FROM pg_stat_ch.query_stats_5m -- WHERE bucket >= now() - INTERVAL 1 HOUR -- GROUP BY query_id, cmd_type -- ORDER BY p99_ms DESC -- LIMIT 10; -- -- ============================================================================ DROP TABLE IF EXISTS pg_stat_ch.query_stats_5m; CREATE MATERIALIZED VIEW pg_stat_ch.query_stats_5m ( bucket DateTime COMMENT '5-minute time bucket start', db LowCardinality(String) COMMENT 'Database name', query_id Int64 COMMENT 'Normalized query identifier', cmd_type LowCardinality(String) COMMENT 'Command type (SELECT, INSERT, etc.)', calls_state AggregateFunction(count) COMMENT 'Call count state. Finalize with countMerge().', duration_sum_state AggregateFunction(sum, UInt64) COMMENT 'Total duration state. Finalize with sumMerge().', duration_min_state AggregateFunction(min, UInt64) COMMENT 'Min duration state. Finalize with minMerge().', duration_max_state AggregateFunction(max, UInt64) COMMENT 'Max duration state. Finalize with maxMerge().', duration_q_state AggregateFunction(quantilesTDigest(0.95, 0.99), UInt64) COMMENT 'Latency percentiles state. Finalize with quantilesTDigestMerge(0.95, 0.99)().', rows_sum_state AggregateFunction(sum, UInt64) COMMENT 'Total rows state. Finalize with sumMerge().', shared_hit_sum_state AggregateFunction(sum, Int64) COMMENT 'Buffer hits state. Finalize with sumMerge().', shared_read_sum_state AggregateFunction(sum, Int64) COMMENT 'Buffer reads state. Finalize with sumMerge(). Cache hit ratio = hits / (hits + reads).' ) ENGINE = AggregatingMergeTree PARTITION BY toYYYYMMDD(bucket) ORDER BY (bucket, db, query_id, cmd_type) AS SELECT toStartOfInterval(toDateTime(ts_start), INTERVAL 5 MINUTE) AS bucket, db, query_id, cmd_type, countState() AS calls_state, sumState(duration_us) AS duration_sum_state, minState(duration_us) AS duration_min_state, maxState(duration_us) AS duration_max_state, quantilesTDigestState(0.95, 0.99)(duration_us) AS duration_q_state, sumState(rows) AS rows_sum_state, sumState(shared_blks_hit) AS shared_hit_sum_state, sumState(shared_blks_read) AS shared_read_sum_state FROM pg_stat_ch.events_raw GROUP BY bucket, db, query_id, cmd_type; -- ============================================================================ -- Materialized View 3: db_app_user_1m -- ============================================================================ -- -- PURPOSE: Load overview grouped by database, application, and user. -- -- USE CASES: -- - "Which application is generating the most load?" -- - "Which user is responsible for the slow queries?" -- - Capacity planning and chargeback by tenant/application -- - Identifying misbehaving applications or users -- -- EXAMPLE: Load by application in the last 30 minutes -- -- SELECT -- app, -- countMerge(calls_state) AS total_queries, -- round(sumMerge(duration_sum_state) / 1000000, 2) AS total_seconds, -- round(quantilesTDigestMerge(0.95, 0.99)(duration_q_state)[2] / 1000, 2) AS p99_ms, -- sumMerge(errors_sum_state) AS errors -- FROM pg_stat_ch.db_app_user_1m -- WHERE bucket >= now() - INTERVAL 30 MINUTE -- GROUP BY app -- ORDER BY total_seconds DESC; -- -- EXAMPLE: Error rate by database and user -- -- SELECT -- db, -- username, -- countMerge(calls_state) AS queries, -- sumMerge(errors_sum_state) AS errors, -- round(100 * sumMerge(errors_sum_state) / countMerge(calls_state), 2) AS error_pct -- FROM pg_stat_ch.db_app_user_1m -- WHERE bucket >= now() - INTERVAL 1 HOUR -- GROUP BY db, username -- HAVING errors > 0 -- ORDER BY error_pct DESC; -- -- ============================================================================ DROP TABLE IF EXISTS pg_stat_ch.db_app_user_1m; CREATE MATERIALIZED VIEW pg_stat_ch.db_app_user_1m ( bucket DateTime COMMENT '1-minute time bucket start', db LowCardinality(String) COMMENT 'Database name', app LowCardinality(String) COMMENT 'Application name', username LowCardinality(String) COMMENT 'PostgreSQL username', cmd_type LowCardinality(String) COMMENT 'Command type', calls_state AggregateFunction(count) COMMENT 'Query count state. Finalize with countMerge().', duration_sum_state AggregateFunction(sum, UInt64) COMMENT 'Total duration state (μs). Finalize with sumMerge().', duration_q_state AggregateFunction(quantilesTDigest(0.95, 0.99), UInt64) COMMENT 'Latency percentiles state. Finalize with quantilesTDigestMerge(0.95, 0.99)().', errors_sum_state AggregateFunction(sum, UInt64) COMMENT 'Error count state. Finalize with sumMerge().' ) ENGINE = AggregatingMergeTree PARTITION BY toYYYYMMDD(bucket) ORDER BY (bucket, db, app, username, cmd_type) AS SELECT toStartOfMinute(toDateTime(ts_start)) AS bucket, db, app, username, cmd_type, countState() AS calls_state, sumState(duration_us) AS duration_sum_state, quantilesTDigestState(0.95, 0.99)(duration_us) AS duration_q_state, sumState(toUInt64(err_elevel > 0)) AS errors_sum_state FROM pg_stat_ch.events_raw GROUP BY bucket, db, app, username, cmd_type; -- ============================================================================ -- Materialized View 4: errors_recent -- ============================================================================ -- -- PURPOSE: Recent errors for debugging and incident investigation. -- -- USE CASES: -- - "What errors happened in the last hour?" -- - Incident investigation: correlate errors with specific queries -- - Error rate monitoring and alerting -- - Identifying recurring error patterns by SQLSTATE -- -- RETENTION: -- 7-day TTL keeps error history for investigation while bounding storage. -- Adjust TTL based on your incident response SLA. -- -- EXAMPLE: Recent errors with query context -- -- SELECT -- ts_start, -- db, -- username, -- app, -- err_sqlstate, -- err_message, -- substring(query, 1, 200) AS query_preview -- FROM pg_stat_ch.errors_recent -- WHERE ts_start > now() - INTERVAL 1 HOUR -- ORDER BY ts_start DESC -- LIMIT 100; -- -- EXAMPLE: Error breakdown by SQLSTATE -- -- SELECT -- err_sqlstate, -- count() AS occurrences, -- uniq(query_id) AS unique_queries, -- any(err_message) AS sample_message -- FROM pg_stat_ch.errors_recent -- WHERE ts_start > now() - INTERVAL 24 HOUR -- GROUP BY err_sqlstate -- ORDER BY occurrences DESC; -- -- COMMON SQLSTATE CODES: -- 23505 - unique_violation (duplicate key) -- 42P01 - undefined_table -- 42703 - undefined_column -- 23503 - foreign_key_violation -- 42601 - syntax_error -- 57014 - query_canceled (statement_timeout) -- 40001 - serialization_failure (retry transaction) -- 53100 - disk_full -- 53200 - out_of_memory -- -- ============================================================================ DROP TABLE IF EXISTS pg_stat_ch.errors_recent; CREATE MATERIALIZED VIEW pg_stat_ch.errors_recent ENGINE = MergeTree PARTITION BY toDate(ts_start) ORDER BY ts_start TTL toDateTime(ts_start) + INTERVAL 7 DAY DELETE AS SELECT ts_start, db, username, app, client_addr, pid, query_id, err_sqlstate, err_elevel, err_message, query FROM pg_stat_ch.events_raw WHERE err_elevel > 0;