-- Benchmark setup: build data once, outside iteration loop -- bench.sql measures regex throughput against this table per iteration CREATE EXTENSION IF NOT EXISTS re2; ALTER EXTENSION re2 UPDATE; -- index support needs >= 0.4 -- pg_trgm supplies gin_trgm_ops, the postgres side of the GIN index comparison CREATE EXTENSION IF NOT EXISTS pg_trgm; DROP TABLE IF EXISTS bench_data; CREATE TABLE bench_data AS SELECT id, -- emails (~40 chars) 'user' || (id % 10000)::text || '@example' || (id % 100)::text || '.com' AS email, -- log lines (~200 chars) repeat(chr(65 + (id % 26)), 20) || ' error_code=' || (id % 999)::text || ' path=/api/v' || (id % 5)::text || '/users/' || id::text || ' ip=192.168.' || (id % 256)::text || '.' || (id % 256)::text || ' ' || repeat(chr(97 + (id % 26)), 100) AS logline, -- long text (~2000 chars, per-row distinct via id-derived number; 400 words/row) repeat('the quick brown fox jumps over the lazy dog ' || lpad(id::text, 6, '0') || ' ', 40) AS longtext FROM generate_series(1, 10000) AS id; ANALYZE bench_data; SELECT count(*) FROM bench_data WHERE email IS NOT NULL; SELECT count(*) FROM bench_data WHERE logline IS NOT NULL; SELECT count(*) FROM bench_data WHERE longtext IS NOT NULL; -- Index-scan comparison table (bench.sql section 7). Larger than bench_data so -- index descent, not per-row overhead, dominates. Distinct ids keep prefixes -- and literals selective. DROP TABLE IF EXISTS bench_index_data; CREATE TABLE bench_index_data AS SELECT id, 'user' || id::text || '@example' || (id % 100)::text || '.com' AS email, repeat(chr(65 + (id % 26)), 20) || ' error_code=' || (id % 999)::text || ' path=/api/v' || (id % 5)::text || '/users/' || id::text || ' ip=192.168.' || (id % 256)::text || '.' || (id % 256)::text AS logline FROM generate_series(1, 100000) AS id; -- b-tree prefix range needs a bytewise-comparing opclass (text_pattern_ops) CREATE INDEX bench_index_email_btree ON bench_index_data (email text_pattern_ops); -- competing GIN trigram indexes on the same column: re2 FilteredRE2 vs pg_trgm CREATE INDEX bench_index_logline_re2 ON bench_index_data USING gin (logline gin_re2_ops); CREATE INDEX bench_index_logline_trgm ON bench_index_data USING gin (logline gin_trgm_ops); ANALYZE bench_index_data; SELECT count(*) FROM bench_index_data;