-- Index support (extension 0.4).
-- Assert index paths return identical results to sequential rechecks. Plan
-- shape is asserted only under forced settings (seqscan/bitmapscan off, after
-- VACUUM so index-only scan wins deterministically); the unforced b-tree
-- choice is left to the planner (varies by PG version).
-- ==== Planner support: b-tree range from a fixed prefix ====
CREATE TABLE re2idx_btree (s text);
INSERT INTO re2idx_btree SELECT 'user_' || g FROM generate_series(1, 5000) g;
INSERT INTO re2idx_btree SELECT 'acct_' || g FROM generate_series(1, 5000) g;
CREATE INDEX ON re2idx_btree (s text_pattern_ops);
ANALYZE re2idx_btree;
-- anchored literal prefix: index range vs seqscan agree
SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^user_12');
 count 
-------
   111
(1 row)

SET enable_indexscan = off; SET enable_bitmapscan = off;
SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^user_12');
 count 
-------
   111
(1 row)

RESET enable_indexscan; RESET enable_bitmapscan;
-- anchored with char-class tail still yields a fixed prefix
SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^acct_9[0-9]');
 count 
-------
   110
(1 row)

-- unanchored / top-level alternation are not prefixable: still correct
SELECT count(*) FROM re2idx_btree WHERE re2match(s, 'ser_12');
 count 
-------
   111
(1 row)

SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^user_1|^acct_1');
 count 
-------
  2222
(1 row)

-- EXPLAIN shows which patterns get a range (Index Cond) and which only
-- filter. Alternation inside a group and a POSIX class keep a fixed prefix.
-- '^h[éø]llo': é and ø share their first UTF-8 byte, but the derived prefix
-- stops at the whole character 'h', never mid-character, so the constant
-- stays printable in any client encoding. '^[éü]x' shares no complete
-- character, so no range. A non-capturing group keeps anchoring: er/ed share
-- 'use' hence a range, while user/acct share nothing hence none.
VACUUM ANALYZE re2idx_btree;
SET enable_seqscan = off; SET enable_bitmapscan = off;
EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^user_1(2|3)');
                                QUERY PLAN                                
--------------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Index Cond: ((s ~>=~ 'user_1'::text) AND (s ~<~ 'user_2'::text))
         Filter: re2match(s, '^user_1(2|3)'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^acct_9[[:digit:]]');
                                QUERY PLAN                                
--------------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Index Cond: ((s ~>=~ 'acct_9'::text) AND (s ~<~ 'acct_:'::text))
         Filter: re2match(s, '^acct_9[[:digit:]]'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^h[éø]llo');
                           QUERY PLAN                           
----------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Index Cond: ((s ~>=~ 'h'::text) AND (s ~<~ 'i'::text))
         Filter: re2match(s, '^h[éø]llo'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^[éü]x');
                           QUERY PLAN                           
----------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Filter: re2match(s, '^[éü]x'::text)
(3 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^us(?:er|ed)_1');
                             QUERY PLAN                             
--------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Index Cond: ((s ~>=~ 'use'::text) AND (s ~<~ 'usf'::text))
         Filter: re2match(s, '^us(?:er|ed)_1'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^(?:user|acct)_1');
                           QUERY PLAN                           
----------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Filter: re2match(s, '^(?:user|acct)_1'::text)
(3 rows)

-- operator form reaches the same support function through its function
EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_btree WHERE s @~ '^user_1(2|3)';
                                QUERY PLAN                                
--------------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_btree_s_idx on re2idx_btree
         Index Cond: ((s ~>=~ 'user_1'::text) AND (s ~<~ 'user_2'::text))
         Filter: (s @~ '^user_1(2|3)'::text)
(4 rows)

SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^user_1(2|3)');
 count 
-------
   222
(1 row)

SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^us(?:er|ed)_1');
 count 
-------
  1111
(1 row)

SELECT count(*) FROM re2idx_btree WHERE re2match(s, '^(?:user|acct)_1');
 count 
-------
  2222
(1 row)

RESET enable_seqscan; RESET enable_bitmapscan;
DROP TABLE re2idx_btree;
-- ==== start_anchored soundness: constructs a byte scan can misread ====
-- Patterns a byte scan can misread into a wrong b-tree range that silently
-- drops rows: quantified caret, \Q..\E quoting, POSIX class member, leading
-- ] class member, and a [^] class whose leading ] member takes two skips,
-- folding them into one hides the top-level | behind a phantom class ('xab]'
-- matches yet a bogus [a,b) range skips it). EXPLAIN must show a filter with
-- no Index Cond; forced index counts must match seqscan (2, 5, 5, 2, 5).
CREATE TABLE re2idx_anchor (s text);
INSERT INTO re2idx_anchor VALUES ('abc'),('xabc'),('aa('),('xab'),('a5'),('x(y'),('axz'),('xab]');
CREATE INDEX ON re2idx_anchor (s text_pattern_ops);
VACUUM ANALYZE re2idx_anchor;
SET enable_seqscan = off; SET enable_bitmapscan = off;
EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^?abc');
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_anchor_s_idx on re2idx_anchor
         Filter: re2match(s, '^?abc'::text)
(3 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^aa\Q(\E|ab');
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_anchor_s_idx on re2idx_anchor
         Filter: re2match(s, '^aa\Q(\E|ab'::text)
(3 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[[:digit:](]|ab');
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_anchor_s_idx on re2idx_anchor
         Filter: re2match(s, '^a[[:digit:](]|ab'::text)
(3 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^x[](]y|xz');
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_anchor_s_idx on re2idx_anchor
         Filter: re2match(s, '^x[](]y|xz'::text)
(3 rows)

EXPLAIN (COSTS OFF) SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[^](]|ab]');
                            QUERY PLAN                            
------------------------------------------------------------------
 Aggregate
   ->  Index Only Scan using re2idx_anchor_s_idx on re2idx_anchor
         Filter: re2match(s, '^a[^](]|ab]'::text)
(3 rows)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^?abc');
 count 
-------
     2
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^aa\Q(\E|ab');
 count 
-------
     5
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[[:digit:](]|ab');
 count 
-------
     5
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^x[](]y|xz');
 count 
-------
     2
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[^](]|ab]');
 count 
-------
     5
(1 row)

SET enable_seqscan = on; SET enable_indexscan = off;
SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^?abc');
 count 
-------
     2
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^aa\Q(\E|ab');
 count 
-------
     5
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[[:digit:](]|ab');
 count 
-------
     5
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^x[](]y|xz');
 count 
-------
     2
(1 row)

SELECT count(*) FROM re2idx_anchor WHERE re2match(s, '^a[^](]|ab]');
 count 
-------
     5
(1 row)

RESET enable_seqscan; RESET enable_indexscan; RESET enable_bitmapscan;
DROP TABLE re2idx_anchor;
-- ==== Selectivity: bare re2match() estimates from column stats ====
-- Support function answers SupportRequestSelectivity through the @~ operator:
-- a constant pattern is run over the column's most-common-value statistics,
-- replacing the flat 0.3333 boolean-function default (3333 rows here). Two
-- distinct values on a fully-sampled table make stored fractions exact, so
-- estimates are stable.
CREATE FUNCTION re2idx_est(q text) RETURNS bigint LANGUAGE plpgsql AS $$
DECLARE j json;
BEGIN
	EXECUTE 'EXPLAIN (FORMAT JSON) ' || q INTO j;
	RETURN (j->0->'Plan'->>'Plan Rows')::bigint;
END $$;
CREATE TABLE re2idx_sel (s text);
INSERT INTO re2idx_sel SELECT CASE WHEN g % 100 = 0 THEN 'needle' ELSE 'haystack' END
FROM generate_series(1, 10000) g;
ANALYZE re2idx_sel;
-- statistics-driven: 1% of rows match
SELECT re2idx_est($$SELECT * FROM re2idx_sel WHERE re2match(s, 'needle')$$);
 re2idx_est 
------------
        100
(1 row)

-- matches every sampled value: full row count
SELECT re2idx_est($$SELECT * FROM re2idx_sel WHERE re2match(s, '.')$$);
 re2idx_est 
------------
      10000
(1 row)

-- operator form agrees with function form
SELECT re2idx_est($$SELECT * FROM re2idx_sel WHERE s @~ 'needle'$$);
 re2idx_est 
------------
        100
(1 row)

-- join clause: flat matching default (0.010) over the cross product
SELECT re2idx_est($$SELECT * FROM re2idx_sel a, re2idx_sel b WHERE re2match(a.s, b.s)$$);
 re2idx_est 
------------
    1000000
(1 row)

DROP TABLE re2idx_sel;
DROP FUNCTION re2idx_est;
-- ==== GIN trigram prefilter driven by RE2 FilteredRE2 ====
CREATE TABLE re2idx_gin (msg text);
INSERT INTO re2idx_gin SELECT 'error code ' || g FROM generate_series(1, 3000) g;
INSERT INTO re2idx_gin SELECT 'info message ' || g FROM generate_series(1, 3000) g;
-- non-ASCII (no digits, so the [0-9]+ counts above are undisturbed): RE2 folds
-- atoms with Unicode case folding while the index folds ASCII only.
INSERT INTO re2idx_gin SELECT 'diagnostic ÄBC' FROM generate_series(1, 100);
INSERT INTO re2idx_gin SELECT 'diagnostic äbc' FROM generate_series(1, 100);
CREATE INDEX ON re2idx_gin USING gin (msg gin_re2_ops);
ANALYZE re2idx_gin;
-- required literal atom
SELECT count(*) FROM re2idx_gin WHERE msg @~ 'error code';
 count 
-------
  3000
(1 row)

-- alternation: atoms on both branches
SELECT count(*) FROM re2idx_gin WHERE msg @~ 'error|message';
 count 
-------
  6000
(1 row)

-- lossy prefilter: recheck removes false positives (anchored tail)
SELECT count(*) FROM re2idx_gin WHERE msg @~ 'error code 999$';
 count 
-------
     1
(1 row)

-- pattern without a >=3-byte atom: full scan, still correct
SELECT count(*) FROM re2idx_gin WHERE msg @~ '[0-9]+';
 count 
-------
  6000
(1 row)

-- index path (@~) must equal the sequential recheck (re2match) for each.
-- 'error code 1'/'message 25' pair a common atom with a rare tail, exercising
-- the tri-state consistent's fast-scan skip; a wrong skip would drop matches.
-- 'ÄBC'/'äbc' force the non-ASCII scan-all fallback: RE2's Unicode-folded atom
-- would miss the ASCII-folded index, so those must scan-all and recheck. The
-- '\xC4BC' escapes are ASCII pattern text that RE2 still compiles to the same
-- non-ASCII atom bytes, so the fallback must inspect the compiled atom, not just
-- the raw pattern.
SET enable_seqscan = off;
-- forced index: confirm @~ rides the GIN index, not a penalised seqscan, for
-- both a selective atom and the non-ASCII scan-all fallback.
EXPLAIN (COSTS OFF) SELECT * FROM re2idx_gin WHERE msg @~ 'error code';
                   QUERY PLAN                    
-------------------------------------------------
 Bitmap Heap Scan on re2idx_gin
   Recheck Cond: (msg @~ 'error code'::text)
   ->  Bitmap Index Scan on re2idx_gin_msg_idx
         Index Cond: (msg @~ 'error code'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT * FROM re2idx_gin WHERE msg @~ 'ÄBC';
                  QUERY PLAN                   
-----------------------------------------------
 Bitmap Heap Scan on re2idx_gin
   Recheck Cond: (msg @~ 'ÄBC'::text)
   ->  Bitmap Index Scan on re2idx_gin_msg_idx
         Index Cond: (msg @~ 'ÄBC'::text)
(4 rows)

EXPLAIN (COSTS OFF) SELECT * FROM re2idx_gin WHERE msg @~ '\xC4BC';
                  QUERY PLAN                   
-----------------------------------------------
 Bitmap Heap Scan on re2idx_gin
   Recheck Cond: (msg @~ '\xC4BC'::text)
   ->  Bitmap Index Scan on re2idx_gin_msg_idx
         Index Cond: (msg @~ '\xC4BC'::text)
(4 rows)

SELECT bool_and(idx = seq) AS gin_matches_re2match
FROM (
	SELECT
		(SELECT count(*) FROM re2idx_gin WHERE msg @~ p)		 AS idx,
		(SELECT count(*) FROM re2idx_gin WHERE re2match(msg, p)) AS seq
	FROM unnest(ARRAY['error code', 'error|message', 'error code 999$', '[0-9]+',
					  'error code 1', 'message 25', 'code 2500$',
					  'ÄBC', 'äbc', '\xC4BC', '(?i)\xC4BC',
					  -- alternation-heavy / repeat-trigram atoms: exercise
					  -- extract_query's cross-atom and within-atom dedup
					  'error code 1|info message 2|error code 3',
					  'errorerror|message']) p
) q;
 gin_matches_re2match 
----------------------
 t
(1 row)

RESET enable_seqscan;
DROP TABLE re2idx_gin;
-- ==== huge GIN-indexed value: dedup must stay under MaxAllocSize ====
-- Entries were sized 8 bytes per trigram before dedup, overflowing palloc's
-- 1GB cap past ~134MB of text a plain text column accepts.
CREATE TABLE re2idx_big (msg text);
CREATE INDEX ON re2idx_big USING gin (msg gin_re2_ops);
INSERT INTO re2idx_big VALUES ('start.' || repeat('ab', 75000000));
SET enable_seqscan = off;
SELECT count(*) FROM re2idx_big WHERE msg @~ 'start';
 count 
-------
     1
(1 row)

RESET enable_seqscan;
DROP TABLE re2idx_big;