// Copyright (c) 2023-2025 ParadeDB, Inc. // // This file is part of ParadeDB - Postgres for Search and Analytics // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as published by // the Free Software Foundation, either version 3 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . mod fixtures; use fixtures::*; use futures::executor::block_on; use lockfree_object_pool::MutexObjectPool; use proptest::prelude::*; use proptest_derive::Arbitrary; use rstest::*; use sqlx::PgConnection; use std::fmt::Debug; use crate::fixtures::querygen::opexprgen::{ArrayQuantifier, Operator, ScalarArrayOperator}; use crate::fixtures::querygen::{compare, PgGucs}; #[derive(Debug, Clone, Arbitrary)] pub enum TokenizerType { Default, Keyword, } impl TokenizerType { fn to_index_config(&self) -> &'static str { match self { TokenizerType::Default => r#""tokenizer": {"type": "default"}"#, TokenizerType::Keyword => r#""tokenizer": {"type": "keyword"}"#, } } } #[derive(Debug, Clone, Arbitrary)] pub enum ColumnType { Text, Integer, Boolean, Timestamp, Uuid, } impl ColumnType { fn column_name(&self) -> &'static str { match self { ColumnType::Text => "text_col", ColumnType::Integer => "int_col", ColumnType::Boolean => "bool_col", ColumnType::Timestamp => "ts_col", ColumnType::Uuid => "uuid_col", } } } #[derive(Debug, Clone, Arbitrary)] pub enum ArrayOperation { OperatorQuantifier { operator: Operator, quantifier: ArrayQuantifier, }, ScalarArray { operator: ScalarArrayOperator, }, } #[derive(Debug, Clone, Arbitrary)] pub struct ScalarArrayExpr { column_type: ColumnType, operation: ArrayOperation, tokenizer: TokenizerType, include_null: bool, } impl ScalarArrayExpr { fn sample_values(&self) -> impl Strategy { let values = match self.column_type { ColumnType::Text => vec![ "'apple'".to_string(), "'banana'".to_string(), "'cherry'".to_string(), "'date'".to_string(), "'elderberry'".to_string(), ], ColumnType::Integer => vec![ "1".to_string(), "2".to_string(), "3".to_string(), "42".to_string(), "100".to_string(), ], ColumnType::Boolean => { vec!["true".to_string(), "false".to_string()] } ColumnType::Timestamp => vec![ "'2023-01-01 00:00:00'::timestamp".to_string(), "'2023-06-15 12:30:00'::timestamp".to_string(), "'2024-01-01 00:00:00'::timestamp".to_string(), "'2024-06-01 09:15:00'::timestamp".to_string(), ], ColumnType::Uuid => vec![ "'550e8400-e29b-41d4-a716-446655440000'::uuid".to_string(), "'6ba7b810-9dad-11d1-80b4-00c04fd430c8'::uuid".to_string(), "'6ba7b811-9dad-11d1-80b4-00c04fd430c8'::uuid".to_string(), "'12345678-1234-5678-9abc-123456789abc'::uuid".to_string(), ], }; proptest::sample::select(values) } fn null_value(&self) -> String { match self.column_type { ColumnType::Text => "NULL::text".to_string(), ColumnType::Integer => "NULL::integer".to_string(), ColumnType::Boolean => "NULL::boolean".to_string(), ColumnType::Timestamp => "NULL::timestamp".to_string(), ColumnType::Uuid => "NULL::uuid".to_string(), } } fn to_sql(&self, values: &[String]) -> String { let column = self.column_type.column_name(); // Add NULL to values if include_null is true let mut final_values = values.to_vec(); if self.include_null { final_values.push(self.null_value()); } match &self.operation { ArrayOperation::OperatorQuantifier { operator, quantifier, } => { let op = operator.to_sql(); let quant = quantifier.to_sql(); let array_literal = format!("ARRAY[{}]", final_values.join(", ")); format!("{column} {op} {quant}({array_literal})") } ArrayOperation::ScalarArray { operator } => { let op = operator.to_sql(); format!("{} {} ({})", column, op, final_values.join(", ")) } } } } fn scalar_array_setup(conn: &mut PgConnection, tokenizer: TokenizerType) -> String { "CREATE EXTENSION IF NOT EXISTS pg_search;".execute(conn); "SET log_error_verbosity TO VERBOSE;".execute(conn); "SET log_min_duration_statement TO 1000;".execute(conn); let setup_sql = format!( r#" DROP TABLE IF EXISTS scalar_array_test; CREATE TABLE scalar_array_test ( id SERIAL8 NOT NULL PRIMARY KEY, text_col TEXT, int_col INTEGER, bool_col BOOLEAN, ts_col TIMESTAMP, uuid_col UUID ); -- Insert test data INSERT INTO scalar_array_test (text_col, int_col, bool_col, ts_col, uuid_col) VALUES ('apple', 1, true, '2023-01-01 00:00:00', '550e8400-e29b-41d4-a716-446655440000'), ('Apple', 2, false, '2023-06-15 12:30:00', '6ba7b810-9dad-11d1-80b4-00c04fd430c8'), ('Apple Tree', 3, true, '2024-01-01 00:00:00', '6ba7b811-9dad-11d1-80b4-00c04fd430c8'), ('banana', 42, false, '2023-12-25 18:00:00', '12345678-1234-5678-9abc-123456789abc'), ('banana bunch', 100, true, '2024-06-01 09:15:00', '550e8400-e29b-41d4-a716-446655440001'), ('Ripe Banana', 1, false, '2023-03-15 14:20:00', '6ba7b810-9dad-11d1-80b4-00c04fd430c9'), ('banana', 2, true, '2023-09-30 20:45:00', '6ba7b811-9dad-11d1-80b4-00c04fd430c9'), ('banana', 3, false, '2024-02-14 11:30:00', '12345678-1234-5678-9abc-123456789abd'), -- Rows with NULL values (NULL, 4, true, '2024-03-01 10:00:00', '550e8400-e29b-41d4-a716-446655440002'), ('cherry', NULL, false, '2024-04-01 11:00:00', '6ba7b810-9dad-11d1-80b4-00c04fd430ca'), ('date', 42, NULL, '2024-05-01 12:00:00', '6ba7b811-9dad-11d1-80b4-00c04fd430ca'), ('elderberry', 2, true, NULL, '12345678-1234-5678-9abc-123456789abe'), ('cherry', 1, false, '2024-07-01 14:00:00', NULL); -- Create BM25 index with configurable tokenizer CREATE INDEX idx_scalar_array_test ON scalar_array_test USING bm25 (id, text_col, int_col, bool_col, ts_col, uuid_col) WITH ( key_field = 'id', text_fields = '{{ "text_col": {{ {} }}, "uuid_col": {{ {} }} }}' ); -- help our cost estimates ANALYZE scalar_array_test; "#, tokenizer.to_index_config(), tokenizer.to_index_config() ); setup_sql.clone().execute(conn); setup_sql } #[rstest] #[tokio::test] async fn scalar_array_pushdown_correctness(database: Db) { let pool = MutexObjectPool::::new( move || block_on(async { database.connection().await }), |_| {}, ); proptest!(|( (expr, selected_values) in any::() .prop_flat_map(|expr| { let values_strategy = proptest::collection::vec(expr.sample_values(), 1..4); (Just(expr), values_strategy) }), gucs in any::(), )| { let setup_sql = scalar_array_setup(&mut pool.pull(), expr.tokenizer.clone()); eprintln!("Setup SQL:\n{setup_sql}"); let array_condition = expr.to_sql(&selected_values); // Test SELECT queries with actual results let pg_query = format!( "SELECT id, text_col FROM scalar_array_test WHERE {array_condition} ORDER BY id" ); let bm25_query = format!( "SELECT id, text_col FROM scalar_array_test WHERE {array_condition} ORDER BY id" ); compare( pg_query, bm25_query, gucs, &mut pool.pull(), |query, conn| { query.fetch::<(i64, Option)>(conn) }, )?; }); }