#include "postgres.h" #include #include "nodes/pg_list.h" #include "utils/datum.h" #include "utils/memutils.h" #include "pg_diffix/aggregation/led.h" #include "pg_diffix/utils.h" typedef Bucket *BucketRef; /* Treat a bucket pointer as opaque in this file. */ #define MAX_SIBLINGS 3 typedef struct BucketSiblings { BucketRef values[MAX_SIBLINGS]; /* (up to) First 3 buckets in subset */ } BucketSiblings; typedef struct SiblingsTrackerData { BucketDescriptor *bucket_desc; /* Reference to bucket descriptor */ int skipped_column; /* Skipped column for subsets */ } SiblingsTrackerData; typedef struct SiblingsTrackerEntry { BucketRef key; /* First bucket in subset */ BucketSiblings *siblings; /* Pointer to siblings in subset */ uint32 hash; /* Memorized hash */ char status; /* Required for hash table */ } SiblingsTrackerEntry; static inline bool subset_equals(SiblingsTrackerData *data, BucketRef a, BucketRef b) { BucketDescriptor *bucket_desc = data->bucket_desc; int skipped_column = data->skipped_column; int num_labels = bucket_desc->num_labels; for (int i = 0; i < num_labels; i++) { if (i == skipped_column) continue; if (a->is_null[i] != b->is_null[i]) return false; /* NULL vs non-NULL. */ if (a->is_null[i]) continue; /* Both NULL. */ BucketAttribute *att = &bucket_desc->attrs[i]; if (!datumIsEqual(a->values[i], b->values[i], att->typ_byval, att->typ_len)) return false; } return true; } static inline uint32 subset_hash(SiblingsTrackerData *data, BucketRef bucket) { BucketDescriptor *bucket_desc = data->bucket_desc; int skipped_column = data->skipped_column; int num_labels = bucket_desc->num_labels; uint32 hash = 0; for (int i = 0; i < num_labels; i++) { if (i == skipped_column) continue; uint32 label_hash = bucket->is_null[i] ? 0 : (uint32)hash_datum(bucket->values[i], bucket_desc->attrs[i].typ_byval, bucket_desc->attrs[i].typ_len); hash ^= label_hash; } return hash; } /* * Declarations for HashTable */ #define SH_PREFIX SiblingsTracker #define SH_ELEMENT_TYPE SiblingsTrackerEntry #define SH_KEY key #define SH_KEY_TYPE BucketRef #define DATA(tb) ((SiblingsTrackerData *)tb->private_data) #define SH_EQUAL(tb, a, b) subset_equals(DATA(tb), a, b) #define SH_HASH_KEY(tb, key) subset_hash(DATA(tb), key) #define SH_STORE_HASH #define SH_GET_HASH(tb, entry) entry->hash #define SH_SCOPE static inline #define SH_DECLARE #define SH_DEFINE #include "lib/simplehash.h" static inline void add_sibling(BucketSiblings *siblings, BucketRef bucket) { for (int i = 0; i < MAX_SIBLINGS; i++) { if (siblings->values[i] == NULL) { siblings->values[i] = bucket; return; } } /* Array is full, do nothing. */ } static inline int siblings_length(BucketSiblings *siblings) { int len = 0; for (int i = 0; i < MAX_SIBLINGS; i++) { if (siblings->values[i] == NULL) break; else len++; } return len; } void led_hook(List *buckets, BucketDescriptor *bucket_desc) { int num_buckets = list_length(buckets); int num_labels = bucket_desc->num_labels; /* * LED requires at least 2 columns - an isolating column and an unknown column. * With exactly 2 columns attacks are not useful because * they would have to isolate victims against the whole dataset. */ if (num_labels <= 2) return; MemoryContext led_context = AllocSetContextCreate(bucket_desc->bucket_context, "led_hook context", ALLOCSET_DEFAULT_SIZES); MemoryContext temp_context = AllocSetContextCreate(led_context, "led_hook temporary context", ALLOCSET_DEFAULT_SIZES); MemoryContext old_context = MemoryContextSwitchTo(temp_context); /* * For each column, we build a cache where we group buckets by their labels EXCLUDING that column. * This means that every cache will associate siblings where the respective column is different. */ SiblingsTracker_hash **trackers_per_column = palloc(num_labels * sizeof(SiblingsTracker_hash *)); SiblingsTrackerData *data = palloc(num_labels * sizeof(SiblingsTrackerData)); for (int i = 0; i < num_labels; i++) { data[i].bucket_desc = bucket_desc; data[i].skipped_column = i; trackers_per_column[i] = SiblingsTracker_create(temp_context, num_buckets, &data[i]); } /* * Per bucket, per column pointers to siblings. For example, * if we have 3 columns (c), then for each bucket (b) we have: * [ b0c0, b0c1, b0c2, b1c0, b1c1, b1c2, b2c0 ... ] */ BucketSiblings **bucket_siblings = MemoryContextAllocZero( led_context, num_buckets * num_labels * sizeof(BucketSiblings *)); /* Fill hash table & associate siblings. Skip star bucket (bucket #0). */ for (int bucket_idx = 1; bucket_idx < num_buckets; bucket_idx++) { BucketRef bucket = (BucketRef)list_nth(buckets, bucket_idx); for (int column_idx = 0; column_idx < num_labels; column_idx++) { bool found; SiblingsTrackerEntry *entry = SiblingsTracker_insert(trackers_per_column[column_idx], bucket, &found); if (!found) entry->siblings = MemoryContextAllocZero(led_context, sizeof(BucketSiblings)); add_sibling(entry->siblings, bucket); bucket_siblings[bucket_idx * num_labels + column_idx] = entry->siblings; } } /* Free hash table. */ MemoryContextReset(temp_context); /* Temp storage to stage buckets for merging. */ BucketRef *current_merge_targets = MemoryContextAlloc(led_context, num_labels * sizeof(BucketRef)); /* Tracks all target buckets. We recompute their low count state after the loop. */ List *all_merge_targets = NIL; int buckets_merged = 0; int total_merges = 0; /* LED bucket loop */ for (int bucket_idx = 1; bucket_idx < num_buckets; bucket_idx++) { BucketRef bucket = (BucketRef)list_nth(buckets, bucket_idx); if (!bucket->low_count) continue; bool has_unknown_column = false; int isolating_columns = 0; for (int column_idx = 0; column_idx < num_labels; column_idx++) { BucketSiblings *siblings = bucket_siblings[bucket_idx * num_labels + column_idx]; int num_siblings = siblings_length(siblings); if (num_siblings == 1) { /* * No siblings for this column (num_siblings=1 means itself). * A column without siblings is an unknown column. */ Assert(siblings->values[0] == bucket); has_unknown_column = true; } else if (num_siblings == 2) { /* * Single sibling (self+other). Find it and check if it's high count. * A column with a single high-count sibling is an isolating column. */ BucketRef other_bucket = (siblings->values[0] == bucket) ? (siblings->values[1]) : (siblings->values[0]); if (!other_bucket->low_count) current_merge_targets[isolating_columns++] = other_bucket; } else { /* num_siblings=3 means there are multiple siblings and has no special meaning. */ } } /* We need both for merge conditions. */ if (!has_unknown_column || isolating_columns == 0) continue; for (int i = 0; i < isolating_columns; i++) { merge_bucket(current_merge_targets[i], bucket, bucket_desc); MemoryContextSwitchTo(led_context); all_merge_targets = list_append_unique(all_merge_targets, current_merge_targets[i]); MemoryContextSwitchTo(temp_context); } buckets_merged++; total_merges += isolating_columns; bucket->merged = true; /* Free any garbage from merging. */ MemoryContextReset(temp_context); } /* Recompute low count for merge targets. */ ListCell *cell; foreach (cell, all_merge_targets) { BucketRef bucket = (BucketRef)lfirst(cell); bucket->low_count = eval_low_count(bucket, bucket_desc); } DEBUG_LOG("[LED] Buckets merged: %i; Total merges: %i", buckets_merged, total_merges); MemoryContextSwitchTo(old_context); MemoryContextDelete(led_context); /* Also removes temp_context. */ }