/*--------------------------------------------------------------------- * * sparql.c * SPARQL-related functions for RDF data manipulation. * * Implements SPARQL 1.1 string functions, accessor functions, and * type checking. * * Copyright (C) 2022-2025 University of Münster, Germany * *--------------------------------------------------------------------- */ #include "postgres.h" #include "rdf_fdw.h" #include "rdf_utils.h" #include "rdfnode.h" #include "sparql.h" #include "lib/stringinfo.h" #include "utils/builtins.h" #include "utils/timestamp.h" #include #include /* * lex * --- * * Extracts the lexical value of a given RDF literal. * * input: RDF literal * * returns: lexical value of an RDF literal */ char *lex(char *input) { StringInfoData output; const char *start = input; int len = strlen(input); initStringInfo(&output); elog(DEBUG1, "%s called: input='%s'", __func__, input); if (len == 0) return ""; /* Handle quoted literal */ if (start[0] == '"') { const char *p; start++; /* skip opening quote */ p = start; while (*p) { if (*p == '"') { /* Check for doubled quote escape ("") */ if (*(p + 1) && *(p + 1) == '"') { /* Escaped quote: append one quote and skip both */ appendStringInfoChar(&output, '"'); p += 2; continue; } /* Check for backslash escape (\") */ if (p > start && *(p - 1) == '\\') { /* Already appended by previous iteration */ appendStringInfoChar(&output, *p); p++; continue; } /* Unescaped quote: closing quote found */ break; } if (*p == '\\' && *(p + 1)) { appendStringInfoChar(&output, *p); p++; } appendStringInfoChar(&output, *p); p++; } /* No closing quote found — malformed, return whole string */ if (*p != '"') { resetStringInfo(&output); appendStringInfoString(&output, input); return output.data; } /* Successful: return parsed inside quotes */ return output.data; } /* Handle IRI */ if (start[0] == '<') { appendStringInfoString(&output, start); return output.data; } /* Unquoted: trim at @ or ^^ only if they indicate language tag or datatype */ { const char *at = strchr(start, '@'); const char *dt = strstr(start, "^^"); const char *cut = NULL; if (at && (!dt || at < dt)) { const char *tag = at + 1; int letter_count = 0; const char *p = NULL; int is_lang_tag = 0; if (*tag && isalpha(*tag)) { p = tag; while (*p && isalpha(*p) && letter_count < 8) { letter_count++; p++; } if (letter_count >= 1 && (!*p || *p == '-' || (*p != '.' && *p != '@'))) { if (*p == '-') { p++; while (*p && (isalnum(*p) || *p == '-')) { p++; } } if (!*p || (*p != '.' && *p != '@')) { is_lang_tag = 1; } } } if (is_lang_tag) { cut = at; } } else if (dt) { cut = dt; } if (cut) { appendBinaryStringInfo(&output, start, cut - start); } else { appendStringInfoString(&output, start); } } return output.data; } /* * lang * ---- * * Extracts the language tag from an RDF literal, if present. Returns an * empty string if no language tag is found or if the input is invalid/empty. * * input: Null-terminated C string representing an RDF literal (e.g., * "abc"@en, "123"^^xsd:int) * * returns: Null-terminated C string representing the language tag (e.g., * "en") or empty string */ char *lang(char *input) { StringInfoData buf; const char *ptr; char *lexical_form; elog(DEBUG1, "%s called: input='%s'", __func__, input); if (!input || strlen(input) == 0) return ""; lexical_form = lex(input); ptr = input; /* find the end of the lexical form in the original input */ if (*ptr == '"') { ptr++; /* skip opening quote */ ptr += strlen(lexical_form); /* move to end of lexical form */ if (*ptr == '"') ptr++; /* skip closing quote */ } else { ptr += strlen(lexical_form); /* unquoted case */ } /* check for language tag */ if (*ptr == '@') { const char *tag_start = ptr + 1; const char *tag_end = tag_start; while (*tag_end && (isalnum(*tag_end) || *tag_end == '-' || *tag_end == '_')) tag_end++; initStringInfo(&buf); appendBinaryStringInfo(&buf, tag_start, tag_end - tag_start); elog(DEBUG1, "%s exit: returning => '%s'", __func__, buf.data); return buf.data; } elog(DEBUG1, "%s exit: returning empty string", __func__); return ""; } /* * strlang * ------- * * Constructs an RDF literal by combining a lexical value with a specified * language tag. The result is formatted as a language-tagged RDF literal. * * literal: Null-terminated C string representing an RDF literal or lexical * value (e.g., "abc") * language: Null-terminated C string representing the language tag (e.g., * "en") * * returns: Null-terminated C string formatted as a language-tagged RDF * literal (e.g., "abc"@en) */ char *strlang(char *literal, char *language) { StringInfoData buf; char *lex_language = lex(language); char *lex_literal = lex(literal); elog(DEBUG1, "%s called: literal='%s', language='%s'", __func__, literal, language); if (strlen(lex_language) == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("language tag cannot be empty"))); initStringInfo(&buf); if (strlen(lex_literal) == 0) appendStringInfo(&buf, "\"\"@%s", lex_language); else appendStringInfo(&buf, "%s@%s", str(literal), lex_language); elog(DEBUG1, "%s exit: returning => '%s'", __func__, buf.data); return buf.data; } /* * strstarts * --------- * * Implements the core logic for the SPARQL STRSTARTS function, returning true * if the lexical form of the first argument (string) starts with the lexical * form of the second argument (substring), or false if arguments are * incompatible or the condition fails. An empty substring is considered a * prefix of any string, per SPARQL behavior. * * str: Null-terminated C string representing an RDF literal or value * (e.g., "foobar") * substr: Null-terminated C string representing an RDF literal or value * (e.g., "foo") * * returns: C boolean (true if string starts with substring, false otherwise * or if incompatible) */ bool strstarts(char *str, char *substr) { char *str_lexical = lex(str); char *substr_lexical = lex(substr); size_t str_len = strlen(str_lexical); size_t substr_len = strlen(substr_lexical); int result; elog(DEBUG1, "%s called: str='%s', substr='%s'", __func__, str, substr); if (!LiteralsCompatible(str, substr)) { elog(DEBUG1, "%s exit: returning 'false' (incompatible literals)", __func__); return false; } if (substr_len == 0) { elog(DEBUG1, "%s exit: returning 'true' (empty substring is a prefix of any string)", __func__); return true; } if (substr_len > str_len) { elog(DEBUG1, "%s exit: returning 'false' (substring longer than string cannot be a prefix)", __func__); return false; } result = strncmp(str_lexical, substr_lexical, substr_len); elog(DEBUG1, "%s exit: returning '%s'", __func__, result == 0 ? "true" : "false"); return result == 0; } /* * strends * ------- * * Implements the core logic for the SPARQL STRENDS function, returning true * if the lexical form of the first argument (string) ends with the lexical * form of the second argument (substring), or false if arguments are * incompatible or the condition fails. An empty substring is considered a * suffix of any string, per SPARQL behavior. * * str: Null-terminated C string representing an RDF literal or value * (e.g., "foobar") * substr: Null-terminated C string representing an RDF literal or value * (e.g., "bar") * * returns: C boolean (true if string ends with substring, false otherwise * or if incompatible) */ bool strends(char *str, char *substr) { char *str_lexical = lex(str); char *substr_lexical = lex(substr); size_t str_len = strlen(str_lexical); size_t substr_len = strlen(substr_lexical); int result; elog(DEBUG1, "%s called: str='%s', substr='%s'", __func__, str, substr); if (!LiteralsCompatible(str, substr)) { elog(DEBUG1, "%s exit: returning 'false' (incompatible literals)", __func__); return false; } if (substr_len == 0) { elog(DEBUG1, "%s exit: returning 'true' (an empty substring is a suffix of any string)", __func__); return true; } if (substr_len > str_len) { elog(DEBUG1, "%s exit: returning 'false' (substring longer than string cannot be a suffix)", __func__); return false; } result = strncmp(str_lexical + (str_len - substr_len), substr_lexical, substr_len); elog(DEBUG1, "%s exit: returning '%s'", __func__, result == 0 ? "true" : "false"); return result == 0; } /* * strdt * ----- * * Constructs an RDF literal by combining a lexical value with a specified * datatype IRI. Uses ExpandDatatypePrefix to handle prefix expansion * (e.g., "xsd:" to full URI) or retain prefixed/bare forms without angle * brackets unless fully expanded. * * literal: Null-terminated C string representing an RDF literal or lexical * value (e.g., "123") * datatype: Null-terminated C string representing the datatype IRI * (e.g., "xsd:int", "foo:bar") * * returns: Null-terminated C string formatted as a datatype-tagged RDF * literal (e.g., "123"^^, * "foo"^^foo:bar) */ char *strdt(char *literal, char *datatype) { StringInfoData buf; char *lex_datatype = lex(datatype); elog(DEBUG1, "%s called: literal='%s', datatype='%s'", __func__, literal, datatype); if (strlen(lex_datatype) == 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("datatype IRI cannot be empty"))); if (ContainsWhitespaces(datatype)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("datatype IRI cannot contain whitespaces"))); initStringInfo(&buf); if (isIRI(datatype)) appendStringInfo(&buf, "%s^^%s", str(literal), datatype); else { char *expanded_datatype; elog(DEBUG2, "%s: data type not an IRI", __func__); expanded_datatype = ExpandDatatypePrefix(lex_datatype); appendStringInfo(&buf, "%s^^%s", str(literal), iri(expanded_datatype)); } elog(DEBUG1, "%s exit: returning => '%s'", __func__, buf.data); return buf.data; } /* * str * --- * * Extracts the lexical value of an RDF literal or the string form of an IRI * and returns it as a new RDF literal. If the input is empty or null, * returns an empty RDF literal. * * input: Null-terminated C string representing an RDF literal or IRI * (e.g., "abc"@en, "") * * returns: Null-terminated C string formatted as an RDF literal * (e.g., "abc", "http://example.org") */ char *str(char *input) { StringInfoData buf; char *result; elog(DEBUG1, "%s called: input='%s'", __func__, input); if (!input || input[0] == '\0') { elog(DEBUG1, "%s exit: returning empty literal", __func__); return "\"\""; } if (isIRI(input)) { size_t len = strlen(input); initStringInfo(&buf); appendStringInfo(&buf, "\"%.*s\"", (int)(len - 2), input + 1); /* skip '<' and trim '>' */ elog(DEBUG1, "%s exit: returning IRI '%s'", __func__, buf.data); return buf.data; } result = cstring_to_rdfliteral(lex(input)); elog(DEBUG1, "%s exit: returning literal '%s'", __func__, result); return result; } /* * iri * --- * Converts a string to an IRI by wrapping it in angle brackets (< >), * mimicking SPARQL's IRI() function. Strips quotes and any language tags or * datatypes if present *only* for quoted literals. Raw strings and * pre-wrapped IRIs are preserved. */ char *iri(char *input) { StringInfoData buf; char *lexical; elog(DEBUG1, "%s called: input='%s'", __func__, input ? input : "(null)"); if (!input || *input == '\0') return "<>"; if (isIRI(input)) return pstrdup(input); initStringInfo(&buf); lexical = lex(input); appendStringInfo(&buf, "<%s>", lexical); elog(DEBUG1, "%s exit: returning wrapped IRI '%s'", __func__, buf.data); return pstrdup(buf.data); } /* * bnode * ----- * * Implements SPARQL’s BNODE function. Without arguments (input = NULL), * generates a unique blank node (e.g., "_:b123"). With a string argument, * returns a blank node based on the lexical form of the input (e.g., * BNODE("xyz") → "_:xyz"). Invalid inputs (e.g., IRIs, blank nodes, empty * literals) return NULL. * * input: Null-terminated C string (literal or bare string) for BNODE(str), * or NULL for BNODE(). * * returns: Null-terminated C string representing a blank node (e.g., * "_:xyz"), or NULL for invalid inputs. */ char *bnode(char *input) { StringInfoData buf; static uint64 counter = 0; /* Ensure uniqueness for BNODE() */ elog(DEBUG1, "%s called: input='%s'", __func__, input); initStringInfo(&buf); if (input == NULL) { /* BNODE(): Generate unique blank node using timestamp and counter */ TimestampTz ts = GetCurrentTimestamp(); uint64 unique_id = counter++ ^ (uint64)ts; appendStringInfo(&buf, "_:b%llu", (unsigned long long)unique_id); } else { StringInfoData input_buf; char *normalized_input; char *lexical; /* Reject IRIs and blank nodes explicitly */ if (isIRI(input) || isBlank(input)) { elog(DEBUG1, "%s exit: returning NULL (input eiter an IRI or a blank node)", __func__); return NULL; } /* Normalize input: quote bare strings */ initStringInfo(&input_buf); if (*input != '"' && !strstr(input, "^^") && !strstr(input, "@")) { appendStringInfoChar(&input_buf, '"'); appendStringInfoString(&input_buf, input); appendStringInfoChar(&input_buf, '"'); } else { appendStringInfoString(&input_buf, input); } normalized_input = input_buf.data; /* Validate input is a literal */ if (!isLiteral(normalized_input)) { elog(DEBUG1, "%s exit: returning NULL (input is not a literal)", __func__); return NULL; } /* Extract lexical form */ lexical = lex(normalized_input); if (!lexical || strlen(lexical) == 0) { elog(DEBUG1, "%s exit: returning NULL (lexical value either NULL or an empty string)", __func__); return NULL; } /* Create blank node ID, sanitizing lexical form (alphanumeric or underscore) */ appendStringInfoString(&buf, "_:"); for (char *p = lexical; *p; p++) { if (isalnum((unsigned char)*p)) appendStringInfoChar(&buf, *p); else appendStringInfoChar(&buf, '_'); } } elog(DEBUG1, "%s exit: returning '%s'", __func__, buf.data); return buf.data; } /* * concat(text, text) returns text * * Implements the SPARQL CONCAT function. * * Concatenates two RDF literals while preserving compatible language tags * or datatype annotations (specifically xsd:string). If both inputs share * the same language tag, the result will carry that tag. If both inputs are * typed as xsd:string, the result is typed as xsd:string. * * Mixing a simple literal with a language-tagged or xsd:string-typed value * results in a plain literal without type or language. Conflicting language * tags or unsupported datatypes raise an error. * * NULL inputs yield NULL. Empty strings are allowed and result in valid RDF * literals. */ char *concat(char *left, char *right) { char *lex1 = lex(left); char *lex2 = lex(right); char *dt1 = datatype(left); char *dt2 = datatype(right); char *lang1 = lang(left); char *lang2 = lang(right); char *result; StringInfoData buf; elog(DEBUG1, "%s called: left='%s', right='%s'", __func__, left, right); initStringInfo(&buf); appendStringInfoString(&buf, lex1); appendStringInfoString(&buf, lex2); /* Check for conflicting language tags */ if (strlen(lang1) > 0 && strlen(lang2) > 0 && strcmp(lang1, lang2) != 0) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("CONCAT arguments have conflicting language tags: '%s' and '%s'", lang1, lang2))); if ((strlen(dt1) > 0 && strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) != 0) || (strlen(dt2) > 0 && strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) != 0)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("CONCAT arguments must be simple literals or '%s'", RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED))); /* only one argument has a datatype */ if ((strlen(dt1) != 0 && strlen(dt2) == 0) || (strlen(dt1) == 0 && strlen(dt2) != 0)) { result = cstring_to_rdfliteral(buf.data); elog(DEBUG1, "%s exit: returning '%s' (only one argument has a datatype)", __func__, result); return result; } /* only one argument has a language tag */ if ((strlen(lang1) != 0 && strlen(lang2) == 0) || (strlen(lang1) == 0 && strlen(lang2) != 0)) { result = cstring_to_rdfliteral(buf.data); elog(DEBUG1, "%s exit: returning '%s' (only one argument has a language tag)", __func__, result); return result; } /* re-wrap result appropriately */ if (strlen(lang1) != 0 || strlen(lang2) != 0) { result = strlang(buf.data, strlen(lang1) > 0 ? lang1 : lang2); elog(DEBUG1, "%s exit: returning '%s' (re-wrapping result appropriately)", __func__, result); return result; } if ((strlen(dt1) > 0 && strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0) || (strlen(dt2) > 0 && strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { result = strdt(buf.data, RDF_SIMPLE_LITERAL_DATATYPE); elog(DEBUG1, "%s exit: returning '%s' (either left or right argument has a simple literal data type - %s)", __func__, result, RDF_SIMPLE_LITERAL_DATATYPE); return result; } result = cstring_to_rdfliteral(buf.data); elog(DEBUG1, "%s exit: returning '%s'", __func__, result); return result; } /* * isIRI * ----- * Checks if a string is an RDF IRI. A valid IRI must: * - Start with '<' and end with '>' * - Not contain spaces or quote characters * - May be absolute (with a colon) or relative (e.g., ) according to * SPARQL 1.1. */ bool isIRI(char *input) { size_t len; size_t i; if (input == NULL || (len = strlen(input)) < 3) return false; /* Must be enclosed in <...> */ if (input[0] != '<' || input[len - 1] != '>') return false; /* Check for illegal characters inside the IRI */ for (i = 1; i < len - 1; i++) { char c = input[i]; if (c == '"' || c == ' ' || c == '\n' || c == '\r' || c == '\t') return false; } /* All checks passed — valid IRI (absolute or relative) */ return true; } /* * isBlank * ------- * * Mimics SPARQL's isBlank function. Checks if the input is a blank node. * Returns true if the term starts with "_:", false otherwise. * * term: Null-terminated C string, an RDF term (e.g., "_:b1", "", "\"hello\"") * * returns: Boolean (true if blank node, false otherwise) */ bool isBlank(char *term) { bool result; elog(DEBUG1, "%s called: term='%s'", __func__, term); /* Handle NULL or empty input */ if (!term || strlen(term) == 0) { elog(DEBUG1, "%s exit: returning 'false' (invalid input)", __func__); return false; } /* Check if term starts with "_:" and has at least 3 characters */ result = (strncmp(term, "_:", 2) == 0) && strlen(term) > 2; elog(DEBUG1, "%s exit: returning '%s'", __func__, result ? "true" : "false"); return result; } /* * isLiteral * --------- * * Checks if an RDF term is a literal per SPARQL 1.1 spec. Returns true for simple * literals (e.g., "\"hello\""), language-tagged literals (e.g., "\"hello\"@en"), * or typed literals (e.g., "\"12\"^^xsd:integer"). Returns false for IRIs * (e.g., ""), blank nodes (e.g., "_:bnode"), bare numbers * (e.g., "123"), empty strings, or invalid inputs. * * term: Null-terminated C string representing an RDF term * * returns: Boolean (1 for literal, 0 otherwise) */ bool isLiteral(char *term) { const char *ptr; int len; elog(DEBUG1, "%s called: term='%s'", __func__, term); if (!term || *term == '\0') { elog(DEBUG1, "%s exit: returning 'false' (term either NULL or has no '\\0')", __func__); return false; } /* Exclude IRIs and blank nodes first */ if (isIRI(term) || isBlank(term)) { elog(DEBUG1, "%s exit: returning 'false' (either an IRI or a blank node)", __func__); return false; } /* Normalize input */ ptr = cstring_to_rdfliteral(term); len = strlen(ptr); /* Check for valid quoted literal */ if (*ptr == '"') { if (len >= 2) { const char *tag = strstr(ptr, "^^"); const char *lang_tag = strstr(ptr, "@"); /* Typed literal: has ^^ followed by datatype */ if (tag && tag > ptr + 1 && *(tag - 1) == '"' && (!lang_tag || lang_tag > tag)) { const char *dt_start = tag + 2; if (*dt_start != '\0' && (*dt_start != '<' || *(dt_start + 1) != '>')) { elog(DEBUG1, "%s exit: returning 'true' (valid datatype)", __func__); return true; } /* Valid datatype */ } /* Language-tagged literal: has @ with language tag */ else if (lang_tag && lang_tag > ptr + 1 && *(lang_tag - 1) == '"' && *(lang_tag + 1) != '\0') { elog(DEBUG1, "%s exit: returning 'true' (literal has a language tag)", __func__); return true; } /* Simple literal: quoted string, no ^^ or @ */ else if (ptr[len - 1] == '"') { elog(DEBUG1, "%s exit: returning 'true' (simple literal - no ^^ or @)", __func__); return true; } } else if (len == 1) { /* Empty quoted literal "" */ elog(DEBUG1, "%s exit: returning 'true' (empty quoted literal)", __func__); return true; } } /* Invalid or non-literal */ elog(DEBUG1, "%s exit: returning 'false' (invalid or non-literal)", __func__); return false; } /* * langmatches * ----------- * * Mimics SPARQL's LANGMATCHES function. Compares a language tag against a pattern, * supporting basic matching and wildcards (*). Case-insensitive per RFC 4647. * Returns true if the language tag matches the pattern, false otherwise. * * lang_tag: Null-terminated C string, typically a language tag (e.g., "en" from lang()) * pattern: Null-terminated C string, language range (e.g., "en", "en-*", "*") * * returns: Boolean (true if lang_tag matches pattern, false otherwise) */ bool langmatches(char *lang_tag, char *pattern) { char *tag; char *pat; bool result; elog(DEBUG1, "%s called: lang_tag='%s', pattern='%s'", __func__, lang_tag, pattern); /* Handle NULL inputs */ if (!lang_tag || !pattern) { elog(DEBUG1, "%s exit: returning 'false' (invalid input)", __func__); return false; } pattern = lex(pattern); tag = lex(lang_tag); /* e.g., "en" from lang('"foo"@en') */ /* Handle pattern: bare string or quoted literal */ if (pattern[0] == '"' && strrchr(pattern, '"') > pattern) pat = lex(pattern); /* e.g., "\"en\"" -> "en" */ else pat = pattern; /* e.g., "en" as-is */ /* Empty tag only matches "*" pattern (case-insensitive) */ if (strlen(tag) == 0) { result = (strcasecmp(pat, "*") == 0); elog(DEBUG1, "%s exit: returning '%s' (empty tag, pattern='%s')", __func__, result ? "true" : "false", pat); return result; } /* Exact match (case-insensitive) */ if (strcasecmp(tag, pat) == 0) { result = true; } /* Wildcard match: "*" matches any non-empty tag */ else if (strcasecmp(pat, "*") == 0) { result = true; } /* SPARQL rule: prefix match with hyphen, e.g. "en" matches "en-US" */ else if (strncasecmp(tag, pat, strlen(pat)) == 0 && tag[strlen(pat)] == '-') { result = true; } /* Prefix match with wildcard (e.g., "en-*" matches "en" or "en-us") */ else if (strchr(pat, '*')) { char *prefix_end = strchr(pat, '*'); size_t prefix_len = prefix_end - pat; size_t tag_len = strlen(tag); if (prefix_len > 0 && tag_len >= (prefix_len - 1) && strncasecmp(tag, pat, prefix_len - 1) == 0) { if (tag_len == prefix_len - 1 || (tag_len > prefix_len && tag[prefix_len - 1] == '-' && prefix_end[1] == '\0')) { result = true; } else { result = false; } } else { result = false; } } else { result = false; } elog(DEBUG1, "%s exit: returning '%s' (tag='%s', pat='%s')", __func__, result ? "true" : "false", tag, pat); return result; } /* * datatype * -------- * * Extracts the datatype URI of an RDF literal, following SPARQL 1.1 conventions. * Returns "" for simple literals and language-tagged literals (unbound per spec). * For typed literals (e.g., xsd: types), constructs the full URI using RDF_XSD_BASE_URI. * Returns "" for invalid or unrecognized inputs. * * input: Null-terminated C string representing an RDF literal (e.g., "123"^^xsd:int, "abc"@en, "xyz") * * returns: Null-terminated C string representing the datatype URI (e.g., "http://www.w3.org/2001/XMLSchema#int") */ char *datatype(char *input) { StringInfoData buf; const char *ptr; int len; elog(DEBUG1, "%s called: input='%s'", __func__, input ? input : "(null)"); if (input == NULL || *input == '\0') { elog(DEBUG1, "%s exit: returning empty string for NULL or empty input", __func__); return ""; } ptr = cstring_to_rdfliteral(input); len = strlen(ptr); initStringInfo(&buf); if (*ptr == '"') { const char *tag = strstr(ptr, "^^"); const char *lang_tag = strstr(ptr, "@"); /* check for datatype first */ if (tag && tag > ptr + 1 && *(tag - 1) == '"' && (!lang_tag || lang_tag > tag)) /* datatype takes precedence */ { const char *dt_start = tag + 2; /* skip ^^ */ const char *dt_end = dt_start; /* find the end of the datatype */ if (*dt_start == '<') { while (*dt_end && *dt_end != '>') dt_end++; if (*dt_end != '>') /* ensure proper closing */ { elog(DEBUG1, "%s exit: returning empty string (malformed datatype IRI, missing '>')", __func__); return ""; } dt_end++; /* include > */ } else { while (*dt_end && *dt_end != ' ' && *dt_end != '>' && *dt_end != '@') dt_end++; } if (dt_start < dt_end) { char *res = ""; /* handle xsd: prefix */ if (strncmp(dt_start, "xsd:", 4) == 0 && dt_end - dt_start > 4) { appendStringInfoString(&buf, RDF_XSD_BASE_URI); appendBinaryStringInfo(&buf, dt_start + 4, dt_end - (dt_start + 4)); } else if (*dt_start == '<' && *(dt_end - 1) == '>' && strncmp(dt_start + 1, "xsd:", 4) == 0 && dt_end - dt_start > 6) { appendStringInfoString(&buf, RDF_XSD_BASE_URI); appendBinaryStringInfo(&buf, dt_start + 5, dt_end - (dt_start + 6)); } else if (*dt_start == '<' && *(dt_end - 1) == '>') { appendBinaryStringInfo(&buf, dt_start + 1, dt_end - dt_start - 2); } else { appendBinaryStringInfo(&buf, dt_start, dt_end - dt_start); } /* ensure no trailing junk */ if (*dt_end != '\0') { elog(DEBUG1, "%s exit: returning empty string (trailing chars after datatype)", __func__); return res; } res = iri(buf.data); elog(DEBUG1, "%s exit: returning '%s'", __func__, res); return res; } } /* simple or language-tagged literal */ if ((lang_tag && lang_tag > ptr + 1 && *(lang_tag - 1) == '"') || (len >= 1 && (ptr[len - 1] == '"' || len == 1))) { elog(DEBUG1, "%s exit: returning empty string (simple/language-tagged literal)", __func__); return ""; } } /* Not a valid literal */ elog(DEBUG1, "%s exit: returning empty string (not a valid literal)", __func__); return ""; } /* * encode_for_uri * -------------- * * Encodes a string for use in a URI by percent-encoding all characters except * those defined as unreserved in RFC 3986 (alphanumeric, hyphen, period, * underscore, and tilde). If the input starts with a quote, it is treated as an * RDF literal and processed accordingly. * * str: Null-terminated C string to encode (e.g., "hello world", "\"example\"@en") * * returns: Null-terminated C string with URI-encoded result, formatted as an RDF literal */ char *encode_for_uri(char *str_in) { const char *unreserved = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.~"; size_t in_len; char *res; StringInfoData buf; initStringInfo(&buf); elog(DEBUG1, "%s called: str='%s'", __func__, str_in); str_in = lex(str_in); in_len = strlen(str_in); elog(DEBUG2, "%s: encoding string: '%s', length: %zu", __func__, str_in, in_len); for (size_t i = 0; i < in_len; i++) { unsigned char c = (unsigned char)str_in[i]; if (strchr(unreserved, c)) appendStringInfoChar(&buf, c); else appendStringInfo(&buf, "%%%02X", c); } res = cstring_to_rdfliteral(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, res); return res; } /* * generate_uuid_v4 * ---------------- * Generates a version 4 (random) UUID per RFC 4122. Returns a lowercase string * in the format xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx, where y is 8, 9, A, or B. * Uses timestamp and counter for entropy, no external dependencies. * * Returns: Null-terminated C string (e.g., "123e4567-e89b-12d3-a456-426614174000") */ char *generate_uuid_v4(void) { StringInfoData buf; static uint64 counter = 0; uint64 seed; uint8_t bytes[16]; char *result; int i; elog(DEBUG1, "%s called", __func__); initStringInfo(&buf); /* Use timestamp and counter for pseudo-randomness */ seed = (uint64)GetCurrentTimestamp() ^ counter++; /* Generate 16 bytes of pseudo-random data */ for (i = 0; i < 16; i++) { seed = (seed * 1103515245 + 12345) & 0x7fffffff; /* Linear congruential generator */ bytes[i] = (uint8_t)(seed >> 16); } /* Set version (4) and variant (y = 8, 9, A, B) */ bytes[6] = (bytes[6] & 0x0F) | 0x40; /* Version 4 */ bytes[8] = (bytes[8] & 0x3F) | 0x80; /* Variant: 10xx */ /* Format as xxxxxxxx-xxxx-4xxx-yxxx-xxxxxxxxxxxx */ appendStringInfo(&buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", bytes[0], bytes[1], bytes[2], bytes[3], bytes[4], bytes[5], bytes[6], bytes[7], bytes[8], bytes[9], bytes[10], bytes[11], bytes[12], bytes[13], bytes[14], bytes[15]); result = pstrdup(buf.data); pfree(buf.data); elog(DEBUG1, "%s exit: returning '%s'", __func__, result); return result; } /* * substr_sparql * ------------- * Implements SPARQL's SUBSTR(str, start, length) function. * Converts RDF literal or bare string into substring while preserving language/datatype tag. * * str : Input RDF literal or bare string. * start : 1-based index (inclusive). * length : Optional substring length (0 or negative is invalid). * * Returns a new RDF literal string with the appropriate tag preserved. */ char *substr_sparql(char *str, int start, int length) { char *lexical; char *str_datatype; char *str_language; StringInfoData buf; char *result; int str_len, i; elog(DEBUG1, "%s called: str='%s', start=%d, length=%d", __func__, str, start, length); if (!str) ereport(ERROR, (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED), errmsg("SUBSTR cannot be NULL"))); if (start < 1) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("SUBSTR start position must be >= 1"))); if (isIRI(str) || isBlank(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("SUBSTR not allowed on IRI or blank node: %s", str))); lexical = lex(str); str_datatype = datatype(str); str_language = lang(str); str_len = strlen(lexical); elog(DEBUG1, "%s: lexical='%s', datatype='%s', language='%s', length=%d", __func__, lexical, str_datatype, str_language, str_len); if (start > str_len) lexical[0] = '\0'; /* empty result */ initStringInfo(&buf); for (i = start - 1; i < str_len; i++) { if (length >= 0 && (i - (start - 1)) >= length) break; appendStringInfoChar(&buf, lexical[i]); } if (strlen(str_language) > 0) result = strlang(buf.data, str_language); else if (strlen(str_datatype) > 0) result = strdt(buf.data, str_datatype); else result = cstring_to_rdfliteral(buf.data); pfree(buf.data); elog(DEBUG1, "%s exit: returning '%s'", __func__, result); return result; } /* * lcase * ----- * * Implements SPARQL’s LCASE function. Converts the lexical form of a string literal * (simple, xsd:string, or language-tagged) to lowercase (ASCII A-Z to a-z, non-ASCII * preserved). Preserves the original datatype or language tag. Errors on IRIs, blank * nodes, non-string literals, or invalid inputs. Bare strings are treated as simple literals. * * str: Null-terminated C string (RDF literal or bare string, e.g., "BAR", "\"BAR\"@en") * * returns: Null-terminated C string (lowercase RDF literal) */ char *lcase(char *str) { char *lexical; char *str_datatype; char *str_language; char *result; StringInfoData buf; int i; int len; elog(DEBUG1, "%s called: str='%s'", __func__, str); if (!str) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("LCASE cannot be NULL"))); if (strlen(str) == 0) { elog(DEBUG1, "%s exit: returning empty literal (str is an empty string)", __func__); return cstring_to_rdfliteral(""); } /* Check for IRIs or blank nodes */ if (isIRI(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("LCASE does not allow IRIs: %s", str))); if (isBlank(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("LCASE does not allow blank nodes: %s", str))); lexical = lex(str); if (lexical == NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("LCASE failed to extract lexical value: %s", str))); str_datatype = datatype(str); if (strlen(str_datatype) != 0 && !IsRDFStringLiteral(str_datatype)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("LCASE does not allow non-string literals: %s", str_datatype))); str_language = lang(str); initStringInfo(&buf); elog(DEBUG1, " %s: lexical='%s', datatype='%s', language='%s'", __func__, lexical, str_datatype, str_language); /* Convert to lowercase (ASCII for simplicity) */ len = strlen(lexical); for (i = 0; i < len; i++) { char c = lexical[i]; if (c >= 'A' && c <= 'Z') c = c + ('a' - 'A'); appendStringInfoChar(&buf, c); } if (strlen(str_language) != 0) result = strlang(buf.data, str_language); else if (strlen(str_datatype) != 0) result = strdt(buf.data, str_datatype); else result = cstring_to_rdfliteral(buf.data); pfree(buf.data); elog(DEBUG1, "%s exit: returning '%s'", __func__, result); return result; } /* * ucase * ----- * * Implements SPARQL’s UCASE function. Converts the lexical form of a string literal * (simple, xsd:string, or language-tagged) to uppercase (ASCII a-z to A-Z, non-ASCII * preserved). Preserves the original datatype or language tag. Errors on IRIs, blank * nodes, non-string literals, or invalid inputs. Bare strings are treated as simple literals. * * str: Null-terminated C string (RDF literal or bare string, e.g., "bar", "\"bar\"@en") * * returns: Null-terminated C string (uppercase RDF literal) */ char *ucase(char *str) { char *lexical; char *str_datatype; char *str_language; char *result; StringInfoData buf; int i; int len; elog(DEBUG1, "%s called: str='%s'", __func__, str); if (!str) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("LCASE cannot be NULL"))); if (strlen(str) == 0) { elog(DEBUG1, "%s exit: returning empty literal (str is an empty string)", __func__); return cstring_to_rdfliteral(""); } /* Check for IRIs or blank nodes */ if (isIRI(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("UCASE does not allow IRIs: %s", str))); if (isBlank(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("UCASE does not allow blank nodes: %s", str))); lexical = lex(str); if (lexical == NULL) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("UCASE failed to extract lexical value: %s", str))); str_datatype = datatype(str); if (strlen(str_datatype) != 0 && !IsRDFStringLiteral(str_datatype)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("UCASE does not allow non-string literals: %s", str_datatype))); str_language = lang(str); initStringInfo(&buf); elog(DEBUG2, " %s: lexical='%s', datatype='%s', language='%s'", __func__, lexical, str_datatype, str_language); /* Convert to uppercase (ASCII only) */ len = strlen(lexical); for (i = 0; i < len; i++) { char c = lexical[i]; if (c >= 'a' && c <= 'z') c = c - ('a' - 'A'); appendStringInfoChar(&buf, c); } if (strlen(str_language) != 0) result = strlang(buf.data, str_language); else if (strlen(str_datatype) != 0) result = strdt(buf.data, str_datatype); else result = cstring_to_rdfliteral(buf.data); pfree(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } /* * isNumeric * --------- * * Checks if an RDF term is numeric per SPARQL spec. Returns true if the term is a * bare number (e.g., "12") or a literal with a numeric datatype (e.g., xsd:integer, * xsd:nonNegativeInteger) and valid numeric lexical form. Returns false otherwise. * * term: Null-terminated C string representing an RDF term (e.g., "12", "12"^^xsd:integer) * * returns: Boolean indicating if the term is numeric */ bool isNumeric(char *term) { char *lexical; char *datatype_uri; bool is_bare_number = false; char *endptr; elog(DEBUG1, "%s called: term='%s'", __func__, term); if (!term || strlen(term) == 0) { elog(DEBUG1, "%s exit: returning 'false' (term either NULL or an empty string)", __func__); return false; } /* Check if term is a bare number (e.g., "12") */ if (term[0] != '"' && !strstr(term, "^^") && !strstr(term, "@")) { lexical = term; is_bare_number = true; } else { /* Extract lexical value using datatype’s helper */ lexical = lex(term); /* From datatype/strdt codebase */ } /* Validate lexical form as numeric (integers, decimals, or scientific notation) */ if (!lexical || strlen(lexical) == 0) { elog(DEBUG1, "%s exit: returning 'false' (lexical value either NULL or an empty string)", __func__); return false; } strtod(lexical, &endptr); if (*endptr != '\0') /* not a valid number, e.g., "abc" */ { elog(DEBUG1, "%s exit: returning 'false' (not a valid number)", __func__); return false; } /* Bare numbers are numeric */ if (is_bare_number) { elog(DEBUG1, "%s exit: returning 'true' (bare numbers are numeric)", __func__); return true; } /* Get datatype using datatype function */ datatype_uri = datatype(term); if (strlen(datatype_uri) == 0) { elog(DEBUG1, "%s exit: returning 'false' (no datatype or invalid literal)", __func__); return false; } /* No datatype or invalid literal (e.g., "12") */ /* Check for numeric datatypes */ if (strcmp(datatype_uri, RDF_XSD_INTEGER) == 0 || strcmp(datatype_uri, RDF_XSD_NONNEGATIVEINTEGER) == 0 || strcmp(datatype_uri, RDF_XSD_POSITIVEINTEGER) == 0 || strcmp(datatype_uri, RDF_XSD_NEGATIVEINTEGER) == 0 || strcmp(datatype_uri, RDF_XSD_NONPOSITIVEINTEGER) == 0 || strcmp(datatype_uri, RDF_XSD_LONG) == 0 || strcmp(datatype_uri, RDF_XSD_INT) == 0 || strcmp(datatype_uri, RDF_XSD_BYTE) == 0 || strcmp(datatype_uri, RDF_XSD_SHORT) == 0 || strcmp(datatype_uri, RDF_XSD_UNSIGNEDLONG) == 0 || strcmp(datatype_uri, RDF_XSD_UNSIGNEDINT) == 0 || strcmp(datatype_uri, RDF_XSD_UNSIGNEDSHORT) == 0 || strcmp(datatype_uri, RDF_XSD_UNSIGNEDBYTE) == 0 || strcmp(datatype_uri, RDF_XSD_DOUBLE) == 0 || strcmp(datatype_uri, RDF_XSD_FLOAT) == 0 || strcmp(datatype_uri, RDF_XSD_DECIMAL) == 0) { /* Special case for xsd:byte: SPARQL requires values to be integers between -128 and 127. * For example, isNumeric("1200"^^xsd:byte) returns false because 1200 exceeds 127. * We parse the lexical value to ensure it’s a valid integer and check its range. */ if (strcmp(datatype_uri, RDF_XSD_BYTE) == 0) { /* Ensure the entire string is a valid integer and within xsd:byte range */ if (*endptr != '\0') /* Not a pure integer, e.g., "12.34" */ { elog(DEBUG1, "%s exit: returning 'false' (not a pure integer)", __func__); return false; } elog(DEBUG1, "%s exit: returning 'true' (valid xsd:byte)", __func__); return true; /* Valid xsd:byte, e.g., "100" */ } /* Other numeric datatypes (e.g., xsd:integer, xsd:double) have no strict range * limits in SPARQL’s isNumeric, and we’ve already validated the lexical form. * Accept them as numeric. */ elog(DEBUG1, "%s exit: returning 'true'", __func__); return true; } elog(DEBUG1, "%s exit: returning 'false'", __func__); return false; } /* * contains * -------- * * Implements SPARQL’s CONTAINS(str, substr) function. Returns true if the lexical * form of str contains the lexical form of substr as a contiguous subsequence; * false otherwise. Matching is case-sensitive per SPARQL. * * str_in : Null-terminated C string representing an RDF term or bare string * substr_in : Null-terminated C string representing an RDF term or bare string * * returns: Boolean (true if substr occurs within str’s lexical form; false on * mismatch, incompatible language tags, or invalid/empty input) */ bool contains(char *str_in, char *substr_in) { char *str_lex; char *substr_lex; char *lang_str; bool result; elog(DEBUG1, "%s called: str='%s', substr='%s'", __func__, str_in, substr_in); /* handle NULL or empty inputs */ if (!str_in || !substr_in || strlen(str_in) == 0 || strlen(substr_in) == 0) { elog(DEBUG1, "%s exit: returning 'false' (invalid input)", __func__); return false; } lang_str = lang(str_in); if (strlen(lang_str) != 0) { char *lang_substr = lang(substr_in); if (strlen(lang_substr) != 0 && pg_strcasecmp(lang_str, lang_substr) != 0) { elog(DEBUG1, "%s exit: returning NULL (string and substring have different languag tags)", __func__); return NULL; } } /* extract lexical values (strips quotes, tags, etc.) */ str_lex = lex(str_in); substr_lex = lex(substr_in); /* check if substr is in str using strstr */ result = (strstr(str_lex, substr_lex) != NULL); elog(DEBUG1, "%s exit: returning > %s (str_lexical='%s', substr_lexical='%s')", __func__, result ? "true" : "false", str_lex, substr_lex); return result; } /* * strbefore * ----------------- * * Implements the SPARQL STRBEFORE function, returning the substring of the first * argument before the first occurrence of the second argument (delimiter). The * result preserves the language tag or datatype of the first argument as present * in the input syntax. Simple literals remain simple in the output. * * str: the input string (e.g., "abc"@en, "abc"^^xsd:string) * delimiter: the delimiter string (e.g., "b", "b"@en) * * returns: cstring representing the RDF literal before the delimiter */ char *strbefore(char *str, char *delimiter) { char *str_lexical; char *delimiter_lexical; char *lang1; char *dt1 = ""; char *pos; char *result; elog(DEBUG1, "%s called: str='%s', delimiter='%s", __func__, str, delimiter); str_lexical = lex(str); delimiter_lexical = lex(delimiter); lang1 = lang(str); /* extract datatypes if no language tags */ if (strlen(lang1) == 0) dt1 = datatype(str); if (!LiteralsCompatible(str, delimiter)) { elog(DEBUG1, "%s exit: returning NULL (literals no compatible)", __func__); return NULL; } if ((pos = strstr(str_lexical, delimiter_lexical)) != NULL) { size_t before_len = pos - str_lexical; StringInfoData buf; initStringInfo(&buf); if (strlen(lang1) > 0) { appendBinaryStringInfo(&buf, str_lexical, before_len); result = strlang(buf.data, lang1); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } else if (strlen(dt1) > 0 && /* only for explicit ^^ */ (strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { appendBinaryStringInfo(&buf, str_lexical, before_len); result = cstring_to_rdfliteral(buf.data); if (strstr(result, "^^") == NULL) { result = strdt(buf.data, dt1); } elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } else { /* simple literal or implicit xsd:string */ appendBinaryStringInfo(&buf, str_lexical, before_len); result = cstring_to_rdfliteral(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } } /* delimiter not found */ if (strlen(dt1) > 0 && (strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { result = cstring_to_rdfliteral(""); if (strstr(result, "^^") == NULL) { StringInfoData typed_buf; initStringInfo(&typed_buf); appendStringInfo(&typed_buf, "%s", strdt("", RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED)); result = typed_buf.data; } elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } result = cstring_to_rdfliteral(""); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } /* * strafter * ---------------- * * Implements the SPARQL STRAFTER function, returning the substring of the first * argument after the first occurrence of the second argument (delimiter). The * result preserves the language tag or datatype of the first argument as present * in the input syntax, always wrapped in double quotes as a valid RDF literal. * Returns an empty simple literal if the delimiter is not found. * * str: the input string (e.g., "abc"@en, "abc"^^xsd:string) * delimiter: the delimiter string (e.g., "b", "b"@en) * * returns: a cstring representing the RDF literal after the delimiter */ char *strafter(char *str, char *delimiter) { char *lexstr; char *lexdelimiter; char *lang1; char *dt1 = ""; char *pos; bool has_explicit_datatype = false; char *result; elog(DEBUG1, "%s called: str='%s', delimiter='%s'", __func__, str, delimiter); lexstr = lex(str); lexdelimiter = lex(delimiter); lang1 = lang(str); /* extract datatype if no language tag */ if (strlen(lang1) == 0) dt1 = datatype(str); /* check if arg1 has an explicit datatype in the input syntax */ if (strlen(lang1) == 0 && strstr(str, "^^") != NULL) has_explicit_datatype = true; if ((pos = strstr(lexstr, lexdelimiter)) != NULL) { size_t delimiter_len = strlen(lexdelimiter); char *after_start = pos + delimiter_len; size_t after_len = strlen(lexstr) - (after_start - lexstr); StringInfoData buf; initStringInfo(&buf); if (strlen(lang1) > 0) { appendBinaryStringInfo(&buf, after_start, after_len); result = strlang(buf.data, lang1); pfree(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } else if (has_explicit_datatype && strlen(dt1) > 0 && (strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { appendBinaryStringInfo(&buf, after_start, after_len); result = cstring_to_rdfliteral(buf.data); if (strstr(result, "^^") == NULL) { result = strdt(buf.data, dt1); } pfree(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } else { /* simple literal or implicit xsd:string */ appendBinaryStringInfo(&buf, after_start, after_len); result = cstring_to_rdfliteral(buf.data); pfree(buf.data); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } } /* delimiter not found */ if (has_explicit_datatype && strlen(dt1) > 0 && (strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { result = cstring_to_rdfliteral(""); if (strstr(result, "^^") == NULL) { StringInfoData typed_buf; initStringInfo(&typed_buf); appendStringInfo(&typed_buf, "%s", strdt("", RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED)); result = typed_buf.data; } elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } result = cstring_to_rdfliteral(""); elog(DEBUG1, "%s exit: returning => '%s'", __func__, result); return result; } /* * count_utf8_chars * ---------------- * * Counts Unicode characters (code points) in a UTF-8 string. * Returns the number of characters, not bytes. */ static int count_utf8_chars(const char *str) { int char_count = 0; elog(DEBUG1, "%s called: str='%s'", __func__, str); while (*str) { /* Skip continuation bytes (0x80-0xBF) */ if ((*str & 0xC0) != 0x80) char_count++; str++; } elog(DEBUG1, "%s exit: returning '%d'", __func__, char_count); return char_count; } int strlen_rdf(char *str) { char *lexical; char *dt; int result; elog(DEBUG1, "%s called: str='%s'", __func__, str); if (!str) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("STRLEN cannot be NULL"))); if (strlen(str) == 0) return 0; /* Check for IRIs or blank nodes */ if (isIRI(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("STRLEN does not allow IRIs: %s", str))); if (isBlank(str)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("STRLEN does not allow blank nodes: %s", str))); dt = datatype(str); /* Validate string literal */ if (!IsRDFStringLiteral(dt)) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("STRLEN does not allow non-string literals: %s", dt))); lexical = lex(str); result = count_utf8_chars(lexical); elog(DEBUG1, "%s exit: returning '%d'", __func__, result); return result; } /* * get_xsd_numeric_type * -------------------- * Determines the XSD numeric type from an rdfnode's datatype URI. * * Returns the type in the promotion hierarchy: integer < decimal < float < double */ XsdNumericType get_xsd_numeric_type(const char *dtype) { /* Handle all integer subtypes */ if (strcmp(dtype, RDF_XSD_INTEGER) == 0 || strcmp(dtype, RDF_XSD_INT) == 0 || strcmp(dtype, RDF_XSD_LONG) == 0 || strcmp(dtype, RDF_XSD_SHORT) == 0 || strcmp(dtype, RDF_XSD_BYTE) == 0 || strcmp(dtype, RDF_XSD_POSITIVEINTEGER) == 0 || strcmp(dtype, RDF_XSD_NEGATIVEINTEGER) == 0 || strcmp(dtype, RDF_XSD_NONNEGATIVEINTEGER) == 0 || strcmp(dtype, RDF_XSD_NONPOSITIVEINTEGER) == 0 || strcmp(dtype, RDF_XSD_UNSIGNEDLONG) == 0 || strcmp(dtype, RDF_XSD_UNSIGNEDINT) == 0 || strcmp(dtype, RDF_XSD_UNSIGNEDSHORT) == 0 || strcmp(dtype, RDF_XSD_UNSIGNEDBYTE) == 0) return XSD_TYPE_INTEGER; if (strcmp(dtype, RDF_XSD_DECIMAL) == 0) return XSD_TYPE_DECIMAL; if (strcmp(dtype, RDF_XSD_FLOAT) == 0) return XSD_TYPE_FLOAT; if (strcmp(dtype, RDF_XSD_DOUBLE) == 0) return XSD_TYPE_DOUBLE; /* Default to decimal for unknown numeric types */ return XSD_TYPE_DECIMAL; } /* * get_xsd_datatype_uri * -------------------- * Returns the XSD datatype URI for a given numeric type level. */ const char *get_xsd_datatype_uri(XsdNumericType type) { switch (type) { case XSD_TYPE_INTEGER: return RDF_XSD_INTEGER; case XSD_TYPE_DECIMAL: return RDF_XSD_DECIMAL; case XSD_TYPE_FLOAT: return RDF_XSD_FLOAT; case XSD_TYPE_DOUBLE: return RDF_XSD_DOUBLE; default: return RDF_XSD_DECIMAL; } } /* * sum_rdfnode_sfunc * ----------------- * Aggregate transition function for SUM(rdfnode). * Converts rdfnode to numeric and accumulates the sum. * * Strict numeric-only policy: * - If any non-numeric value is present, returns NULL (unbound). * - Type promotion: integer < decimal < float < double. * - Example: SUM({1, 2, 3}) = 6; SUM({1, 2, "string"}) = NULL. * * State is stored as RdfnodeAggState to track both sum and result type. * * Note: Aggregate context validation is handled by the wrapper in * rdf_fdw.c */ Datum sum_rdfnode_sfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; MemoryContext aggcontext; MemoryContext oldcontext; rdfnode *node; rdfnode_info parsed; Datum rdf_numeric; XsdNumericType inputType; /* Get the aggregate memory context */ AggCheckCallContext(fcinfo, &aggcontext); /* Get current state (NULL on first call) */ if (PG_ARGISNULL(0)) aggstate = NULL; else aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* Get the rdfnode and parse it */ node = (rdfnode *)PG_GETARG_TEXT_PP(1); parsed = parse_rdfnode(node); /* * Mark that we received input (even if non-numeric). * This distinguishes SUM({}) from SUM({"string"}) per SPARQL 1.1. */ if (aggstate == NULL) { /* Initialize state to track that we saw input */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc0(sizeof(RdfnodeAggState)); aggstate->has_input = true; aggstate->has_non_numeric = false; MemoryContextSwitchTo(oldcontext); } else { aggstate->has_input = true; } /* * Per SPARQL 1.1 spec Section 18.5.1.3: SUM returns an error if any * values are not numeric. Errors are excluded from the aggregate, so * if any non-numeric values are present, the entire SUM aggregate * returns unbound (NULL). Examples: * - SUM({1, 2, 3}) = 6 (all numeric) * - SUM({1, 2, "string"}) = NULL (mixed types cause error) * - SUM({"string"}) = NULL (all non-numeric) */ if (!parsed.isNumeric) { /* Non-numeric value - mark as error and skip it */ aggstate->has_non_numeric = true; PG_RETURN_POINTER(aggstate); } /* Determine the XSD type of this input */ inputType = get_xsd_numeric_type(parsed.dtype); /* Convert rdfnode lexical value to numeric */ rdf_numeric = DirectFunctionCall3(numeric_in, CStringGetDatum(parsed.lex), ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)); /* Initialize or update numeric accumulator */ if (aggstate->numeric_value == NULL) { /* First numeric value */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate->numeric_value = DatumGetNumeric( DirectFunctionCall1(numeric_uplus, rdf_numeric)); aggstate->maxType = inputType; MemoryContextSwitchTo(oldcontext); } else { /* Add to accumulator - need to be in aggcontext for result */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate->numeric_value = DatumGetNumeric( DirectFunctionCall2(numeric_add, NumericGetDatum(aggstate->numeric_value), rdf_numeric)); /* * Track the highest type seen (type promotion: * integer < decimal < float < double) */ if (inputType > aggstate->maxType) aggstate->maxType = inputType; MemoryContextSwitchTo(oldcontext); } PG_RETURN_POINTER(aggstate); } /* * sum_rdfnode_finalfunc * --------------------- * Final function for SUM(rdfnode). * Converts the accumulated numeric sum back to rdfnode with proper type promotion. * * Note: NULL state handling is done by the wrapper in rdf_fdw.c */ Datum sum_rdfnode_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; char *sum_str; char *result; const char *datatype_uri; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* If state is NULL (no rows), return NULL per SPARQL (unbound) */ if (aggstate == NULL) PG_RETURN_NULL(); /* If no numeric values were summed, return NULL (unbound per SPARQL) */ if (aggstate->numeric_value == NULL || aggstate->has_non_numeric) PG_RETURN_NULL(); /* Convert numeric to string */ sum_str = DatumGetCString(DirectFunctionCall1(numeric_out, NumericGetDatum(aggstate->numeric_value))); /* Get the appropriate XSD datatype based on type promotion */ datatype_uri = get_xsd_datatype_uri(aggstate->maxType); /* Format as typed literal rdfnode using strdt() */ result = strdt(sum_str, (char *)datatype_uri); pfree(sum_str); PG_RETURN_TEXT_P(cstring_to_text(result)); } /* * avg_rdfnode_sfunc * ----------------- * Aggregate transition function for AVG(rdfnode). * Accumulates sum and count for computing average. * * Note: Aggregate context validation and NULL input handling done by wrapper in rdf_fdw.c */ Datum avg_rdfnode_sfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; MemoryContext aggcontext; MemoryContext oldcontext; rdfnode *node; rdfnode_info parsed; Datum rdf_numeric; XsdNumericType inputType; /* Get the aggregate memory context */ AggCheckCallContext(fcinfo, &aggcontext); /* Get current state (NULL on first call) */ if (PG_ARGISNULL(0)) aggstate = NULL; else aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* Get the rdfnode and parse it */ node = (rdfnode *)PG_GETARG_TEXT_PP(1); parsed = parse_rdfnode(node); /* Mark that we received input (even if non-numeric). * This distinguishes AVG({}) from AVG({"string"}) per SPARQL 1.1 spec. */ if (aggstate == NULL) { /* Initialize state to track that we saw input */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc0(sizeof(RdfnodeAggState)); aggstate->has_input = true; aggstate->has_non_numeric = false; MemoryContextSwitchTo(oldcontext); } else { aggstate->has_input = true; } /* * Per SPARQL 1.1 spec Section 18.5.1.4: AVG returns an error if any values are not numeric. * Errors are excluded from the aggregate, so if any non-numeric values are present, * the entire AVG aggregate returns unbound (NULL). Examples: * - AVG({10, 20, 30}) = 20 (all numeric) * - AVG({10, 20, "string"}) = NULL (mixed types cause error) * - AVG({"string"}) = NULL (all non-numeric) */ if (!parsed.isNumeric) { /* Non-numeric value - mark as error and skip it */ aggstate->has_non_numeric = true; PG_RETURN_POINTER(aggstate); } /* Determine the XSD type of this input */ inputType = get_xsd_numeric_type(parsed.dtype); /* Convert rdfnode lexical value to numeric */ rdf_numeric = DirectFunctionCall3(numeric_in, CStringGetDatum(parsed.lex), ObjectIdGetDatum(InvalidOid), Int32GetDatum(-1)); /* Initialize or update numeric accumulator */ if (aggstate->numeric_value == NULL) { /* First numeric value */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate->numeric_value = DatumGetNumeric(DirectFunctionCall1(numeric_uplus, rdf_numeric)); aggstate->count = 1; aggstate->maxType = inputType; MemoryContextSwitchTo(oldcontext); } else { /* Add to accumulator */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate->numeric_value = DatumGetNumeric(DirectFunctionCall2(numeric_add, NumericGetDatum(aggstate->numeric_value), rdf_numeric)); aggstate->count++; /* Track the highest type seen (type promotion: integer < decimal < float < double) */ if (inputType > aggstate->maxType) aggstate->maxType = inputType; MemoryContextSwitchTo(oldcontext); } PG_RETURN_POINTER(aggstate); } /* * avg_rdfnode_finalfunc * --------------------- * Final function for AVG(rdfnode). * Computes average by dividing sum by count, with proper type promotion. * * Note: NULL state handling is done by the wrapper in rdf_fdw.c */ Datum avg_rdfnode_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; Numeric count_numeric; Numeric avg_numeric; Numeric avg_trunc0; char *avg_str; char *result; const char *datatype_uri; XsdNumericType outType; bool is_exact_integer = false; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* If state is NULL (no rows), return NULL per SPARQL (unbound) */ if (aggstate == NULL) PG_RETURN_NULL(); /* If no numeric values were aggregated, return NULL (unbound per SPARQL) */ if (aggstate->numeric_value == NULL || aggstate->has_non_numeric) PG_RETURN_NULL(); /* Convert count to numeric for division */ count_numeric = DatumGetNumeric(DirectFunctionCall1(int8_numeric, Int64GetDatum(aggstate->count))); /* Compute average: sum / count */ avg_numeric = DatumGetNumeric(DirectFunctionCall2(numeric_div, NumericGetDatum(aggstate->numeric_value), NumericGetDatum(count_numeric))); /* Determine output type for AVG: * - If any double was seen, use xsd:double * - else if any float was seen, use xsd:float * - else use xsd:decimal (even if the average is an exact integer) * This ensures AVG over integer-only inputs yields xsd:decimal, e.g., 42.0 */ outType = aggstate->maxType; /* Check exact-integer condition by truncating scale to 0 and comparing */ avg_trunc0 = DatumGetNumeric(DirectFunctionCall2(numeric_trunc, NumericGetDatum(avg_numeric), Int32GetDatum(0))); is_exact_integer = DatumGetBool(DirectFunctionCall2(numeric_eq, NumericGetDatum(avg_numeric), NumericGetDatum(avg_trunc0))); if (outType == XSD_TYPE_DOUBLE) { /* keep double */ } else if (outType == XSD_TYPE_FLOAT) { /* keep float */ } else { /* For integer-only or decimal inputs, return decimal */ outType = XSD_TYPE_DECIMAL; } /* Convert result to string. * For xsd:decimal and exact-integer values, append ".0" to match common SPARQL engine output. */ if (outType == XSD_TYPE_DECIMAL) { if (is_exact_integer) { char *int_str = DatumGetCString(DirectFunctionCall1(numeric_out, NumericGetDatum(avg_trunc0))); StringInfoData buf; initStringInfo(&buf); appendStringInfo(&buf, "%s.0", int_str); avg_str = buf.data; pfree(int_str); } else { avg_str = DatumGetCString(DirectFunctionCall1(numeric_out, NumericGetDatum(avg_numeric))); } } else { /* float/double: use native textual form */ avg_str = DatumGetCString(DirectFunctionCall1(numeric_out, NumericGetDatum(avg_numeric))); } /* Map chosen type to XSD URI */ datatype_uri = get_xsd_datatype_uri(outType); /* Format as typed literal rdfnode using strdt() */ result = strdt(avg_str, (char *)datatype_uri); pfree(avg_str); PG_RETURN_TEXT_P(cstring_to_text(result)); } /* * get_rdfnode_category_rank * ------------------------- * Returns a category rank for an rdfnode to support * mixed-type aggregate ordering. Lower rank = lower * priority for MAX, higher priority for MIN. * * Category order (low → high): * 0: string-like (plain literal, xsd:string, language-tagged) * 1: numeric (xsd:integer, xsd:decimal, xsd:float, etc.) * 2: dateTime * 3: date * 4: time * 5: duration * 6: other */ static int get_rdfnode_category_rank(rdfnode_info parsed) { if (strlen(parsed.lang) > 0 || parsed.isPlainLiteral || parsed.isString) return 0; if (parsed.isNumeric) return 1; if (parsed.isDateTime) return 2; if (parsed.isDate) return 3; if (parsed.isTime) return 4; if (parsed.isDuration) return 5; return 6; } /* * min_rdfnode_sfunc * ----------------- * Aggregate transition function for MIN(rdfnode). * Compares rdfnode values and keeps track of the minimum. * * Mixed-type policy (Fuseki-compatible): * - Assigns each term to a category (string-like < numeric < temporal). * - MIN selects the lowest category present; ties resolved by comparator. * - Example: MIN({"zebra"^^xsd:string, 42, "mango"^^xsd:string}) → * "mango"^^xsd:string (string category wins; lexical minimum among * strings). * * Note: Aggregate context validation and NULL input handling done by * wrapper in rdf_fdw.c */ Datum min_rdfnode_sfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; MemoryContext aggcontext; MemoryContext oldcontext; text *input_node; rdfnode_info input_parsed; rdfnode_info current_parsed; /* Get the aggregate memory context */ AggCheckCallContext(fcinfo, &aggcontext); /* Get current state (NULL on first call) */ if (PG_ARGISNULL(0)) aggstate = NULL; else aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* Get and parse the input rdfnode */ input_node = PG_GETARG_TEXT_PP(1); input_parsed = parse_rdfnode((rdfnode *)input_node); if (aggstate == NULL) { /* First row: allocate state and store the rdfnode */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc(sizeof(RdfnodeAggState)); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY(PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); PG_RETURN_POINTER(aggstate); } /* Parse current value for category-based comparison */ current_parsed = parse_rdfnode((rdfnode *)aggstate->rdfnode_value); /* * Choose the smallest category present, then the minimum within * that category. */ { int rank_in = get_rdfnode_category_rank(input_parsed); int rank_cur = get_rdfnode_category_rank(current_parsed); if (rank_in < rank_cur) { /* Input has lower category → new minimum */ oldcontext = MemoryContextSwitchTo(aggcontext); pfree(aggstate->rdfnode_value); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY( PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); } else if (rank_in == rank_cur) { /* Same category → use comparator */ int cmp = rdfnode_cmp_for_aggregate( (rdfnode *)input_node, (rdfnode *)aggstate->rdfnode_value); if (cmp < 0) { oldcontext = MemoryContextSwitchTo(aggcontext); pfree(aggstate->rdfnode_value); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY( PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); } } /* rank_in > rank_cur: keep current (higher category) */ } PG_RETURN_POINTER(aggstate); } /* * min_rdfnode_finalfunc * --------------------- * Final function for MIN(rdfnode). * Returns the minimum rdfnode value stored as text. * * Note: NULL state handling is done by the wrapper in rdf_fdw.c */ Datum min_rdfnode_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); if (aggstate == NULL || aggstate->rdfnode_value == NULL) PG_RETURN_NULL(); /* Return the stored minimum rdfnode */ PG_RETURN_TEXT_P(aggstate->rdfnode_value); } /* * max_rdfnode_sfunc * ----------------- * Aggregate transition function for MAX(rdfnode). * Compares rdfnode values and keeps track of the maximum. * * Mixed-type policy: * - Assigns each term to a category (string-like < numeric < temporal). * - MAX selects the highest category present; ties resolved by comparator. * - Example: MAX({42, "2023-01-01"^^xsd:date}) → "2023-01-01"^^xsd:date * (date category wins over numeric). * * Note: Aggregate context validation and NULL input handling done by * wrapper in rdf_fdw.c */ Datum max_rdfnode_sfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; MemoryContext aggcontext; MemoryContext oldcontext; text *input_node; rdfnode_info input_parsed; rdfnode_info current_parsed; /* Get the aggregate memory context */ AggCheckCallContext(fcinfo, &aggcontext); /* Get current state (NULL on first call) */ if (PG_ARGISNULL(0)) aggstate = NULL; else aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* Get and parse the input rdfnode */ input_node = PG_GETARG_TEXT_PP(1); input_parsed = parse_rdfnode((rdfnode *)input_node); if (aggstate == NULL) { /* First row: allocate state and store the rdfnode */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc(sizeof(RdfnodeAggState)); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY(PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); PG_RETURN_POINTER(aggstate); } current_parsed = parse_rdfnode((rdfnode *)aggstate->rdfnode_value); /* * Choose the largest category present, then the maximum within * that category. */ { int rank_in = get_rdfnode_category_rank(input_parsed); int rank_cur = get_rdfnode_category_rank(current_parsed); if (rank_in > rank_cur) { /* Input has higher category → new maximum */ oldcontext = MemoryContextSwitchTo(aggcontext); pfree(aggstate->rdfnode_value); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY( PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); } else if (rank_in == rank_cur) { /* Same category → use comparator */ int cmp = rdfnode_cmp_for_aggregate( (rdfnode *)input_node, (rdfnode *)aggstate->rdfnode_value); if (cmp > 0) { oldcontext = MemoryContextSwitchTo(aggcontext); pfree(aggstate->rdfnode_value); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY( PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); } } /* rank_in < rank_cur: keep current (higher category) */ } PG_RETURN_POINTER(aggstate); } /* * max_rdfnode_finalfunc * --------------------- * Final function for MAX(rdfnode). * Returns the maximum rdfnode value stored as text, or NULL if no values were aggregated. * * Note: NULL state handling is done by the wrapper in rdf_fdw.c */ Datum max_rdfnode_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); if (aggstate == NULL || aggstate->rdfnode_value == NULL) PG_RETURN_NULL(); /* Return the stored maximum rdfnode */ PG_RETURN_TEXT_P(aggstate->rdfnode_value); } /* * sample_rdfnode_sfunc * -------------------- * Aggregate transition function for SAMPLE(rdfnode). * Returns an arbitrary value from the aggregate group. * * Per SPARQL 1.1 Section 18.5.1.8, SAMPLE returns an "arbitrary value" * from the multiset passed to it. The spec explicitly states the result * is non-deterministic. * * This implementation follows the common industry practice of returning * the first non-NULL value encountered. While deterministic, this is * acceptable as the spec allows implementation-defined behavior for * "arbitrary". */ Datum sample_rdfnode_sfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; MemoryContext aggcontext; MemoryContext oldcontext; text *input_node; /* Get the aggregate memory context */ AggCheckCallContext(fcinfo, &aggcontext); /* Get current state (NULL on first call) */ if (PG_ARGISNULL(0)) aggstate = NULL; else aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* If we already have a value, keep it (first value wins) */ if (aggstate != NULL) PG_RETURN_POINTER(aggstate); /* Get the input rdfnode */ input_node = PG_GETARG_TEXT_PP(1); /* First non-NULL value: allocate state and store it */ oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc(sizeof(RdfnodeAggState)); aggstate->rdfnode_value = (text *)PG_DETOAST_DATUM_COPY(PointerGetDatum(input_node)); MemoryContextSwitchTo(oldcontext); PG_RETURN_POINTER(aggstate); } /* * sample_rdfnode_finalfunc * ------------------------ * Final function for SAMPLE(rdfnode). * Returns the arbitrary value stored (first non-NULL * value encountered). * * Note: NULL state handling is done by the wrapper in * rdf_fdw.c */ Datum sample_rdfnode_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); if (aggstate == NULL || aggstate->rdfnode_value == NULL) PG_RETURN_NULL(); /* Return the stored sample value */ PG_RETURN_TEXT_P(aggstate->rdfnode_value); } /* * group_concat_sfunc * ------------------ * Transition function for GROUP_CONCAT(rdfnode [, separator]). * * Accumulates string representations of RDF terms, separated by a * delimiter. Per SPARQL 1.1 Section 18.5.1.7, the default separator * is a single space character. * * RDF term serialization follows SPARQL rules: * - Typed literals: extract lexical value only (strip ^^datatype) * - Language-tagged: extract lexical value only (strip @lang) * - IRIs: use URI string (strip angle brackets) * - Plain literals: use as-is * * NULL/unbound values are skipped during aggregation. */ Datum group_concat_sfunc(PG_FUNCTION_ARGS) { MemoryContext aggcontext; MemoryContext oldcontext; RdfnodeAggState *aggstate; text *input_node; rdfnode_info parsed; char *str_value; if (!AggCheckCallContext(fcinfo, &aggcontext)) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("aggregate function called in non-aggregate context"))); /* Get the current state */ aggstate = PG_ARGISNULL(0) ? NULL : (RdfnodeAggState *)PG_GETARG_POINTER(0); /* Skip NULL input values */ if (PG_ARGISNULL(1)) { if (aggstate == NULL) PG_RETURN_NULL(); PG_RETURN_POINTER(aggstate); } /* Get the input rdfnode */ input_node = PG_GETARG_TEXT_PP(1); parsed = parse_rdfnode((rdfnode *)input_node); /* Extract lexical value based on RDF term type */ if (parsed.isIRI) { /* For IRIs, remove angle brackets: → http://example.org */ size_t len = strlen(parsed.raw); if (len > 2 && parsed.raw[0] == '<' && parsed.raw[len - 1] == '>') { str_value = palloc(len - 1); memcpy(str_value, parsed.raw + 1, len - 2); str_value[len - 2] = '\0'; } else { str_value = pstrdup(parsed.raw); } } else { /* For literals, use the lexical value (already extracted by parse_rdfnode) */ str_value = parsed.lex; } /* Initialize state on first value */ if (aggstate == NULL) { oldcontext = MemoryContextSwitchTo(aggcontext); aggstate = (RdfnodeAggState *)palloc(sizeof(RdfnodeAggState)); aggstate->result_str = makeStringInfo(); /* Get separator (arg 2), default to space if not provided */ if (PG_NARGS() > 2 && !PG_ARGISNULL(2)) { /* Copy the separator into aggregate memory context */ aggstate->separator = PG_GETARG_TEXT_P_COPY(2); } else aggstate->separator = cstring_to_text(" "); /* SPARQL 1.1 default */ aggstate->has_input = false; MemoryContextSwitchTo(oldcontext); } /* Add separator if not the first value */ oldcontext = MemoryContextSwitchTo(aggcontext); if (aggstate->has_input) { appendStringInfoString(aggstate->result_str, text_to_cstring(aggstate->separator)); } /* Append the string value */ appendStringInfoString(aggstate->result_str, str_value); aggstate->has_input = true; MemoryContextSwitchTo(oldcontext); PG_RETURN_POINTER(aggstate); } /* * group_concat_finalfunc * ---------------------- * Final function for GROUP_CONCAT(rdfnode [, separator]). * * Returns the concatenated string as a simple literal (plain literal * without datatype or language tag), matching SPARQL 1.1 semantics. * Returns empty string for empty result sets (per SPARQL 1.1). * * Note: NULL state handling is done by the wrapper in rdf_fdw.c */ Datum group_concat_finalfunc(PG_FUNCTION_ARGS) { RdfnodeAggState *aggstate; char *literal; text *result; /* Get the state (already validated as non-NULL by wrapper) */ aggstate = (RdfnodeAggState *)PG_GETARG_POINTER(0); if (aggstate == NULL || aggstate->result_str == NULL) { /* No input values: return empty simple literal */ result = cstring_to_text(""); PG_RETURN_TEXT_P(result); } /* Convert to simple literal (plain literal without datatype) */ literal = cstring_to_rdfliteral(aggstate->result_str->data); /* Return as rdfnode (text type) */ result = cstring_to_text(literal); PG_RETURN_TEXT_P(result); }