/*--------------------------------------------------------------------- * * rdf_utils.c * Utility functions for RDF data manipulation and validation. * * Copyright (C) 2022-2026 Jim Jones * *--------------------------------------------------------------------- */ #include "postgres.h" #include "rdf_fdw.h" #include "rdf_utils.h" #include "rdfnode.h" #include "sparql.h" #include "lib/stringinfo.h" #include "utils/builtins.h" #if PG_VERSION_NUM >= 100000 #include "utils/varlena.h" #endif #include "access/htup_details.h" #include "catalog/pg_type.h" #include "mb/pg_wchar.h" #include "nodes/makefuncs.h" #include #include #include /* Type mapping table for PostgreSQL types to XSD datatypes */ static const TypeXSDMap type_map[] = { {INT2OID, "integer"}, {INT4OID, "integer"}, {INT8OID, "integer"}, {NUMERICOID, "decimal"}, {FLOAT8OID, "double"}, {FLOAT4OID, "float"}, {BOOLOID, "boolean"}, {TIMESTAMPOID, "dateTime"}, {DATEOID, "date"}, {TIMEOID, "time"}, {TEXTOID, "string"}, {NAMEOID, "string"}, {TIMESTAMPTZOID, "dateTime"}, {InvalidOid, NULL} }; /* * ContainsWhitespaces * --------------- * Checks if a string contains whitespaces * * str: string to be evaluated * * returns true if the string contains whitespaces or false otherwise */ bool ContainsWhitespaces(char *str) { elog(DEBUG3, "%s called: str='%s'", __func__, str); for (int i = 0; str[i] != '\0'; i++) if (isspace((unsigned char)str[i])) { elog(DEBUG3, "%s exit: returning 'true'", __func__); return true; } elog(DEBUG3, "%s exit: returning 'false'", __func__); return false; } /* * is_valid_language_tag * ---------------------- * Validates language tags according to the pattern: [a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})* * Examples: "en", "en-US", "de-DE" */ bool is_valid_language_tag(const char *lan) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for language tag"))); reti = regexec(®ex, lan, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } /* * isPlainLiteral * -------------- * Checks if a literal is a plain literal (no language tag or datatype). */ bool isPlainLiteral(char *literal) { if (strlen(lang(literal)) != 0 || strlen(datatype(literal)) != 0) return false; return true; } /* * LiteralsCompatible * ------------------ * * Determines if two RDF literals are compatible according to SPARQL rules. * Compatibility is based on language tags and datatypes: literals are compatible * if they are both simple literals or xsd:string, or if they have identical * language tags, or if one has a language tag and the other is a simple literal * or xsd:string. Incompatible cases (e.g., one with a datatype and the other * with a language tag) return false. * * literal1: Null-terminated C string representing an RDF literal (e.g., "abc"@en, "123"^^xsd:integer) * literal2: Null-terminated C string representing an RDF literal (e.g., "def", "456"^^xsd:string) * * returns: C boolean (true if literals are compatible, false otherwise) */ bool LiteralsCompatible(char *literal1, char *literal2) { char *lang1; char *lang2; char *dt1; char *dt2; elog(DEBUG3, "%s called: literal1='%s', literal2='%s'", __func__, literal1, literal2); lang1 = lang(literal1); lang2 = lang(literal2); dt1 = datatype(literal1); dt2 = datatype(literal2); if (!literal1 || !literal2) { elog(DEBUG3, "%s exit: returning 'false' (one of the arguments is NULL)", __func__); return false; } /*TODO: check if RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED is needed, as the prefix is expaded elsewhere */ /* both simple literals or xsd:string */ if (strlen(lang1) == 0 && strlen(lang2) == 0 && (strlen(dt1) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0) && (strlen(dt2) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { elog(DEBUG3, "%s exit: returning 'true' (both simple literals or xsd:string)", __func__); return true; } /* both plain literals with identical language tags */ if (strlen(lang1) > 0 && strlen(lang2) > 0 && strcmp(lang1, lang2) == 0) { elog(DEBUG3, "%s exit: returning 'true' (both plain literals with identical language tags)", __func__); return true; } /* arg1 has language tag, arg2 is simple or xsd:string */ if (strlen(lang1) > 0 && strlen(lang2) == 0 && (strlen(dt2) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { elog(DEBUG3, "%s exit: returning 'true' (arg1 has language tag, arg2 is simple or xsd:string)", __func__); return true; } /* incompatible otherwise (e.g., arg1 xsd:string, arg2 language-tagged) */ elog(DEBUG3, "%s exit: returning 'false' (incompatible)", __func__); return false; } /* * cstring_to_rdfliteral * --------------------- * * Converts a raw string input into a valid RDF literal by adding quotes and escaping * internal quotes as needed. If the input is already a complete RDF literal (i.e., * quoted with a language tag or datatype), it is returned unchanged. * * input: the raw string or partial literal to convert (e.g., "abc", "abc"@en, "ab\"c") * * returns: a string representing the RDF literal (e.g., "\"abc\"", "\"ab\\\"c\"") * or the input as-is if already a complete literal. */ char *cstring_to_rdfliteral(char *input) { StringInfoData buf; const char *start; const char *end; int len; elog(DEBUG3, "%s called: input='%s'", __func__, input); /* return the string as-is if the input is an IRI */ if (isIRI(input)) return input; if (!input || strlen(input) == 0) { elog(DEBUG3, "%s exit: returning empty literal '\"\"'", __func__); return "\"\""; /* empty input becomes empty literal */ } start = input; len = strlen(start); initStringInfo(&buf); /* check if it's already a complete RDF literal */ if (*start == '"') { end = start + len - 1; /* last character */ if (end > start) { const char *tag = strstr(start, "@"); if (!tag) tag = strstr(start, "^^"); if (tag && tag > start + 1 && *(tag - 1) == '"') { elog(DEBUG3, "%s exit: returning => '%s'", __func__, input); /* complete literal with lang or type, return as-is */ return input; } } } /* not a complete literal, treat as raw content */ end = start + len; /* add opening quote */ appendStringInfoChar(&buf, '"'); /* process the content, escaping all quotes */ while (start < end) { if (*start == '"') { /* escape unless already escaped */ if (start == input || *(start - 1) != '\\') { appendStringInfoChar(&buf, '\\'); } appendStringInfoChar(&buf, '"'); } else { appendStringInfoChar(&buf, *start); } start++; } /* add closing quote */ appendStringInfoChar(&buf, '"'); elog(DEBUG3, "%s exit: returning => '%s'", __func__, buf.data); return buf.data; } /* * ExpandDatatypePrefix * -------------------- * * Expands a datatype prefix (e.g., "xsd:") to its full URI form if recognized. * Strips angle brackets (< >) from input before processing. Supports "xsd:" mapped * to "http://www.w3.org/2001/XMLSchema#". Returns the input as-is (without < >) * for other prefixed or bare datatypes, assuming prefix resolution elsewhere. * * str: Null-terminated C string representing a datatype (e.g., "xsd:string", "") * * returns: Null-terminated C string, expanded for "xsd:" or stripped/as-is otherwise */ char *ExpandDatatypePrefix(char *str) { StringInfoData buf; const char *xsd_prefix = "xsd:"; char *stripped_str = str; size_t len; elog(DEBUG3, "%s called: str='%s'", __func__, str); if (!str || strlen(str) == 0) return ""; /* Empty input returns empty string */ len = strlen(str); /* Strip < > if present */ if (str[0] == '<' && str[len - 1] == '>') { stripped_str = palloc(len - 1); /* allocate space for stripped string (len - 2 + null terminator) */ strncpy(stripped_str, str + 1, len - 2); stripped_str[len - 2] = '\0'; /* NULL-terminate */ } /* Check for 'xsd:' prefix and expand it */ if (strncmp(stripped_str, xsd_prefix, strlen(xsd_prefix)) == 0 && strlen(stripped_str) > strlen(xsd_prefix)) { const char *suffix = stripped_str + strlen(xsd_prefix); /* get part after "xsd:" */ initStringInfo(&buf); appendStringInfoChar(&buf, '<'); /* open bracket */ appendStringInfoString(&buf, RDF_XSD_BASE_URI); /* add XSD URI */ appendStringInfoString(&buf, suffix); /* add suffix */ appendStringInfoChar(&buf, '>'); /* close bracket */ if (stripped_str != str) pfree(stripped_str); elog(DEBUG3, "%s exit: returning '%s'", __func__, buf.data); return buf.data; } /* return stripped string (or original if no stripping) without < > */ if (stripped_str != str) { initStringInfo(&buf); appendStringInfoString(&buf, stripped_str); pfree(stripped_str); elog(DEBUG3, "%s exit: returning '%s'", __func__, buf.data); return buf.data; } elog(DEBUG3, "%s exit: returning '%s'", __func__, str); return str; } /* * MapSPARQLDatatype * ----------------- * Maps PostgreSQL type OIDs to their corresponding XSD datatype strings. */ char *MapSPARQLDatatype(Oid pgtype) { elog(DEBUG3, "%s called: input='%u'", __func__, pgtype); for (int i = 0; type_map[i].type_oid != InvalidOid; i++) { if (pgtype == type_map[i].type_oid) { elog(DEBUG3, "%s exit: returning => '%s'", __func__, (char *)type_map[i].xsd_datatype); return (char *)type_map[i].xsd_datatype; } } elog(DEBUG3, "%s exit: returning NULL (unsupported type)", __func__); return NULL; } #if PG_VERSION_NUM < 130000 void pg_unicode_to_server(pg_wchar c, unsigned char *utf8) { unsigned char utf8buf[8]; /* Large enough for UTF-8 encoding */ int len; unsigned char *converted; /* Convert Unicode code point to UTF-8 */ if (unicode_to_utf8(c, utf8buf) == NULL) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid Unicode code point: 0x%04x", c))); len = pg_utf_mblen(utf8buf); /* Get the length of the encoded UTF-8 character */ if (GetDatabaseEncoding() == PG_UTF8) { memcpy(utf8, utf8buf, len); } else { converted = pg_do_encoding_conversion(utf8buf, len, PG_UTF8, GetDatabaseEncoding()); if (converted == NULL) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("Unicode character 0x%04x cannot be converted to server encoding \"%s\"", c, GetDatabaseEncodingName()))); memcpy(utf8, converted, strlen((const char *)converted)); } utf8[len] = '\0'; /* Null-terminate (safe if utf8 has size ≥ 5) */ } #endif char *unescape_unicode(const char *input) { StringInfoData buf; initStringInfo(&buf); elog(DEBUG2, "%s: Input='%s'", __func__, input); for (const char *p = input; *p;) { if (p[0] == '\\' && p[1] == 'u') { /* \uXXXX (exactly 4 hex digits) */ if (p[2] && p[3] && p[4] && p[5] && isxdigit(p[2]) && isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5]) && (!p[6] || !isxdigit(p[6]))) { uint16_t codeunit; char hex[5]; unsigned char utf8[5]; int len; memcpy(hex, p + 2, 4); hex[4] = '\0'; sscanf(hex, "%hx", &codeunit); elog(DEBUG2, "%s: Parsed \\u%s to codeunit U+%04X", __func__, hex, codeunit); /* Check for high surrogate */ if (codeunit >= 0xD800 && codeunit <= 0xDBFF && p[6] == '\\' && p[7] == 'u' && p[8] && p[9] && p[10] && p[11] && isxdigit(p[8]) && isxdigit(p[9]) && isxdigit(p[10]) && isxdigit(p[11]) && (!p[12] || !isxdigit(p[12]))) { uint16_t low; char lowhex[5]; uint32_t full; memcpy(lowhex, p + 8, 4); lowhex[4] = '\0'; sscanf(lowhex, "%hx", &low); if (low >= 0xDC00 && low <= 0xDFFF) { full = 0x10000 + (((codeunit - 0xD800) << 10) | (low - 0xDC00)); elog(DEBUG2, "%s: Surrogate pair U+%04X U+%04X -> U+%X", __func__, codeunit, low, full); memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(full, (unsigned char *)utf8); len = pg_utf_mblen((const unsigned char *)utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 12; continue; } } if (codeunit >= 0xD800 && codeunit <= 0xDFFF) { elog(DEBUG2, "%s: Lone surrogate U+%04X -> U+FFFD", __func__, codeunit); pg_unicode_to_server(0xFFFD, (unsigned char *)utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 6; continue; } memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(codeunit, (unsigned char *)utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 6; continue; } else { elog(DEBUG2, "%s: Invalid \\u sequence at '%s' -> literal", __func__, p); appendStringInfoString(&buf, "\\u"); p += 2; for (int i = 0; i < 4 && p[0] && isxdigit(p[0]); i++) appendStringInfoChar(&buf, *p++); continue; } } else if (p[0] == '\\' && p[1] == 'U') { /* \UXXXXXXXX (exactly 8 hex digits) */ if (p[2] && p[3] && p[4] && p[5] && p[6] && p[7] && p[8] && p[9] && isxdigit(p[2]) && isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5]) && isxdigit(p[6]) && isxdigit(p[7]) && isxdigit(p[8]) && isxdigit(p[9]) && (!p[10] || !isxdigit(p[10]))) { char hex[9]; uint32_t codepoint; unsigned char utf8[5]; int len; memcpy(hex, p + 2, 8); hex[8] = '\0'; sscanf(hex, "%x", &codepoint); elog(DEBUG2, "%s: Parsed \\U%s to codepoint U+%X", __func__, hex, codepoint); if (codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { elog(DEBUG2, "%s: Invalid codepoint U+%X -> U+FFFD", __func__, codepoint); codepoint = 0xFFFD; } memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(codepoint, utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 10; continue; } else { elog(DEBUG2, "%s: Invalid \\U sequence at '%s' -> literal", __func__, p); appendStringInfoString(&buf, "\\U"); p += 2; for (int i = 0; i < 8 && p[0] && isxdigit(p[0]); i++) appendStringInfoChar(&buf, *p++); continue; } } else { /* Preserve all other characters, including \t, \n, \", etc. */ appendStringInfoChar(&buf, *p++); } } elog(DEBUG2, "%s: Output='%s'", __func__, buf.data); return buf.data; } /* * IsFunctionPushable * --------------- * Check if a PostgreSQL function can be pushed down. * * funcname: name of the PostgreSQL function * * returns true if the function can be pushed down or false otherwise */ bool IsFunctionPushable(char *funcname) { bool result; elog(DEBUG3, "%s called: funcname='%s'", __func__, funcname); result = strcmp(funcname, "abs") == 0 || strcmp(funcname, "ceil") == 0 || strcmp(funcname, "floor") == 0 || strcmp(funcname, "round") == 0 || strcmp(funcname, "upper") == 0 || strcmp(funcname, "lower") == 0 || strcmp(funcname, "length") == 0 || strcmp(funcname, "md5") == 0 || strcmp(funcname, "starts_with") == 0 || strcmp(funcname, "strstarts") == 0 || strcmp(funcname, "strends") == 0 || strcmp(funcname, "strbefore") == 0 || strcmp(funcname, "strafter") == 0 || strcmp(funcname, "strlang") == 0 || strcmp(funcname, "langmatches") == 0 || strcmp(funcname, "strdt") == 0 || strcmp(funcname, "str") == 0 || strcmp(funcname, "iri") == 0 || strcmp(funcname, "isiri") == 0 || strcmp(funcname, "lang") == 0 || strcmp(funcname, "datatype") == 0 || strcmp(funcname, "contains") == 0 || strcmp(funcname, "extract") == 0 || strcmp(funcname, "encode_for_uri") == 0 || strcmp(funcname, "isblank") == 0 || strcmp(funcname, "isnumeric") == 0 || strcmp(funcname, "isliteral") == 0 || strcmp(funcname, "bnode") == 0 || strcmp(funcname, "lcase") == 0 || strcmp(funcname, "ucase") == 0 || strcmp(funcname, "strlen") == 0 || strcmp(funcname, "substr") == 0 || strcmp(funcname, "concat") == 0 || strcmp(funcname, "replace") == 0 || strcmp(funcname, "regex") == 0 || strcmp(funcname, "year") == 0 || strcmp(funcname, "month") == 0 || strcmp(funcname, "day") == 0 || strcmp(funcname, "hours") == 0 || strcmp(funcname, "minutes") == 0 || strcmp(funcname, "seconds") == 0 || strcmp(funcname, "timezone") == 0 || strcmp(funcname, "tz") == 0 || strcmp(funcname, "bound") == 0 || strcmp(funcname, "sameterm") == 0 || strcmp(funcname, "coalesce") == 0 || strcmp(funcname, "substring") == 0 || strcmp(funcname, "rdfnode_to_time") == 0 || strcmp(funcname, "rdfnode_to_timetz") == 0 || strcmp(funcname, "rdfnode_to_timestamp") == 0 || strcmp(funcname, "rdfnode_to_timestamptz") == 0 || strcmp(funcname, "rdfnode_to_boolean") == 0 || strcmp(funcname, "boolean_to_rdfnode") == 0; elog(DEBUG3, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * IsRDFStringLiteral * ------------------ * * Checks if an RDF term is a string literal (simple, xsd:string, or language-tagged). * Follows SPARQL 1.1 requirements for string literal inputs (e.g., LCASE, UCASE). * Returns 1 for valid string literals, 0 otherwise. Logs unexpected datatypes for * debugging, as derived string types (e.g., xsd:token) may appear in some datasets. * * str_datatype: Null-terminated C string from datatype() (e.g., "", "http://www.w3.org/2001/XMLSchema#string") * str_language: Null-terminated C string from lang() (e.g., "", "en") */ bool IsRDFStringLiteral(char *str) { elog(DEBUG3, "%s called: str='%s'", __func__, str); if (str == NULL) { elog(DEBUG3, "%s exit: returning 'false' (NULL argument)", __func__); return false; } if (strcmp(str, "") == 0 || strcmp(str, RDF_SIMPLE_LITERAL_DATATYPE) == 0 || strcmp(str, RDF_LANGUAGE_LITERAL_DATATYPE) == 0) { elog(DEBUG3, "%s exit: returning 'true'", __func__); return true; } elog(DEBUG3, "%s exit: returning 'false' (unsupported datatype '%s')", __func__, str); return false; } /* * CreateRegexString * --------------- * Escapes regex wildcards into normal characters by adding \\ to them * * str: string to be converted * * returns str with the regex wildcards escaped. */ char *CreateRegexString(char *str) { StringInfoData res; initStringInfo(&res); elog(DEBUG3, "%s called: str='%s'", __func__, str); if (!str) return NULL; for (int i = 0; str[i] != '\0'; i++) { char c = str[i]; if (i == 0 && c != '%' && c != '_' && c != '^') appendStringInfo(&res, "^"); if (strchr("/:=#@^()[]{}+-*$.?|", c) != NULL) appendStringInfo(&res, "\\\\%c", c); else if (c == '%') appendStringInfo(&res, ".*"); else if (c == '_') appendStringInfo(&res, "."); else if (c == '"') appendStringInfo(&res, "\\\""); else appendStringInfo(&res, "%c", c); if (i == strlen(str) - 1 && c != '%' && c != '_') appendStringInfo(&res, "$"); elog(DEBUG2, "%s loop => %c res => %s", __func__, str[i], NameStr(res)); } elog(DEBUG3, "%s exit: returning '%s'", __func__, NameStr(res)); return NameStr(res); } /* * FormatSQLExtractField * --------------- * The fields "years", "months" and "days" (plural) and "hour", "minute", * "second" (singular) are note supported in SPARQL, but PostgreSQL can * handle both. So here we convert the parameters to a form that correspond * to a SPARQL function. * * field: EXTRACT or DATE_PART field parameter * * returns formated field parameter (uppercase) */ char *FormatSQLExtractField(char *field) { char *res; elog(DEBUG3, "%s called: field='%s'", __func__, field); if (strcasecmp(field, "year") == 0 || strcasecmp(field, "years") == 0) res = "YEAR"; else if (strcasecmp(field, "month") == 0 || strcasecmp(field, "months") == 0) res = "MONTH"; else if (strcasecmp(field, "day") == 0 || strcasecmp(field, "days") == 0) res = "DAY"; else if (strcasecmp(field, "hour") == 0 || strcasecmp(field, "hours") == 0) res = "HOURS"; else if (strcasecmp(field, "minute") == 0 || strcasecmp(field, "minutes") == 0) res = "MINUTES"; else if (strcasecmp(field, "second") == 0 || strcasecmp(field, "seconds") == 0) res = "SECONDS"; else { elog(DEBUG3, "%s exit: returning NULL (field unknown)", __func__); return NULL; } elog(DEBUG3, "%s exit: returning '%s'", __func__, res); return res; } /* * ConstToCString * ----------------- * Extracts a string from a Const * * returns a palloc'ed copy. */ char *ConstToCString(Const *constant) { if (constant->constisnull) return NULL; else return text_to_cstring(DatumGetTextP(constant->constvalue)); } /* * CStringToConst * ----------------- * Extracts a Const from a char* * * returns Const from given string. */ Const *CStringToConst(const char *str) { if (str == NULL) return makeNullConst(TEXTOID, -1, InvalidOid); else return makeConst(TEXTOID, -1, InvalidOid, -1, PointerGetDatum(cstring_to_text(str)), false, false); } char *rdfnode_to_cstring(rdfnode *node) { /* Get a pointer to the actual data and its length */ char *data = VARDATA_ANY(node); int len = VARSIZE_ANY_EXHDR(node); /* Allocate a null-terminated C string */ char *result = palloc(len + 1); memcpy(result, data, len); result[len] = '\0'; return result; } /* * IsStringDataType * --------------- * Determines if a PostgreSQL data type is string or numeric type * so that we can know when to wrap the value with single quotes * or leave it as-is. * * type: PostgreSQL data type * * returns true if the data type needs to be wrapped with quotes * or false otherwise. */ bool IsStringDataType(Oid type) { bool result; if (type == RDFNODEOID) elog(DEBUG3, "%s called: type='(RDFNODEOID)'", __func__); else elog(DEBUG3, "%s called: type='%u'", __func__, type); result = type == TEXTOID || type == VARCHAROID || type == CHAROID || type == NAMEOID || type == DATEOID || type == TIMESTAMPOID || type == TIMESTAMPTZOID || type == NAMEOID || type == RDFNODEOID; elog(DEBUG3, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } bool is_valid_xsd_double(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$"; if (pg_strcasecmp(lexical, "NaN") == 0 || pg_strcasecmp(lexical, "INF") == 0 || pg_strcasecmp(lexical, "-INF") == 0) return true; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:double"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_int(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^-?[0-9]+$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:int"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_dateTime(const char *lexical) { const char *p = lexical; int i; if (!lexical) return false; /* Parse year (4 digits) */ for (i = 0; i < 4; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect '-' */ if (*p != '-') return false; p++; /* Parse month (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect '-' */ if (*p != '-') return false; p++; /* Parse day (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect 'T' */ if (*p != 'T') return false; p++; /* Parse hour (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect ':' */ if (*p != ':') return false; p++; /* Parse minute (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect ':' */ if (*p != ':') return false; p++; /* Parse second (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Optional: fractional seconds */ if (*p == '.') { p++; /* Must have at least one digit after decimal point */ if (!isdigit((unsigned char)*p)) return false; /* Continue reading all fractional digits */ while (isdigit((unsigned char)*p)) p++; } /* Optional: timezone */ if (*p == 'Z') { p++; } else if (*p == '+' || *p == '-') { p++; /* Timezone hour (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } /* Expect ':' */ if (*p != ':') return false; p++; /* Timezone minute (2 digits) */ for (i = 0; i < 2; i++) { if (!isdigit((unsigned char)*p)) return false; p++; } } /* Must be at end of string */ if (*p != '\0') return false; return true; } bool is_valid_xsd_time(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\\.[0-9]+)?([+-][0-9]{2}:[0-9]{2}|Z)?$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:time"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_date(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])([+-][0-9]{2}:[0-9]{2}|Z)?$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:date"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } /* * IsSPARQLVariableValid * --------------- * A query variable is marked by the use of either "?" or "$"; the "?" or * "$" is not part of the variable name. Valid characters for the name * are [a-z], [A-Z], [0-9] * * str: string to be evaluated * * returns true if the variable is valid or false otherwise */ bool IsSPARQLVariableValid(const char *str) { elog(DEBUG3, "%s called: str='%s'", __func__, str); if (str[0] != '?' && str[0] != '$') { elog(DEBUG3, "%s exit: returning 'false' (str does not start with '?' or '$')", __func__); return false; } for (int i = 1; str[i] != '\0'; i++) if (!isalnum(str[i]) && str[i] != '_') { elog(DEBUG3, "%s exit: returning 'false' (invalid variable name)", __func__); return false; } elog(DEBUG3, "%s exit: returning 'true'", __func__); return true; } /* * IsSPARQLParsable * ------------------ * Checks if a SPARQL query can be parsed and modified to accommodate possible * pusdhown instructions. If it returns false it does not mean that the query * is invalid. It just means that it contains unsupported clauses and it cannot * be modifed. * * state: SPARQL, SERVER and FOREIGN TABLE info * * returns 'true' if the SPARQL query is safe to be parsed or 'false' otherwise */ bool IsSPARQLParsable(struct RDFfdwState *state) { int keyword_count = 0; bool result; elog(DEBUG3, "%s called", __func__); /* * SPARQL Queries containing SUB SELECTS are not supported. So, if any number * other than 1 is returned from LocateKeyword, this query cannot be parsed. */ LocateKeyword(state->raw_sparql, "{\n\t> ", RDF_SPARQL_KEYWORD_SELECT, " *?\n\t", &keyword_count, 0); elog(DEBUG2, "%s: SPARQL contains '%d' SELECT clauses.", __func__, keyword_count); result = LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_GROUPBY, " \n\t?", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_ORDERBY, " \n\t?DA", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_LIMIT, " \n\t", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_MINUS, " \n\t{", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_UNION, " \n\t{", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t", RDF_SPARQL_KEYWORD_HAVING, " \n\t(", NULL, 0) == RDF_KEYWORD_NOT_FOUND && keyword_count == 1; elog(DEBUG3, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * IsExpressionPushable * ------------ * Checks if an expression attached to a column can be pushed down, in case it * is used in a condition in the SQL WHERE clause. * * state: SPARQL, SERVER and FOREIGN TABLE info * * returns 'true' if the expression can be pushed down or 'false' otherwise */ bool IsExpressionPushable(char *expression) { char *open = " \n("; char *close = " \n("; bool result; elog(DEBUG3, "%s called: expression='%s'", __func__, expression); result = LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_COUNT, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_SUM, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_AVG, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_MIN, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_MAX, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_SAMPLE, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_GROUPCONCAT, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND; elog(DEBUG3, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * LocateKeyword * ----------- * This function locates the first occurrence of given 'keyword' within 'str'. The keywords * must be wrapped with one of the characters given in 'start_chars' and end_chars'. If * the parameter '*count' is used, this function will be called recursively to count how * many times the searched 'keyword' can be found in 'str' * * str : string where 'keyword' will be searched * start_chars : all possible chars that can preceed the searched 'keyword' * keyword : the searched keyword (case insensitive) * end_chars : all possible chars that can be found after the 'keyword' * count : how many times 'keyword' was found in 'str' (nullable) * start_position : position in 'str' where the function has to start looking for * 'keyword'. Set it to '0' if the whole 'str' must be considered. * * returns : position where 'keyword' was found, or RDF_KEYWORD_NOT_FOUND otherwise. */ int LocateKeyword(char *str, char *start_chars, char *keyword, char *end_chars, int *count, int start_position) { int keyword_position = RDF_KEYWORD_NOT_FOUND; StringInfoData idt; initStringInfo(&idt); if (count) { for (size_t i = 0; i < *count; i++) { appendStringInfo(&idt, " "); } if (*count > 0) appendStringInfo(&idt, "├─ "); } elog(DEBUG2, "%s%s called: searching '%s' in start_position %d", NameStr(idt), __func__, keyword, start_position); if (start_position < 0) elog(ERROR, "%s%s: start_position cannot be negative.", NameStr(idt), __func__); /* * Some SPARQL keywords can be placed in the very beginning of a query, so they not always * have a preceeding character. So here we first check if the searched keyword exists * in the beginning of the string. */ if (((strcasecmp(keyword, RDF_SPARQL_KEYWORD_SELECT) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_SELECT, strlen(RDF_SPARQL_KEYWORD_SELECT)) == 0) || (strcasecmp(keyword, RDF_SPARQL_KEYWORD_PREFIX) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_PREFIX, strlen(RDF_SPARQL_KEYWORD_PREFIX)) == 0) || (strcasecmp(keyword, RDF_SPARQL_KEYWORD_DESCRIBE) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_DESCRIBE, strlen(RDF_SPARQL_KEYWORD_DESCRIBE)) == 0)) && start_position == 0) { elog(DEBUG2, "%s%s: nothing before SELECT. Setting keyword_position to 0.", NameStr(idt), __func__); keyword_position = 0; } else { for (int i = 0; i < strlen(start_chars); i++) { for (int j = 0; j < strlen(end_chars); j++) { char *el; StringInfoData eval_token; initStringInfo(&eval_token); appendStringInfo(&eval_token, "%c%s%c", start_chars[i], keyword, end_chars[j]); el = strcasestr(str + start_position, eval_token.data); if (el != NULL) { int nquotes = 0; for (int k = 0; k <= (el - str); k++) { if (str[k] == '\"') nquotes++; } /* * If the keyword is located after an opening double-quote it is a literal and should * not be considered as a keyword. */ if (nquotes % 2 != 1) keyword_position = el - str; if (keyword_position != RDF_KEYWORD_NOT_FOUND) break; } } } } if ((count) && keyword_position != RDF_KEYWORD_NOT_FOUND) { (*count)++; elog(DEBUG2, "%s%s (%d): keyword '%s' found in position %d. Recalling %s ... ", NameStr(idt), __func__, *count, keyword, keyword_position, __func__); LocateKeyword(str, start_chars, keyword, end_chars, count, keyword_position + 1); elog(DEBUG2, "%s%s: '%s' search returning postition %d for start position %d", NameStr(idt), __func__, keyword, keyword_position, start_position); } elog(DEBUG2, "%s exit: returning '%d' (keyword_position)", __func__, keyword_position); return keyword_position; } /* * CheckURL * -------- * CheckS if an URL is valid. * * url: URL to be validated. * * returns REQUEST_SUCCESS or REQUEST_FAIL */ int CheckURL(char *url) { CURLUcode code; CURLU *handler = curl_url(); elog(DEBUG3, "%s called: '%s'", __func__, url); code = curl_url_set(handler, CURLUPART_URL, url, 0); curl_url_cleanup(handler); elog(DEBUG2, " %s handler return code: %u", __func__, code); if (code != 0) { elog(DEBUG2, "%s: invalid URL (%u) > '%s'", __func__, code, url); return code; } elog(DEBUG3, "%s exit: returning '%d' (REQUEST_SUCCESS)", __func__, REQUEST_SUCCESS); return REQUEST_SUCCESS; } /* * ValidateSPARQLUpdatePattern * ---------------------------- * * Validates the sparql_update_pattern to ensure it is suitable * for INSERT operations: * 1. Contains at least one valid triple pattern (subject, predicate, * and object) * 2. All SPARQL variables have corresponding table columns with * matching variable options * * This prevents empty or invalid patterns from generating malformed * SPARQL UPDATE statements. * * Throws an ERROR if: * - The pattern is empty or contains no valid triple patterns * - A variable in the pattern has no matching column */ void ValidateSPARQLUpdatePattern(RDFfdwState *state) { const char *pos; const char *pattern = state->sparql_update_pattern; bool has_triple = false; /* Check for at least one valid triple pattern (must have at least 3 components) */ { const char *p = pattern; int component_count = 0; bool in_uri = false; bool in_literal = false; bool in_var = false; while (*p) { /* Skip whitespace between components */ if (isspace((unsigned char)*p)) { if (in_var) { component_count++; in_var = false; } p++; continue; } /* Handle URIs <...> */ if (*p == '<') { in_uri = true; p++; continue; } if (in_uri && *p == '>') { in_uri = false; component_count++; p++; continue; } if (in_uri) { p++; continue; } /* Handle literals "..." */ if (*p == '"' && !in_literal) { in_literal = true; p++; continue; } if (*p == '"' && in_literal && (p == pattern || *(p - 1) != '\\')) { in_literal = false; component_count++; /* Skip language tags or datatypes */ p++; if (*p == '@' || (*p == '^' && *(p + 1) == '^')) { while (*p && !isspace((unsigned char)*p) && *p != '.') p++; } continue; } if (in_literal) { p++; continue; } /* Handle variables ?var or $var */ if ((*p == '?' || *p == '$') && !in_var) { in_var = true; p++; continue; } if (in_var) { if (!isalnum((unsigned char)*p) && *p != '_') { component_count++; in_var = false; /* Don't increment p, re-process this character */ continue; } p++; continue; } /* Handle triple terminator */ if (*p == '.') { if (in_var) { component_count++; in_var = false; } if (component_count >= 3) { has_triple = true; break; } /* Reset for next potential triple */ component_count = 0; p++; continue; } /* Other characters (bare words, prefixed names like ex:Thing) */ if (isalnum((unsigned char)*p) || *p == ':' || *p == '_') { /* Scan to end of token */ while (*p && (isalnum((unsigned char)*p) || *p == ':' || *p == '_' || *p == '-')) p++; component_count++; continue; } /* Unknown character, skip */ p++; } /* Check if we ended with a variable */ if (in_var) component_count++; /* Final check: did we accumulate at least 3 components? */ if (component_count >= 3) has_triple = true; } if (!has_triple) { ereport(ERROR, (errcode(ERRCODE_FDW_UNABLE_TO_CREATE_EXECUTION), errmsg("'%s' contains no valid triple patterns", RDF_TABLE_OPTION_SPARQL_UPDATE_PATTERN), errhint("A triple pattern requires at least three components (subject, predicate, object), e.g., '?s ?p ?o .'"))); } /* Check that all variables in template have corresponding columns */ pos = pattern; while ((pos = strchr(pos, '?')) != NULL) { StringInfoData var_name; bool found = false; int j = 0; /* Extract variable name (alphanumeric after ?) */ initStringInfo(&var_name); pos++; /* Skip the ? */ while (isalnum((unsigned char)pos[j]) || pos[j] == '_') { appendStringInfoChar(&var_name, pos[j]); j++; } if (var_name.len > 0) { /* Check if a column maps to this variable */ for (int k = 0; k < state->numcols; k++) { if (state->rdfTable->cols[k]->sparqlvar) { /* Build the full variable name with ? prefix for comparison */ StringInfoData full_var; initStringInfo(&full_var); appendStringInfoString(&full_var, "?"); appendStringInfoString(&full_var, var_name.data); if (strcmp(state->rdfTable->cols[k]->sparqlvar, full_var.data) == 0) { found = true; pfree(full_var.data); break; } pfree(full_var.data); } } /* Report error immediately if variable not found */ if (!found) { ereport(ERROR, (errcode(ERRCODE_FDW_UNABLE_TO_CREATE_EXECUTION), errmsg("SPARQL variable '?%s' in '%s' is not mapped to any table column", var_name.data, RDF_TABLE_OPTION_SPARQL_UPDATE_PATTERN))); } } pfree(var_name.data); pos += j; } } /* * str_replace * ----------- * Replace all occurrences of 'search' with 'replace' in 'source'. * Returns a newly allocated string. * * source : the original string * search : the substring to search for * replace : the replacement string * * returns a new string with replacements made */ char *str_replace(const char *source, const char *search, const char *replace) { StringInfoData result; const char *pos = source; const char *found; size_t search_len = strlen(search); size_t replace_len = strlen(replace); initStringInfo(&result); while ((found = strstr(pos, search)) != NULL) { /* Append everything before the match */ appendBinaryStringInfo(&result, pos, found - pos); /* Append the replacement */ appendBinaryStringInfo(&result, replace, replace_len); /* Move past the match */ pos = found + search_len; } /* Append any remaining text */ appendStringInfoString(&result, pos); return result.data; } /* * EscapeSPARQLLiteral * ------------------- * * Escapes special characters in an RDF literal for use in SPARQL UPDATE operations. * Handles newlines, carriage returns, and tabs that are stored as actual bytes in * PostgreSQL but must be represented as escape sequences in SPARQL. * * This function is specifically for INSERT/DELETE operations where the rdfnode * output may contain actual newline bytes (0x0A) that need to be converted to * the SPARQL escape sequence "\n". * * input: RDF literal string (e.g., "Line1\nLine2"@en where \n is byte 0x0A) * * returns: SPARQL-safe literal (e.g., "Line1\\nLine2"@en where \\n is two chars) */ char *EscapeSPARQLLiteral(const char *input) { StringInfoData result; const char *ptr; const char *closing_quote = NULL; const char *lang_or_type = NULL; if (!input || strlen(input) == 0) return (char *)input; /* Check if this is an IRI - no escaping needed */ if (input[0] == '<') return (char *)input; /* Check if this is a quoted literal */ if (input[0] != '"') return (char *)input; initStringInfo(&result); /* Find the closing quote (not preceded by backslash) */ ptr = input + 1; /* skip opening quote */ while (*ptr != '\0') { if (*ptr == '"' && (ptr == input + 1 || *(ptr - 1) != '\\')) { closing_quote = ptr; /* Check what follows the closing quote */ if (ptr[1] == '@' || (ptr[1] == '^' && ptr[2] == '^') || ptr[1] == '\0') { lang_or_type = ptr + 1; /* Point to @lang or ^^type or end of string */ break; } } ptr++; } if (!closing_quote) { /* Malformed literal - return as-is */ return (char *)input; } /* Escape the content between quotes */ appendStringInfoChar(&result, '"'); /* opening quote */ ptr = input + 1; /* reset to start of content */ while (ptr < closing_quote) { switch (*ptr) { case '\n': appendStringInfoString(&result, "\\n"); break; case '\r': appendStringInfoString(&result, "\\r"); break; case '\t': appendStringInfoString(&result, "\\t"); break; default: appendStringInfoChar(&result, *ptr); break; } ptr++; } /* Add closing quote */ appendStringInfoChar(&result, '"'); /* Add language tag or datatype if present */ if (lang_or_type && *lang_or_type != '\0') { appendStringInfoString(&result, lang_or_type); } return result.data; }