/*--------------------------------------------------------------------- * * rdf_utils.c * Utility functions for RDF data manipulation and validation. * * Copyright (C) 2022-2025 University of Münster, Germany * *--------------------------------------------------------------------- */ #include "postgres.h" #include "rdf_fdw.h" #include "rdf_utils.h" #include "rdfnode.h" #include "sparql.h" #include "lib/stringinfo.h" #include "utils/builtins.h" #if PG_VERSION_NUM >= 100000 #include "utils/varlena.h" #endif #include "access/htup_details.h" #include "catalog/pg_type.h" #include "mb/pg_wchar.h" #include "nodes/makefuncs.h" #include #include #include /* Type mapping table for PostgreSQL types to XSD datatypes */ static const TypeXSDMap type_map[] = { {INT2OID, "integer"}, {INT4OID, "integer"}, {INT8OID, "integer"}, {NUMERICOID, "decimal"}, {FLOAT8OID, "double"}, {FLOAT4OID, "float"}, {BOOLOID, "boolean"}, {TIMESTAMPOID, "dateTime"}, {DATEOID, "date"}, {TIMEOID, "time"}, {TEXTOID, "string"}, {NAMEOID, "string"}, {TIMESTAMPTZOID, "dateTime"}, {InvalidOid, NULL} }; /* * ContainsWhitespaces * --------------- * Checks if a string contains whitespaces * * str: string to be evaluated * * returns true if the string contains whitespaces or false otherwise */ bool ContainsWhitespaces(char *str) { elog(DEBUG1, "%s called: str='%s'", __func__, str); for (int i = 0; str[i] != '\0'; i++) if (isspace((unsigned char)str[i])) { elog(DEBUG1, "%s exit: returning 'true'", __func__); return true; } elog(DEBUG1, "%s exit: returning 'false'", __func__); return false; } /* * is_valid_language_tag * ---------------------- * Validates language tags according to the pattern: [a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})* * Examples: "en", "en-US", "de-DE" */ bool is_valid_language_tag(const char *lan) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[a-zA-Z]{1,8}(-[a-zA-Z0-9]{1,8})*$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for language tag"))); reti = regexec(®ex, lan, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } /* * isPlainLiteral * -------------- * Checks if a literal is a plain literal (no language tag or datatype). */ bool isPlainLiteral(char *literal) { if (strlen(lang(literal)) != 0 || strlen(datatype(literal)) != 0) return false; return true; } /* * LiteralsCompatible * ------------------ * * Determines if two RDF literals are compatible according to SPARQL rules. * Compatibility is based on language tags and datatypes: literals are compatible * if they are both simple literals or xsd:string, or if they have identical * language tags, or if one has a language tag and the other is a simple literal * or xsd:string. Incompatible cases (e.g., one with a datatype and the other * with a language tag) return false. * * literal1: Null-terminated C string representing an RDF literal (e.g., "abc"@en, "123"^^xsd:integer) * literal2: Null-terminated C string representing an RDF literal (e.g., "def", "456"^^xsd:string) * * returns: C boolean (true if literals are compatible, false otherwise) */ bool LiteralsCompatible(char *literal1, char *literal2) { char *lang1; char *lang2; char *dt1; char *dt2; elog(DEBUG1, "%s called: literal1='%s', literal2='%s'", __func__, literal1, literal2); lang1 = lang(literal1); lang2 = lang(literal2); dt1 = datatype(literal1); dt2 = datatype(literal2); if (!literal1 || !literal2) { elog(DEBUG1, "%s exit: returning 'false' (one of the arguments is NULL)", __func__); return false; } /*TODO: check if RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED is needed, as the prefix is expaded elsewhere */ /* both simple literals or xsd:string */ if (strlen(lang1) == 0 && strlen(lang2) == 0 && (strlen(dt1) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt1, RDF_SIMPLE_LITERAL_DATATYPE) == 0) && (strlen(dt2) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { elog(DEBUG1, "%s exit: returning 'true' (both simple literals or xsd:string)", __func__); return true; } /* both plain literals with identical language tags */ if (strlen(lang1) > 0 && strlen(lang2) > 0 && strcmp(lang1, lang2) == 0) { elog(DEBUG1, "%s exit: returning 'true' (both plain literals with identical language tags)", __func__); return true; } /* arg1 has language tag, arg2 is simple or xsd:string */ if (strlen(lang1) > 0 && strlen(lang2) == 0 && (strlen(dt2) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE_PREFIXED) == 0 || strcmp(dt2, RDF_SIMPLE_LITERAL_DATATYPE) == 0)) { elog(DEBUG1, "%s exit: returning 'true' (arg1 has language tag, arg2 is simple or xsd:string)", __func__); return true; } /* incompatible otherwise (e.g., arg1 xsd:string, arg2 language-tagged) */ elog(DEBUG1, "%s exit: returning 'false' (incompatible)", __func__); return false; } /* * cstring_to_rdfliteral * --------------------- * * Converts a raw string input into a valid RDF literal by adding quotes and escaping * internal quotes as needed. If the input is already a complete RDF literal (i.e., * quoted with a language tag or datatype), it is returned unchanged. * * input: the raw string or partial literal to convert (e.g., "abc", "abc"@en, "ab\"c") * * returns: a string representing the RDF literal (e.g., "\"abc\"", "\"ab\\\"c\"") * or the input as-is if already a complete literal. */ char *cstring_to_rdfliteral(char *input) { StringInfoData buf; const char *start; const char *end; int len; elog(DEBUG1, "%s called: input='%s'", __func__, input); /* return the string as-is if the input is an IRI */ if (isIRI(input)) return input; if (!input || strlen(input) == 0) { elog(DEBUG1, "%s exit: returning empty literal '\"\"'", __func__); return "\"\""; /* empty input becomes empty literal */ } start = input; len = strlen(start); initStringInfo(&buf); /* check if it's already a complete RDF literal */ if (*start == '"') { end = start + len - 1; /* last character */ if (end > start) { const char *tag = strstr(start, "@"); if (!tag) tag = strstr(start, "^^"); if (tag && tag > start + 1 && *(tag - 1) == '"') { elog(DEBUG1, "%s exit: returning => '%s'", __func__, input); /* complete literal with lang or type, return as-is */ return input; } } } /* not a complete literal, treat as raw content */ end = start + len; /* add opening quote */ appendStringInfoChar(&buf, '"'); /* process the content, escaping all quotes */ while (start < end) { if (*start == '"') { /* escape unless already escaped */ if (start == input || *(start - 1) != '\\') { appendStringInfoChar(&buf, '\\'); } appendStringInfoChar(&buf, '"'); } else { appendStringInfoChar(&buf, *start); } start++; } /* add closing quote */ appendStringInfoChar(&buf, '"'); elog(DEBUG1, "%s exit: returning => '%s'", __func__, buf.data); return buf.data; } /* * ExpandDatatypePrefix * -------------------- * * Expands a datatype prefix (e.g., "xsd:") to its full URI form if recognized. * Strips angle brackets (< >) from input before processing. Supports "xsd:" mapped * to "http://www.w3.org/2001/XMLSchema#". Returns the input as-is (without < >) * for other prefixed or bare datatypes, assuming prefix resolution elsewhere. * * str: Null-terminated C string representing a datatype (e.g., "xsd:string", "") * * returns: Null-terminated C string, expanded for "xsd:" or stripped/as-is otherwise */ char *ExpandDatatypePrefix(char *str) { StringInfoData buf; const char *xsd_prefix = "xsd:"; char *stripped_str = str; size_t len; elog(DEBUG1, "%s called: str='%s'", __func__, str); if (!str || strlen(str) == 0) return ""; /* Empty input returns empty string */ len = strlen(str); /* Strip < > if present */ if (str[0] == '<' && str[len - 1] == '>') { stripped_str = palloc(len - 1); /* allocate space for stripped string (len - 2 + null terminator) */ strncpy(stripped_str, str + 1, len - 2); stripped_str[len - 2] = '\0'; /* NULL-terminate */ } /* Check for 'xsd:' prefix and expand it */ if (strncmp(stripped_str, xsd_prefix, strlen(xsd_prefix)) == 0 && strlen(stripped_str) > strlen(xsd_prefix)) { const char *suffix = stripped_str + strlen(xsd_prefix); /* get part after "xsd:" */ initStringInfo(&buf); appendStringInfoChar(&buf, '<'); /* open bracket */ appendStringInfoString(&buf, RDF_XSD_BASE_URI); /* add XSD URI */ appendStringInfoString(&buf, suffix); /* add suffix */ appendStringInfoChar(&buf, '>'); /* close bracket */ if (stripped_str != str) pfree(stripped_str); elog(DEBUG1, "%s exit: returning '%s'", __func__, buf.data); return buf.data; } /* return stripped string (or original if no stripping) without < > */ if (stripped_str != str) { initStringInfo(&buf); appendStringInfoString(&buf, stripped_str); pfree(stripped_str); elog(DEBUG1, "%s exit: returning '%s'", __func__, buf.data); return buf.data; } elog(DEBUG1, "%s exit: returning '%s'", __func__, str); return str; } /* * MapSPARQLDatatype * ----------------- * Maps PostgreSQL type OIDs to their corresponding XSD datatype strings. */ char *MapSPARQLDatatype(Oid pgtype) { elog(DEBUG1, "%s called: input='%u'", __func__, pgtype); for (int i = 0; type_map[i].type_oid != InvalidOid; i++) { if (pgtype == type_map[i].type_oid) { elog(DEBUG1, "%s exit: returning => '%s'", __func__, (char *)type_map[i].xsd_datatype); return (char *)type_map[i].xsd_datatype; } } elog(DEBUG1, "%s exit: returning NULL (unsupported type)", __func__); return NULL; } #if PG_VERSION_NUM < 130000 void pg_unicode_to_server(pg_wchar c, unsigned char *utf8) { unsigned char utf8buf[8]; /* Large enough for UTF-8 encoding */ int len; unsigned char *converted; /* Convert Unicode code point to UTF-8 */ if (unicode_to_utf8(c, utf8buf) == NULL) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid Unicode code point: 0x%04x", c))); len = pg_utf_mblen(utf8buf); /* Get the length of the encoded UTF-8 character */ if (GetDatabaseEncoding() == PG_UTF8) { memcpy(utf8, utf8buf, len); } else { converted = pg_do_encoding_conversion(utf8buf, len, PG_UTF8, GetDatabaseEncoding()); if (converted == NULL) ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("Unicode character 0x%04x cannot be converted to server encoding \"%s\"", c, GetDatabaseEncodingName()))); memcpy(utf8, converted, strlen((const char *)converted)); } utf8[len] = '\0'; /* Null-terminate (safe if utf8 has size ≥ 5) */ } #endif char *unescape_unicode(const char *input) { StringInfoData buf; initStringInfo(&buf); elog(DEBUG2, "%s: Input='%s'", __func__, input); for (const char *p = input; *p;) { if (p[0] == '\\' && p[1] == 'u') { /* \uXXXX (exactly 4 hex digits) */ if (p[2] && p[3] && p[4] && p[5] && isxdigit(p[2]) && isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5]) && (!p[6] || !isxdigit(p[6]))) { uint16_t codeunit; char hex[5]; unsigned char utf8[5]; int len; memcpy(hex, p + 2, 4); hex[4] = '\0'; sscanf(hex, "%hx", &codeunit); elog(DEBUG2, "%s: Parsed \\u%s to codeunit U+%04X", __func__, hex, codeunit); /* Check for high surrogate */ if (codeunit >= 0xD800 && codeunit <= 0xDBFF && p[6] == '\\' && p[7] == 'u' && p[8] && p[9] && p[10] && p[11] && isxdigit(p[8]) && isxdigit(p[9]) && isxdigit(p[10]) && isxdigit(p[11]) && (!p[12] || !isxdigit(p[12]))) { uint16_t low; char lowhex[5]; uint32_t full; memcpy(lowhex, p + 8, 4); lowhex[4] = '\0'; sscanf(lowhex, "%hx", &low); if (low >= 0xDC00 && low <= 0xDFFF) { full = 0x10000 + (((codeunit - 0xD800) << 10) | (low - 0xDC00)); elog(DEBUG2, "%s: Surrogate pair U+%04X U+%04X -> U+%X", __func__, codeunit, low, full); memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(full, (unsigned char *)utf8); len = pg_utf_mblen((const unsigned char *)utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 12; continue; } } if (codeunit >= 0xD800 && codeunit <= 0xDFFF) { elog(DEBUG2, "%s: Lone surrogate U+%04X -> U+FFFD", __func__, codeunit); pg_unicode_to_server(0xFFFD, (unsigned char *)utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 6; continue; } memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(codeunit, (unsigned char *)utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 6; continue; } else { elog(DEBUG2, "%s: Invalid \\u sequence at '%s' -> literal", __func__, p); appendStringInfoString(&buf, "\\u"); p += 2; for (int i = 0; i < 4 && p[0] && isxdigit(p[0]); i++) appendStringInfoChar(&buf, *p++); continue; } } else if (p[0] == '\\' && p[1] == 'U') { /* \UXXXXXXXX (exactly 8 hex digits) */ if (p[2] && p[3] && p[4] && p[5] && p[6] && p[7] && p[8] && p[9] && isxdigit(p[2]) && isxdigit(p[3]) && isxdigit(p[4]) && isxdigit(p[5]) && isxdigit(p[6]) && isxdigit(p[7]) && isxdigit(p[8]) && isxdigit(p[9]) && (!p[10] || !isxdigit(p[10]))) { char hex[9]; uint32_t codepoint; unsigned char utf8[5]; int len; memcpy(hex, p + 2, 8); hex[8] = '\0'; sscanf(hex, "%x", &codepoint); elog(DEBUG2, "%s: Parsed \\U%s to codepoint U+%X", __func__, hex, codepoint); if (codepoint > 0x10FFFF || (codepoint >= 0xD800 && codepoint <= 0xDFFF)) { elog(DEBUG2, "%s: Invalid codepoint U+%X -> U+FFFD", __func__, codepoint); codepoint = 0xFFFD; } memset(utf8, 0, sizeof(utf8)); pg_unicode_to_server(codepoint, utf8); len = pg_utf_mblen(utf8); appendBinaryStringInfo(&buf, (const char *)utf8, len); p += 10; continue; } else { elog(DEBUG2, "%s: Invalid \\U sequence at '%s' -> literal", __func__, p); appendStringInfoString(&buf, "\\U"); p += 2; for (int i = 0; i < 8 && p[0] && isxdigit(p[0]); i++) appendStringInfoChar(&buf, *p++); continue; } } else { /* Preserve all other characters, including \t, \n, \", etc. */ appendStringInfoChar(&buf, *p++); } } elog(DEBUG2, "%s: Output='%s'", __func__, buf.data); return buf.data; } /* * IsFunctionPushable * --------------- * Check if a PostgreSQL function can be pushed down. * * funcname: name of the PostgreSQL function * * returns true if the function can be pushed down or false otherwise */ bool IsFunctionPushable(char *funcname) { bool result; elog(DEBUG1, "%s called: funcname='%s'", __func__, funcname); result = strcmp(funcname, "abs") == 0 || strcmp(funcname, "ceil") == 0 || strcmp(funcname, "floor") == 0 || strcmp(funcname, "round") == 0 || strcmp(funcname, "upper") == 0 || strcmp(funcname, "lower") == 0 || strcmp(funcname, "length") == 0 || strcmp(funcname, "md5") == 0 || strcmp(funcname, "starts_with") == 0 || strcmp(funcname, "strstarts") == 0 || strcmp(funcname, "strends") == 0 || strcmp(funcname, "strbefore") == 0 || strcmp(funcname, "strafter") == 0 || strcmp(funcname, "strlang") == 0 || strcmp(funcname, "langmatches") == 0 || strcmp(funcname, "strdt") == 0 || strcmp(funcname, "str") == 0 || strcmp(funcname, "iri") == 0 || strcmp(funcname, "isiri") == 0 || strcmp(funcname, "lang") == 0 || strcmp(funcname, "datatype") == 0 || strcmp(funcname, "contains") == 0 || strcmp(funcname, "extract") == 0 || strcmp(funcname, "encode_for_uri") == 0 || strcmp(funcname, "isblank") == 0 || strcmp(funcname, "isnumeric") == 0 || strcmp(funcname, "isliteral") == 0 || strcmp(funcname, "bnode") == 0 || strcmp(funcname, "lcase") == 0 || strcmp(funcname, "ucase") == 0 || strcmp(funcname, "strlen") == 0 || strcmp(funcname, "substr") == 0 || strcmp(funcname, "concat") == 0 || strcmp(funcname, "replace") == 0 || strcmp(funcname, "regex") == 0 || strcmp(funcname, "year") == 0 || strcmp(funcname, "month") == 0 || strcmp(funcname, "day") == 0 || strcmp(funcname, "hours") == 0 || strcmp(funcname, "minutes") == 0 || strcmp(funcname, "seconds") == 0 || strcmp(funcname, "timezone") == 0 || strcmp(funcname, "tz") == 0 || strcmp(funcname, "bound") == 0 || strcmp(funcname, "sameterm") == 0 || strcmp(funcname, "coalesce") == 0 || strcmp(funcname, "substring") == 0 || strcmp(funcname, "rdfnode_to_time") == 0 || strcmp(funcname, "rdfnode_to_timetz") == 0 || strcmp(funcname, "rdfnode_to_timestamp") == 0 || strcmp(funcname, "rdfnode_to_timestamptz") == 0 || strcmp(funcname, "rdfnode_to_boolean") == 0 || strcmp(funcname, "boolean_to_rdfnode") == 0; elog(DEBUG1, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * IsRDFStringLiteral * ------------------ * * Checks if an RDF term is a string literal (simple, xsd:string, or language-tagged). * Follows SPARQL 1.1 requirements for string literal inputs (e.g., LCASE, UCASE). * Returns 1 for valid string literals, 0 otherwise. Logs unexpected datatypes for * debugging, as derived string types (e.g., xsd:token) may appear in some datasets. * * str_datatype: Null-terminated C string from datatype() (e.g., "", "http://www.w3.org/2001/XMLSchema#string") * str_language: Null-terminated C string from lang() (e.g., "", "en") */ bool IsRDFStringLiteral(char *str) { elog(DEBUG1, "%s called: str='%s'", __func__, str); if (str == NULL) { elog(DEBUG1, "%s exit: returning 'false' (NULL argument)", __func__); return false; } if (strcmp(str, "") == 0 || strcmp(str, RDF_SIMPLE_LITERAL_DATATYPE) == 0 || strcmp(str, RDF_LANGUAGE_LITERAL_DATATYPE) == 0) { elog(DEBUG1, "%s exit: returning 'true'", __func__); return true; } elog(DEBUG1, "%s exit: returning 'false' (unsupported datatype '%s')", __func__, str); return false; } /* * CreateRegexString * --------------- * Escapes regex wildcards into normal characters by adding \\ to them * * str: string to be converted * * returns str with the regex wildcards escaped. */ char *CreateRegexString(char *str) { StringInfoData res; initStringInfo(&res); elog(DEBUG1, "%s called: str='%s'", __func__, str); if (!str) return NULL; for (int i = 0; str[i] != '\0'; i++) { char c = str[i]; if (i == 0 && c != '%' && c != '_' && c != '^') appendStringInfo(&res, "^"); if (strchr("/:=#@^()[]{}+-*$.?|", c) != NULL) appendStringInfo(&res, "\\\\%c", c); else if (c == '%') appendStringInfo(&res, ".*"); else if (c == '_') appendStringInfo(&res, "."); else if (c == '"') appendStringInfo(&res, "\\\""); else appendStringInfo(&res, "%c", c); if (i == strlen(str) - 1 && c != '%' && c != '_') appendStringInfo(&res, "$"); elog(DEBUG2, "%s loop => %c res => %s", __func__, str[i], NameStr(res)); } elog(DEBUG1, "%s exit: returning '%s'", __func__, NameStr(res)); return NameStr(res); } /* * FormatSQLExtractField * --------------- * The fields "years", "months" and "days" (plural) and "hour", "minute", * "second" (singular) are note supported in SPARQL, but PostgreSQL can * handle both. So here we convert the parameters to a form that correspond * to a SPARQL function. * * field: EXTRACT or DATE_PART field parameter * * returns formated field parameter (uppercase) */ char *FormatSQLExtractField(char *field) { char *res; elog(DEBUG1, "%s called: field='%s'", __func__, field); if (strcasecmp(field, "year") == 0 || strcasecmp(field, "years") == 0) res = "YEAR"; else if (strcasecmp(field, "month") == 0 || strcasecmp(field, "months") == 0) res = "MONTH"; else if (strcasecmp(field, "day") == 0 || strcasecmp(field, "days") == 0) res = "DAY"; else if (strcasecmp(field, "hour") == 0 || strcasecmp(field, "hours") == 0) res = "HOURS"; else if (strcasecmp(field, "minute") == 0 || strcasecmp(field, "minutes") == 0) res = "MINUTES"; else if (strcasecmp(field, "second") == 0 || strcasecmp(field, "seconds") == 0) res = "SECONDS"; else { elog(DEBUG1, "%s exit: returning NULL (field unknown)", __func__); return NULL; } elog(DEBUG1, "%s exit: returning '%s'", __func__, res); return res; } /* * ConstToCString * ----------------- * Extracts a string from a Const * * returns a palloc'ed copy. */ char *ConstToCString(Const *constant) { if (constant->constisnull) return NULL; else return text_to_cstring(DatumGetTextP(constant->constvalue)); } /* * CStringToConst * ----------------- * Extracts a Const from a char* * * returns Const from given string. */ Const *CStringToConst(const char *str) { if (str == NULL) return makeNullConst(TEXTOID, -1, InvalidOid); else return makeConst(TEXTOID, -1, InvalidOid, -1, PointerGetDatum(cstring_to_text(str)), false, false); } char *rdfnode_to_cstring(rdfnode *node) { /* Get a pointer to the actual data and its length */ char *data = VARDATA_ANY(node); int len = VARSIZE_ANY_EXHDR(node); /* Allocate a null-terminated C string */ char *result = palloc(len + 1); memcpy(result, data, len); result[len] = '\0'; return result; } /* * IsStringDataType * --------------- * Determines if a PostgreSQL data type is string or numeric type * so that we can know when to wrap the value with single quotes * or leave it as-is. * * type: PostgreSQL data type * * returns true if the data type needs to be wrapped with quotes * or false otherwise. */ bool IsStringDataType(Oid type) { bool result; elog(DEBUG1, "%s called: type='%u'", __func__, type); result = type == TEXTOID || type == VARCHAROID || type == CHAROID || type == NAMEOID || type == DATEOID || type == TIMESTAMPOID || type == TIMESTAMPTZOID || type == NAMEOID || type == RDFNODEOID; elog(DEBUG1, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } bool is_valid_xsd_double(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?$"; if (pg_strcasecmp(lexical, "NaN") == 0 || pg_strcasecmp(lexical, "INF") == 0 || pg_strcasecmp(lexical, "-INF") == 0) return true; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:double"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_int(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^-?[0-9]+$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:int"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_dateTime(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]+)?([+-][0-9]{2}:[0-9]{2}|Z)?$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:dateTime"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_time(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\\.[0-9]+)?([+-][0-9]{2}:[0-9]{2}|Z)?$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:time"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } bool is_valid_xsd_date(const char *lexical) { regex_t regex; int reti; bool is_valid = false; const char *pattern = "^[0-9]{4}-(0[1-9]|1[0-2])-(0[1-9]|[12][0-9]|3[01])([+-][0-9]{2}:[0-9]{2}|Z)?$"; reti = regcomp(®ex, pattern, REG_EXTENDED); if (reti) ereport(ERROR, (errmsg("could not compile regex for xsd:date"))); reti = regexec(®ex, lexical, 0, NULL, 0); if (reti == 0) is_valid = true; regfree(®ex); return is_valid; } /* * IsSPARQLVariableValid * --------------- * A query variable is marked by the use of either "?" or "$"; the "?" or * "$" is not part of the variable name. Valid characters for the name * are [a-z], [A-Z], [0-9] * * str: string to be evaluated * * returns true if the variable is valid or false otherwise */ bool IsSPARQLVariableValid(const char *str) { elog(DEBUG1, "%s called: str='%s'", __func__, str); if (str[0] != '?' && str[0] != '$') { elog(DEBUG1, "%s exit: returning 'false' (str does not start with '?' or '$')", __func__); return false; } for (int i = 1; str[i] != '\0'; i++) if (!isalnum(str[i]) && str[i] != '_') { elog(DEBUG1, "%s exit: returning 'false' (invalid variable name)", __func__); return false; } elog(DEBUG1, "%s exit: returning 'true'", __func__); return true; } /* * IsSPARQLParsable * ------------------ * Checks if a SPARQL query can be parsed and modified to accommodate possible * pusdhown instructions. If it returns false it does not mean that the query * is invalid. It just means that it contains unsupported clauses and it cannot * be modifed. * * state: SPARQL, SERVER and FOREIGN TABLE info * * returns 'true' if the SPARQL query is safe to be parsed or 'false' otherwise */ bool IsSPARQLParsable(struct RDFfdwState *state) { int keyword_count = 0; bool result; elog(DEBUG1, "%s called", __func__); /* * SPARQL Queries containing SUB SELECTS are not supported. So, if any number * other than 1 is returned from LocateKeyword, this query cannot be parsed. */ LocateKeyword(state->raw_sparql, "{\n\t> ", RDF_SPARQL_KEYWORD_SELECT, " *?\n\t", &keyword_count, 0); elog(DEBUG2, "%s: SPARQL contains '%d' SELECT clauses.", __func__, keyword_count); result = LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_GROUPBY, " \n\t?", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_ORDERBY, " \n\t?DA", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_LIMIT, " \n\t", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_MINUS, " \n\t{", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t}", RDF_SPARQL_KEYWORD_UNION, " \n\t{", NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(state->raw_sparql, " \n\t", RDF_SPARQL_KEYWORD_HAVING, " \n\t(", NULL, 0) == RDF_KEYWORD_NOT_FOUND && keyword_count == 1; elog(DEBUG1, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * IsExpressionPushable * ------------ * Checks if an expression attached to a column can be pushed down, in case it * is used in a condition in the SQL WHERE clause. * * state: SPARQL, SERVER and FOREIGN TABLE info * * returns 'true' if the expression can be pushed down or 'false' otherwise */ bool IsExpressionPushable(char *expression) { char *open = " \n("; char *close = " \n("; bool result; elog(DEBUG1, "%s called: expression='%s'", __func__, expression); result = LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_COUNT, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_SUM, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_AVG, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_MIN, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_MAX, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_SAMPLE, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND && LocateKeyword(expression, open, RDF_SPARQL_AGGREGATE_FUNCTION_GROUPCONCAT, close, NULL, 0) == RDF_KEYWORD_NOT_FOUND; elog(DEBUG1, "%s exit: returning '%s'", __func__, !result ? "false" : "true"); return result; } /* * LocateKeyword * ----------- * This function locates the first occurrence of given 'keyword' within 'str'. The keywords * must be wrapped with one of the characters given in 'start_chars' and end_chars'. If * the parameter '*count' is used, this function will be called recursively to count how * many times the searched 'keyword' can be found in 'str' * * str : string where 'keyword' will be searched * start_chars : all possible chars that can preceed the searched 'keyword' * keyword : the searched keyword (case insensitive) * end_chars : all possible chars that can be found after the 'keyword' * count : how many times 'keyword' was found in 'str' (nullable) * start_position : position in 'str' where the function has to start looking for * 'keyword'. Set it to '0' if the whole 'str' must be considered. * * returns : position where 'keyword' was found, or RDF_KEYWORD_NOT_FOUND otherwise. */ int LocateKeyword(char *str, char *start_chars, char *keyword, char *end_chars, int *count, int start_position) { int keyword_position = RDF_KEYWORD_NOT_FOUND; StringInfoData idt; initStringInfo(&idt); if (count) { for (size_t i = 0; i < *count; i++) { appendStringInfo(&idt, " "); } if (*count > 0) appendStringInfo(&idt, "├─ "); } elog(DEBUG1, "%s%s called: searching '%s' in start_position %d", NameStr(idt), __func__, keyword, start_position); if (start_position < 0) elog(ERROR, "%s%s: start_position cannot be negative.", NameStr(idt), __func__); /* * Some SPARQL keywords can be placed in the very beginning of a query, so they not always * have a preceeding character. So here we first check if the searched keyword exists * in the beginning of the string. */ if (((strcasecmp(keyword, RDF_SPARQL_KEYWORD_SELECT) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_SELECT, strlen(RDF_SPARQL_KEYWORD_SELECT)) == 0) || (strcasecmp(keyword, RDF_SPARQL_KEYWORD_PREFIX) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_PREFIX, strlen(RDF_SPARQL_KEYWORD_PREFIX)) == 0) || (strcasecmp(keyword, RDF_SPARQL_KEYWORD_DESCRIBE) == 0 && strncasecmp(str, RDF_SPARQL_KEYWORD_DESCRIBE, strlen(RDF_SPARQL_KEYWORD_DESCRIBE)) == 0)) && start_position == 0) { elog(DEBUG2, "%s%s: nothing before SELECT. Setting keyword_position to 0.", NameStr(idt), __func__); keyword_position = 0; } else { for (int i = 0; i < strlen(start_chars); i++) { for (int j = 0; j < strlen(end_chars); j++) { char *el; StringInfoData eval_token; initStringInfo(&eval_token); appendStringInfo(&eval_token, "%c%s%c", start_chars[i], keyword, end_chars[j]); el = strcasestr(str + start_position, eval_token.data); if (el != NULL) { int nquotes = 0; for (int k = 0; k <= (el - str); k++) { if (str[k] == '\"') nquotes++; } /* * If the keyword is located after an opening double-quote it is a literal and should * not be considered as a keyword. */ if (nquotes % 2 != 1) keyword_position = el - str; if (keyword_position != RDF_KEYWORD_NOT_FOUND) break; } } } } if ((count) && keyword_position != RDF_KEYWORD_NOT_FOUND) { (*count)++; elog(DEBUG2, "%s%s (%d): keyword '%s' found in position %d. Recalling %s ... ", NameStr(idt), __func__, *count, keyword, keyword_position, __func__); LocateKeyword(str, start_chars, keyword, end_chars, count, keyword_position + 1); elog(DEBUG2, "%s%s: '%s' search returning postition %d for start position %d", NameStr(idt), __func__, keyword, keyword_position, start_position); } elog(DEBUG1, "%s exit: returning '%d' (keyword_position)", __func__, keyword_position); return keyword_position; }