diff --git a/packages/linkml/pyproject.toml b/packages/linkml/pyproject.toml index abce3b4510..5f2689848d 100644 --- a/packages/linkml/pyproject.toml +++ b/packages/linkml/pyproject.toml @@ -50,7 +50,10 @@ dependencies = [ # Specifier syntax: https://peps.python.org/pep-0631/ "openpyxl", "parse", "prefixcommons >= 0.1.7", - "prefixmaps >= 0.2.2", + # TODO(prefixmaps-0.2.8): Replace git pin with "prefixmaps >= 0.2.8" once released, + # then remove [tool.hatch.metadata] allow-direct-references and regenerate uv.lock. + # Tracked in: https://github.com/linkml/prefixmaps/issues/82 + "prefixmaps @ git+https://github.com/linkml/prefixmaps@75435150a1b31760b9780af2b64a265943a9b263", "pydantic >= 2.0.0, < 3.0.0", "pyjsg >= 0.12.3", "pyshex >= 0.9.0", @@ -202,6 +205,10 @@ vcs = "git" style = "pep440" fallback-version = "0.0.0" +[tool.hatch.metadata] +# TODO(prefixmaps-0.2.8): Remove this section once the git pin is replaced with >= 0.2.8 +allow-direct-references = true + [tool.hatch.version] source = "uv-dynamic-versioning" diff --git a/packages/linkml/src/linkml/generators/jsonldcontextgen.py b/packages/linkml/src/linkml/generators/jsonldcontextgen.py index c30afc72a5..0c81a0edc4 100644 --- a/packages/linkml/src/linkml/generators/jsonldcontextgen.py +++ b/packages/linkml/src/linkml/generators/jsonldcontextgen.py @@ -15,7 +15,7 @@ from linkml._version import __version__ from linkml.utils.deprecation import deprecated_fields -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, shared_arguments, well_known_prefix_map from linkml_runtime.linkml_model.meta import ClassDefinition, EnumDefinition, SlotDefinition from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, underscore @@ -90,6 +90,9 @@ class ContextGenerator(Generator): frame_root: str | None = None def __post_init__(self) -> None: + # Must be set before super().__post_init__() because the parent triggers + # the visitor pattern (visit_schema), which accesses _prefix_remap. + self._prefix_remap: dict[str, str] = {} super().__post_init__() if self.namespaces is None: raise TypeError("Schema text must be supplied to context generator. Preparsed schema will not work") @@ -127,8 +130,14 @@ def _collect_external_elements(sv: SchemaView) -> tuple[set[str], set[str]]: external_slots.update(schema_def.slots.keys()) return external_classes, external_slots + def add_prefix(self, ncname: str) -> None: + """Add a prefix, applying well-known prefix normalisation when enabled.""" + super().add_prefix(self._prefix_remap.get(ncname, ncname)) + def visit_schema(self, base: str | Namespace | None = None, output: str | None = None, **_): - # Add any explicitly declared prefixes + # Add any explicitly declared prefixes. + # Direct .add() is safe here: the normalisation block below explicitly + # rewrites emit_prefixes entries for any renamed prefixes (Cases 1-3). for prefix in self.schema.prefixes.values(): self.emit_prefixes.add(prefix.prefix_prefix) @@ -136,6 +145,68 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = for pfx in self.schema.emit_prefixes: self.add_prefix(pfx) + # Normalise well-known prefix names when --normalize-prefixes is set. + # If the schema declares a non-standard alias for a namespace that has + # a well-known standard name (e.g. ``sdo`` for + # ``https://schema.org/``), replace the alias with the standard name + # so that generated JSON-LD contexts use the conventional prefix. + # + # Three cases are handled: + # 1. Standard prefix is not yet bound → just rebind from old to new. + # 2. Standard prefix is bound to a *different* URI: + # a. User-declared (in schema.prefixes) → collision, skip with warning. + # b. Runtime default (e.g. linkml-runtime's ``schema: http://…``) + # → remove stale binding, then rebind. + # 3. Standard prefix is already bound to the *same* URI (duplicate) + # → just drop the non-standard alias. + # + # A remap dict is stored for ``_build_element_id`` because + # ``prefix_suffix()`` splits CURIEs on ``:`` without looking up the + # namespace dict. + self._prefix_remap.clear() + if self.normalize_prefixes: + wk = well_known_prefix_map() + for old_pfx in list(self.namespaces): + url = str(self.namespaces[old_pfx]) + std_pfx = wk.get(url) + if not std_pfx or std_pfx == old_pfx: + continue + if std_pfx in self.namespaces: + if str(self.namespaces[std_pfx]) != url: + # Case 2: std_pfx is bound to a different URI. + # If the user explicitly declared std_pfx in the schema, + # it is intentional — skip to avoid data loss. + if std_pfx in self.schema.prefixes: + self.logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is " + "already declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + str(self.namespaces[std_pfx]), + url, + ) + continue + # Not user-declared (e.g. linkml-runtime default) — safe to remove + self.emit_prefixes.discard(std_pfx) + del self.namespaces[std_pfx] + else: + # Case 3: standard prefix already bound to same URI + # — just drop the non-standard alias + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + continue + # Case 1 (or Case 2 after stale removal): bind standard name + self.namespaces[std_pfx] = self.namespaces[old_pfx] + del self.namespaces[old_pfx] + if old_pfx in self.emit_prefixes: + self.emit_prefixes.discard(old_pfx) + self.emit_prefixes.add(std_pfx) + self._prefix_remap[old_pfx] = std_pfx + # Add the default prefix if self.schema.default_prefix: dflt = self.namespaces.prefix_for(self.schema.default_prefix) @@ -143,6 +214,8 @@ def visit_schema(self, base: str | Namespace | None = None, output: str | None = self.default_ns = dflt if self.default_ns: default_uri = self.namespaces[self.default_ns] + # Direct .add() is safe: default_ns is already resolved from + # the (possibly normalised) namespace bindings above. self.emit_prefixes.add(self.default_ns) else: default_uri = self.schema.default_prefix @@ -236,7 +309,61 @@ def end_schema( with open(frame_path, "w", encoding="UTF-8") as f: json.dump(frame, f, indent=2, ensure_ascii=False) - return str(as_json(context)) + "\n" + if self.deterministic: + return self._deterministic_context_json(json.loads(str(as_json(context))), indent=3) + return str(as_json(context)) + + @staticmethod + def _deterministic_context_json(data: dict, indent: int = 3) -> str: + """Serialize a JSON-LD context with deterministic key ordering. + + Preserves the conventional JSON-LD context structure: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with: + a. ``@``-prefixed directives (``@vocab``, ``@base``) first + b. Prefix declarations (string values) second + c. Class/property term entries (object values) last + 3. Each group sorted alphabetically within itself + + Unlike :func:`deterministic_json`, this understands JSON-LD + conventions so that the output remains human-readable while + still being byte-identical across invocations. + """ + from linkml.utils.generator import deterministic_json + + ordered = {} + + # 1. "comments" first (if present) + if "comments" in data: + ordered["comments"] = data["comments"] + + # 2. "@context" with structured internal ordering + if "@context" in data: + ctx = data["@context"] + ordered_ctx = {} + + # 2a. @-prefixed directives (@vocab, @base, etc.) + for k in sorted(k for k in ctx if k.startswith("@")): + ordered_ctx[k] = ctx[k] + + # 2b. Prefix declarations (string values — short namespace URIs) + for k in sorted(k for k in ctx if not k.startswith("@") and isinstance(ctx[k], str)): + ordered_ctx[k] = ctx[k] + + # 2c. Term definitions (object values) — deep-sorted for determinism + term_entries = {k: v for k, v in ctx.items() if not k.startswith("@") and not isinstance(v, str)} + sorted_terms = json.loads(deterministic_json(term_entries)) + for k in sorted(sorted_terms): + ordered_ctx[k] = sorted_terms[k] + + ordered["@context"] = ordered_ctx + + # 3. Any remaining top-level keys + for k in sorted(data): + if k not in ordered: + ordered[k] = data[k] + + return json.dumps(ordered, indent=indent, ensure_ascii=False) def visit_class(self, cls: ClassDefinition) -> bool: if self.exclude_imports and cls.name not in self._local_classes: @@ -486,6 +613,11 @@ def _build_element_id(self, definition: Any, uri: str) -> None: @return: None """ uri_prefix, uri_suffix = self.namespaces.prefix_suffix(uri) + # Apply well-known prefix normalisation (e.g. sdo → schema). + # prefix_suffix() splits CURIEs on ':' without checking the + # namespace dict, so it may return a stale alias. + if uri_prefix and uri_prefix in self._prefix_remap: + uri_prefix = self._prefix_remap[uri_prefix] is_default_namespace = uri_prefix == self.context_body["@vocab"] or uri_prefix == self.namespaces.prefix_for( self.context_body["@vocab"] ) diff --git a/packages/linkml/src/linkml/generators/jsonldgen.py b/packages/linkml/src/linkml/generators/jsonldgen.py index 75d2068e16..c94c74d9dd 100644 --- a/packages/linkml/src/linkml/generators/jsonldgen.py +++ b/packages/linkml/src/linkml/generators/jsonldgen.py @@ -1,5 +1,6 @@ """Generate JSONld from a LinkML schema.""" +import json import os from collections.abc import Sequence from copy import deepcopy @@ -179,6 +180,8 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: # TODO: The _visit function above alters the schema in situ # force some context_kwargs context_kwargs["metadata"] = False + # Forward prefix normalisation into the inline @context. + context_kwargs.setdefault("normalize_prefixes", self.normalize_prefixes) add_prefixes = ContextGenerator(self.original_schema, **context_kwargs).serialize() add_prefixes_json = loads(add_prefixes) metamodel_ctx = self.metamodel_context or METAMODEL_CONTEXT_URI @@ -203,6 +206,10 @@ def end_schema(self, context: str | Sequence[str] | None = None, context_kwargs: self.schema["@context"].append({"@base": base_prefix}) # json_obj["@id"] = self.schema.id out = str(as_json(self.schema, indent=" ")) + "\n" + if self.deterministic: + from linkml.utils.generator import deterministic_json + + out = deterministic_json(json.loads(out), indent=2) + "\n" self.schema = self.original_schema return out diff --git a/packages/linkml/src/linkml/generators/owlgen.py b/packages/linkml/src/linkml/generators/owlgen.py index 88cd3fa10f..c065b3f3b3 100644 --- a/packages/linkml/src/linkml/generators/owlgen.py +++ b/packages/linkml/src/linkml/generators/owlgen.py @@ -7,7 +7,7 @@ from copy import copy from dataclasses import dataclass, field from enum import Enum, unique -from typing import Any, TypeAlias, TypeVar +from typing import Any, ClassVar, TypeAlias, TypeVar import click import rdflib @@ -21,7 +21,9 @@ from linkml._version import __version__ from linkml.generators.common.subproperty import is_xsd_anyuri_range from linkml.utils.deprecation import deprecation_warning -from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments +from linkml.utils.language_tags import LanguageTagResolver +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime import SchemaView from linkml_runtime.linkml_model.meta import ( AnonymousClassExpression, @@ -42,7 +44,7 @@ ) from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.introspection import package_schemaview -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph +from linkml_runtime.utils.yamlutils import YAMLRoot logger = logging.getLogger(__name__) @@ -55,6 +57,21 @@ SWRLB = rdflib.Namespace("http://www.w3.org/2003/11/swrlb#") +def _expression_sort_key(expr: YAMLRoot) -> str: + """Return a stable sort key for LinkML anonymous expressions. + + Used by ``--deterministic`` to order ``any_of``, ``all_of``, + ``none_of``, and ``exactly_one_of`` members reproducibly. + + This relies on ``YAMLRoot.__repr__()`` which formats objects using + their **field values** (not memory addresses). All anonymous + expression dataclasses in ``linkml_runtime.linkml_model.meta`` + use ``@dataclass(repr=False)`` and inherit this field-based repr, + so the output is deterministic across runs. + """ + return repr(expr) + + @unique class MetadataProfile(Enum): """ @@ -209,7 +226,11 @@ class OwlSchemaGenerator(Generator): one direct ``is_a`` child, the generator adds ``AbstractClass rdfs:subClassOf (Child1 or Child2 or …)``, expressing the open-world covering constraint that every instance of the abstract class must also be an instance of one of its - direct subclasses.""" + direct subclasses. + + .. note:: An info message is emitted when an abstract class has no children (no axiom generated). + A warning is emitted when there is only one child (covering axiom degenerates to equivalence + Parent ≡ Child). Use this flag to suppress covering axioms entirely if equivalence is undesired.""" @staticmethod def _present(values: Iterable[_T | None]) -> list[_T]: @@ -235,6 +256,49 @@ def _present(values: Iterable[_T | None]) -> list[_T]: - have no ``rdfs:range`` restriction (any IRI is valid) """ + default_language: str | None = None + """Default BCP 47 language tag for human-readable string literals. + + When set, ``rdfs:label``, ``rdfs:comment``, ``skos:definition``, + ``dcterms:title``, and other annotation literals are emitted with the + specified language tag (e.g. ``"Person"@en``). An element-level + ``in_language`` value overrides this default for that element. + + Technical literals (URIs, numeric constraints, XSD facets) are never + language-tagged. Conforms to :rfc:`5646` (BCP 47). + """ + + # Metaslot ranges that represent human-readable text (eligible for language tags). + # Everything else (uri, uriorcurie, datetime, boolean, integer, classes, enums, …) + # is technical and must never be language-tagged. + _LANGUAGE_TAGGABLE_RANGES: ClassVar[frozenset[str]] = frozenset({"string", "ncname"}) + + def __post_init__(self) -> None: + # Resolver must be assigned before ``super().__post_init__()`` so that + # any hook the parent invokes during initialisation can safely call + # ``_resolve_language``. The resolver also validates the default tag + # once here; per-element tags are validated lazily, with at most one + # warning per distinct malformed tag. + self._language_resolver = LanguageTagResolver(self.default_language) + super().__post_init__() + + def _resolve_language(self, element: "Definition | PermissibleValue | None" = None) -> str | None: + """Return the BCP 47 language tag for *element*, or ``None``. + + Delegates to :class:`linkml.utils.language_tags.LanguageTagResolver`. + Resolution order is element-level ``in_language`` first, then the + generator-level default. + """ + return self._language_resolver.resolve(element) + + def _literal(self, value: str, element: "Definition | PermissibleValue | None" = None) -> Literal: + """Create a language-tagged ``Literal`` for a human-readable string. + + If no language tag is resolved, falls back to a plain literal. + """ + lang = self._resolve_language(element) + return Literal(value, lang=lang) if lang else Literal(value) + def as_graph(self) -> Graph: """ Generate an rdflib Graph from the LinkML schema. @@ -265,6 +329,10 @@ def as_graph(self) -> Graph: self.graph.bind(prefix, self.metamodel.namespaces[prefix]) for pfx in schema.prefixes.values(): self.graph.namespace_manager.bind(pfx.prefix_prefix, URIRef(pfx.prefix_reference)) + if self.normalize_prefixes: + normalize_graph_prefixes( + graph, {str(v.prefix_prefix): str(v.prefix_reference) for v in schema.prefixes.values()} + ) graph.add((base, RDF.type, OWL.Ontology)) # Add main schema elements @@ -300,6 +368,10 @@ def serialize(self, **kwargs: Any) -> str: """ self.as_graph() fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + return deterministic_turtle(self.graph) return canonicalize_rdf_graph(self.graph, output_format=fmt) def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: @@ -307,6 +379,8 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: Add annotation properties. Set the profile attribute to the appropriate OWL profile. + Human-readable string literals are language-tagged when + ``default_language`` is set or the element has ``in_language``. :param e: schema element :param uri: URI representation of schema element @@ -316,6 +390,7 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: msv = self.metamodel_schemaview this_sv = self.schemaview sn_mappings = msv.slot_name_mappings() + lang = self._resolve_language(e) # iterate through all the assigned metamodel slots for metaslot_name, metaslot_value in vars(e).items(): @@ -340,6 +415,8 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: obj = URIRef(v) elif metaslot_range == "uriorcurie": obj = URIRef(this_sv.expand_curie(v)) + elif metaslot_range in self._LANGUAGE_TAGGABLE_RANGES and lang: + obj = Literal(v, lang=lang) else: obj = Literal(v) elif metaslot_range in msv.all_subsets(): @@ -351,6 +428,15 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: # else: # logger.debug(f"Skipping {uri} {metaslot_uri} => {v}") else: + # Catch-all for ranges that are not types, subsets, or + # classes -- in practice these are enum-ranged metaslots + # such as ``pv_formula`` (range ``pv_formula_options``) on + # a PermissibleValue or ``obligation_level`` (range + # ``obligation_level_enum``) on a SlotDefinition. Their + # values are permissible-value identifiers, i.e. constraint + # data, not labels: tagging them would shift the datatype + # from ``xsd:string`` to ``rdf:langString`` and break + # downstream string equality / SHACL ``sh:in`` matching. obj = Literal(v) self.graph.add((uri, metaslot_uri, obj)) @@ -368,7 +454,11 @@ def add_metadata(self, e: Definition | PermissibleValue, uri: URIRef) -> None: if k_uri == k: k_uri = None if k_uri: - self.graph.add((uri, URIRef(k_uri), Literal(v.value))) + if isinstance(v.value, str): + obj = self._literal(v.value, e) + else: + obj = Literal(v.value) + self.graph.add((uri, URIRef(k_uri), obj)) def add_class(self, cls: ClassDefinition) -> None: """ @@ -505,6 +595,26 @@ def condition_to_bnode(expr: AnonymousClassExpression) -> OWL_EXPRESSION | None: # must be an instance of at least one of its direct subclasses. if cls.abstract and not self.skip_abstract_class_as_unionof_subclasses: children = sorted(sv.class_children(cls.name, imports=self.mergeimports, mixins=False, is_a=True)) + if not children: + logger.info( + "Abstract class '%s' has no children. No covering axiom will be generated.", + cls.name, + ) + elif len(children) == 1: + # Warn: with one child C, the covering axiom degenerates to + # Parent ⊑ C which, combined with C ⊑ Parent (from is_a), + # creates Parent ≡ C (equivalence). This is semantically + # correct per OWL 2 but may be surprising for extensible + # ontologies where more children are added later. + logger.warning( + "Abstract class '%s' has only 1 direct child ('%s'). " + "The covering axiom makes them equivalent (%s ≡ %s). " + "Use --skip-abstract-class-as-unionof-subclasses to suppress.", + cls.name, + children[0], + cls.name, + children[0], + ) if children: child_uris = [self._class_uri(child) for child in children] union_node = self._union_of(child_uris) @@ -568,13 +678,17 @@ def transform_class_expression( own_slots = self.get_own_slots(cls) owl_exprs: list[OWL_EXPRESSION] = [] if cls.any_of: - any_of_expr = self._union_of([self.transform_class_expression(x) for x in cls.any_of]) + members = list(cls.any_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + any_of_expr = self._union_of([self.transform_class_expression(x) for x in members]) if any_of_expr: owl_exprs.append(any_of_expr) if cls.exactly_one_of: - sub_exprs: list[OWL_EXPRESSION] = self._present( - self.transform_class_expression(x) for x in cls.exactly_one_of - ) + members = list(cls.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + sub_exprs: list[OWL_EXPRESSION] = self._present(self.transform_class_expression(x) for x in members) if isinstance(cls, ClassDefinition): cls_uri = self._class_uri(cls.name) listnode = BNode() @@ -582,11 +696,11 @@ def transform_class_expression( graph.add((cls_uri, OWL.disjointUnionOf, listnode)) else: sub_sub_exprs: list[OWL_EXPRESSION] = [] - for i, x in enumerate(cls.exactly_one_of): + for i, x in enumerate(members): operand_expr = self.transform_class_expression(x) if not operand_expr: continue - rest = cls.exactly_one_of[0:i] + cls.exactly_one_of[i + 1 :] + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of([self.transform_class_expression(nx) for nx in rest]) pos_expr = self._intersection_of([operand_expr, neg_expr]) if pos_expr: @@ -596,11 +710,17 @@ def transform_class_expression( owl_exprs.append(union_expr) # owl_exprs.extend(sub_exprs) if cls.all_of: - all_of_expr = self._intersection_of([self.transform_class_expression(x) for x in cls.all_of]) + members = list(cls.all_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + all_of_expr = self._intersection_of([self.transform_class_expression(x) for x in members]) if all_of_expr: owl_exprs.append(all_of_expr) if cls.none_of: - none_of_expr = self._complement_of_union_of([self.transform_class_expression(x) for x in cls.none_of]) + members = list(cls.none_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) + none_of_expr = self._complement_of_union_of([self.transform_class_expression(x) for x in members]) if none_of_expr: owl_exprs.append(none_of_expr) for slot in own_slots: @@ -773,19 +893,29 @@ def _get_slot_nodes( ) return rdflib_nodes or None - if any_of_rdflib_nodes := _get_slot_nodes(slot.any_of): + def _maybe_sort_slots( + slot_definitions: Sequence[SlotDefinition | AnonymousSlotExpression] | None, + ) -> Sequence[SlotDefinition | AnonymousSlotExpression] | None: + if slot_definitions and self.deterministic: + return sorted(slot_definitions, key=_expression_sort_key) + return slot_definitions + + if any_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.any_of)): owl_exprs.append(self._union_of(any_of_rdflib_nodes)) - if all_of_rdflib_nodes := _get_slot_nodes(slot.all_of): + if all_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.all_of)): owl_exprs.append(self._intersection_of(all_of_rdflib_nodes)) - if none_of_rdflib_nodes := _get_slot_nodes(slot.none_of): + if none_of_rdflib_nodes := _get_slot_nodes(_maybe_sort_slots(slot.none_of)): owl_exprs.append(self._complement_of_union_of(none_of_rdflib_nodes)) if slot.exactly_one_of: + members = list(slot.exactly_one_of) + if self.deterministic: + members = sorted(members, key=_expression_sort_key) disj_exprs: list[OWL_EXPRESSION] = [] - for i, operand in enumerate(slot.exactly_one_of): + for i, operand in enumerate(members): operand_expr = self.transform_class_slot_expression(cls, operand, main_slot, owl_types) if not operand_expr: continue - rest = slot.exactly_one_of[0:i] + slot.exactly_one_of[i + 1 :] + rest = members[0:i] + members[i + 1 :] neg_expr = self._complement_of_union_of( [self.transform_class_slot_expression(cls, x, main_slot, owl_types) for x in rest], owl_types=owl_types, @@ -1059,7 +1189,10 @@ def add_enum(self, e: EnumDefinition) -> None: owl_types: list[URIRef | None] = [] enum_owl_type = self._get_metatype(e, self.default_permissible_value_type) - for pv in e.permissible_values.values(): + pvs = e.permissible_values.values() + if self.deterministic: + pvs = sorted(pvs, key=lambda x: x.text) + for pv in pvs: pv_owl_type = self._get_metatype(pv, enum_owl_type) owl_types.append(pv_owl_type) if pv_owl_type == RDFS.Literal: @@ -1079,7 +1212,7 @@ def add_enum(self, e: EnumDefinition) -> None: if not isinstance(pv_node, Literal): self.add_metadata(pv, pv_node) g.add((pv_node, RDF.type, pv_owl_type)) - g.add((pv_node, RDFS.label, Literal(pv.text))) + g.add((pv_node, RDFS.label, self._literal(pv.text, pv))) # TODO: make this configurable # self._add_element_properties(pv_uri, pv) if self.metaclasses: @@ -1654,7 +1787,9 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: show_default=True, help=( "If true, suppress rdfs:subClassOf owl:unionOf(subclasses) covering axioms for abstract classes. " - "By default such axioms are emitted for every abstract class that has direct is_a children." + "By default such axioms are emitted for every abstract class that has direct is_a children. " + "Note: an info message is logged for abstract classes with zero children (no axiom); " + "a warning is emitted for one child (equivalence)." ), ) @click.option( @@ -1668,6 +1803,17 @@ def slot_owl_type(self, slot: SlotDefinition) -> URIRef: "the JSON-LD context generator (--xsd-anyuri-as-iri → @type: @id)." ), ) +@click.option( + "--default-language", + default=None, + show_default=True, + help=( + "Default BCP 47 language tag for human-readable string literals " + "(e.g. en, de, zh-Hans). When set, rdfs:label, rdfs:comment, " + "skos:definition and other text annotations are emitted with the " + "specified language tag. Element-level in_language overrides this." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile: str, metadata_profile: str, **kwargs: Any) -> None: """Generate an OWL representation of a LinkML model diff --git a/packages/linkml/src/linkml/generators/rdfgen.py b/packages/linkml/src/linkml/generators/rdfgen.py index a3fcf6a848..95d832f2b3 100644 --- a/packages/linkml/src/linkml/generators/rdfgen.py +++ b/packages/linkml/src/linkml/generators/rdfgen.py @@ -19,8 +19,8 @@ from linkml._version import __version__ from linkml.generators.jsonldgen import JSONLDGenerator from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.linkml_model import SchemaDefinition -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @dataclass diff --git a/packages/linkml/src/linkml/generators/shaclgen.py b/packages/linkml/src/linkml/generators/shaclgen.py index 874e47b3a6..bcf88037c2 100644 --- a/packages/linkml/src/linkml/generators/shaclgen.py +++ b/packages/linkml/src/linkml/generators/shaclgen.py @@ -13,10 +13,11 @@ from linkml.generators.common.subproperty import get_subproperty_values, is_uri_range from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shacl.shacl_ifabsent_processor import ShaclIfAbsentProcessor -from linkml.utils.generator import Generator, shared_arguments -from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName +from linkml.utils.generator import Generator, normalize_graph_prefixes, shared_arguments +from linkml.utils.language_tags import LanguageTagResolver +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph +from linkml_runtime.linkml_model.meta import ClassDefinition, ElementName, PresenceEnum from linkml_runtime.utils.formatutils import underscore -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.utils.yamlutils import TypedNode, extended_float, extended_int, extended_str logger = logging.getLogger(__name__) @@ -75,6 +76,50 @@ class ShaclGenerator(Generator): """ expand_subproperty_of: bool = True """If True, expand subproperty_of to sh:in constraints with slot descendants""" + + default_language: str | None = None + """Default BCP 47 language tag for human-readable string literals. + + When set, ``sh:name``, ``sh:description``, ``rdfs:label``, and + ``rdfs:comment`` literals are emitted with the specified language tag. + Conforms to :rfc:`5646` (BCP 47). + """ + + message_template: str | None = None + """Template for ``sh:message`` on property shapes. + + When set, each property shape receives an ``sh:message`` literal built from + this template. The following placeholders are expanded: + + * ``{name}`` — the slot name (underscore-separated LinkML name) + * ``{title}`` — the slot title (human-readable), falls back to *name* + * ``{description}`` — the slot description, falls back to empty string + * ``{comments}`` — the slot comments joined with ``; ``, falls back to empty string + * ``{class}`` — the enclosing class name + * ``{path}`` — the property IRI (compact or full) + + Example: ``"Validation of {name} failed!"`` → + ``sh:message "Validation of has_speed failed!"`` + + If ``default_language`` is also set the literal is language-tagged. + """ + + emit_rules: bool = True + """Emit ``sh:sparql`` constraints from LinkML ``rules:`` blocks. + + When ``True`` (default), recognised rule patterns are translated into + SHACL-SPARQL constraints (``sh:SPARQLConstraint``) on the corresponding + ``sh:NodeShape``. Currently two patterns are recognised: + + * *Boolean guard* — a precondition with ``value_presence: PRESENT`` on a + value slot and a postcondition with ``equals_string: "true"`` on a + boolean flag slot. + * *Exclusive value* — a precondition with ``equals_string`` on a slot and + a postcondition with ``maximum_cardinality`` on the *same* slot. + + See `W3C SHACL §5 `_ + and `linkml/linkml#2464 `_. + """ generatorname = os.path.basename(__file__) generatorversion = "0.0.1" valid_formats = ["ttl"] @@ -82,8 +127,24 @@ class ShaclGenerator(Generator): visit_all_class_slots = False uses_schemaloader = False + def _resolve_language(self, element=None) -> str | None: + """Return the BCP 47 language tag for *element*, or ``None``. + + Delegates to :class:`linkml.utils.language_tags.LanguageTagResolver`. + Resolution order is element-level ``in_language`` first, then the + generator-level default. + """ + return self._language_resolver.resolve(element) + def __post_init__(self) -> None: + # Resolver must be assigned before ``super().__post_init__()`` so that + # any hook the parent invokes during initialisation can safely call + # ``_resolve_language``. The resolver also validates the default tag + # once here; per-element tags are validated lazily, with at most one + # warning per distinct malformed tag. + self._language_resolver = LanguageTagResolver(self.default_language) super().__post_init__() + self.message_template = (self.message_template or "").strip() or None self.generate_header() def generate_header(self) -> str: @@ -95,6 +156,10 @@ def generate_header(self) -> str: def serialize(self, **args) -> str: g = self.as_graph() fmt = "turtle" if self.format in ["owl", "ttl"] else self.format + if self.deterministic and fmt == "turtle": + from linkml.utils.generator import deterministic_turtle + + return deterministic_turtle(g) return canonicalize_rdf_graph(g, output_format=fmt) def as_graph(self) -> Graph: @@ -106,6 +171,10 @@ def as_graph(self) -> Graph: for pfx in self.schema.prefixes.values(): g.bind(str(pfx.prefix_prefix), pfx.prefix_reference) + if self.normalize_prefixes: + normalize_graph_prefixes( + g, {str(v.prefix_prefix): str(v.prefix_reference) for v in self.schema.prefixes.values()} + ) for c in sv.all_classes(imports=not self.exclude_imports).values(): @@ -133,13 +202,13 @@ def shape_pv(p, v): if c.title is not None: # Use rdfs:label for NodeShape titles per SHACL spec. # sh:name has rdfs:domain of sh:PropertyShape. See issue #3059. - shape_pv(RDFS.label, Literal(c.title)) + shape_pv(RDFS.label, Literal(c.title, lang=self._resolve_language(c))) if c.description is not None: # Use rdfs:comment for NodeShape descriptions per SHACL spec. # sh:description has rdfs:domain of sh:PropertyShape, so using it # on NodeShapes causes RDFS-aware validators to incorrectly infer # the NodeShape is also a PropertyShape. See issue #3059. - shape_pv(RDFS.comment, Literal(c.description)) + shape_pv(RDFS.comment, Literal(c.description, lang=self._resolve_language(c))) shape_pv(SH.ignoredProperties, self._build_ignored_properties(g, c)) @@ -164,15 +233,38 @@ def prop_pv_literal(p, v): if v is not None: g.add((pnode, p, Literal(v))) + def prop_pv_text(p, v): + if v is not None: + g.add((pnode, p, Literal(v, lang=self._resolve_language(s)))) + prop_pv(SH.path, slot_uri) prop_pv_literal(SH.order, order) order += 1 - prop_pv_literal(SH.name, s.title) - prop_pv_literal(SH.description, s.description) + prop_pv_text(SH.name, s.title) + prop_pv_text(SH.description, s.description) + + # sh:message from template + if self.message_template is not None: + try: + msg_text = self.message_template.format( + name=s.name, + title=s.title or s.name, + description=s.description or "", + comments="; ".join(s.comments) if s.comments else "", + **{"class": c.name}, + path=str(slot_uri), + ).strip() + except (KeyError, IndexError, ValueError) as exc: + raise ValueError( + f"Invalid placeholder {exc} in --message-template. " + f"Allowed: {{name}}, {{title}}, {{description}}, {{comments}}, {{class}}, {{path}}" + ) from None + if msg_text: + prop_pv_text(SH.message, msg_text) # minCount - if s.minimum_cardinality: + if s.minimum_cardinality is not None: prop_pv_literal(SH.minCount, s.minimum_cardinality) - elif s.exact_cardinality: + elif s.exact_cardinality is not None: prop_pv_literal(SH.minCount, s.exact_cardinality) # Identifiers map to the node's IRI rather than a property triple, # so there's no arc to constrain with sh:minCount 1 — emitting it @@ -180,9 +272,9 @@ def prop_pv_literal(p, v): elif s.required and not s.identifier: prop_pv_literal(SH.minCount, 1) # maxCount - if s.maximum_cardinality: + if s.maximum_cardinality is not None: prop_pv_literal(SH.maxCount, s.maximum_cardinality) - elif s.exact_cardinality: + elif s.exact_cardinality is not None: prop_pv_literal(SH.maxCount, s.exact_cardinality) elif not s.multivalued: prop_pv_literal(SH.maxCount, 1) @@ -238,6 +330,11 @@ def st_node_pv(p, v): add_simple_data_type(st_node_pv, r) range_list.append(st_node) + # Propagate pattern constraint to the branch node. + # A branch may combine range + pattern (e.g. range: string + # with pattern: "^...") or specify pattern alone (no range). + if any.pattern: + g.add((range_list[-1], SH.pattern, Literal(any.pattern))) Collection(g, or_node, range_list) else: prop_pv_literal(SH.hasValue, s.equals_number) @@ -284,10 +381,239 @@ def st_node_pv(p, v): if default_value: prop_pv(SH.defaultValue, default_value) + if self.emit_rules: + self._add_rules(g, class_uri_with_suffix, c) + return g LINKML_ANY_URI = "https://w3id.org/linkml/Any" + # ------------------------------------------------------------------- + # Rules → sh:sparql + # ------------------------------------------------------------------- + + def _add_rules(self, g: Graph, shape_uri: URIRef, cls: ClassDefinition) -> None: + """Emit ``sh:sparql`` constraints from LinkML ``rules:`` blocks. + + Each recognised rule is converted into an ``sh:SPARQLConstraint`` + attached to *shape_uri*. Unrecognised patterns are logged at + ``DEBUG`` level and silently skipped. + + Currently recognised patterns: + + * **Boolean guard** — a *precondition* with + ``value_presence: PRESENT`` on a value slot and a *postcondition* + with ``equals_string: "true"`` on a boolean flag slot. + + * **Exclusive value** — a *precondition* with ``equals_string`` on + a slot and a *postcondition* with ``maximum_cardinality`` on the + *same* slot. Enforces that when a specific value is present in a + multivalued slot, the total number of values must not exceed the + given cardinality (typically 1 for mutual exclusion). + + See `W3C SHACL §5 `_. + """ + if not cls.rules: + return + + sv = self.schemaview + for rule in cls.rules: + if getattr(rule, "deactivated", False): + continue + + if getattr(rule, "bidirectional", False): + logger.warning( + "Rule in class %r has bidirectional=true; " + "SHACL-SPARQL generation does not support bidirectional rules. " + "Skipping this rule entirely.", + cls.name, + ) + continue + + if getattr(rule, "open_world", False): + logger.warning( + "Rule in class %r has open_world=true; " + "SHACL operates under closed-world assumption. " + "The constraint is emitted but may not match open-world semantics.", + cls.name, + ) + + if getattr(rule, "elseconditions", None): + logger.warning( + "Rule in class %r has elseconditions; " + "only the forward (if/then) branch is emitted as sh:sparql. " + "The else branch cannot be represented in SHACL-SPARQL.", + cls.name, + ) + + sparql_query = self._rule_to_sparql(sv, cls, rule) + if sparql_query is None: + logger.debug( + "Skipping unsupported rule pattern in class %r: %s", + cls.name, + getattr(rule, "description", "(no description)"), + ) + continue + + constraint = BNode() + g.add((shape_uri, SH.sparql, constraint)) + g.add((constraint, RDF.type, SH.SPARQLConstraint)) + + message = getattr(rule, "description", None) + if message: + g.add((constraint, SH.message, Literal(message, lang=self._resolve_language()))) + + g.add((constraint, SH.select, Literal(sparql_query))) + + def _rule_to_sparql(self, sv, cls: ClassDefinition, rule) -> str | None: + """Convert a ``ClassRule`` to a SPARQL SELECT query string. + + Returns ``None`` when the rule does not match any supported pattern. + """ + pre = getattr(rule, "preconditions", None) + post = getattr(rule, "postconditions", None) + if not pre or not post: + return None + + pre_slots = getattr(pre, "slot_conditions", None) or {} + post_slots = getattr(post, "slot_conditions", None) or {} + + # Pattern: boolean guard + # preconditions: exactly one slot with value_presence PRESENT + # postconditions: exactly one slot with equals_string "true" + if len(pre_slots) == 1 and len(post_slots) == 1: + pre_slot_name = next(iter(pre_slots)) + post_slot_name = next(iter(post_slots)) + + pre_cond = pre_slots[pre_slot_name] + post_cond = post_slots[post_slot_name] + + # Note: PresenceEnum.PRESENT is a PermissibleValue, but parsed schemas + # return PresenceEnum instances — wrapping ensures type-compatible comparison. + is_value_present = getattr(pre_cond, "value_presence", None) == PresenceEnum(PresenceEnum.PRESENT) + is_flag_true = getattr(post_cond, "equals_string", None) == "true" + + if is_value_present and is_flag_true: + return self._build_boolean_guard_sparql(sv, cls, post_slot_name, pre_slot_name) + + # Pattern: exclusive value + # preconditions: slot X has equals_string (a specific enum value) + # postconditions: same slot X has maximum_cardinality N + # Semantics: "If value V is present in slot X, then X has at most N values." + pre_equals = getattr(pre_cond, "equals_string", None) + post_max_card = getattr(post_cond, "maximum_cardinality", None) + + if pre_equals is not None and post_max_card is not None and pre_slot_name == post_slot_name: + return self._build_exclusive_value_sparql(sv, cls, pre_slot_name, pre_equals, int(post_max_card)) + + return None + + def _build_boolean_guard_sparql(self, sv, cls: ClassDefinition, flag_slot_name: str, value_slot_name: str) -> str: + """Build a SPARQL SELECT query for the boolean-guard pattern. + + The query detects violations where the value property is present + but the boolean flag is absent or not ``true``. + + Conforms to `SHACL §5.3.1 + `_: + ``$this`` is pre-bound to each focus node. + """ + flag_uri = self._slot_uri(sv, flag_slot_name, cls) + value_uri = self._slot_uri(sv, value_slot_name, cls) + + return ( + f"SELECT $this WHERE {{\n" + f" OPTIONAL {{ $this <{flag_uri}> ?flag . }}\n" + f" OPTIONAL {{ $this <{value_uri}> ?value . }}\n" + f" FILTER (\n" + f' ( !BOUND(?flag) || str(?flag) != "true" ) &&\n' + f" BOUND(?value)\n" + f" )\n" + f"}}" + ) + + def _build_exclusive_value_sparql( + self, + sv, + cls: ClassDefinition, + slot_name: str, + value_name: str, + max_card: int, + ) -> str | None: + """Build a SPARQL SELECT query for the exclusive-value pattern. + + Detects violations where a specific value is present in a multivalued + slot but the total number of values exceeds *max_card*. + + For the common case ``max_card == 1``, the query checks whether the + exclusive value coexists with any other value (simple existence test). + For ``max_card > 1``, a subquery counts all values and checks against + the limit. + + The exclusive value is resolved to its full IRI via the slot's enum + ``meaning`` field. If the slot is not an enum or the value has no + ``meaning``, the value is compared as a plain literal. + + Conforms to `SHACL §5.3.1 + `_: + ``$this`` is pre-bound to each focus node. + """ + slot_uri = self._slot_uri(sv, slot_name, cls) + value_ref = self._resolve_enum_value_ref(sv, slot_name, value_name) + + if max_card == 1: + return ( + f"SELECT $this WHERE {{\n" + f" $this <{slot_uri}> {value_ref} .\n" + f" $this <{slot_uri}> ?other .\n" + f" FILTER (?other != {value_ref})\n" + f"}}" + ) + + return ( + f"SELECT $this WHERE {{\n" + f" $this <{slot_uri}> {value_ref} .\n" + f" {{\n" + f" SELECT $this (COUNT(?val) AS ?count)\n" + f" WHERE {{ $this <{slot_uri}> ?val . }}\n" + f" GROUP BY $this\n" + f" HAVING (?count > {max_card})\n" + f" }}\n" + f"}}" + ) + + def _resolve_enum_value_ref(self, sv, slot_name: str, value_name: str) -> str: + """Resolve an enum value name to a SPARQL term (IRI or literal). + + Looks up the slot's range as an enum, finds the permissible value + matching *value_name*, and returns its ``meaning`` as a full IRI + wrapped in angle brackets. Falls back to a quoted literal if the + slot is not an enum or the value lacks a ``meaning``. + """ + slot = sv.get_slot(slot_name) + if slot: + range_name = slot.range + if range_name and range_name in sv.all_enums(): + enum = sv.get_enum(range_name) + pv = enum.permissible_values.get(value_name) + if pv and pv.meaning: + iri = sv.expand_curie(pv.meaning) + return f"<{iri}>" + return f'"{value_name}"' + + def _slot_uri(self, sv, slot_name: str, cls: ClassDefinition) -> str: + """Resolve a slot name to a full IRI string for use in SPARQL queries. + + Mirrors the resolution logic used for ``sh:path`` in the main slot loop: + prefer ``sv.get_uri()`` for slots registered in the schema map, fall + back to ``default_prefix:underscored_name``. + """ + slot = sv.get_slot(slot_name) + if slot and slot_name in sv.element_by_schema_map(): + return sv.get_uri(slot, expand=True) + pfx = sv.schema.default_prefix + return sv.expand_curie(f"{pfx}:{underscore(slot_name)}") + def _add_class(self, func: Callable, r: ElementName) -> None: """Add an sh:class constraint for range class *r*. @@ -313,13 +639,13 @@ def _add_enum(self, g: Graph, func: Callable, r: ElementName) -> None: sv = self.schemaview enum = sv.get_enum(r) pv_node = BNode() + pv_items = list(enum.permissible_values.items()) + if self.deterministic: + pv_items = sorted(pv_items, key=lambda x: x[0]) Collection( g, pv_node, - [ - URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) - for pv_name, pv in enum.permissible_values.items() - ], + [URIRef(sv.expand_curie(pv.meaning)) if pv.meaning else Literal(pv_name) for pv_name, pv in pv_items], ) func(SH["in"], pv_node) @@ -430,9 +756,14 @@ def _add_annotations(self, func: Callable, item) -> None: else: N_predicate = Literal(a["tag"], datatype=XSD.string) # If the value is a string and ':' is in the value, treat it as a CURIE, - # otherwise treat as Literal with derived XSD datatype + # otherwise treat as Literal with derived XSD datatype. + # String annotations are language-tagged when default_language is set; + # non-string types (bool, int, float) keep their XSD datatype. + lang = self._resolve_language(item) if type(a["value"]) is extended_str and ":" in a["value"]: N_object = URIRef(sv.expand_curie(a["value"])) + elif isinstance(a["value"], str) and lang: + N_object = Literal(a["value"], lang=lang) else: N_object = Literal(a["value"], datatype=self._getXSDtype(a["value"])) @@ -473,7 +804,10 @@ def collect_child_properties(class_name: str, output: set) -> None: list_node = BNode() ignored_properties.add(RDF.type) - Collection(g, list_node, list(ignored_properties)) + props = list(ignored_properties) + if self.deterministic: + props = sorted(props, key=str) + Collection(g, list_node, props) return list_node @@ -527,6 +861,40 @@ def add_simple_data_type(func: Callable, r: ElementName) -> None: help="If --expand-subproperty-of (default), slots with subproperty_of will generate sh:in constraints " "containing all slot descendants. Use --no-expand-subproperty-of to disable this behavior.", ) +@click.option( + "--default-language", + default=None, + show_default=True, + help=( + "Default BCP 47 language tag for human-readable string literals " + "(e.g. en, de, zh-Hans). When set, sh:name, sh:description, " + "rdfs:label and rdfs:comment are emitted with the specified " + "language tag." + ), +) +@click.option( + "--message-template", + default=None, + show_default=True, + help=( + "Template string for sh:message on each property shape. " + "Placeholders: {name} (slot name), {title} (slot title or name), " + "{description} (slot description), {comments} (slot comments joined with '; '), " + "{class} (class name), {path} (property IRI). " + 'Example: "{name} ({class}): {description} [{comments}]"' + ), +) +@click.option( + "--emit-rules/--no-emit-rules", + default=True, + show_default=True, + help=( + "Emit sh:sparql constraints from LinkML rules: blocks. " + "When enabled (default), recognised rule patterns (e.g. boolean-guard) " + "are translated into SHACL-SPARQL constraints on the corresponding " + "sh:NodeShape. Use --no-emit-rules to suppress rule generation." + ), +) @click.version_option(__version__, "-V", "--version") def cli(yamlfile, **args): """Generate SHACL turtle from a LinkML model""" diff --git a/packages/linkml/src/linkml/generators/shexgen.py b/packages/linkml/src/linkml/generators/shexgen.py index 40a93ffbc9..704dd1ae61 100644 --- a/packages/linkml/src/linkml/generators/shexgen.py +++ b/packages/linkml/src/linkml/generators/shexgen.py @@ -15,6 +15,7 @@ from linkml._version import __version__ from linkml.generators.common.subproperty import get_subproperty_values from linkml.utils.generator import Generator, shared_arguments +from linkml.utils.rdf_canonicalize import canonicalize_rdf_graph from linkml_runtime.linkml_model.meta import ( ClassDefinition, ElementName, @@ -26,7 +27,6 @@ from linkml_runtime.linkml_model.types import SHEX from linkml_runtime.utils.formatutils import camelcase, sfx from linkml_runtime.utils.metamodelcore import URIorCURIE -from linkml_runtime.utils.rdf_canonicalize import canonicalize_rdf_graph @dataclass diff --git a/packages/linkml/src/linkml/utils/generator.py b/packages/linkml/src/linkml/utils/generator.py index 88fc485851..99121b50e4 100644 --- a/packages/linkml/src/linkml/utils/generator.py +++ b/packages/linkml/src/linkml/utils/generator.py @@ -20,11 +20,12 @@ import os import re import sys +import types from collections.abc import Callable, Mapping from dataclasses import dataclass, field from functools import lru_cache from pathlib import Path -from typing import ClassVar, TextIO, Union, cast +from typing import TYPE_CHECKING, ClassVar, TextIO, Union, cast import click from click import Argument, Command, Option @@ -37,6 +38,10 @@ from linkml.utils.schemaloader import SchemaLoader from linkml.utils.typereferences import References from linkml_runtime import SchemaView + +if TYPE_CHECKING: + from rdflib import Graph as RdfGraph + from linkml_runtime.linkml_model.meta import ( ClassDefinition, ClassDefinitionName, @@ -58,6 +63,9 @@ from linkml_runtime.utils.formatutils import camelcase, underscore from linkml_runtime.utils.namespaces import Namespaces +if TYPE_CHECKING: + from rdflib import Graph + logger = logging.getLogger(__name__) @@ -78,6 +86,428 @@ def _resolved_metamodel(mergeimports): return metamodel +def well_known_prefix_map() -> dict[str, str]: + """Return a mapping from namespace URI to standard prefix name. + + Primary source: the ``linked_data`` context from `prefixmaps + `_ — the canonical curated + registry maintained by the LinkML team. This context provides + correct, community-consensus prefix names (e.g. ``sh`` not ``shacl``, + ``schema`` not ``sdo``). + + Secondary source: the ``merged`` context from prefixmaps, which + combines prefix.cc, bioregistry, and other sources for broad coverage. + + A small ``_PREFIX_OVERRIDES`` map corrects the few cases where the + merged context disagrees with rdflib/W3C canonical names. + + Both ``http`` and ``https`` variants of schema.org and wgs84 are + included because the linkml-runtime historically binds the HTTP form + while rdflib (and the W3C) prefer HTTPS. + + .. note:: + Requires ``prefixmaps >= 0.2.7``. For entries added in + linkml/prefixmaps#81 (W3C/OGC standard prefixes), pin to + ``prefixmaps @ git+https://github.com/linkml/prefixmaps@75435150`` + until v0.2.8 is released. + """ + return dict(_cached_well_known_prefix_map()) + + +@lru_cache(maxsize=1) +def _cached_well_known_prefix_map() -> dict[str, str]: + """Internal cached builder for well_known_prefix_map().""" + from prefixmaps import load_context + + # Layer 1: merged context (broad coverage, first-seen-wins for duplicates). + merged = load_context("merged") + ns_to_prefix: dict[str, str] = {} + for rec in merged.prefix_expansions: + if rec.namespace not in ns_to_prefix: + ns_to_prefix[rec.namespace] = rec.prefix + + # Layer 2: linked_data context (curated, correct names) overrides merged. + ld = load_context("linked_data") + for rec in ld.prefix_expansions: + ns_to_prefix[rec.namespace] = rec.prefix + + # Layer 3: overrides for the few cases where merged/linked_data disagrees + # with the rdflib/W3C canonical forms used by the RDF community. + for ns, pfx in _PREFIX_OVERRIDES.items(): + ns_to_prefix[ns] = pfx + + # Ensure both HTTP/HTTPS schema.org variants resolve to 'schema'. + ns_to_prefix.setdefault("https://schema.org/", "schema") + ns_to_prefix["http://schema.org/"] = "schema" + + # Ensure both HTTP/HTTPS wgs84 variants resolve to 'wgs'. + ns_to_prefix.setdefault("https://www.w3.org/2003/01/geo/wgs84_pos#", "wgs") + + return ns_to_prefix + + +# Overrides: corrections where prefixmaps merged context uses non-standard names +# that differ from rdflib 7.x / W3C canonical forms. +_PREFIX_OVERRIDES: types.MappingProxyType[str, str] = types.MappingProxyType( + { + # merged gives 'geosparql', rdflib/W3C uses 'geo' + "http://www.opengis.net/ont/geosparql#": "geo", + # merged gives 'sc', rdflib/W3C uses 'schema' + "https://schema.org/": "schema", + # merged gives 'WGS84', rdflib uses 'wgs' + "https://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + "http://www.w3.org/2003/01/geo/wgs84_pos#": "wgs", + } +) + + +def normalize_graph_prefixes(graph: "Graph", schema_prefixes: dict[str, str]) -> None: + """Normalise non-standard prefix aliases in an rdflib Graph. + + For each prefix bound in *schema_prefixes* (mapping prefix name → + namespace URI), check whether ``well_known_prefix_map()`` knows a + standard name for that URI. If the standard name differs from the + schema-declared name, rebind the namespace to the standard name. + + This is the **shared implementation** used by OWL, SHACL, and (via a + different code-path) JSON-LD context generators so that all serialisation + formats agree on prefix names when ``--normalize-prefixes`` is active. + + :param graph: rdflib Graph whose namespace bindings should be adjusted. + :param schema_prefixes: mapping of prefix name → namespace URI string, + typically from ``schema.prefixes``. + """ + from rdflib import Namespace + + wk = well_known_prefix_map() + + # Phase 1: normalise schema-declared prefixes. + for old_pfx, ns_uri in schema_prefixes.items(): + ns_str = str(ns_uri) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == old_pfx: + continue + # Collision: the user explicitly declared std_pfx for a different + # namespace — do not clobber their binding. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + logger.warning( + "Prefix collision: cannot rename '%s' to '%s' because '%s' is already " + "declared for <%s>; skipping normalisation for <%s>", + old_pfx, + std_pfx, + std_pfx, + schema_prefixes[std_pfx], + ns_str, + ) + continue + # Rebind: remove old prefix, add standard prefix. + # ``replace=True`` forces the new prefix even if the prefix name + # is already bound to a different namespace. + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + # Phase 2: normalise runtime-injected bindings (e.g. metamodel defaults). + # The linkml-runtime / rdflib may inject well-known namespaces under + # non-standard prefix names. After Phase 1 rebinds schema-declared + # prefixes, orphaned runtime bindings can appear as ``schema1``, ``dc0``, + # etc. Scan the graph's current bindings and fix any that map to a + # well-known namespace under a non-standard name, provided the standard + # name isn't already claimed by the user for a different namespace. + # + # Guard: if Phase 1 already bound std_pfx to a different URI (e.g. + # ``schema`` → ``https://schema.org/``), do not clobber it with the + # HTTP variant (``http://schema.org/``). Build a snapshot of the + # current bindings after Phase 1 to detect this. + current_bindings = {str(p): str(n) for p, n in graph.namespaces()} + for pfx, ns in list(graph.namespaces()): + pfx_str, ns_str = str(pfx), str(ns) + std_pfx = wk.get(ns_str) + if not std_pfx or std_pfx == pfx_str: + continue + # Same collision check as Phase 1: respect user-declared prefixes. + if std_pfx in schema_prefixes and schema_prefixes[std_pfx] != ns_str: + continue + # Guard: if std_pfx is already bound to a different (correct) URI + # by Phase 1, do not overwrite it. This prevents the HTTP variant + # of schema.org from clobbering the HTTPS binding. + if std_pfx in current_bindings and current_bindings[std_pfx] != ns_str: + continue + graph.bind(std_pfx, Namespace(ns_str), override=True, replace=True) + + +def _wl_signatures( + quads: list, + iterations: int = 4, +) -> dict[str, str]: + """Compute Weisfeiler-Lehman structural signatures for blank nodes. + + Uses 1-dimensional WL colour refinement [1]_ to assign each blank + node a deterministic signature derived from its multi-hop + neighbourhood structure. The signature depends only on predicate + IRIs, literal values, and named-node IRIs — **not** on blank-node + identifiers — so it remains stable when unrelated triples are added + or removed. + + Parameters + ---------- + quads : list + Canonical quads from pyoxigraph (after RDFC-1.0). + iterations : int + Number of WL refinement rounds (default 4). + + Returns + ------- + dict[str, str] + Mapping from canonical blank-node ID (e.g. ``c14n42``) to a + truncated SHA-256 hash suitable for use as a stable blank-node + label. + + References + ---------- + .. [1] Weisfeiler, B. & Leman, A. (1968). "The reduction of a graph + to canonical form and the algebra which appears therein." + """ + import hashlib + + import pyoxigraph # guaranteed available — caller (deterministic_turtle) checks + + # Collect all blank node IDs and build adjacency index. + bnode_ids: set[str] = set() + # outgoing[b] = list of (predicate_str, object_str_or_bnode_id, is_bnode) + outgoing: dict[str, list[tuple[str, str, bool]]] = {} + # incoming[b] = list of (subject_str_or_bnode_id, predicate_str, is_bnode) + incoming: dict[str, list[tuple[str, str, bool]]] = {} + + for q in quads: + s, p, o = q.subject, q.predicate, q.object + s_is_bn = isinstance(s, pyoxigraph.BlankNode) + o_is_bn = isinstance(o, pyoxigraph.BlankNode) + p_str = str(p) + + if s_is_bn: + bnode_ids.add(s.value) + outgoing.setdefault(s.value, []).append((p_str, o.value if o_is_bn else str(o), o_is_bn)) + if o_is_bn: + bnode_ids.add(o.value) + incoming.setdefault(o.value, []).append((s.value if s_is_bn else str(s), p_str, s_is_bn)) + + # Initialise signatures: named-node edges only (no bnode IDs). + sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if not o_is_bn: + parts.append(f"+{p_str}={o_str}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if not s_is_bn: + parts.append(f"-{s_str}={p_str}") + sig[bid] = "|".join(sorted(parts)) + + # Iterative refinement: incorporate neighbour signatures. + for _ in range(iterations): + new_sig: dict[str, str] = {} + for bid in bnode_ids: + parts = [sig[bid]] + for p_str, o_str, o_is_bn in outgoing.get(bid, []): + if o_is_bn: + parts.append(f"+{p_str}={sig.get(o_str, '')}") + for s_str, p_str, s_is_bn in incoming.get(bid, []): + if s_is_bn: + parts.append(f"-{sig.get(s_str, '')}={p_str}") + new_sig[bid] = "|".join(sorted(parts)) + sig = new_sig + + # Convert signatures to truncated SHA-256 hashes. + # Use 12 hex chars (48 bits) — birthday-bound collision probability + # is ~n²/2^49: ~0.002% at 100k nodes. Collisions are handled by + # appending a counter (see below), so correctness is preserved. + hash_map: dict[str, str] = {} + seen_hashes: dict[str, int] = {} + for bid in sorted(bnode_ids): + digest = hashlib.sha256(sig[bid].encode("utf-8")).hexdigest()[:12] + # Handle collisions by appending a counter. + count = seen_hashes.get(digest, 0) + seen_hashes[digest] = count + 1 + label = f"b{digest}" if count == 0 else f"b{digest}_{count}" + hash_map[bid] = label + + return hash_map + + +def deterministic_turtle(graph: "RdfGraph") -> str: + """Serialize an RDF graph to Turtle with deterministic output ordering. + + Uses a three-phase hybrid pipeline for **correctness**, **diff + stability**, and **readability**: + + 1. **RDFC-1.0** [1]_ (via ``pyoxigraph``) canonicalizes the graph, + ensuring isomorphic inputs produce identical triple sets. + 2. **Weisfeiler-Lehman structural hashing** replaces the sequential + ``_:c14nN`` identifiers with content-based hashes derived from + each blank node's multi-hop neighbourhood. These hashes depend + only on predicate IRIs, literal values, and named-node IRIs — + not on blank-node numbering — so adding or removing a triple + only affects the identifiers of directly involved blank nodes. + 3. **Hybrid rdflib re-serialization** parses the canonicalized, + WL-hashed triples back into an rdflib ``Graph`` and serializes + with rdflib's native Turtle writer. This recovers idiomatic + Turtle features that pyoxigraph cannot emit: + + - **Inline blank nodes** (``[ … ]``) for singly-referenced + blank nodes (Turtle §2.7 [2]_), instead of verbose named + ``_:bHASH`` syntax. + - **Collection syntax** (``( … )``) for ``rdf:List`` chains + (Turtle §2.8 [2]_). + - **Prefix filtering**: only prefixes actually used in the + graph's IRIs are declared, following the practice of Apache + Jena, Eclipse RDF4J, and Raptor. + + All triples from the source graph are preserved — the hybrid step + only changes syntactic form, never semantic content. + + Parameters + ---------- + graph : rdflib.Graph + An rdflib Graph to serialize. + + Returns + ------- + str + Deterministic Turtle string with ``@prefix`` declarations. + + References + ---------- + .. [1] W3C (2024). "RDF Dataset Canonicalization (RDFC-1.0)." + W3C Recommendation. https://www.w3.org/TR/rdf-canon/ + .. [2] W3C (2014). "RDF 1.1 Turtle — Terse RDF Triple Language." + W3C Recommendation. https://www.w3.org/TR/turtle/ + """ + try: + import pyoxigraph + except ImportError as exc: + raise ImportError( + "pyoxigraph >= 0.4.0 is required for --deterministic output. " + "Install it with: pip install 'pyoxigraph>=0.4.0'" + ) from exc + + from rdflib import BNode, Graph, Literal, URIRef + + # ── Phase 1: RDFC-1.0 canonicalization ────────────────────────── + nt_data = graph.serialize(format="nt") + + dataset = pyoxigraph.Dataset(pyoxigraph.parse(nt_data, format=pyoxigraph.RdfFormat.N_TRIPLES)) + dataset.canonicalize(pyoxigraph.CanonicalizationAlgorithm.RDFC_1_0) + + canonical_quads = list(dataset) + + # ── Phase 2: WL structural hashing for diff-stable blank node IDs + wl_map = _wl_signatures(canonical_quads) + + def _remap(term): + if isinstance(term, pyoxigraph.BlankNode) and term.value in wl_map: + return pyoxigraph.BlankNode(wl_map[term.value]) + return term + + remapped = [pyoxigraph.Triple(_remap(q.subject), q.predicate, _remap(q.object)) for q in canonical_quads] + + # ── Phase 3: Hybrid rdflib re-serialization ───────────────────── + # Convert pyoxigraph terms to rdflib terms and populate a clean + # Graph that only carries explicitly-bound prefixes. + def _to_rdflib(term): + """Convert a pyoxigraph term to the equivalent rdflib term.""" + if isinstance(term, pyoxigraph.NamedNode): + return URIRef(term.value) + if isinstance(term, pyoxigraph.BlankNode): + return BNode(term.value) + if isinstance(term, pyoxigraph.Literal): + if term.language: + return Literal(term.value, lang=term.language) + if term.datatype: + dt_iri = term.datatype.value + # In RDF 1.1, simple literals are syntactic sugar for + # xsd:string (Turtle §2.5.1). Preserve the shorter form + # to match the original owlgen output and avoid spurious + # diffs on every string literal. + if dt_iri == "http://www.w3.org/2001/XMLSchema#string": + return Literal(term.value) + return Literal(term.value, datatype=URIRef(dt_iri)) + return Literal(term.value) + raise TypeError(f"Unexpected pyoxigraph term type: {type(term).__name__}: {term}") + + result_graph = Graph(bind_namespaces="none") + for triple in remapped: + result_graph.add( + ( + _to_rdflib(triple.subject), + _to_rdflib(triple.predicate), + _to_rdflib(triple.object), + ) + ) + + # Bind only prefixes whose namespace IRI is actually referenced + # by at least one subject, predicate, or object in the graph. + # This filters out rdflib's ~27 built-in default bindings + # (brick, csvw, doap, …) that leak through Graph() even when + # the schema never declared them. + used_iris: set[str] = set() + for s, p, o in result_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + for pfx, ns in sorted(graph.namespaces()): + pfx_s, ns_s = str(pfx), str(ns) + if pfx_s and any(iri.startswith(ns_s) for iri in used_iris): + result_graph.bind(pfx_s, ns_s) + + # rdflib's Turtle serializer always emits a trailing double newline; + # normalize to a single newline for consistent file endings. + return result_graph.serialize(format="turtle").rstrip("\n") + "\n" + + +def deterministic_json(obj: object, indent: int = 3, preserve_list_order_keys: frozenset[str] | None = None) -> str: + """Serialize a JSON-compatible object with deterministic ordering. + + Recursively sorts all dict keys *and* list elements to produce + stable output across Python versions and process invocations. + + List elements are sorted by their canonical JSON representation + (``json.dumps(item, sort_keys=True)``), which handles lists of + dicts, strings, and mixed types. + + :param obj: A JSON-serializable object (typically parsed from ``as_json``). + :param indent: Number of spaces for indentation. + :param preserve_list_order_keys: Dict keys whose list values must NOT be + sorted (e.g. ``@context``, ``@list`` in JSON-LD where array order is + semantic). Defaults to ``_JSONLD_ORDERED_KEYS``. + :returns: Deterministic JSON string. + """ + import json + + skip = preserve_list_order_keys if preserve_list_order_keys is not None else _JSONLD_ORDERED_KEYS + + def _deep_sort(value: object, parent_key: str = "") -> object: + if isinstance(value, dict): + return {k: _deep_sort(v, parent_key=k) for k, v in sorted(value.items())} + if isinstance(value, list): + sorted_items = [_deep_sort(item) for item in value] + if parent_key in skip: + return sorted_items + try: + return sorted(sorted_items, key=lambda x: json.dumps(x, sort_keys=True, ensure_ascii=False)) + except TypeError: + return sorted_items + return value + + return json.dumps(_deep_sort(obj), indent=indent, ensure_ascii=False) + + +# JSON-LD keys whose array values carry ordering semantics and must not +# be sorted. @context arrays define an override cascade (JSON-LD 1.1 +# §4.1); @list containers are explicitly ordered; @graph and @set are +# included defensively. +_JSONLD_ORDERED_KEYS: frozenset[str] = frozenset({"@context", "@list", "@graph", "@set", "imports"}) + + @dataclass class Generator(metaclass=abc.ABCMeta): """ @@ -139,6 +569,9 @@ class Generator(metaclass=abc.ABCMeta): mergeimports: bool | None = True """True means merge non-linkml sources into importing package. False means separate packages""" + deterministic: bool = False + """True means produce stable, reproducible output with sorted keys and canonical blank-node ordering""" + source_file_date: str | None = None """Modification date of input source file""" @@ -180,6 +613,12 @@ class Generator(metaclass=abc.ABCMeta): stacktrace: bool = False """True means print stack trace, false just error message""" + normalize_prefixes: bool = False + """True means normalise non-standard prefix aliases to well-known names + from the ``prefixmaps`` package (linked_data + merged contexts, with + overrides for rdflib/W3C canonical forms). E.g. ``sdo`` → ``schema`` + for ``https://schema.org/``.""" + include: str | Path | SchemaDefinition | None = None """If set, include extra schema outside of the imports mechanism""" @@ -986,6 +1425,26 @@ def decorator(f: Command) -> Command: callback=stacktrace_callback, ) ) + f.params.append( + Option( + ("--deterministic/--no-deterministic",), + default=False, + show_default=True, + help="Generate stable, reproducible output with sorted keys and canonical blank-node ordering. " + "Supported by OWL, SHACL, JSON-LD, and JSON-LD Context generators. " + "Useful when generated artifacts are stored in version control.", + ) + ) + f.params.append( + Option( + ("--normalize-prefixes/--no-normalize-prefixes",), + default=False, + show_default=True, + help="Normalise non-standard prefix aliases to rdflib's curated default names " + "(e.g. sdo → schema for https://schema.org/). " + "Supported by OWL, SHACL, and JSON-LD Context generators.", + ) + ) return f diff --git a/packages/linkml/src/linkml/utils/language_tags.py b/packages/linkml/src/linkml/utils/language_tags.py new file mode 100644 index 0000000000..442a6eadc3 --- /dev/null +++ b/packages/linkml/src/linkml/utils/language_tags.py @@ -0,0 +1,116 @@ +"""BCP 47 language tag validation shared across generators. + +Centralises the syntactic validator and resolution policy used by +:mod:`linkml.generators.owlgen` and :mod:`linkml.generators.shaclgen` +for the ``--default-language`` feature. + +The validator implements *well-formedness* in the sense of +RFC 5646 §2.2.9 (Classes of Conformance): conformance to the ABNF +grammar in §2.1. It does **not** check IANA registry validity -- +that would require external data and is out of scope for a code +generator. RDF 1.1 §3.3 also requires only well-formedness for +``rdf:langString`` literals. + +References +---------- +- RFC 5646 -- Tags for Identifying Languages (BCP 47): https://www.rfc-editor.org/rfc/rfc5646 +- RFC 5646 §2.1 (Syntax / ABNF): https://www.rfc-editor.org/rfc/rfc5646#section-2.1 +- RFC 5646 §2.2.9 (Classes of Conformance): https://www.rfc-editor.org/rfc/rfc5646#section-2.2.9 +- RDF 1.1 Concepts §3.3 (Literals): https://www.w3.org/TR/rdf11-concepts/#section-Graph-Literal +""" + +from __future__ import annotations + +import logging +import re +from typing import Any + +logger = logging.getLogger(__name__) + +# RFC 5646 §2.1 ABNF -- full grammar (langtag | privateuse | grandfathered). +# Each top-level alternative maps 1:1 to an ABNF production: +# langtag = language ["-" script] ["-" region] *("-" variant) +# *("-" extension) ["-" privateuse] +# privateuse = "x" 1*("-" (1*8alphanum)) +# grandfathered = irregular | regular (closed list from §2.2.8) +BCP47_RE: re.Pattern[str] = re.compile( + r"^(?:" + # langtag + r"(?:(?:[A-Za-z]{2,3}(?:-[A-Za-z]{3}){0,3})|[A-Za-z]{4}|[A-Za-z]{5,8})" + r"(?:-[A-Za-z]{4})?" + r"(?:-(?:[A-Za-z]{2}|\d{3}))?" + r"(?:-(?:[A-Za-z\d]{5,8}|\d[A-Za-z\d]{3}))*" + r"(?:-[0-9A-WY-Za-wy-z](?:-[A-Za-z\d]{2,8})+)*" + r"(?:-x(?:-[A-Za-z\d]{1,8})+)?" + # privateuse + r"|x(?:-[A-Za-z\d]{1,8})+" + # grandfathered (irregular) + r"|en-GB-oed|i-ami|i-bnn|i-default|i-enochian|i-hak|i-klingon" + r"|i-lux|i-mingo|i-navajo|i-pwn|i-tao|i-tay|i-tsu" + r"|sgn-BE-FR|sgn-BE-NL|sgn-CH-DE" + # grandfathered (regular) + r"|art-lojban|cel-gaulish|no-bok|no-nyn|zh-guoyu" + r"|zh-hakka|zh-min|zh-min-nan|zh-xiang" + r")$", + re.ASCII, +) + + +def is_well_formed_bcp47(tag: str) -> bool: + """Return ``True`` if *tag* is well-formed per RFC 5646 §2.2.9. + + Well-formedness is conformance to the ABNF grammar in RFC 5646 §2.1; + it does not imply IANA registry validity (RFC 5646 §2.2.9). + """ + return bool(BCP47_RE.match(tag)) + + +class LanguageTagResolver: + """Resolve and validate BCP 47 language tags for code generators. + + The resolver implements the two-level policy used by both ``gen-owl`` + and ``gen-shacl``: + + 1. ``element.in_language`` (per-element override) takes precedence + 2. fall back to the generator-level default + + Validation happens at most once per distinct malformed tag: + + - the generator-level default is validated **once** at construction; + - per-element ``in_language`` values are validated the first time + each distinct tag is observed and remembered in :attr:`_warned`. + + This avoids the original implementation's "hundreds of warnings per + run" failure mode while still surfacing every distinct problem tag. + """ + + __slots__ = ("default", "_warned") + + def __init__(self, default: str | None) -> None: + self.default: str | None = (default or "").strip() or None + if self.default is not None and not is_well_formed_bcp47(self.default): + logger.warning( + "default language tag %r is not a well-formed BCP 47 tag (RFC 5646 §2.2.9)", + self.default, + ) + self._warned: set[str] = set() + + def resolve(self, element: Any = None) -> str | None: + """Return the resolved BCP 47 tag for *element*, or ``None``. + + Resolution order is per-element first, generator default second. + Empty or whitespace-only ``in_language`` values are ignored + (the default is consulted instead). + """ + if element is not None: + element_lang = getattr(element, "in_language", None) + if element_lang and element_lang.strip(): + tag = element_lang.strip() + if not is_well_formed_bcp47(tag) and tag not in self._warned: + logger.warning( + "in_language value %r is not a well-formed BCP 47 tag (RFC 5646 §2.2.9)", + tag, + ) + self._warned.add(tag) + return tag + return self.default diff --git a/packages/linkml/src/linkml/utils/rdf_canonicalize.py b/packages/linkml/src/linkml/utils/rdf_canonicalize.py new file mode 100644 index 0000000000..4b6f093b29 --- /dev/null +++ b/packages/linkml/src/linkml/utils/rdf_canonicalize.py @@ -0,0 +1,226 @@ +"""Deterministic RDF serialization via pyoxigraph RDFC-1.0 canonicalization. + +This module provides a function to canonicalize an rdflib Graph using +pyoxigraph's RDFC-1.0 implementation, producing deterministic output +with stable blank node labels and sorted triples. + +**Known limitations:** + +1. **xsd:string normalization**: pyoxigraph follows RDF 1.1, where plain + string literals and ``"text"^^xsd:string`` are identical. The output + will never contain explicit ``^^xsd:string`` annotations. Code that + re-parses the output with rdflib will see ``Literal("x")`` (datatype + ``None``) rather than ``Literal("x", datatype=XSD.string)``. + +2. **Non-standard RDF**: Graphs with literal predicates (e.g. SHACL + annotation mode) are rejected by pyoxigraph. This function falls + back to rdflib's serializer for such graphs. + +3. **Numeric short forms**: pyoxigraph uses Turtle short forms for + ``xsd:integer`` (``42``), ``xsd:boolean`` (``true``), and + ``xsd:decimal`` (``1.23``). rdflib parses these back with the + correct datatype, so this is lossless. + +4. **Base IRI / prefix collision**: When a graph has ``@base`` and a + prefix whose namespace equals the base IRI (e.g. rdflib's auto-bound + ``base:`` prefix), pyoxigraph emits CURIEs like ``base:label`` that + rdflib rejects. We skip such prefixes during serialization. + +5. **Trailing escaped dot in PN_LOCAL**: pyoxigraph emits CURIEs like + ``prefix:local\\.`` for IRIs whose local part ends with ``.``. This + is valid Turtle (PN_LOCAL_ESC), but rdflib's notation3 parser rejects + it because it conflicts with the statement-terminator dot. We + post-process the output to expand such CURIEs to full ```` form. +""" + +import io +import logging +import re + +import pyoxigraph as ox +import rdflib + +logger = logging.getLogger(__name__) + +# Mapping from rdflib/LinkML format strings to pyoxigraph RdfFormat objects. +_FORMAT_MAP: dict[str, ox.RdfFormat] = { + "turtle": ox.RdfFormat.TURTLE, + "ttl": ox.RdfFormat.TURTLE, + "nt": ox.RdfFormat.N_TRIPLES, + "ntriples": ox.RdfFormat.N_TRIPLES, + "n-triples": ox.RdfFormat.N_TRIPLES, + "nt11": ox.RdfFormat.N_TRIPLES, + "nquads": ox.RdfFormat.N_QUADS, + "n-quads": ox.RdfFormat.N_QUADS, + "xml": ox.RdfFormat.RDF_XML, + "rdf/xml": ox.RdfFormat.RDF_XML, + "trig": ox.RdfFormat.TRIG, + "n3": ox.RdfFormat.N3, +} + +# Formats that support prefix declarations. +_PREFIX_FORMATS = frozenset({ox.RdfFormat.TURTLE, ox.RdfFormat.TRIG, ox.RdfFormat.N3, ox.RdfFormat.RDF_XML}) + + +# Characters that may appear escaped in a Turtle PN_LOCAL via PN_LOCAL_ESC. +_PN_LOCAL_ESC_UNESCAPE = re.compile(r"\\([_~.\-!$&'()*+,;=/?#@%])") + + +def _expand_trailing_dot_curies(turtle_text: str, prefixes: dict[str, str]) -> str: + """Replace CURIEs whose local part ends in ``\\.`` with full ```` form. + + rdflib's notation3 parser rejects PN_LOCAL ending in an escaped dot + even though Turtle permits it (PN_LOCAL_ESC). pyoxigraph emits this + form for IRIs ending in ``.`` (e.g. ``biolink:StrandEnum#.``). We + rewrite each such CURIE to its expanded ```` form so the output + round-trips through rdflib. + """ + if not prefixes: + return turtle_text + + # Match: a prefix name, ':', a local part (no whitespace or token + # delimiters), ending in ``\.``, followed by whitespace. Use a + # negative lookbehind to avoid matching inside ``<...>`` or word + # characters that would make this a substring of something else. + pattern = re.compile( + r"(?\"'\[\]]*?\\\.)" + r"(?=\s)" + ) + + def replace(match: re.Match[str]) -> str: + prefix = match.group(1) + local_escaped = match.group(2) + namespace = prefixes.get(prefix) + if namespace is None: + return match.group(0) + local = _PN_LOCAL_ESC_UNESCAPE.sub(r"\1", local_escaped) + return f"<{namespace}{local}>" + + return pattern.sub(replace, turtle_text) + + +def _is_safe_prefix_iri(iri: str) -> bool: + """Check whether a namespace IRI is safe for prefix serialization. + + pyoxigraph rejects IRIs with invalid code-points (e.g. double ``#``), + and rdflib's Turtle parser cannot round-trip CURIEs whose namespace + contains query parameters or fragments in unexpected positions. This + function returns ``False`` for such IRIs so they can be skipped during + prefix collection. + """ + # A namespace IRI should end with '/' or '#'. If '#' appears + # *before* the final character, the IRI contains an embedded + # fragment which produces unusable CURIEs. + if "#" in iri[:-1]: + return False + # Query parameters in namespace IRIs produce CURIEs that rdflib + # cannot parse back. + if "?" in iri: + return False + return True + + +def canonicalize_rdf_graph( + graph: rdflib.Graph, + output_format: str = "turtle", +) -> str: + """Serialize an rdflib Graph deterministically using RDFC-1.0 canonicalization. + + The graph is transferred to pyoxigraph via N-Triples, canonicalized + with RDFC-1.0, sorted, and serialized back to the requested format. + Prefix bindings from the rdflib Graph are preserved in the output + for formats that support them (Turtle, TriG, N3, RDF/XML). + + Falls back to plain rdflib serialization for unsupported formats or + graphs containing non-standard RDF (e.g. literal predicates). + + :param graph: The rdflib Graph to serialize. + :param output_format: Target serialization format (e.g. ``"turtle"``, ``"nt"``). + :return: Deterministic string serialization of the graph. + """ + ox_format = _FORMAT_MAP.get(output_format.lower()) + if ox_format is None: + logger.warning( + "pyoxigraph does not support format %r; falling back to rdflib serializer", + output_format, + ) + # rdflib's Turtle serializer emits a trailing double newline; + # normalize to single newline for consistent file endings. + data = graph.serialize(format=output_format) + return data.rstrip("\n") + "\n" if data.endswith("\n") else data + + # 1. Transfer rdflib graph to pyoxigraph via N-Triples. + nt_data = graph.serialize(format="nt") + nt_bytes = nt_data.encode("utf-8") if isinstance(nt_data, str) else nt_data + + # 2. Parse into pyoxigraph and build a Dataset for canonicalization. + # Fall back to rdflib if the graph contains non-standard RDF + # (e.g. literal predicates from annotations) that pyoxigraph rejects. + try: + triples = list(ox.parse(io.BytesIO(nt_bytes), format=ox.RdfFormat.N_TRIPLES)) + except SyntaxError: + logger.warning( + "Graph contains non-standard RDF that pyoxigraph cannot parse; falling back to rdflib serializer" + ) + return graph.serialize(format=output_format) + + dataset = ox.Dataset() + for triple in triples: + dataset.add(ox.Quad(triple.subject, triple.predicate, triple.object, ox.DefaultGraph())) + + # 3. Canonicalize blank node labels with RDFC-1.0. + dataset.canonicalize(ox.CanonicalizationAlgorithm.RDFC_1_0) + + # 4. Sort triples for deterministic ordering. + quads = list(dataset) + sorted_triples = sorted( + (ox.Triple(q.subject, q.predicate, q.object) for q in quads), + key=lambda t: (str(t.subject), str(t.predicate), str(t.object)), + ) + + # 5. Collect prefixes for formats that support them. + base_iri = str(graph.base) if graph.base else None + prefixes: dict[str, str] | None = None + if ox_format in _PREFIX_FORMATS: + prefixes = {} + for prefix, namespace in graph.namespace_manager.namespaces(): + if not prefix: # skip empty prefix (base) + continue + ns_str = str(namespace) + # Skip prefixes whose namespace matches the base IRI to avoid + # pyoxigraph emitting CURIEs like `base:label` that conflict + # with the @base directive. + if base_iri and ns_str == base_iri: + continue + # Skip namespace IRIs that pyoxigraph rejects or that produce + # CURIEs rdflib cannot round-trip. Valid namespace IRIs for + # prefix use should end with '/' or '#' and contain no query + # parameters or fragment-like characters in the middle. + if not _is_safe_prefix_iri(ns_str): + continue + prefixes[str(prefix)] = ns_str + used_prefixes = prefixes + try: + result_bytes = ox.serialize( + sorted_triples, + format=ox_format, + prefixes=prefixes, + base_iri=base_iri, + ) + except ValueError: + # pyoxigraph rejects prefixes with invalid IRIs (e.g. containing + # fragment-like characters such as double '#'). Retry without + # the offending prefixes by falling back to no prefixes, which + # still produces valid (if verbose) Turtle. + logger.warning("pyoxigraph rejected one or more prefix IRIs; serializing without prefix declarations") + result_bytes = ox.serialize( + sorted_triples, + format=ox_format, + ) + used_prefixes = None + result = result_bytes.decode("utf-8") + if ox_format in _PREFIX_FORMATS and used_prefixes: + result = _expand_trailing_dot_curies(result, used_prefixes) + return result diff --git a/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml b/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml new file mode 100644 index 0000000000..5b247bb2a1 --- /dev/null +++ b/tests/linkml/test_generators/input/shaclgen/any_of_pattern.yaml @@ -0,0 +1,59 @@ +id: https://w3id.org/linkml/examples/any_of_pattern +name: test_any_of_pattern +description: >- + Test schema for pattern constraints inside any_of branches. + Exercises three cases: (1) pattern-only branch (no range), + (2) range + pattern on the same branch, (3) mixed branches + where some have pattern and some do not. +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://w3id.org/linkml/examples/any_of_pattern/ +imports: + - linkml:types +default_range: string +default_prefix: ex + +enums: + LicenseEnum: + permissible_values: + MIT: + Apache-2.0: + GPL-3.0-only: + +classes: + PatternOnlyBranch: + description: >- + A class where one any_of branch specifies only a pattern + (no range). The generated SHACL sh:or should contain a + node with sh:pattern but no sh:datatype or sh:class. + attributes: + license: + any_of: + - range: LicenseEnum + - range: uri + - pattern: "^LicenseRef-[a-zA-Z0-9\\-\\.]+$" + + RangeWithPattern: + description: >- + A class where an any_of branch combines range + pattern. + The generated SHACL sh:or node should have both sh:datatype + and sh:pattern. + attributes: + identifier: + any_of: + - range: string + pattern: "^[A-Z]{2}-[0-9]{4}$" + - range: integer + + MixedBranches: + description: >- + A class with three any_of branches: one with range only, + one with pattern only, one with range + pattern. Ensures + pattern is emitted only on branches that declare it. + attributes: + code: + any_of: + - range: integer + - pattern: "^CUSTOM-.*$" + - range: string + pattern: "^STD-[0-9]+$" diff --git a/tests/linkml/test_generators/input/shaclgen/cardinality.yaml b/tests/linkml/test_generators/input/shaclgen/cardinality.yaml index 6bacffa680..86f88c4f60 100644 --- a/tests/linkml/test_generators/input/shaclgen/cardinality.yaml +++ b/tests/linkml/test_generators/input/shaclgen/cardinality.yaml @@ -17,6 +17,23 @@ classes: slots: - list_exact_size + ParentClass: + slots: + - inherited_slot + - restricted_slot + + ChildWithZeroMaxCard: + is_a: ParentClass + slot_usage: + restricted_slot: + maximum_cardinality: 0 + + ChildWithZeroExactCard: + is_a: ParentClass + slot_usage: + restricted_slot: + exact_cardinality: 0 + slots: list_min_max_size: range: integer @@ -28,3 +45,11 @@ slots: range: integer multivalued: true exact_cardinality: 3 + + inherited_slot: + range: string + multivalued: true + + restricted_slot: + range: string + multivalued: true diff --git a/tests/linkml/test_generators/test_deterministic_benchmark.py b/tests/linkml/test_generators/test_deterministic_benchmark.py new file mode 100644 index 0000000000..b7488a8dda --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_benchmark.py @@ -0,0 +1,356 @@ +"""Benchmark: deterministic Turtle serializer on real-world ontologies. + +Evaluates the ``--deterministic`` flag against schema.org (~16 000 triples, +~800 classes, ~1 400 properties) and the kitchen_sink LinkML schema to +demonstrate four properties: + +1. **Semantic equivalence** — ``rdflib.compare.isomorphic()`` confirms that + deterministic and non-deterministic outputs encode the same RDF graph. +2. **Byte-level stability** — SHA-256 identity across repeated runs proves + that deterministic output is truly reproducible. +3. **Diff quality** — controlled mutations show that small schema changes + produce small, focused diffs (high signal-to-noise ratio). +4. **Performance** — generation time stays within acceptable bounds even + on large real-world graphs. + +Schema.org tests exercise ``deterministic_turtle()`` directly on a +pre-existing OWL ontology. Kitchen_sink tests exercise the full +``OwlSchemaGenerator`` / ``ShaclGenerator`` pipeline with LinkML schemas. + +References +---------- +- W3C RDFC-1.0: https://www.w3.org/TR/rdf-canon/ +- W3C Turtle 1.1: https://www.w3.org/TR/turtle/ +- schema.org: https://schema.org/docs/developers.html +""" + +import difflib +import hashlib +import time +from pathlib import Path + +import pytest +import yaml +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator +from linkml.utils.generator import deterministic_turtle + +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif( + not _has_pyoxigraph, + reason="pyoxigraph >= 0.4.0 required for deterministic benchmarks", +) + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") +SCHEMA_ORG_URL = "https://schema.org/version/latest/schemaorg-current-https.ttl" + + +def _sha256(text: str) -> str: + return hashlib.sha256(text.encode()).hexdigest() + + +def _diff_line_count(a: str, b: str) -> int: + """Count lines present in *b* but not in *a* (unified-diff additions).""" + al = a.strip().splitlines() + bl = b.strip().splitlines() + return sum( + 1 for line in difflib.unified_diff(al, bl, lineterm="") if line.startswith("+") and not line.startswith("+++") + ) + + +# ── Schema.org: direct serializer benchmark ──────────────────────── + + +@pytest.fixture(scope="module") +def schema_org_graph(): + """Download and parse schema.org as an rdflib Graph. + + Cached for the module so the network fetch only happens once. + Skips all dependent tests if the download fails. + """ + try: + import urllib.request + + with urllib.request.urlopen(SCHEMA_ORG_URL, timeout=60) as resp: + data = resp.read().decode("utf-8") + except Exception as exc: + pytest.skip(f"Could not fetch schema.org: {exc}") + + g = Graph() + g.parse(data=data, format="turtle") + return g + + +@pytest.mark.network +class TestSchemaOrgDeterministicSerializer: + """Benchmark ``deterministic_turtle()`` on schema.org OWL ontology.""" + + def test_semantic_equivalence(self, schema_org_graph): + """Deterministic serialization must be isomorphic to the original graph.""" + det_ttl = deterministic_turtle(schema_org_graph) + + g_det = Graph() + g_det.parse(data=det_ttl, format="turtle") + + assert len(g_det) == len(schema_org_graph), ( + f"Triple count mismatch: original={len(schema_org_graph)}, deterministic={len(g_det)}" + ) + assert isomorphic(g_det, schema_org_graph), ( + "Deterministic output is NOT isomorphic to original schema.org graph" + ) + + def test_byte_stability(self, schema_org_graph): + """Two deterministic runs must produce byte-identical output.""" + run1 = deterministic_turtle(schema_org_graph) + run2 = deterministic_turtle(schema_org_graph) + assert _sha256(run1) == _sha256(run2), "Deterministic serializer produced different output across runs" + + def test_prefix_filtering(self, schema_org_graph): + """Only prefixes actually used in the graph should be declared.""" + det_ttl = deterministic_turtle(schema_org_graph) + + # Extract declared prefixes + declared = {} + for line in det_ttl.splitlines(): + if line.startswith("@prefix"): + parts = line.split() + pfx = parts[1].rstrip(":") + ns = parts[2].strip("<>") + declared[pfx] = ns + + # Collect all IRIs in the graph + from rdflib import URIRef + + used_iris = set() + for s, p, o in schema_org_graph: + for term in (s, p, o): + if isinstance(term, URIRef): + used_iris.add(str(term)) + + # Every declared prefix must have at least one IRI using it + for pfx, ns in declared.items(): + assert any(iri.startswith(ns) for iri in used_iris), f"Prefix '{pfx}:' <{ns}> declared but no IRI uses it" + + def test_performance(self, schema_org_graph): + """Serialization must complete within 60 seconds for ~16K triples.""" + start = time.time() + det_ttl = deterministic_turtle(schema_org_graph) + elapsed = time.time() - start + triple_count = len(schema_org_graph) + throughput = triple_count / elapsed if elapsed > 0 else float("inf") + + # Log for benchmark visibility (shows with pytest -v) + print(f"\n schema.org: {triple_count} triples in {elapsed:.1f}s ({throughput:.0f} triples/s)") + + assert elapsed < 60.0, f"Serialization took {elapsed:.1f}s (limit: 60s) for {triple_count} triples" + assert len(det_ttl) > 1000, "Output suspiciously short" + + +# ── Kitchen_sink: full pipeline benchmark ─────────────────────────── + + +def _mutate_kitchen_sink(description_suffix: str = "", add_slot: bool = False) -> str: + """Create a mutated copy of kitchen_sink.yaml **in the same directory** and return its path. + + The copy must live alongside the original so that LinkML relative imports + (``linkml:types``, ``core``, etc.) resolve correctly. + + Uses a unique filename (via ``os.getpid()``) to avoid race conditions + when tests run in parallel under pytest-xdist. + + Parameters + ---------- + description_suffix + Text appended to the first class description found. + add_slot + If True, adds a synthetic ``benchmark_notes`` slot to the first class. + """ + import os + + ks_path = Path(KITCHEN_SINK) + schema = yaml.safe_load(ks_path.read_text()) + + if description_suffix or add_slot: + # Find the first class with a description + for cls_name, cls_def in schema.get("classes", {}).items(): + if isinstance(cls_def, dict) and cls_def.get("description"): + if description_suffix: + cls_def["description"] += description_suffix + if add_slot: + slots = cls_def.get("slots", []) + slots.append("benchmark_notes") + cls_def["slots"] = slots + break + + # Define the synthetic slot if adding one + if add_slot: + slots_dict = schema.setdefault("slots", {}) + slots_dict["benchmark_notes"] = { + "description": "Synthetic benchmark slot for diff quality testing.", + "range": "string", + } + + # Write in the same directory so relative imports resolve. + # Use PID to avoid race conditions with pytest-xdist workers. + out_path = ks_path.parent / f"_benchmark_mutated_{os.getpid()}_kitchen_sink.yaml" + out_path.write_text( + yaml.dump(schema, default_flow_style=False, allow_unicode=True), + encoding="utf-8", + ) + return str(out_path) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkDiffQuality: + """Measure diff quality on the kitchen_sink schema with controlled mutations.""" + + def test_mutation_description_change(self, generator_cls): + """A single description change must produce a small, focused diff. + + Deterministic mode should change only the affected line(s) and their + immediate context (e.g. SHACL may repeat descriptions in sh:description). + Non-deterministic mode produces a much larger diff due to blank-node + and property-ordering instability. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(description_suffix=" (benchmark edit)") + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + # The deterministic diff must be small (description + any SHACL mirrors) + assert det_diff <= 20, ( + f"Deterministic diff too large for a 1-description change: {det_diff} lines (expected ≤20)" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} description mutation: " + f"det={det_diff} lines, non-det={non_diff} lines, " + f"noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + def test_mutation_add_slot(self, generator_cls): + """Adding a new slot must produce a proportionally small diff. + + A new slot adds ~10-20 triples (label, range, domain, restrictions). + The diff should be roughly proportional to the new content, not a + full-file rewrite. + """ + base = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + mutated = generator_cls(mutated_path, deterministic=True).serialize() + finally: + Path(mutated_path).unlink(missing_ok=True) + + det_diff = _diff_line_count(base, mutated) + + # Non-deterministic baseline for comparison + non_base = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + non_mutated_path = _mutate_kitchen_sink(add_slot=True) + try: + non_mutated = generator_cls(non_mutated_path, deterministic=False).serialize() + finally: + Path(non_mutated_path).unlink(missing_ok=True) + + non_diff = _diff_line_count(non_base, non_mutated) + + g_base = Graph() + g_base.parse(data=base, format="turtle") + g_mut = Graph() + g_mut.parse(data=mutated, format="turtle") + new_triples = len(g_mut) - len(g_base) + + # Diff should be proportional to new triples (allow 5× margin) + assert det_diff <= max(new_triples * 5, 40), ( + f"Deterministic diff ({det_diff} lines) disproportionate to new triples ({new_triples})" + ) + # Signal-to-noise: deterministic must be at least 5× smaller + if non_diff > 0: + ratio = non_diff / max(det_diff, 1) + assert ratio >= 5, ( + f"Insufficient noise reduction: det={det_diff}, non-det={non_diff}, ratio={ratio:.1f}× (expected ≥5×)" + ) + + print( + f"\n {generator_cls.__name__} add-slot mutation: " + f"det_diff={det_diff} lines, non-det={non_diff} lines, " + f"new_triples={new_triples}, noise reduction={non_diff / max(det_diff, 1):.0f}×" + ) + + print(f"\n {generator_cls.__name__} add-slot mutation: det_diff={det_diff} lines, new_triples={new_triples}") + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +class TestKitchenSinkEquivalence: + """Verify semantic equivalence between deterministic and non-deterministic modes.""" + + def test_triple_count_matches(self, generator_cls): + """Both modes must produce the same number of triples.""" + det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=det, format="turtle") + g_nondet = Graph() + g_nondet.parse(data=nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + + def test_byte_stability_across_runs(self, generator_cls): + """Three deterministic runs must produce identical output.""" + runs = [generator_cls(KITCHEN_SINK, deterministic=True).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + assert hashes[0] == hashes[1] == hashes[2], f"Deterministic output varies across runs: {hashes}" + + def test_non_deterministic_instability(self, generator_cls): + """Non-deterministic output should vary across runs (documents the problem). + + This test is advisory — it passes regardless but logs the instability. + """ + runs = [generator_cls(KITCHEN_SINK, deterministic=False).serialize() for _ in range(3)] + hashes = [_sha256(r) for r in runs] + identical = hashes[0] == hashes[1] == hashes[2] + print( + f"\n {generator_cls.__name__} non-det stable: {identical} " + f"(expected: False for Turtle due to bnode/ordering instability)" + ) diff --git a/tests/linkml/test_generators/test_deterministic_output.py b/tests/linkml/test_generators/test_deterministic_output.py new file mode 100644 index 0000000000..6721c2ac93 --- /dev/null +++ b/tests/linkml/test_generators/test_deterministic_output.py @@ -0,0 +1,481 @@ +"""Tests for deterministic generator output. + +When ``deterministic=True``, generators must produce byte-identical output +across multiple invocations. This ensures version-controlled artifacts don't +show spurious diffs from blank-node relabeling or dict-ordering instability. + +Generators must also produce **isomorphic** output — the deterministic +serialization must encode the same RDF graph as non-deterministic mode. +""" + +import json +import time +from pathlib import Path + +import pytest +from rdflib import Graph +from rdflib.compare import isomorphic + +from linkml.generators.jsonldcontextgen import ContextGenerator +from linkml.generators.jsonldgen import JSONLDGenerator +from linkml.generators.owlgen import OwlSchemaGenerator +from linkml.generators.shaclgen import ShaclGenerator + +# Deterministic Turtle requires pyoxigraph >= 0.4.0 (for Dataset/canonicalize). +# When an older version is present (e.g. pulled in by morph-kgc), skip these tests. +_has_pyoxigraph = False +try: + import pyoxigraph + + _has_pyoxigraph = hasattr(pyoxigraph, "Dataset") +except ImportError: + pass + +pytestmark = pytest.mark.skipif(not _has_pyoxigraph, reason="pyoxigraph >= 0.4.0 required for deterministic tests") + +SCHEMA = str(Path(__file__).parent / "input" / "personinfo.yaml") + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_deterministic_output_is_identical_across_runs(generator_cls, kwargs): + """Generate output twice with deterministic=True and verify identity.""" + out1 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + out2 = generator_cls(SCHEMA, deterministic=True, **kwargs).serialize() + # JSONLDGenerator embeds a generation_date timestamp — normalize it + if generator_cls is JSONLDGenerator: + import re + + ts_re = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}") + out1 = ts_re.sub("TIMESTAMP", out1) + out2 = ts_re.sub("TIMESTAMP", out2) + assert out1 == out2, f"{generator_cls.__name__} produced different output across runs" + assert len(out1) > 100, "Output suspiciously short — generator may have failed silently" + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_has_sorted_keys(generator_cls): + """When deterministic=True, JSON dict keys should be sorted at all levels. + + For the ContextGenerator, @context keys use grouped ordering (prefixes + before term entries) — each group is sorted, but not globally. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + is_context_gen = generator_cls is ContextGenerator + + def _check_sorted_keys(obj, path="root"): + if isinstance(obj, dict): + keys = list(obj.keys()) + # Context generator groups @context keys: @-directives, prefixes, terms + if is_context_gen and path == "root.@context": + at_keys = [k for k in keys if k.startswith("@")] + prefix_keys = [k for k in keys if not k.startswith("@") and isinstance(obj[k], str)] + term_keys = [k for k in keys if not k.startswith("@") and not isinstance(obj[k], str)] + assert at_keys == sorted(at_keys), f"@-keys not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefix keys not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term keys not sorted: {term_keys}" + else: + assert keys == sorted(keys), f"Keys not sorted at {path}: {keys}" + for k, v in obj.items(): + _check_sorted_keys(v, f"{path}.{k}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + _check_sorted_keys(item, f"{path}[{i}]") + + _check_sorted_keys(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [ContextGenerator, JSONLDGenerator], + ids=["context", "jsonld"], +) +def test_deterministic_json_lists_are_sorted(generator_cls): + """When deterministic=True, JSON list elements should be sorted. + + Lists under JSON-LD structural keys (``@context``, ``@list``, ``imports``, + etc.) are exempt because their ordering carries semantic meaning. + """ + out = generator_cls(SCHEMA, deterministic=True).serialize() + parsed = json.loads(out) + + # JSON-LD keys whose array values carry ordering semantics. + _ORDERED_KEYS = {"@context", "@list", "@graph", "@set", "imports"} + + def _check_sorted_lists(obj, path="root", parent_key=""): + if isinstance(obj, dict): + for k, v in obj.items(): + _check_sorted_lists(v, f"{path}.{k}", parent_key=k) + elif isinstance(obj, list): + if parent_key not in _ORDERED_KEYS: + str_items = [json.dumps(item, sort_keys=True, ensure_ascii=False) for item in obj] + assert str_items == sorted(str_items), f"List not sorted at {path}" + for i, item in enumerate(obj): + _check_sorted_lists(item, f"{path}[{i}]") + + _check_sorted_lists(parsed) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_preserves_at_prefix(generator_cls): + """deterministic_turtle must produce standard @prefix, not SPARQL PREFIX.""" + out = generator_cls(SCHEMA, deterministic=True).serialize() + assert "@prefix" in out, "Output uses non-standard prefix syntax" + assert "PREFIX " not in out, "Output uses SPARQL PREFIX instead of Turtle @prefix" + + +def test_deterministic_turtle_performance(): + """Deterministic OWL generation must complete within 10 seconds for personinfo. + + The Weisfeiler-Lehman approach is O(n log n), so this should easily pass. + The previous canon=True approach was exponential and failed this test + for graphs above ~250 triples. + """ + start = time.time() + out = OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() + elapsed = time.time() - start + assert elapsed < 10.0, f"Deterministic generation took {elapsed:.1f}s (limit: 10s)" + assert len(out) > 100, "Output suspiciously short" + + +def test_shacl_closed_ignored_properties_deterministic(): + """sh:ignoredProperties in closed shapes must be deterministic. + + ``_build_ignored_properties`` collects inherited slots into a set; without + explicit sorting this produces different ``rdf:first``/``rdf:rest`` chains + on each run. With ``deterministic=True`` (and sorted Collection inputs) + the output must be byte-identical. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True, closed=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:ignoredProperties ordering differs across runs" + assert "sh:ignoredProperties" in runs[0], "Expected closed shapes with sh:ignoredProperties" + + +def test_shacl_enum_in_deterministic(): + """sh:in RDF lists for enums must be deterministic. + + ``_build_enum_constraint`` iterates ``enum.permissible_values.items()`` + (dict iteration order) into a ``Collection``. Without sorting, the + ``rdf:first``/``rdf:rest`` chain varies across runs. + """ + runs = [ShaclGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "sh:in enum list ordering differs across runs" + assert "sh:in" in runs[0], "Expected sh:in constraints for enums" + + +def test_owl_enum_one_of_deterministic(): + """owl:oneOf RDF lists for enums must be deterministic. + + ``_boolean_expression`` feeds ``pv_uris`` (from ``permissible_values``) + into a ``Collection``. Without sorting, ``owl:oneOf`` list ordering varies. + """ + runs = [OwlSchemaGenerator(SCHEMA, deterministic=True).serialize() for _ in range(3)] + assert runs[0] == runs[1] == runs[2], "owl:oneOf enum list ordering differs across runs" + + +KITCHEN_SINK = str(Path(__file__).parent / "input" / "kitchen_sink.yaml") + + +def test_deterministic_large_schema(): + """End-to-end idempotency on a complex schema (kitchen_sink). + + Exercises many code paths simultaneously: closed shapes, enums, imports, + class hierarchies, and mixed ranges. + """ + owl1 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + owl2 = OwlSchemaGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert owl1 == owl2, "OWL output differs across runs for kitchen_sink" + assert len(owl1) > 500, "kitchen_sink output suspiciously short" + + shacl1 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + shacl2 = ShaclGenerator(KITCHEN_SINK, deterministic=True).serialize() + assert shacl1 == shacl2, "SHACL output differs across runs for kitchen_sink" + assert len(shacl1) > 500, "kitchen_sink output suspiciously short" + + +def test_deterministic_context_preserves_jsonld_structure(): + """Deterministic JSON-LD context must preserve conventional structure. + + JSON-LD contexts have a conventional layout: + 1. ``comments`` block first (metadata) + 2. ``@context`` block second, with prefixes grouped before term entries + + ``deterministic_json()`` would scramble this by sorting all keys + uniformly. The context generator must use JSON-LD-aware ordering. + """ + out = ContextGenerator(SCHEMA, deterministic=True, metadata=True).serialize() + parsed = json.loads(out) + + # Top-level key order: "comments" before "@context" + top_keys = list(parsed.keys()) + assert "comments" in top_keys, "Expected 'comments' block with metadata=True" + assert top_keys.index("comments") < top_keys.index("@context"), ( + f"'comments' should precede '@context', got: {top_keys}" + ) + + # Inside @context: @-directives, then prefixes (str values), then terms (dict values) + ctx = parsed["@context"] + ctx_keys = list(ctx.keys()) + + at_keys = [k for k in ctx_keys if k.startswith("@")] + prefix_keys = [k for k in ctx_keys if not k.startswith("@") and isinstance(ctx[k], str)] + term_keys = [k for k in ctx_keys if not k.startswith("@") and not isinstance(ctx[k], str)] + + # Verify grouping: all @-keys before all prefix keys before all term keys + last_at = max(ctx_keys.index(k) for k in at_keys) if at_keys else -1 + first_prefix = min(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else len(ctx_keys) + last_prefix = max(ctx_keys.index(k) for k in prefix_keys) if prefix_keys else -1 + first_term = min(ctx_keys.index(k) for k in term_keys) if term_keys else len(ctx_keys) + + assert last_at < first_prefix, "@-directives must come before prefixes" + assert last_prefix < first_term, "Prefixes must come before term entries" + + # Verify each group is sorted internally + assert at_keys == sorted(at_keys), f"@-directives not sorted: {at_keys}" + assert prefix_keys == sorted(prefix_keys), f"Prefixes not sorted: {prefix_keys}" + assert term_keys == sorted(term_keys), f"Term entries not sorted: {term_keys}" + + +def test_non_deterministic_is_default(): + """Verify that ``deterministic`` defaults to False.""" + gen = OwlSchemaGenerator(SCHEMA) + assert gen.deterministic is False + + +def test_wl_handles_structurally_similar_bnodes(): + """Blank nodes with identical local structure but different named neighbours + must receive different WL signatures and thus different stable labels. + + This tests the core WL property: two BNodes that differ only in their + connected named nodes (URIs/literals) must be distinguishable. + """ + from rdflib import BNode, Graph, Namespace, URIRef + + from linkml.utils.generator import deterministic_turtle + + RDF_TYPE = URIRef("http://www.w3.org/1999/02/22-rdf-syntax-ns#type") + OWL_RESTRICTION = URIRef("http://www.w3.org/2002/07/owl#Restriction") + OWL_ON_PROP = URIRef("http://www.w3.org/2002/07/owl#onProperty") + OWL_ALL_VALUES = URIRef("http://www.w3.org/2002/07/owl#allValuesFrom") + + EX = Namespace("http://example.org/") + g = Graph() + + # Two restrictions with same structure but different property URIs + r1 = BNode() + g.add((r1, RDF_TYPE, OWL_RESTRICTION)) + g.add((r1, OWL_ON_PROP, EX.alpha)) + g.add((r1, OWL_ALL_VALUES, EX.Target1)) + + r2 = BNode() + g.add((r2, RDF_TYPE, OWL_RESTRICTION)) + g.add((r2, OWL_ON_PROP, EX.beta)) + g.add((r2, OWL_ALL_VALUES, EX.Target2)) + + RDFS_SUBCLASS = URIRef("http://www.w3.org/2000/01/rdf-schema#subClassOf") + g.add((EX.MyClass, RDFS_SUBCLASS, r1)) + g.add((EX.MyClass, RDFS_SUBCLASS, r2)) + + # Must be deterministic across runs + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2, "WL-based serializer is not deterministic for similar BNodes" + + # Both restrictions must appear (not collapsed) + assert "alpha" in out1 + assert "beta" in out1 + + +def test_deterministic_turtle_no_bnodes(): + """Graphs with no blank nodes should still produce sorted, deterministic output.""" + from rdflib import Graph, Literal, Namespace + from rdflib.namespace import RDFS + + from linkml.utils.generator import deterministic_turtle + + EX = Namespace("http://example.org/") + g = Graph() + g.add((EX.B, RDFS.label, Literal("B"))) + g.add((EX.A, RDFS.label, Literal("A"))) + + out1 = deterministic_turtle(g) + out2 = deterministic_turtle(g) + assert out1 == out2 + + # A should appear before B (sorted) + a_pos = out1.find("example.org/A") + b_pos = out1.find("example.org/B") + assert a_pos < b_pos, "Triples should be sorted: A before B" + + +@pytest.mark.xfail( + reason=( + "Collection sorting (owl:oneOf, sh:in) in deterministic mode intentionally " + "reorders RDF list triples for canonical output. The resulting graph is " + "semantically equivalent (OWL/SHACL interpret these as unordered sets) but " + "not RDF-isomorphic because rdf:first/rdf:rest chains encode ordering." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_turtle_is_isomorphic(generator_cls): + """Deterministic output is NOT RDF-isomorphic to non-deterministic output. + + This documents the trade-off identified in linkml/linkml#3295 review: + deterministic mode sorts Collection inputs (owl:oneOf, sh:in, + sh:ignoredProperties) to produce canonical RDF list ordering. Since RDF + Collections encode order via rdf:first/rdf:rest triples, the sorted graph + is structurally different from the insertion-order graph — even though the + OWL/SHACL semantics are identical (these Collections represent sets). + + The test is marked xfail(strict=True) so that it: + - Documents the known, intentional non-isomorphism + - Alerts maintainers if the behaviour changes (strict xfail fails on pass) + """ + out_det = generator_cls(SCHEMA, deterministic=True).serialize() + out_nondet = generator_cls(SCHEMA, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert len(g_det) == len(g_nondet), ( + f"Triple count mismatch: deterministic={len(g_det)}, non-deterministic={len(g_nondet)}" + ) + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: deterministic output is NOT isomorphic " + "to non-deterministic output — the serialization changed the graph" + ) + + +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_non_deterministic_output_unchanged(generator_cls): + """Non-deterministic output must still produce valid RDF. + + Ensures that changes for deterministic mode don't break default behavior. + """ + out = generator_cls(SCHEMA, deterministic=False).serialize() + assert len(out) > 100, "Output suspiciously short" + g = Graph() + g.parse(data=out, format="turtle") + assert len(g) > 50, f"Graph has too few triples ({len(g)})" + + +@pytest.mark.parametrize( + "generator_cls,kwargs", + [ + (OwlSchemaGenerator, {}), + (ShaclGenerator, {}), + (ContextGenerator, {}), + (JSONLDGenerator, {}), + ], + ids=["owl", "shacl", "context", "jsonld"], +) +def test_non_deterministic_produces_valid_output(generator_cls, kwargs): + """All generators must produce valid output in non-deterministic mode.""" + out = generator_cls(SCHEMA, deterministic=False, **kwargs).serialize() + assert len(out) > 100, f"{generator_cls.__name__} output suspiciously short" + + +@pytest.mark.xfail( + reason=( + "Collection sorting in deterministic mode produces non-isomorphic RDF " + "(different rdf:first/rdf:rest triples). See test_deterministic_turtle_is_isomorphic." + ), + strict=True, +) +@pytest.mark.parametrize( + "generator_cls", + [OwlSchemaGenerator, ShaclGenerator], + ids=["owl", "shacl"], +) +def test_deterministic_kitchen_sink_isomorphic(generator_cls): + """Isomorphism check on the complex kitchen_sink schema. + + Expected to fail for the same reason as test_deterministic_turtle_is_isomorphic: + Collection sorting changes the RDF structure while preserving OWL/SHACL semantics. + """ + out_det = generator_cls(KITCHEN_SINK, deterministic=True).serialize() + out_nondet = generator_cls(KITCHEN_SINK, deterministic=False).serialize() + + g_det = Graph() + g_det.parse(data=out_det, format="turtle") + + g_nondet = Graph() + g_nondet.parse(data=out_nondet, format="turtle") + + assert isomorphic(g_det, g_nondet), ( + f"{generator_cls.__name__}: kitchen_sink deterministic output is NOT isomorphic to non-deterministic output" + ) + + +@pytest.mark.skipif(False, reason="does not require pyoxigraph") +def test_expression_sort_key_is_stable(): + """``_expression_sort_key`` must produce stable, content-based keys. + + LinkML anonymous expressions inherit ``YAMLRoot.__repr__()``, which + formats objects using **field values** (not memory addresses). + The ``_expression_sort_key`` helper relies on this for deterministic + ordering of ``any_of`` / ``all_of`` / ``none_of`` members. + + This test verifies that: + 1. Two distinct objects with identical fields produce the same key. + 2. Objects with different fields produce different keys. + 3. Sorting is stable across repeated calls. + """ + from linkml.generators.owlgen import _expression_sort_key + from linkml_runtime.linkml_model.meta import AnonymousClassExpression, AnonymousSlotExpression + + # Two distinct objects with identical content → same key + a1 = AnonymousClassExpression(is_a="Parent") + a2 = AnonymousClassExpression(is_a="Parent") + assert a1 is not a2 + assert _expression_sort_key(a1) == _expression_sort_key(a2) + + # Different content → different keys + b = AnonymousClassExpression(is_a="Child") + assert _expression_sort_key(a1) != _expression_sort_key(b) + + # Sorting stability: same order every time + items = [b, a1, a2] + for _ in range(5): + result = sorted(items, key=_expression_sort_key) + # "Child" < "Parent" alphabetically, so b comes first + assert _expression_sort_key(result[0]) == _expression_sort_key(b) + assert _expression_sort_key(result[1]) == _expression_sort_key(result[2]) # a1, a2 together + + # Slot expressions work too + s1 = AnonymousSlotExpression(range="string") + s2 = AnonymousSlotExpression(range="integer") + assert _expression_sort_key(s1) != _expression_sort_key(s2) + order1 = sorted([s2, s1], key=_expression_sort_key) + order2 = sorted([s1, s2], key=_expression_sort_key) + assert [_expression_sort_key(x) for x in order1] == [_expression_sort_key(x) for x in order2] diff --git a/tests/linkml/test_generators/test_jsonldcontextgen.py b/tests/linkml/test_generators/test_jsonldcontextgen.py index 6e3170d5ac..3a1081ceeb 100644 --- a/tests/linkml/test_generators/test_jsonldcontextgen.py +++ b/tests/linkml/test_generators/test_jsonldcontextgen.py @@ -1637,3 +1637,118 @@ def test_kitchen_sink_employment_event_type_falls_back(kitchen_sink_path): slot_def = ctx["employed_at"] if isinstance(slot_def, dict) and "@context" in slot_def: assert "@vocab" not in slot_def.get("@context", {}) + + +def test_normalize_prefixes_renames_nonstandard_alias(tmp_path): + """When --normalize-prefixes is set, non-standard aliases are replaced by rdflib defaults. + + rdflib binds ``dc`` to ``http://purl.org/dc/elements/1.1/`` by default. + A schema that declares ``dce`` for the same URI should have it normalised + to ``dc`` when the flag is enabled. + + See: rdflib default namespace bindings. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_normalize +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ +imports: + - linkml:types +classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""", + encoding="utf-8", + ) + + # Flag OFF (default): non-standard alias preserved + ctx_off = json.loads(ContextGenerator(str(schema), normalize_prefixes=False).serialize())["@context"] + assert "dce" in ctx_off, "With flag off, original prefix 'dce' must be preserved" + + # Flag ON: rdflib default name used + ctx_on = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + assert "dc" in ctx_on, "With flag on, 'dce' should be normalised to 'dc'" + assert "dce" not in ctx_on, "With flag on, original alias 'dce' should be removed" + assert ctx_on["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_normalize_prefixes_default_is_off(tmp_path): + """The --normalize-prefixes flag defaults to False — no prefix renaming. + + Ensures backward compatibility: existing schemas produce identical output. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_default +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Thing: + class_uri: sdo:Thing + attributes: + name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema)).serialize())["@context"] + # Without the flag, the schema's own prefix name must be preserved + assert "sdo" in ctx, "Default behavior must preserve schema-declared prefix 'sdo'" + + +def test_normalize_prefixes_curie_remapping(tmp_path): + """CURIEs in element @id values use the normalised prefix name. + + When ``sdo`` is normalised to ``schema``, slot URIs like ``sdo:name`` + must appear as ``schema:name`` in the generated context. + """ + schema = tmp_path / "schema.yaml" + schema.write_text( + """\ +id: https://example.org/test +name: test_curie +default_prefix: ex +prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ +imports: + - linkml:types +classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""", + encoding="utf-8", + ) + + ctx = json.loads(ContextGenerator(str(schema), normalize_prefixes=True).serialize())["@context"] + # The prefix declaration must use the standard name + assert "schema" in ctx, "Normalised prefix 'schema' must appear" + # Element @id must use the normalised prefix + person = ctx.get("Person", {}) + assert person.get("@id", "").startswith("schema:"), ( + f"Person @id should use normalised prefix 'schema:', got {person}" + ) diff --git a/tests/linkml/test_generators/test_normalize_prefixes.py b/tests/linkml/test_generators/test_normalize_prefixes.py new file mode 100644 index 0000000000..0a832a5791 --- /dev/null +++ b/tests/linkml/test_generators/test_normalize_prefixes.py @@ -0,0 +1,545 @@ +"""Tests for the --normalize-prefixes flag across all generators. + +Verifies that non-standard prefix aliases (e.g. ``sdo`` for ``https://schema.org/``) +are normalised to well-known names (e.g. ``schema``) consistently in OWL, SHACL, +and JSON-LD context output. + +References: +- prefix.cc — community consensus RDF prefix registry +- rdflib 7.x curated default namespace bindings +- W3C Turtle §2.4 — prefix declarations are syntactic sugar +""" + +import json +import logging +import re +import textwrap + +import pytest + +# ── Shared test schema ────────────────────────────────────────────── + +SCHEMA_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: https://schema.org/ + imports: + - linkml:types + classes: + Person: + class_uri: sdo:Person + attributes: + full_name: + range: string + slot_uri: sdo:name +""") + +SCHEMA_DCE = textwrap.dedent("""\ + id: https://example.org/test + name: test_normalize_dce + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + dce: http://purl.org/dc/elements/1.1/ + imports: + - linkml:types + classes: + Record: + class_uri: ex:Record + attributes: + title: + range: string + slot_uri: dce:title +""") + +# HTTP variant — linkml-runtime historically binds schema: http://schema.org/ +# while rdflib (and the W3C) prefer https://schema.org/. The normalize flag +# must handle both. +SCHEMA_HTTP_SDO = textwrap.dedent("""\ + id: https://example.org/test + name: test_http_schema + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + sdo: http://schema.org/ + imports: + - linkml:types + classes: + Place: + class_uri: sdo:Place + attributes: + geo: + range: string + slot_uri: sdo:geo +""") + +# Collision scenario: user declares 'foaf' for a custom namespace AND 'myfoaf' +# for http://xmlns.com/foaf/0.1/. Normalisation must NOT clobber the user's 'foaf'. +# Uses 'foaf' instead of 'schema' because 'schema' is declared in linkml:types, +# which causes a SchemaLoader merge conflict before normalisation even runs. +SCHEMA_COLLISION = textwrap.dedent("""\ + id: https://example.org/test + name: test_collision + default_prefix: ex + prefixes: + ex: https://example.org/ + linkml: https://w3id.org/linkml/ + foaf: https://something-else.org/ + myfoaf: http://xmlns.com/foaf/0.1/ + imports: + - linkml:types + classes: + Agent: + class_uri: myfoaf:Agent + attributes: + label: + range: string + slot_uri: myfoaf:name +""") + + +def _write_schema(tmp_path, content: str, name: str = "schema.yaml") -> str: + """Write schema content to a temporary file and return its path as string.""" + p = tmp_path / name + p.write_text(content, encoding="utf-8") + return str(p) + + +def _turtle_prefixes(ttl: str) -> dict[str, str]: + """Extract @prefix declarations from Turtle output → {prefix: namespace}.""" + result = {} + for m in re.finditer(r"@prefix\s+(\w+):\s+<([^>]+)>", ttl): + result[m.group(1)] = m.group(2) + return result + + +# ── OWL Generator Tests ───────────────────────────────────────────── + + +def test_owl_sdo_normalised_to_schema(tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in OWL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + +def test_owl_flag_off_preserves_original(tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + +def test_owl_dce_normalised_to_dc(tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/ in graph bindings. + + Note: rdflib's Turtle serializer only emits @prefix declarations for + namespaces actually used in triples. Since the OWL generator may not + produce triples using dc:elements URIs for simple attribute schemas, + we verify the graph's namespace bindings directly. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "dc" in bound, f"Expected 'dc' in graph bindings, got: {sorted(bound)}" + assert bound["dc"] == "http://purl.org/dc/elements/1.1/" + + +def test_owl_custom_prefix_not_affected(tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "ex" in pfx, "Custom prefix 'ex' must survive normalisation" + assert pfx["ex"] == "https://example.org/" + + +def test_owl_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'. + + The linkml-runtime historically binds ``schema: http://schema.org/`` + while the W3C and rdflib prefer ``https://schema.org/``. Both + variants must be recognised by the static well-known prefix map. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + +def test_owl_no_schema1_from_runtime_http_binding(tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + The linkml metamodel (types.yaml) declares ``schema: http://schema.org/`` + (HTTP). When a user schema declares ``sdo: https://schema.org/`` (HTTPS), + normalisation must clean up *both* variants so the output never contains + auto-generated suffixed prefixes like ``schema1``. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── SHACL Generator Tests ─────────────────────────────────────────── + + +def test_shacl_sdo_normalised_to_schema(tmp_path): + """sdo → schema when --normalize-prefixes is active.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["schema"] == "https://schema.org/" + assert "sdo" not in pfx, "Non-standard 'sdo' prefix should be removed" + + +def test_shacl_flag_off_preserves_original(tmp_path): + """Without the flag, schema-declared prefix names are preserved.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=False).serialize() + pfx = _turtle_prefixes(ttl) + assert "sdo" in pfx, "With flag off, original prefix 'sdo' must be preserved" + + +def test_shacl_dce_normalised_to_dc(tmp_path): + """dce → dc for http://purl.org/dc/elements/1.1/.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_DCE) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "dc" in pfx, f"Expected 'dc' prefix in SHACL output, got: {sorted(pfx)}" + assert pfx["dc"] == "http://purl.org/dc/elements/1.1/" + assert "dce" not in pfx, "Non-standard 'dce' prefix should be removed" + + +def test_shacl_custom_prefix_not_affected(tmp_path): + """Domain-specific prefixes (e.g. 'ex') are not touched by normalisation. + + Note: rdflib only emits @prefix for namespaces used in triples. + We verify graph bindings directly. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = ShaclGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "ex" in bound, f"Custom prefix 'ex' must survive in graph bindings, got: {sorted(bound)}" + assert bound["ex"] == "https://example.org/" + + +def test_shacl_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) also normalises to 'schema'.""" + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + assert "schema" in pfx, f"Expected 'schema' prefix for http://schema.org/, got: {sorted(pfx)}" + assert "sdo" not in pfx + + +def test_shacl_no_schema1_from_runtime_http_binding(tmp_path): + """Runtime-injected ``schema: http://schema.org/`` must not create ``schema1``. + + Same scenario as the OWL test: linkml:types imports bring in + ``schema: http://schema.org/`` while the user schema has + ``sdo: https://schema.org/``. Phase 2 of normalisation must + clean up the orphaned HTTP binding. + """ + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + pfx = _turtle_prefixes(ttl) + suffixed = [p for p in pfx if re.match(r"schema\d+", p)] + assert not suffixed, ( + f"Auto-generated suffixed prefix(es) {suffixed} found — runtime http://schema.org/ binding was not cleaned up" + ) + + +# ── JSON-LD Context Generator Tests ───────────────────────────────── + + +def test_context_http_schema_org_normalised(tmp_path): + """http://schema.org/ (HTTP variant) normalises to 'schema' in JSON-LD context. + + This covers the edge case where linkml-runtime's ``schema: http://schema.org/`` + conflicts with rdflib's ``schema: https://schema.org/``. The stale binding + must be removed and replaced with the correct one. + """ + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_HTTP_SDO) + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + assert "schema" in ctx, "HTTP schema.org should normalise to 'schema'" + assert "sdo" not in ctx, "Non-standard 'sdo' should be removed" + # The namespace URI must match the schema-declared one (http, not https) + schema_val = ctx["schema"] + if isinstance(schema_val, dict): + schema_val = schema_val.get("@id", "") + assert schema_val == "http://schema.org/", f"Namespace URI must be preserved: got {schema_val}" + + +# ── Static Prefix Map Tests ───────────────────────────────────────── + + +def test_well_known_prefix_map_returns_dict(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert isinstance(wk, dict) + assert len(wk) >= 29, f"Expected ≥29 entries, got {len(wk)}" + + +def test_well_known_prefix_map_schema_https(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["https://schema.org/"] == "schema" + + +def test_well_known_prefix_map_schema_http_variant(): + """Both http and https schema.org must map to 'schema'.""" + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://schema.org/"] == "schema" + + +def test_well_known_prefix_map_dc_elements(): + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + assert wk["http://purl.org/dc/elements/1.1/"] == "dc" + + +def test_well_known_prefix_map_returns_copy(): + """Callers should not be able to mutate the internal map.""" + from linkml.utils.generator import well_known_prefix_map + + wk1 = well_known_prefix_map() + wk1["http://never-in-any-real-prefix-map.test/"] = "test" + wk2 = well_known_prefix_map() + assert "http://never-in-any-real-prefix-map.test/" not in wk2 + + +def test_well_known_prefix_map_fully_resolved_from_prefixmaps(): + """All rdflib defaults must be resolved from prefixmaps (no residual map). + + This is the proof that pinning prefixmaps to the commit containing + linkml/prefixmaps#81 resolves all well-known prefixes without any + hardcoded fallback. If this test fails after a prefixmaps update, + add the missing prefix to the upstream linked_data.curated.yaml. + """ + from rdflib import Graph as RdfGraph + + from linkml.utils.generator import well_known_prefix_map + + wk = well_known_prefix_map() + rdflib_map = {str(ns): str(pfx) for pfx, ns in RdfGraph().namespaces() if str(pfx)} + missing = {ns: pfx for ns, pfx in rdflib_map.items() if ns not in wk} + assert not missing, f"Prefix map missing rdflib defaults (add to prefixmaps upstream): {missing}" + + +# ── Cross-Generator Consistency Tests ──────────────────────────────── + + +def test_all_generators_normalise_sdo_to_schema(tmp_path): + """OWL, SHACL, and JSON-LD context must all use 'schema' for schema.org.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + from linkml.generators.owlgen import OwlSchemaGenerator + from linkml.generators.shaclgen import ShaclGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + + owl_ttl = OwlSchemaGenerator(schema_path, normalize_prefixes=True).serialize() + shacl_ttl = ShaclGenerator(schema_path, normalize_prefixes=True).serialize() + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + + owl_pfx = _turtle_prefixes(owl_ttl) + shacl_pfx = _turtle_prefixes(shacl_ttl) + + assert "schema" in owl_pfx, "OWL must use 'schema'" + assert "schema" in shacl_pfx, "SHACL must use 'schema'" + assert "schema" in ctx, "JSON-LD context must use 'schema'" + + assert "sdo" not in owl_pfx, "OWL must not have 'sdo'" + assert "sdo" not in shacl_pfx, "SHACL must not have 'sdo'" + assert "sdo" not in ctx, "JSON-LD context must not have 'sdo'" + + +# ── Prefix Collision Tests ──────────────────────────────────────────── + + +@pytest.mark.parametrize( + "generator_cls,generator_module", + [ + ("OwlSchemaGenerator", "linkml.generators.owlgen"), + ("ShaclGenerator", "linkml.generators.shaclgen"), + ], + ids=["owl", "shacl"], +) +def test_graph_generator_collision_skips_rename(tmp_path, caplog, generator_cls, generator_module): + """Graph generators: myfoaf must NOT be renamed to 'foaf' when user claims that name.""" + import importlib + + mod = importlib.import_module(generator_module) + cls = getattr(mod, generator_cls) + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + gen = cls(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "myfoaf" in bound, "Non-standard 'myfoaf' must remain when collision prevents renaming" + assert bound["myfoaf"] == "http://xmlns.com/foaf/0.1/" + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +def test_context_collision_preserves_user_prefix(tmp_path, caplog): + """JSON-LD: user's 'foaf: https://something-else.org/' must survive.""" + from linkml.generators.jsonldcontextgen import ContextGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_COLLISION) + with caplog.at_level(logging.WARNING): + ctx = json.loads(ContextGenerator(schema_path, normalize_prefixes=True).serialize())["@context"] + # User's 'foaf' binding preserved + foaf_val = ctx.get("foaf") + if isinstance(foaf_val, dict): + foaf_val = foaf_val.get("@id", "") + assert foaf_val == "https://something-else.org/", f"User's 'foaf' binding must be preserved, got: {foaf_val}" + # myfoaf must remain (not renamed to foaf) + assert "myfoaf" in ctx, "Non-standard 'myfoaf' must remain when collision prevents renaming" + # Warning emitted + assert "collision" in caplog.text.lower(), f"Expected collision warning, got: {caplog.text}" + + +# ── JSONLDGenerator Flag Forwarding Tests ───────────────────────────── + + +def test_jsonld_generator_forwards_normalize_prefixes(tmp_path): + """JSONLDGenerator must pass normalize_prefixes to embedded ContextGenerator. + + Without forwarding, the inline @context in JSON-LD output would keep + non-standard prefix aliases even when --normalize-prefixes is set. + """ + from linkml.generators.jsonldgen import JSONLDGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + out = JSONLDGenerator(schema_path, normalize_prefixes=True).serialize() + parsed = json.loads(out) + # The @context may be a list; find the dict entry + ctx = parsed.get("@context", {}) + if isinstance(ctx, list): + for item in ctx: + if isinstance(item, dict): + ctx = item + break + assert "sdo" not in ctx, "normalize_prefixes not forwarded: 'sdo' still in embedded @context" + + +# ── Phase 2 HTTP/HTTPS Overwrite Bug Tests ──────────────────────────── + + +def test_phase2_does_not_overwrite_https_with_http(tmp_path): + """When Phase 1 binds schema → https://schema.org/, Phase 2 must not + overwrite it with http://schema.org/ from the runtime metamodel. + + Reproduction: linkml:types imports bring schema: http://schema.org/ + (HTTP) while the user schema has sdo: https://schema.org/ (HTTPS). + Phase 1 normalises sdo → schema (HTTPS). Phase 2 must not then + rebind schema → http://schema.org/ when it encounters the runtime + HTTP binding. + """ + from linkml.generators.owlgen import OwlSchemaGenerator + + schema_path = _write_schema(tmp_path, SCHEMA_SDO) + gen = OwlSchemaGenerator(schema_path, normalize_prefixes=True) + graph = gen.as_graph() + bound = {str(p): str(n) for p, n in graph.namespaces()} + assert "schema" in bound, f"Expected 'schema' in bindings, got: {sorted(bound)}" + # MUST be HTTPS (from the user's schema), not HTTP (from runtime) + assert bound["schema"] == "https://schema.org/", ( + f"Phase 2 overwrote HTTPS with HTTP: schema bound to {bound['schema']}" + ) + + +def test_normalize_graph_prefixes_phase2_guard(): + """Direct unit test for the Phase 2 guard in normalize_graph_prefixes. + + Simulates the exact scenario: Phase 1 binds schema → https://schema.org/, + then Phase 2 encounters schema1 → http://schema.org/ and must NOT rebind. + """ + from rdflib import Graph, Namespace, URIRef + + from linkml.utils.generator import normalize_graph_prefixes + + g = Graph(bind_namespaces="none") + # Simulate Phase 1 result + g.bind("schema", Namespace("https://schema.org/")) + # Simulate runtime-injected HTTP variant (would appear as schema1) + g.bind("schema1", Namespace("http://schema.org/")) + # Add a triple so the graph isn't empty + g.add((URIRef("https://example.org/s"), URIRef("https://schema.org/name"), URIRef("https://example.org/o"))) + + normalize_graph_prefixes(g, {"sdo": "https://schema.org/"}) + + bound = {str(p): str(n) for p, n in g.namespaces()} + assert bound.get("schema") == "https://schema.org/", f"Phase 2 guard failed: schema bound to {bound.get('schema')}" + + +def test_empty_schema_no_crash(tmp_path): + """A schema with no custom prefixes must not crash normalize_graph_prefixes.""" + from linkml.generators.owlgen import OwlSchemaGenerator + + (tmp_path / "empty.yaml").write_text( + textwrap.dedent("""\ + id: https://example.org/empty + name: empty + default_prefix: ex + prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/ + imports: + - linkml:types + """), + encoding="utf-8", + ) + # Should not raise + gen = OwlSchemaGenerator(str(tmp_path / "empty.yaml"), normalize_prefixes=True) + ttl = gen.serialize() + assert len(ttl) > 0 diff --git a/tests/linkml/test_generators/test_owlgen.py b/tests/linkml/test_generators/test_owlgen.py index ead3359ee2..af5b904af2 100644 --- a/tests/linkml/test_generators/test_owlgen.py +++ b/tests/linkml/test_generators/test_owlgen.py @@ -1,3 +1,4 @@ +import logging from enum import Enum import pytest @@ -526,6 +527,175 @@ def test_abstract_class_without_subclasses_gets_no_union_of_axiom(): assert _union_members(g, EX.Orphan) is None +def test_abstract_class_with_no_children_emits_info(caplog): + """An abstract class with no children emits an info message about missing coverage. + + When an abstract class has zero subclasses, no covering axiom can be + generated. An info message alerts users that the class hierarchy is + incomplete — this is not a warning because abstract leaf classes are + a normal pattern in base schemas designed for downstream extension. + + See: mgskjaeveland's review on linkml/linkml#3309. + See: matentzn's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom emitted + assert _union_members(g, EX.Orphan) is None + + # An info message must be logged (not a warning) + assert any("has no children" in msg for msg in caplog.messages), ( + "Expected an info message about abstract class with no children" + ) + assert any("No covering axiom" in msg for msg in caplog.messages), ( + "Info message should mention that no covering axiom will be generated" + ) + + +def test_no_children_info_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no info for zero children.""" + sb = SchemaBuilder() + sb.add_class("Orphan", abstract=True) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + assert not any("has no children" in msg for msg in caplog.messages) + + +def test_abstract_class_with_single_child_emits_warning(caplog): + """An abstract class with one child still gets a covering axiom but emits a warning. + + Per OWL 2 semantics, the covering axiom with a single child creates an + equivalence (Parent ≡ Child). This is logically correct but may surprise + users who plan to extend the ontology later. The generator should warn + and recommend ``--skip-abstract-class-as-unionof-subclasses``. + + See: W3C OWL 2 Primer §4.2 — bidirectional rdfs:subClassOf = equivalence. + See: mgskjaeveland's review on linkml/linkml#3309. + """ + sb = SchemaBuilder() + sb.add_class("GrandParent") + sb.add_class("Parent", is_a="GrandParent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom IS still emitted (single child → equivalence is OWL-correct). + # With one child, _union_of returns the child URI directly (no owl:unionOf wrapper), + # so the covering axiom materialises as Parent rdfs:subClassOf Child. + # Combined with Child rdfs:subClassOf Parent (from is_a), this is the equivalence. + assert (EX.Parent, RDFS.subClassOf, EX.Child) in g, ( + "Covering axiom should produce Parent rdfs:subClassOf Child for single-child case" + ) + assert (EX.Child, RDFS.subClassOf, EX.Parent) in g + assert (EX.Parent, RDFS.subClassOf, EX.GrandParent) in g + + # But a warning must be logged + assert any("only 1 direct child" in msg for msg in caplog.messages), ( + "Expected a warning about single-child covering axiom creating equivalence" + ) + assert any("--skip-abstract-class-as-unionof-subclasses" in msg for msg in caplog.messages), ( + "Warning should recommend the skip flag" + ) + + +def test_single_child_warning_suppressed_by_skip_flag(caplog): + """When --skip-abstract-class-as-unionof-subclasses is set, no warning is emitted. + + The skip flag suppresses covering axioms entirely, so the single-child + equivalence case never arises. + """ + sb = SchemaBuilder() + sb.add_class("Parent", abstract=True) + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb, skip_abstract_class_as_unionof_subclasses=True) + + # No covering axiom emitted + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + # No warning either + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_multiple_children_no_warning(caplog): + """An abstract class with 2+ children must NOT emit a warning. + + The covering axiom is a proper union (not a degenerate equivalence), + so no warning is needed. + """ + sb = SchemaBuilder() + sb.add_class("Animal", abstract=True) + sb.add_class("Dog", is_a="Animal") + sb.add_class("Cat", is_a="Animal") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # Covering axiom emitted (proper union) + members = _union_members(g, EX.Animal) + assert members == {EX.Dog, EX.Cat} + + # No warning about children count + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_non_abstract_class_no_warning(caplog): + """A non-abstract class must NOT emit covering axiom warnings. + + Covering axioms only apply to abstract classes. Concrete classes + should be silently skipped regardless of child count. + """ + sb = SchemaBuilder() + sb.add_class("Parent") # not abstract + sb.add_class("Child", is_a="Parent") + sb.add_defaults() + + with caplog.at_level(logging.WARNING, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + # No covering axiom for non-abstract class + assert _union_members(g, EX.Parent) is None + assert (EX.Parent, RDFS.subClassOf, EX.Child) not in g + + # No warning either + assert not any("has no children" in msg for msg in caplog.messages) + assert not any("only 1 direct child" in msg for msg in caplog.messages) + + +def test_abstract_class_with_only_mixin_children_emits_info(caplog): + """An abstract class whose only children are via mixins (not is_a) gets the no-children info. + + The covering axiom only considers direct is_a children (not mixins). + If an abstract class has mixin children but no is_a children, it should + log an info message about having no children for covering axiom purposes. + """ + sb = SchemaBuilder() + sb.add_class("Base", abstract=True) + sb.add_class("MixinChild", mixins=["Base"]) + sb.add_defaults() + + with caplog.at_level(logging.INFO, logger="linkml.generators.owlgen"): + g = _owl_graph(sb) + + assert _union_members(g, EX.Base) is None + assert any("has no children" in msg for msg in caplog.messages), ( + "Abstract class with only mixin children should log info about no is_a children" + ) + + @pytest.mark.parametrize("skip", [False, True]) def test_union_of_axiom_only_covers_direct_children(skip: bool): """Union-of axiom lists only direct is_a children, not grandchildren. @@ -824,3 +994,405 @@ def test_children_are_mutually_disjoint( members_node = list(g.objects(disjoint_nodes[0], OWL.members))[0] members = set(Collection(g, members_node)) assert members == {EX[name] for name in child_names} + + +# --------------------------------------------------------------------------- +# --default-language tests +# --------------------------------------------------------------------------- + + +def _build_lang_test_schema(): + """Build a small schema with classes, slots, and an enum for language-tag testing.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + ) + ) + sb.add_slot( + SlotDefinition( + "color", + range="ColorEnum", + description="Paint color.", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name", "color"], + description="A road vehicle.", + title="Vehicle", + ) + sb.add_enum( + "ColorEnum", + permissible_values=[ + PermissibleValue(text="Red", description="A warm color."), + PermissibleValue(text="Blue", description="A cool color."), + ], + ) + sb.add_defaults() + return sb.schema + + +def test_default_language_tags_owl_labels(): + """With --default-language en, rdfs:label and skos:definition get @en.""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Class label + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="en") in labels + + # Class description + defs = list(g.objects(EX.Vehicle, SKOS.definition)) + assert Literal("A road vehicle.", lang="en") in defs + + # Enum PV label — PVs are emitted as <{enum_uri}#{pv_text}> + pv_red = URIRef(str(EX.ColorEnum) + "#Red") + pv_labels = list(g.objects(pv_red, RDFS.label)) + assert Literal("Red", lang="en") in pv_labels + + # No plain (untagged) literals should be present for these predicates + for lit in labels + defs + pv_labels: + assert lit.language == "en", f"Expected @en, got lang={lit.language!r} on {lit!r}" + + +def test_no_default_language_produces_plain_literals(): + """Without --default-language, literals have no language tag (backward-compat).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no language tag, got {lit.language!r}" + + +def test_default_language_does_not_tag_uri_range_metaslots(): + """Metaslots with non-string ranges must never produce language-tagged literals. + + This is the *negative* counterpart to the positive tagging tests. It asserts + that language tags appear only on human-readable predicates (per RDF 1.1 + Concepts §3.3 and the ``_LANGUAGE_TAGGABLE_RANGES`` allowlist), and never on: + + - IRI-valued predicates (``owl:imports``, ``rdf:type``, ``rdfs:isDefinedBy``) + - The ``status`` metaslot (range ``uriorcurie``) + - Enum-ranged metaslots that land in ``add_metadata``'s catch-all branch + (e.g. ``pv_formula`` on permissible values). + + A regression where any of these become ``rdf:langString`` would silently + break SHACL ``sh:in`` / OWL ``owl:oneOf`` matching downstream. + """ + schema = _build_lang_test_schema() + # uriorcurie-ranged metaslots that should remain IRIs: + schema.id_prefixes = ["http://example.org/"] + schema.status = "release" + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="de", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Positive sanity check: string-ranged labels still get the @de tag. + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + + # Strong negative: language tags appear ONLY on known human-readable + # predicates. Whitelist everything we expect to be tag-bearing; anything + # else carrying a language tag is a regression. + from rdflib.namespace import DCTERMS + + # All known human-readable annotation predicates that owlgen may emit + # from string-ranged metaslots. Adding a new string metaslot to the + # linkml metamodel requires extending this allowlist (or proving the + # value is constraint data, not a label). + LANG_TAG_ALLOWED_PREDICATES = { + RDFS.label, + RDFS.comment, + SKOS.definition, + SKOS.prefLabel, + SKOS.altLabel, + SKOS.editorialNote, # linkml ``notes`` metaslot + SKOS.note, + SKOS.example, + DCTERMS.title, + DCTERMS.description, + } + for s, p, o in g: + if isinstance(o, Literal) and o.language is not None: + assert p in LANG_TAG_ALLOWED_PREDICATES, ( + f"Predicate {p!r} produced a language-tagged literal {o!r}; " + "only label/description-style predicates may carry @lang." + ) + + # And specifically: every emitted ``status`` (uriorcurie range) reaches the + # graph as a URIRef, not a Literal of any kind. + BIBO_STATUS = URIRef("http://purl.org/ontology/bibo/status") + for obj in g.objects(None, BIBO_STATUS): + assert isinstance(obj, URIRef), f"uriorcurie-ranged ``status`` metaslot must emit URIRef, got {obj!r}" + + +def test_default_language_in_language_override(): + """Element-level in_language overrides the generator default_language.""" + schema = _build_lang_test_schema() + schema.classes["Vehicle"].in_language = "de" + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Vehicle class should use element-level "de", not default "en" + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + assert Literal("Vehicle", lang="en") not in labels + + # ColorEnum should still use the default "en" (no override) + enum_labels = list(g.objects(EX.ColorEnum, RDFS.label)) + assert Literal("ColorEnum", lang="en") in enum_labels + + +def test_default_language_annotations_tagged(): + """OWL annotations with string values are language-tagged.""" + from linkml_runtime.linkml_model.meta import Annotation, Prefix + + sb = SchemaBuilder() + sb.add_class("Widget", description="A widget.") + sb.add_defaults() + sb.schema.prefixes["skos"] = Prefix( + prefix_prefix="skos", + prefix_reference="http://www.w3.org/2004/02/skos/core#", + ) + sb.schema.classes["Widget"].annotations["skos:altLabel"] = Annotation(tag="skos:altLabel", value="Gadget") + + owl = OwlSchemaGenerator( + sb.schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + alt_labels = list(g.objects(EX.Widget, SKOS.altLabel)) + assert Literal("Gadget", lang="en") in alt_labels + + +def test_default_language_empty_string_treated_as_none(): + """An empty string default_language is normalised to None (no tags).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_default_language_whitespace_only_treated_as_none(): + """A whitespace-only default_language is normalised to None (no tags).""" + schema = _build_lang_test_schema() + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language=" ", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_default_language_bcp47_warning(caplog): + """A malformed BCP 47 tag logs a warning but still produces output.""" + import logging + + schema = _build_lang_test_schema() + # "toolongtag" passes rdflib's lax regex but fails strict BCP 47 (max 8 chars for subtag). + with caplog.at_level(logging.WARNING): + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="toolongtag", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Tag is still applied (warning, not error) + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + # Warning was emitted + assert any("not a well-formed BCP 47 tag" in rec.message for rec in caplog.records) + + +def test_default_language_bcp47_valid_no_warning(caplog): + """A well-formed BCP 47 tag does not log any warning.""" + import logging + + schema = _build_lang_test_schema() + with caplog.at_level(logging.WARNING): + OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + assert not any("BCP 47" in rec.message for rec in caplog.records) + + +def test_default_language_in_language_override_bcp47_warning(caplog): + """A malformed in_language value logs a warning.""" + import logging + + schema = _build_lang_test_schema() + # "toolongtag" passes rdflib but fails strict BCP 47. + schema.classes["Vehicle"].in_language = "toolongtag" + with caplog.at_level(logging.WARNING): + owl = OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ).serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + # Vehicle uses the (malformed) in_language, not the default + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) + + +def test_default_language_does_not_tag_enum_ranged_metaslot_in_catchall_branch(monkeypatch): + """Direct regression test for PR #3449 review comment #2. + + The ``else`` branch in :meth:`OwlSchemaGenerator.add_metadata` is reached + when a metaslot's range is neither a type, subset, nor class -- in + practice, an enum-ranged metaslot. The fix removes the unconditional + ``Literal(v, lang=lang)`` emission from that branch. + + No metaslot in the *current* LinkML metamodel reaches this branch with + a non-``linkml:`` slot URI (``pv_formula``, ``obligation_level``, + ``alias_predicate`` are all either filtered by the ``linkml:`` guard + or nested inside class-ranged containers). To exercise the branch + directly, this test temporarily promotes ``pv_formula``'s slot URI to + a non-``linkml:`` value via ``monkeypatch``, then verifies the emitted + permissible-value identifier remains a plain ``xsd:string`` literal -- + never ``rdf:langString`` -- even with ``--default-language en`` set. + + Tagging this value would shift the datatype and silently break + downstream SHACL ``sh:in`` / OWL ``owl:oneOf`` matching (RDF 1.1 + Concepts §3.3). + """ + sb = SchemaBuilder() + sb.add_enum("ColorEnum", permissible_values=[PermissibleValue(text="Red")]) + sb.schema.enums["ColorEnum"].pv_formula = "CODE" + sb.add_defaults() + + gen = OwlSchemaGenerator( + sb.schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + default_language="en", + ) + # Promote pv_formula's slot URI so it passes the ``linkml:`` guard in + # add_metadata and actually reaches the catch-all else branch. + pv_formula_slot = gen.metamodel_schemaview.get_slot("pv_formula") + monkeypatch.setattr(pv_formula_slot, "slot_uri", "https://example.org/pv_formula") + + owl = gen.serialize() + g = Graph() + g.parse(data=owl, format="turtle") + + pv_formula_objects = list(g.objects(None, URIRef("https://example.org/pv_formula"))) + assert pv_formula_objects, ( + "Test setup failure: pv_formula triple was not emitted -- the monkey-patch may have stopped working." + ) + for obj in pv_formula_objects: + assert isinstance(obj, Literal), f"expected Literal, got {obj!r}" + assert obj.language is None, f"catch-all else branch language-tagged an enum-ranged metaslot value: {obj!r}" + assert str(obj) == "CODE" + + +def test_default_language_bcp47_warning_is_deduplicated(caplog): + """Each distinct malformed tag warns at most once across the whole run. + + Regression test for the original implementation, which re-validated on + every call to ``_resolve_language`` and emitted one warning per element + -- potentially hundreds per run. The shared :class:`LanguageTagResolver` + caches the default check (one warning at construction) and remembers + already-warned per-element ``in_language`` tags. + """ + import logging + + schema = _build_lang_test_schema() + # Stamp the same malformed in_language on multiple elements. + schema.classes["Vehicle"].in_language = "toolongtag" + schema.enums["ColorEnum"].in_language = "toolongtag" + schema.slots["vehicle_name"].in_language = "toolongtag" + schema.slots["color"].in_language = "toolongtag" + + with caplog.at_level(logging.WARNING, logger="linkml.utils.language_tags"): + OwlSchemaGenerator( + schema, + mergeimports=False, + metaclasses=False, + type_objects=False, + # Also stamp a malformed default to exercise the default branch. + default_language="anothertoolongone", + ).serialize() + + in_language_warnings = [ + rec for rec in caplog.records if "in_language" in rec.message and "toolongtag" in rec.message + ] + default_warnings = [ + rec for rec in caplog.records if "default language" in rec.message and "anothertoolongone" in rec.message + ] + assert len(in_language_warnings) == 1, ( + f"expected exactly 1 in_language warning for 'toolongtag', got {len(in_language_warnings)}" + ) + assert len(default_warnings) == 1, f"expected exactly 1 default-language warning, got {len(default_warnings)}" diff --git a/tests/linkml/test_generators/test_shaclgen.py b/tests/linkml/test_generators/test_shaclgen.py index 84bac6b4ec..8b3ce5e89b 100644 --- a/tests/linkml/test_generators/test_shaclgen.py +++ b/tests/linkml/test_generators/test_shaclgen.py @@ -7,6 +7,8 @@ from linkml.generators.shacl.shacl_data_type import ShaclDataType from linkml.generators.shaclgen import ShaclGenerator +from linkml_runtime.linkml_model import SlotDefinition +from linkml_runtime.utils.schema_builder import SchemaBuilder EXPECTED = [ ( @@ -577,6 +579,81 @@ def test_multivalued_slot_exact_cardinality(input_path): ) in g +def test_zero_maximum_cardinality_emits_maxcount(input_path): + """Test that maximum_cardinality: 0 correctly emits sh:maxCount 0. + + Regression test for the bug where Python truthiness check + `if s.maximum_cardinality:` would skip the value 0 (falsy), + failing to emit sh:maxCount 0 in the generated SHACL shape. + The fix uses `if s.maximum_cardinality is not None:` instead. + + This is the primary mechanism for suppressing inherited slots on + subclasses via slot_usage (e.g., OWL maxCardinality 0 pattern). + """ + shacl = ShaclGenerator(input_path("shaclgen/cardinality.yaml"), mergeimports=True).serialize() + + g = rdflib.Graph() + g.parse(data=shacl) + + # Find the ChildWithZeroMaxCard shape + child_uri = URIRef("https://w3id.org/linkml/examples/cardinality/ChildWithZeroMaxCard") + restricted_slot_uri = URIRef("https://w3id.org/linkml/examples/cardinality/restricted_slot") + + # Get all property shapes for the child class + prop_nodes = list(g.objects(child_uri, SH.property)) + assert prop_nodes, "ChildWithZeroMaxCard should have property shapes" + + # Find the property shape for restricted_slot + restricted_prop_node = None + for pn in prop_nodes: + if (pn, SH.path, restricted_slot_uri) in g: + restricted_prop_node = pn + break + assert restricted_prop_node is not None, "Should have a property shape for restricted_slot" + + # The critical assertion: sh:maxCount 0 must be emitted + max_count_values = list(g.objects(restricted_prop_node, SH.maxCount)) + assert len(max_count_values) == 1, f"Expected exactly one sh:maxCount, got {max_count_values}" + assert max_count_values[0] == rdflib.term.Literal( + 0, datatype=rdflib.term.URIRef("http://www.w3.org/2001/XMLSchema#integer") + ), f"sh:maxCount should be 0, got {max_count_values[0]}" + + +def test_zero_exact_cardinality_emits_both_counts(input_path): + """Test that exact_cardinality: 0 emits both sh:minCount 0 and sh:maxCount 0. + + Same truthiness bug as maximum_cardinality: `if s.exact_cardinality:` + skips value 0 (falsy). The fix uses `is not None` instead. + """ + shacl = ShaclGenerator(input_path("shaclgen/cardinality.yaml"), mergeimports=True).serialize() + + g = rdflib.Graph() + g.parse(data=shacl) + + child_uri = URIRef("https://w3id.org/linkml/examples/cardinality/ChildWithZeroExactCard") + restricted_slot_uri = URIRef("https://w3id.org/linkml/examples/cardinality/restricted_slot") + + prop_nodes = list(g.objects(child_uri, SH.property)) + assert prop_nodes, "ChildWithZeroExactCard should have property shapes" + + restricted_prop_node = None + for pn in prop_nodes: + if (pn, SH.path, restricted_slot_uri) in g: + restricted_prop_node = pn + break + assert restricted_prop_node is not None, "Should have a property shape for restricted_slot" + + XSD_INT = rdflib.term.URIRef("http://www.w3.org/2001/XMLSchema#integer") + + min_count_values = list(g.objects(restricted_prop_node, SH.minCount)) + assert len(min_count_values) == 1, f"Expected exactly one sh:minCount, got {min_count_values}" + assert min_count_values[0] == rdflib.term.Literal(0, datatype=XSD_INT) + + max_count_values = list(g.objects(restricted_prop_node, SH.maxCount)) + assert len(max_count_values) == 1, f"Expected exactly one sh:maxCount, got {max_count_values}" + assert max_count_values[0] == rdflib.term.Literal(0, datatype=XSD_INT) + + def test_exclude_imports(input_path): shacl = ShaclGenerator( input_path("shaclgen/exclude_imports.yaml"), mergeimports=True, exclude_imports=True @@ -1169,3 +1246,1426 @@ def test_nodeidentifier_range_produces_blank_node_or_iri(): uri_ref = props["https://example.org/uriRef"] uri_kinds = list(g.objects(uri_ref, SH.nodeKind)) assert SH.IRI in uri_kinds, f"Expected sh:IRI for uri, got {uri_kinds}" + + +def test_any_of_with_pattern(input_path): + """Test that pattern constraints inside any_of branches emit sh:pattern. + + Exercises three cases: + 1. PatternOnlyBranch: any_of with a pattern-only branch (no range) + 2. RangeWithPattern: any_of with range + pattern on the same branch + 3. MixedBranches: combination of range-only, pattern-only, and range+pattern + """ + shacl = ShaclGenerator(input_path("shaclgen/any_of_pattern.yaml"), mergeimports=True).serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + def get_or_branch_nodes(class_uri: str, slot_local: str) -> list[rdflib.BNode]: + """Return the list of BNodes inside sh:or for a given class property.""" + class_ref = URIRef(class_uri) + for prop_node in g.objects(class_ref, SH.property): + paths = list(g.objects(prop_node, SH.path)) + if any(slot_local in str(p) for p in paths): + for or_head in g.objects(prop_node, SH["or"]): + return list(Collection(g, or_head)) + return [] + + prefix = "https://w3id.org/linkml/examples/any_of_pattern/" + + # Case 1: PatternOnlyBranch — license slot has 3 branches: + # [enum sh:in], [sh:nodeKind sh:IRI], [sh:pattern "^LicenseRef-..."] + branches = get_or_branch_nodes(f"{prefix}PatternOnlyBranch", "license") + assert len(branches) == 3, f"Expected 3 branches, got {len(branches)}" + # Find the branch with sh:pattern + pattern_branches = [b for b in branches if list(g.objects(b, SH.pattern))] + assert len(pattern_branches) == 1, f"Expected 1 pattern branch, got {len(pattern_branches)}" + pattern_val = str(list(g.objects(pattern_branches[0], SH.pattern))[0]) + assert pattern_val == "^LicenseRef-[a-zA-Z0-9\\-\\.]+$" + # The pattern-only branch should NOT have sh:datatype or sh:class + assert list(g.objects(pattern_branches[0], SH.datatype)) == [] + assert list(g.objects(pattern_branches[0], SH["class"])) == [] + + # Case 2: RangeWithPattern — identifier slot has 2 branches: + # [sh:datatype xsd:string + sh:pattern "^[A-Z]{2}-[0-9]{4}$"], [sh:datatype xsd:integer] + branches = get_or_branch_nodes(f"{prefix}RangeWithPattern", "identifier") + assert len(branches) == 2, f"Expected 2 branches, got {len(branches)}" + # Find branch with both datatype and pattern + combo_branches = [b for b in branches if list(g.objects(b, SH.datatype)) and list(g.objects(b, SH.pattern))] + assert len(combo_branches) == 1, f"Expected 1 combo branch, got {len(combo_branches)}" + assert str(list(g.objects(combo_branches[0], SH.pattern))[0]) == "^[A-Z]{2}-[0-9]{4}$" + # The other branch (integer) should NOT have sh:pattern + int_branches = [b for b in branches if b not in combo_branches] + assert list(g.objects(int_branches[0], SH.pattern)) == [] + + # Case 3: MixedBranches — code slot has 3 branches: + # [sh:datatype xsd:integer], [sh:pattern "^CUSTOM-.*$"], [sh:datatype xsd:string + sh:pattern "^STD-[0-9]+$"] + branches = get_or_branch_nodes(f"{prefix}MixedBranches", "code") + assert len(branches) == 3, f"Expected 3 branches, got {len(branches)}" + # Exactly 2 branches should have sh:pattern + pattern_branches = [b for b in branches if list(g.objects(b, SH.pattern))] + assert len(pattern_branches) == 2, f"Expected 2 pattern branches, got {len(pattern_branches)}" + # Collect the patterns + patterns = sorted(str(list(g.objects(b, SH.pattern))[0]) for b in pattern_branches) + assert patterns == ["^CUSTOM-.*$", "^STD-[0-9]+$"] + # The integer-only branch should have no pattern + no_pattern = [b for b in branches if not list(g.objects(b, SH.pattern))] + assert len(no_pattern) == 1 + assert list(g.objects(no_pattern[0], SH.datatype)) == [URIRef("http://www.w3.org/2001/XMLSchema#integer")] + + +# --------------------------------------------------------------------------- +# --default-language tests +# --------------------------------------------------------------------------- + +EX = rdflib.Namespace("http://example.org/test-schema/") + + +def _build_shacl_lang_schema(): + """Build a schema with title/description for language-tag testing.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name"], + description="A road vehicle.", + title="Vehicle", + ) + sb.add_defaults() + return sb.schema + + +def _build_message_test_schema(): + """Build a schema for sh:message testing (includes a second slot without title).""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "vehicle_name", + range="string", + description="The vehicle name.", + title="Name", + required=True, + ) + ) + sb.add_slot( + SlotDefinition( + "speed", + range="integer", + description="Speed in km/h.", + ) + ) + sb.add_class( + "Vehicle", + slots=["vehicle_name", "speed"], + description="A road vehicle.", + ) + sb.add_defaults() + return sb.schema + + +# --------------------------------------------------------------------------- +# Helper functions +# --------------------------------------------------------------------------- + + +def _parse_shacl(schema, **kwargs): + shacl = ShaclGenerator(schema, mergeimports=False, **kwargs).serialize() + g = rdflib.Graph() + g.parse(data=shacl) + return g + + +def _get_prop_objects(g, shape_uri, prop_path_uri, predicate): + """Get predicate values for the property shape with the given sh:path.""" + for prop_node in g.objects(shape_uri, SH.property): + paths = list(g.objects(prop_node, SH.path)) + if paths and paths[0] == prop_path_uri: + return list(g.objects(prop_node, predicate)) + return [] + + +def test_shacl_default_language_node_shape(): + """NodeShape rdfs:label and rdfs:comment get @en with --default-language.""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle", lang="en") in labels + + comments = list(g.objects(vehicle_shape, RDFS.comment)) + assert Literal("A road vehicle.", lang="en") in comments + + +def test_shacl_default_language_property_shape(): + """PropertyShape sh:name and sh:description get @en with --default-language.""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + slot_uri = EX.vehicle_name + + sh_names = _get_prop_objects(g, vehicle_shape, slot_uri, SH["name"]) + assert Literal("Name", lang="en") in sh_names + + sh_descs = _get_prop_objects(g, vehicle_shape, slot_uri, SH.description) + assert Literal("The vehicle name.", lang="en") in sh_descs + + +def test_shacl_no_default_language_plain_literals(): + """Without --default-language, literals have no language tag (backward-compat).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema) + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + slot_uri = EX.vehicle_name + sh_names = _get_prop_objects(g, vehicle_shape, slot_uri, SH["name"]) + assert Literal("Name") in sh_names + for lit in sh_names: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_numeric_literals_untagged(): + """Numeric literals (sh:order, sh:minCount, etc.) must never get language tags.""" + schema = _build_shacl_lang_schema() + schema.slots["vehicle_name"].required = True + g = _parse_shacl(schema, default_language="fr") + + vehicle_shape = EX.Vehicle + slot_uri = EX.vehicle_name + + orders = _get_prop_objects(g, vehicle_shape, slot_uri, SH.order) + for lit in orders: + assert lit.language is None, f"sh:order must not be language-tagged: {lit!r}" + + min_counts = _get_prop_objects(g, vehicle_shape, slot_uri, SH.minCount) + for lit in min_counts: + assert lit.language is None, f"sh:minCount must not be language-tagged: {lit!r}" + + +def test_shacl_default_language_annotations_tagged(): + """SHACL string annotations are language-tagged with --default-language.""" + from linkml_runtime.linkml_model.meta import Annotation, Prefix + + schema = _build_shacl_lang_schema() + schema.prefixes["skos"] = Prefix( + prefix_prefix="skos", + prefix_reference="http://www.w3.org/2004/02/skos/core#", + ) + schema.classes["Vehicle"].annotations["skos:altLabel"] = Annotation(tag="skos:altLabel", value="Car") + g = _parse_shacl(schema, default_language="en", include_annotations=True) + + vehicle_shape = EX.Vehicle + SKOS = rdflib.Namespace("http://www.w3.org/2004/02/skos/core#") + alt_labels = list(g.objects(vehicle_shape, SKOS.altLabel)) + assert Literal("Car", lang="en") in alt_labels + + +def test_shacl_default_language_empty_string_treated_as_none(): + """An empty string default_language is normalised to None (no tags).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language="") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_whitespace_only_treated_as_none(): + """A whitespace-only default_language is normalised to None (no tags).""" + schema = _build_shacl_lang_schema() + g = _parse_shacl(schema, default_language=" ") + + vehicle_shape = EX.Vehicle + + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle") in labels + for lit in labels: + assert lit.language is None, f"Expected no lang tag, got {lit.language!r}" + + +def test_shacl_default_language_in_language_override(): + """Element-level in_language overrides the generator default_language in SHACL.""" + schema = _build_shacl_lang_schema() + schema.classes["Vehicle"].in_language = "de" + g = _parse_shacl(schema, default_language="en") + + vehicle_shape = EX.Vehicle + + # Vehicle class should use element-level "de", not default "en" + labels = list(g.objects(vehicle_shape, RDFS.label)) + assert Literal("Vehicle", lang="de") in labels + assert Literal("Vehicle", lang="en") not in labels + + comments = list(g.objects(vehicle_shape, RDFS.comment)) + assert Literal("A road vehicle.", lang="de") in comments + assert Literal("A road vehicle.", lang="en") not in comments + + +def test_shacl_default_language_bcp47_warning(caplog): + """A malformed BCP 47 tag logs a warning but still produces output.""" + import logging + + schema = _build_shacl_lang_schema() + # "toolongtag" passes rdflib's lax regex but fails strict BCP 47. + with caplog.at_level(logging.WARNING): + shacl = ShaclGenerator(schema, mergeimports=False, default_language="toolongtag").serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + # Tag is still applied (warning, not error) + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + # Warning was emitted + assert any("not a well-formed BCP 47 tag" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_bcp47_valid_no_warning(caplog): + """A well-formed BCP 47 tag does not log any warning.""" + import logging + + schema = _build_shacl_lang_schema() + with caplog.at_level(logging.WARNING): + ShaclGenerator(schema, mergeimports=False, default_language="en").serialize() + assert not any("BCP 47" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_in_language_bcp47_warning(caplog): + """A malformed in_language value logs a warning in SHACL generator.""" + import logging + + schema = _build_shacl_lang_schema() + # "toolongtag" passes rdflib but fails strict BCP 47. + schema.classes["Vehicle"].in_language = "toolongtag" + with caplog.at_level(logging.WARNING): + shacl = ShaclGenerator(schema, mergeimports=False, default_language="en").serialize() + g = rdflib.Graph() + g.parse(data=shacl) + + # Vehicle uses the (malformed) in_language, not the default + labels = list(g.objects(EX.Vehicle, RDFS.label)) + assert any(lit.language == "toolongtag" for lit in labels) + assert any("in_language" in rec.message and "toolongtag" in rec.message for rec in caplog.records) + + +def test_shacl_default_language_bcp47_warning_is_deduplicated(caplog): + """Each distinct malformed tag warns at most once across the whole SHACL run. + + Mirrors the owlgen regression test (see PR #3449 review comment): the + original implementation emitted one warning per element. The shared + :class:`linkml.utils.language_tags.LanguageTagResolver` collapses these + to one warning per distinct malformed tag. + """ + import logging + + schema = _build_shacl_lang_schema() + schema.classes["Vehicle"].in_language = "toolongtag" + schema.slots["vehicle_name"].in_language = "toolongtag" + + with caplog.at_level(logging.WARNING, logger="linkml.utils.language_tags"): + ShaclGenerator( + schema, + mergeimports=False, + default_language="anothertoolongone", + ).serialize() + + in_language_warnings = [ + rec for rec in caplog.records if "in_language" in rec.message and "toolongtag" in rec.message + ] + default_warnings = [ + rec for rec in caplog.records if "default language" in rec.message and "anothertoolongone" in rec.message + ] + assert len(in_language_warnings) == 1, ( + f"expected exactly 1 in_language warning for 'toolongtag', got {len(in_language_warnings)}" + ) + assert len(default_warnings) == 1, f"expected exactly 1 default-language warning, got {len(default_warnings)}" + + +# --------------------------------------------------------------------------- +# --message-template tests +# --------------------------------------------------------------------------- + + +def test_message_template_basic(): + """--message-template emits sh:message on every property shape.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="Validation of {name} failed!") + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Validation of vehicle_name failed!") in msgs + + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("Validation of speed failed!") in msgs + + +def test_message_template_title_placeholder(): + """{title} expands to slot title, falling back to slot name.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{title} is invalid") + + vehicle_shape = EX.Vehicle + + # vehicle_name has title="Name" + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Name is invalid") in msgs + + # speed has no title → falls back to slot name + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("speed is invalid") in msgs + + +def test_message_template_class_placeholder(): + """{class} expands to the enclosing class name.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{class}.{name} constraint violated") + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Vehicle.vehicle_name constraint violated") in msgs + + +def test_message_template_description_placeholder(): + """{description} expands to the slot description, empty string when absent.""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="{name} ({class}): {description}") + + vehicle_shape = EX.Vehicle + + # vehicle_name has description="The vehicle name." + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("vehicle_name (Vehicle): The vehicle name.") in msgs + + # speed has description="Speed in km/h." + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert Literal("speed (Vehicle): Speed in km/h.") in msgs + + +def test_message_template_description_fallback_empty(): + """{description} falls back to empty string when slot has no description.""" + sb = SchemaBuilder() + sb.add_slot(SlotDefinition("bare_slot", range="string")) + sb.add_class("Thing", slots=["bare_slot"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name}: {description}") + + msgs = _get_prop_objects(g, EX.Thing, EX.bare_slot, SH.message) + assert Literal("bare_slot:") in msgs + + +def test_message_template_comments_placeholder(): + """{comments} expands to slot comments joined with '; '.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "wind_speed", + range="float", + description="Wind speed in metres per second.", + comments=["ISO 34503:2023, Section 10.2.3"], + ) + ) + sb.add_class("Weather", slots=["wind_speed"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name} ({class}): {description} [{comments}]") + + msgs = _get_prop_objects(g, EX.Weather, EX.wind_speed, SH.message) + assert Literal("wind_speed (Weather): Wind speed in metres per second. [ISO 34503:2023, Section 10.2.3]") in msgs + + +def test_message_template_comments_multiple(): + """{comments} joins multiple comments with '; '.""" + sb = SchemaBuilder() + sb.add_slot( + SlotDefinition( + "temperature", + range="float", + comments=["ISO 34503:2023, Section 10.2", "Unit: Celsius"], + ) + ) + sb.add_class("Weather", slots=["temperature"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{comments}") + + msgs = _get_prop_objects(g, EX.Weather, EX.temperature, SH.message) + assert Literal("ISO 34503:2023, Section 10.2; Unit: Celsius") in msgs + + +def test_message_template_comments_fallback_empty(): + """{comments} falls back to empty string when slot has no comments.""" + sb = SchemaBuilder() + sb.add_slot(SlotDefinition("bare_slot", range="string")) + sb.add_class("Thing", slots=["bare_slot"]) + sb.add_defaults() + g = _parse_shacl(sb.schema, message_template="{name}: {comments}") + + msgs = _get_prop_objects(g, EX.Thing, EX.bare_slot, SH.message) + assert Literal("bare_slot:") in msgs + + +def test_no_message_template_no_sh_message(): + """Without --message-template, no sh:message is emitted (backward-compat).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema) + + vehicle_shape = EX.Vehicle + + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + msgs = _get_prop_objects(g, vehicle_shape, EX.speed, SH.message) + assert msgs == [] + + +def test_message_template_invalid_placeholder_raises(): + """An invalid placeholder in --message-template raises ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {invalid}") + + +def test_message_template_positional_placeholder_raises(): + """Positional placeholders like {0} raise ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {0}") + + +def test_message_template_format_spec_raises(): + """Format specs like {name:d} raise ValueError.""" + import pytest + + schema = _build_message_test_schema() + with pytest.raises(ValueError, match="Invalid placeholder"): + _parse_shacl(schema, message_template="Error: {name:d}") + + +def test_message_template_empty_string_treated_as_none(): + """An empty message_template is normalised to None (no sh:message).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template="") + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + +def test_message_template_whitespace_only_treated_as_none(): + """A whitespace-only message_template is normalised to None (no sh:message).""" + schema = _build_message_test_schema() + g = _parse_shacl(schema, message_template=" ") + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert msgs == [] + + +def test_message_template_with_default_language(): + """sh:message is language-tagged when both --message-template and --default-language are set.""" + schema = _build_message_test_schema() + g = _parse_shacl( + schema, + message_template="Validation of {name} failed!", + default_language="en", + ) + + vehicle_shape = EX.Vehicle + msgs = _get_prop_objects(g, vehicle_shape, EX.vehicle_name, SH.message) + assert Literal("Validation of vehicle_name failed!", lang="en") in msgs + + # Verify the message is NOT a plain literal + assert Literal("Validation of vehicle_name failed!") not in msgs + + +# --------------------------------------------------------------------------- +# --emit-rules / sh:sparql tests +# --------------------------------------------------------------------------- + +_RULES_SCHEMA_YAML = """ +id: https://example.org/boolean-guards +name: boolean_guard_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/boolean-guards/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + WeatherWind: + range: boolean + slot_uri: ex:WeatherWind + weatherWindValue: + description: Wind speed value. + range: decimal + slot_uri: ex:weatherWindValue + WeatherRain: + range: boolean + slot_uri: ex:WeatherRain + weatherRainValue: + description: Rain intensity value. + range: decimal + slot_uri: ex:weatherRainValue + Temperature: + range: decimal + slot_uri: ex:Temperature +classes: + Environment: + class_uri: ex:Environment + slots: + - WeatherWind + - weatherWindValue + - WeatherRain + - weatherRainValue + - Temperature + rules: + - description: If weatherWindValue is provided, WeatherWind must be true. + preconditions: + slot_conditions: + weatherWindValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherWind: + equals_string: "true" + - description: If weatherRainValue is provided, WeatherRain must be true. + preconditions: + slot_conditions: + weatherRainValue: + value_presence: PRESENT + postconditions: + slot_conditions: + WeatherRain: + equals_string: "true" +""" + +EX_RULES = rdflib.Namespace("https://example.org/boolean-guards/") + + +def test_rule_boolean_guard_generates_sparql(): + """Boolean-guard rules produce sh:sparql constraints on the NodeShape.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2, f"Expected 2 sh:sparql constraints, got {len(sparql_nodes)}" + + for node in sparql_nodes: + assert (node, RDF.type, SH.SPARQLConstraint) in g + selects = list(g.objects(node, SH.select)) + assert len(selects) == 1, "Each constraint must have exactly one sh:select" + query = str(selects[0]) + assert "$this" in query, "SPARQL must use $this pre-bound variable" + assert "OPTIONAL" in query, "SPARQL must use OPTIONAL for flag/value" + assert "FILTER" in query, "SPARQL must have a FILTER clause" + assert "BOUND" in query, "SPARQL must use BOUND()" + + +def test_rule_with_description_generates_message(): + """Rule description is emitted as sh:message on the SPARQLConstraint.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + + messages = set() + for node in sparql_nodes: + for msg in g.objects(node, SH.message): + messages.add(str(msg)) + + assert "If weatherWindValue is provided, WeatherWind must be true." in messages + assert "If weatherRainValue is provided, WeatherRain must be true." in messages + + +def test_rule_sparql_contains_correct_uris(): + """SPARQL queries reference the correct slot URIs.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + all_sparql = "\n".join(queries) + + assert str(EX_RULES.WeatherWind) in all_sparql + assert str(EX_RULES.weatherWindValue) in all_sparql + assert str(EX_RULES.WeatherRain) in all_sparql + assert str(EX_RULES.weatherRainValue) in all_sparql + + +_DEACTIVATED_RULE_SCHEMA_YAML = """ +id: https://example.org/deactivated-test +name: deactivated_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/deactivated-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + rules: + - description: This rule is deactivated. + deactivated: true + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + + +def test_rule_deactivated_skipped(): + """Deactivated rules do not produce sh:sparql constraints.""" + g = _parse_shacl(_DEACTIVATED_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/deactivated-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, f"Deactivated rule should not emit sh:sparql, got {len(sparql_nodes)}" + + +_UNSUPPORTED_RULE_SCHEMA_YAML = """ +id: https://example.org/unsupported-test +name: unsupported_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/unsupported-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + slotA: + range: string + slot_uri: ex:slotA + slotB: + range: string + slot_uri: ex:slotB +classes: + TestClass: + class_uri: ex:TestClass + slots: + - slotA + - slotB + rules: + - description: Rule with no postconditions. + preconditions: + slot_conditions: + slotA: + value_presence: PRESENT +""" + + +def test_rule_unsupported_pattern_skipped(): + """Unrecognised rule patterns are silently skipped (no sh:sparql emitted).""" + g = _parse_shacl(_UNSUPPORTED_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/unsupported-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0 + + +def test_rule_no_emit_rules_flag(): + """--no-emit-rules suppresses sh:sparql constraint generation.""" + g = _parse_shacl(_RULES_SCHEMA_YAML, emit_rules=False) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, f"emit_rules=False should suppress rules, got {len(sparql_nodes)}" + + +_NO_RULES_SCHEMA_YAML = """ +id: https://example.org/no-rules +name: no_rules_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/no-rules/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + name: + range: string + slot_uri: ex:name +classes: + SimpleClass: + class_uri: ex:SimpleClass + slots: + - name +""" + + +def test_rule_no_rules_no_sparql(): + """Classes without rules: blocks produce no sh:sparql constraints.""" + g = _parse_shacl(_NO_RULES_SCHEMA_YAML) + + shape = URIRef("https://example.org/no-rules/SimpleClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0 + + +def test_rule_multiple_rules_per_class(): + """Multiple boolean-guard rules on one class produce multiple sh:sparql constraints.""" + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2 + + # Each constraint should reference different slot pairs + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + wind_query = [q for q in queries if "weatherWindValue" in q] + rain_query = [q for q in queries if "weatherRainValue" in q] + assert len(wind_query) == 1, "Expected exactly one wind query" + assert len(rain_query) == 1, "Expected exactly one rain query" + + +# --------------------------------------------------------------------------- +# Tests for URI resolution without explicit slot_uri +# --------------------------------------------------------------------------- + +_NO_SLOT_URI_SCHEMA_YAML = """ +id: https://example.org/no-slot-uri +name: no_slot_uri_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/no-slot-uri/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + is_active: + range: boolean + measured_value: + range: decimal +classes: + Reading: + class_uri: ex:Reading + slots: + - is_active + - measured_value + rules: + - description: If measured_value is provided, is_active must be true. + preconditions: + slot_conditions: + measured_value: + value_presence: PRESENT + postconditions: + slot_conditions: + is_active: + equals_string: "true" +""" + + +def test_rule_no_explicit_slot_uri(): + """Slots without explicit slot_uri resolve via default_prefix + underscore(name).""" + g = _parse_shacl(_NO_SLOT_URI_SCHEMA_YAML) + + shape = URIRef("https://example.org/no-slot-uri/Reading") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1 + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + # URIs should be default_prefix:underscore(name) + assert "https://example.org/no-slot-uri/is_active" in query + assert "https://example.org/no-slot-uri/measured_value" in query + + +# --------------------------------------------------------------------------- +# Tests for elseconditions rejection +# --------------------------------------------------------------------------- + +_ELSE_COND_SCHEMA_YAML = """ +id: https://example.org/else-test +name: else_cond_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/else-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue + fallbackValue: + range: string + slot_uri: ex:fallbackValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + - fallbackValue + rules: + - description: Rule with elseconditions should be skipped. + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" + elseconditions: + slot_conditions: + fallbackValue: + value_presence: PRESENT +""" + + +def test_rule_with_elseconditions_emitted(): + """Rules with elseconditions emit the forward (if/then) branch and warn.""" + + g = _parse_shacl(_ELSE_COND_SCHEMA_YAML) + + shape = URIRef("https://example.org/else-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) >= 1, "Rule with elseconditions should emit sh:sparql for the forward branch" + + +def test_rule_with_elseconditions_warns(caplog): + """Rules with elseconditions emit a warning about the dropped else branch.""" + import logging + + with caplog.at_level(logging.WARNING): + _parse_shacl(_ELSE_COND_SCHEMA_YAML) + + assert any("elseconditions" in rec.message for rec in caplog.records), ( + "Expected a warning about elseconditions being dropped" + ) + + +_BIDIRECTIONAL_RULE_SCHEMA_YAML = """ +id: https://example.org/bidir-test +name: bidir_rule_test +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/bidir-test/ +imports: + - linkml:types +default_prefix: ex +default_range: string +slots: + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue +classes: + TestClass: + class_uri: ex:TestClass + slots: + - Flag + - flagValue + rules: + - description: Bidirectional rule should be skipped. + bidirectional: true + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + + +def test_rule_bidirectional_skipped(caplog): + """Rules with bidirectional=true are skipped entirely with a warning.""" + import logging + + with caplog.at_level(logging.WARNING): + g = _parse_shacl(_BIDIRECTIONAL_RULE_SCHEMA_YAML) + + shape = URIRef("https://example.org/bidir-test/TestClass") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, "Bidirectional rules should NOT emit sh:sparql" + assert any("bidirectional" in rec.message for rec in caplog.records), ( + "Expected a warning about bidirectional rules being skipped" + ) + + +# --------------------------------------------------------------------------- +# End-to-end pyshacl validation test +# --------------------------------------------------------------------------- + + +def test_rule_boolean_guard_pyshacl_end_to_end(): + """End-to-end: pyshacl flags a violation and passes a conforming instance.""" + import pyshacl + + shacl_ttl = ShaclGenerator(_RULES_SCHEMA_YAML, mergeimports=False, emit_rules=True).serialize() + + # Build a conforming RDF instance: weatherWindValue present AND WeatherWind = true + conforming_data = """ + @prefix ex: . + @prefix xsd: . + + ex:env1 a ex:Environment ; + ex:WeatherWind "true"^^xsd:boolean ; + ex:weatherWindValue "12.5"^^xsd:decimal . + """ + + # Build a violating RDF instance: weatherWindValue present but WeatherWind missing + violating_data = """ + @prefix ex: . + @prefix xsd: . + + ex:env2 a ex:Environment ; + ex:weatherWindValue "8.0"^^xsd:decimal . + """ + + # Conforming instance should pass + conforms, _, _ = pyshacl.validate( + data_graph=conforming_data, + shacl_graph=shacl_ttl, + data_graph_format="turtle", + shacl_graph_format="turtle", + advanced=True, + ) + assert conforms, "Conforming instance should pass SHACL validation" + + # Violating instance should fail + conforms, results_graph, results_text = pyshacl.validate( + data_graph=violating_data, + shacl_graph=shacl_ttl, + data_graph_format="turtle", + shacl_graph_format="turtle", + advanced=True, + ) + assert not conforms, f"Violating instance should fail SHACL validation:\n{results_text}" + + +# --------------------------------------------------------------------------- +# SPARQL syntax validation +# --------------------------------------------------------------------------- + + +def test_rule_sparql_syntax_valid(): + """Generated SPARQL queries must be syntactically valid.""" + from rdflib.plugins.sparql import prepareQuery + + g = _parse_shacl(_RULES_SCHEMA_YAML) + + shape = EX_RULES.Environment + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) >= 1 + + for node in sparql_nodes: + query_text = str(list(g.objects(node, SH.select))[0]) + # prepareQuery validates SPARQL syntax; $this is a valid variable name + prepareQuery(query_text) + + +# =========================================================================== +# Exclusive-value pattern tests (SHACL §5 SPARQL constraints) +# =========================================================================== +# +# The "exclusive value" pattern translates a LinkML rule where: +# - preconditions: slot X has equals_string (a specific enum value name) +# - postconditions: same slot X has maximum_cardinality N +# +# Semantics: "If value V is present in multivalued slot X, then X has at most +# N values total." For N=1 this means V must be the sole value (mutual +# exclusion with other enum members). +# +# Generated SHACL: sh:SPARQLConstraint per W3C SHACL §5.3.1, using $this +# pre-bound to each focus node. +# +# References: +# - W3C SHACL §5 +# - W3C SHACL §5.3.1 +# - ISO 34503:2023, 9.3.6 (motivating use case: EdgeNone exclusivity) +# =========================================================================== + +_EXCLUSIVE_VALUE_SCHEMA_YAML = """ +id: https://example.org/exclusive-value +name: exclusive_value_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/exclusive-value/ +imports: + - linkml:types +default_prefix: ex +default_range: string + +enums: + EdgeTypeEnum: + permissible_values: + EdgeNone: + meaning: ex:EdgeNone + EdgeBarriers: + meaning: ex:EdgeBarriers + EdgeMarkers: + meaning: ex:EdgeMarkers + + PriorityEnum: + permissible_values: + High: + description: High priority (no meaning IRI). + Medium: + description: Medium priority (no meaning IRI). + Low: + description: Low priority (no meaning IRI). + +slots: + edgeType: + range: EdgeTypeEnum + multivalued: true + slot_uri: ex:edgeType + priority: + range: PriorityEnum + multivalued: true + slot_uri: ex:priority + otherSlot: + range: string + slot_uri: ex:otherSlot + +classes: + Road: + class_uri: ex:Road + slots: + - edgeType + - otherSlot + rules: + - description: >- + EdgeNone is mutually exclusive with other edge types. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + edgeType: + maximum_cardinality: 1 + + Intersection: + class_uri: ex:Intersection + slots: + - edgeType + rules: + - description: >- + EdgeNone allows at most 2 total edge values. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + edgeType: + maximum_cardinality: 2 + + Task: + class_uri: ex:Task + slots: + - priority + rules: + - description: >- + High priority is exclusive (literal fallback test). + preconditions: + slot_conditions: + priority: + equals_string: "High" + postconditions: + slot_conditions: + priority: + maximum_cardinality: 1 + + MismatchedSlots: + class_uri: ex:MismatchedSlots + slots: + - edgeType + - otherSlot + rules: + - description: >- + Different slots in pre/post — not an exclusive-value pattern. + preconditions: + slot_conditions: + edgeType: + equals_string: "EdgeNone" + postconditions: + slot_conditions: + otherSlot: + maximum_cardinality: 1 +""" + +EX_EXCL = rdflib.Namespace("https://example.org/exclusive-value/") + + +def test_exclusive_value_generates_sparql(): + """Exclusive-value rules produce sh:sparql constraints on the NodeShape.""" + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + node = sparql_nodes[0] + assert (node, RDF.type, SH.SPARQLConstraint) in g + selects = list(g.objects(node, SH.select)) + assert len(selects) == 1, "Constraint must have exactly one sh:select" + + +def test_exclusive_value_sparql_uses_enum_iri(): + """SPARQL references the enum value's meaning IRI, not a string literal. + + Per the enum definition, EdgeNone has meaning: ex:EdgeNone which expands + to . The generated SPARQL + must use this full IRI in angle brackets. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + edge_none_iri = str(EX_EXCL.EdgeNone) + assert f"<{edge_none_iri}>" in query, f"SPARQL must reference EdgeNone as full IRI <{edge_none_iri}>, got:\n{query}" + + +def test_exclusive_value_max_card_1_sparql_structure(): + """For maximum_cardinality: 1, SPARQL uses FILTER(?other != ). + + The query pattern for N=1 is: + SELECT $this WHERE { + $this . + $this ?other . + FILTER (?other != ) + } + + This is more efficient than the COUNT-based approach for the common + singleton exclusion case. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + assert "$this" in query, "SPARQL must use $this pre-bound variable (SHACL §5.3.1)" + assert "FILTER" in query, "N=1 pattern must use FILTER for exclusion check" + assert "?other" in query, "N=1 pattern must bind ?other for comparison" + # Must NOT use COUNT for the N=1 case (simpler pattern) + assert "COUNT" not in query, "N=1 pattern should use FILTER, not COUNT" + # The slot URI must appear (property path) + assert str(EX_EXCL.edgeType) in query, "SPARQL must reference the slot URI" + + +def test_exclusive_value_max_card_gt1_sparql_structure(): + """For maximum_cardinality > 1, SPARQL uses COUNT-based subquery. + + The query pattern for N>1 is: + SELECT $this WHERE { + $this . + { + SELECT $this (COUNT(?val) AS ?count) + WHERE { $this ?val . } + GROUP BY $this + HAVING (?count > N) + } + } + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Intersection + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + assert "$this" in query, "SPARQL must use $this pre-bound variable" + assert "COUNT" in query, "N>1 pattern must use COUNT" + assert "GROUP BY" in query, "N>1 pattern must GROUP BY $this" + assert "HAVING" in query, "N>1 pattern must use HAVING for count check" + assert "> 2" in query, "HAVING must check count > maximum_cardinality (2)" + + +def test_exclusive_value_no_meaning_falls_back_to_literal(): + """When enum values lack a meaning IRI, the value is compared as a literal. + + PriorityEnum values have no meaning field, so 'High' is used as a + quoted string in the SPARQL rather than an IRI in angle brackets. + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Task + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 1, f"Expected 1 sh:sparql constraint, got {len(sparql_nodes)}" + + query = str(list(g.objects(sparql_nodes[0], SH.select))[0]) + + # Should use quoted literal, not angle-bracket IRI + assert '"High"' in query, f"No-meaning enum should use literal '\"High\"', got:\n{query}" + assert "" not in query, "Should not emit as IRI when meaning is absent" + + +def test_exclusive_value_different_slots_not_recognised(): + """Rules where pre/post reference different slots are NOT exclusive-value. + + The pattern requires the SAME slot in both preconditions and + postconditions. When they differ, the rule is unrecognised and + silently skipped (no sh:sparql emitted). + """ + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.MismatchedSlots + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 0, ( + f"Mismatched slots should not trigger exclusive-value pattern, got {len(sparql_nodes)}" + ) + + +def test_exclusive_value_message_from_description(): + """Rule description is emitted as sh:message on the SPARQLConstraint.""" + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + shape = EX_EXCL.Road + sparql_nodes = list(g.objects(shape, SH.sparql)) + messages = [str(m) for node in sparql_nodes for m in g.objects(node, SH.message)] + + assert any("EdgeNone is mutually exclusive" in m for m in messages), ( + f"Expected message about EdgeNone exclusivity, got: {messages}" + ) + + +def test_exclusive_value_sparql_syntax_valid(): + """Generated SPARQL for exclusive-value rules must be syntactically valid. + + Uses rdflib's prepareQuery() which validates SPARQL syntax. + $this is a valid SPARQL variable name per the grammar. + """ + from rdflib.plugins.sparql import prepareQuery + + g = _parse_shacl(_EXCLUSIVE_VALUE_SCHEMA_YAML) + + for shape in (EX_EXCL.Road, EX_EXCL.Intersection, EX_EXCL.Task): + sparql_nodes = list(g.objects(shape, SH.sparql)) + for node in sparql_nodes: + query_text = str(list(g.objects(node, SH.select))[0]) + # prepareQuery validates SPARQL syntax + prepareQuery(query_text) + + +def test_exclusive_value_coexists_with_boolean_guard(): + """Exclusive-value and boolean-guard rules can coexist on the same class. + + When a class has both pattern types, both produce sh:sparql constraints. + """ + schema = """ +id: https://example.org/mixed-rules +name: mixed_rules +prefixes: + linkml: https://w3id.org/linkml/ + ex: https://example.org/mixed-rules/ +imports: + - linkml:types +default_prefix: ex +default_range: string + +enums: + StatusEnum: + permissible_values: + None: + meaning: ex:None + Active: + meaning: ex:Active + +slots: + status: + range: StatusEnum + multivalued: true + slot_uri: ex:status + Flag: + range: boolean + slot_uri: ex:Flag + flagValue: + range: decimal + slot_uri: ex:flagValue + +classes: + Widget: + class_uri: ex:Widget + slots: + - status + - Flag + - flagValue + rules: + - description: None is exclusive. + preconditions: + slot_conditions: + status: + equals_string: "None" + postconditions: + slot_conditions: + status: + maximum_cardinality: 1 + - description: If flagValue present, Flag must be true. + preconditions: + slot_conditions: + flagValue: + value_presence: PRESENT + postconditions: + slot_conditions: + Flag: + equals_string: "true" +""" + g = _parse_shacl(schema) + + shape = URIRef("https://example.org/mixed-rules/Widget") + sparql_nodes = list(g.objects(shape, SH.sparql)) + assert len(sparql_nodes) == 2, ( + f"Expected 2 sh:sparql constraints (1 exclusive + 1 boolean guard), got {len(sparql_nodes)}" + ) + + queries = [str(list(g.objects(n, SH.select))[0]) for n in sparql_nodes] + # One should have FILTER(?other != ...) pattern, the other BOUND pattern + has_exclusive = any("?other" in q for q in queries) + has_boolean = any("BOUND" in q for q in queries) + assert has_exclusive, "Expected one exclusive-value SPARQL constraint" + assert has_boolean, "Expected one boolean-guard SPARQL constraint" diff --git a/uv.lock b/uv.lock index c23e3dffd7..aeec80f737 100644 --- a/uv.lock +++ b/uv.lock @@ -2342,7 +2342,7 @@ requires-dist = [ { name = "openpyxl" }, { name = "parse" }, { name = "prefixcommons", specifier = ">=0.1.7" }, - { name = "prefixmaps", specifier = ">=0.2.2" }, + { name = "prefixmaps", git = "https://github.com/linkml/prefixmaps?rev=75435150a1b31760b9780af2b64a265943a9b263" }, { name = "pydantic", specifier = ">=2.0.0,<3.0.0" }, { name = "pyjsg", specifier = ">=0.12.3" }, { name = "pyshex", specifier = ">=0.9.0" }, @@ -3548,16 +3548,12 @@ wheels = [ [[package]] name = "prefixmaps" -version = "0.2.6" -source = { registry = "https://pypi.org/simple" } +version = "0.2.7.post2.dev0+7543515" +source = { git = "https://github.com/linkml/prefixmaps?rev=75435150a1b31760b9780af2b64a265943a9b263#75435150a1b31760b9780af2b64a265943a9b263" } dependencies = [ { name = "curies" }, { name = "pyyaml" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/4d/cf/f588bcdfd2c841839b9d59ce219a46695da56aa2805faff937bbafb9ee2b/prefixmaps-0.2.6.tar.gz", hash = "sha256:7421e1244eea610217fa1ba96c9aebd64e8162a930dc0626207cd8bf62ecf4b9", size = 709899, upload-time = "2024-10-17T16:30:57.738Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/89/b2/2b2153173f2819e3d7d1949918612981bc6bd895b75ffa392d63d115f327/prefixmaps-0.2.6-py3-none-any.whl", hash = "sha256:f6cef28a7320fc6337cf411be212948ce570333a0ce958940ef684c7fb192a62", size = 754732, upload-time = "2024-10-17T16:30:55.731Z" }, -] [[package]] name = "prettytable"