From 01012f0e2b354df2a3c4b026ab2a12f8a7ff3e92 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 09:39:30 -0500 Subject: [PATCH 01/10] Agent initial impl Signed-off-by: Mike Knepper --- .../latest/pages/concepts/seed-datasets.mdx | 2 +- .../src/data_designer/config/seed_source.py | 37 ++++++++++++-- .../tests/config/test_seed_source.py | 21 ++++---- .../src/data_designer/engine/compiler.py | 7 ++- .../engine/resources/seed_reader.py | 48 ++++++++++++++++++- .../engine/resources/test_seed_reader.py | 15 ++++-- .../tests/engine/test_compiler.py | 16 ++++++- 7 files changed, 121 insertions(+), 25 deletions(-) diff --git a/fern/versions/latest/pages/concepts/seed-datasets.mdx b/fern/versions/latest/pages/concepts/seed-datasets.mdx index a9498caeb..5e948b228 100644 --- a/fern/versions/latest/pages/concepts/seed-datasets.mdx +++ b/fern/versions/latest/pages/concepts/seed-datasets.mdx @@ -138,7 +138,7 @@ Directory-backed seed datasets expose these columns: Filesystem matching -`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off. +`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off. Relative local `path` values are resolved by the active filesystem provider when the seed is validated or read, not when the config object is constructed. ### 📄 FileContentsSeedSource diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index 57a7eb9fc..ca1f6b2ba 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -100,9 +100,9 @@ class FileSystemSeedSource(SeedSource, ABC): ``FileSystemSeedReader`` implementation. Attributes: - path: Directory containing seed artifacts. Relative paths are resolved - from the current working directory when the config is loaded, not - from the config file location. + path: Directory containing seed artifacts. Relative local paths are + resolved by the active filesystem provider when the seed is + validated or read, not when the config object is constructed. file_pattern: Case-sensitive filename pattern used to match files under the provided directory. Patterns match basenames only, not relative paths. Defaults to ``'*'``. @@ -115,8 +115,8 @@ class FileSystemSeedSource(SeedSource, ABC): path: str = Field( ..., description=( - "Directory containing seed artifacts. Relative paths are resolved from the current working " - "directory when the config is loaded, not from the config file location." + "Directory containing seed artifacts. Relative local paths are resolved by the active filesystem " + "provider when the seed is validated or read, not when the config object is constructed." ), ) file_pattern: str = Field( @@ -155,6 +155,13 @@ def validate_file_pattern(cls, value: str | None) -> str | None: class DirectorySeedSource(FileSystemSeedSource): seed_type: Literal["directory"] = "directory" + def model_post_init(self, __context: Any) -> None: + self._runtime_path = self.path + + @property + def runtime_path(self) -> str: + return self.path + class FileContentsSeedSource(FileSystemSeedSource): seed_type: Literal["file_contents"] = "file_contents" @@ -172,6 +179,13 @@ def validate_encoding(cls, value: str) -> str: raise ValueError(f"🛑 Unknown encoding: {value!r}. Use a valid Python codec name.") from error return value + def model_post_init(self, __context: Any) -> None: + self._runtime_path = self.path + + @property + def runtime_path(self) -> str: + return self.path + def _resolve_filesystem_runtime_path(path: str) -> str: return str(Path(path).expanduser().resolve()) @@ -203,6 +217,15 @@ def get_pi_coding_agent_default_path() -> str: def _validate_filesystem_seed_source_path(value: str | None) -> str | None: + if value is None: + return None + if not value.strip(): + raise InvalidFilePathError("🛑 FileSystemSeedSource.path must be a non-empty string.") + return value + + +def _validate_local_filesystem_seed_source_path(value: str | None) -> str | None: + value = _validate_filesystem_seed_source_path(value) if value is None: return None path = Path(value).expanduser().resolve() @@ -273,6 +296,10 @@ class AgentRolloutSeedSource(FileSystemSeedSource): ), ) + @field_validator("path", mode="after") + def validate_path(cls, value: str | None) -> str | None: + return _validate_local_filesystem_seed_source_path(value) + @model_validator(mode="after") def validate_runtime_path_source(self) -> Self: default_path, _ = get_agent_rollout_format_defaults(self.format) diff --git a/packages/data-designer-config/tests/config/test_seed_source.py b/packages/data-designer-config/tests/config/test_seed_source.py index a2c46747d..d886c9ca3 100644 --- a/packages/data-designer-config/tests/config/test_seed_source.py +++ b/packages/data-designer-config/tests/config/test_seed_source.py @@ -95,12 +95,14 @@ def test_dataframe_seed_source_serialization() -> None: assert serialized == {"seed_type": "df"} -def test_directory_seed_source_requires_directory(tmp_path: Path) -> None: +def test_directory_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None: file_path = tmp_path / "file.txt" file_path.write_text("alpha", encoding="utf-8") - with pytest.raises(InvalidFilePathError, match="is not a directory"): - DirectorySeedSource(path=str(file_path)) + source = DirectorySeedSource(path=str(file_path)) + + assert source.path == str(file_path) + assert source.runtime_path == str(file_path) def test_directory_seed_source_preserves_relative_path_input(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None: @@ -146,7 +148,7 @@ def test_file_contents_seed_source_preserves_relative_path_input( pytest.param(FileContentsSeedSource, {"file_pattern": "*.txt"}, id="file-contents"), ], ) -def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes( +def test_filesystem_seed_sources_preserve_raw_runtime_path_across_cwd_changes( source_type: type[DirectorySeedSource] | type[FileContentsSeedSource], source_kwargs: dict[str, str], tmp_path: Path, @@ -160,12 +162,11 @@ def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes( monkeypatch.chdir(initial_root) source = source_type(path="seed-dir", **source_kwargs) - expected_runtime_path = str(initial_seed_dir.resolve()) monkeypatch.chdir(later_root) assert source.path == "seed-dir" - assert source.runtime_path == expected_runtime_path + assert source.runtime_path == "seed-dir" assert source.model_dump(mode="json")["path"] == "seed-dir" @@ -176,10 +177,10 @@ def test_seed_source_path_descriptions_document_cwd_resolution() -> None: assert "current working directory" in local_path_description assert "config file location" in local_path_description - assert "current working directory" in directory_path_description - assert "config file location" in directory_path_description - assert "current working directory" in file_contents_path_description - assert "config file location" in file_contents_path_description + assert "active filesystem provider" in directory_path_description + assert "config object is constructed" in directory_path_description + assert "active filesystem provider" in file_contents_path_description + assert "config object is constructed" in file_contents_path_description def test_file_contents_seed_source_parses_from_dict(tmp_path: Path) -> None: diff --git a/packages/data-designer-engine/src/data_designer/engine/compiler.py b/packages/data-designer-engine/src/data_designer/engine/compiler.py index fa3917269..bef430f92 100644 --- a/packages/data-designer-engine/src/data_designer/engine/compiler.py +++ b/packages/data-designer-engine/src/data_designer/engine/compiler.py @@ -10,7 +10,7 @@ from data_designer.config.errors import InvalidConfigError from data_designer.config.sampler_params import UUIDSamplerParams from data_designer.engine.resources.resource_provider import ResourceProvider -from data_designer.engine.resources.seed_reader import SeedReader +from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderConfigError from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config logger = logging.getLogger(__name__) @@ -31,7 +31,10 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR if not seed_reader: return - seed_col_names = seed_reader.get_column_names() + try: + seed_col_names = seed_reader.get_column_names() + except SeedReaderConfigError as error: + raise InvalidConfigError(str(error)) from error existing_columns = {column.name for column in config.columns} colliding_columns = {name for name in seed_col_names if name in existing_columns} if colliding_columns: diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 8f2574cf2..dedf15122 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -9,7 +9,7 @@ from copy import copy from dataclasses import dataclass from fnmatch import fnmatchcase -from pathlib import Path, PurePosixPath +from pathlib import Path, PurePath, PurePosixPath from typing import TYPE_CHECKING, Any, ClassVar, Generic, Protocol, TypeVar, get_args, get_origin from fsspec.implementations.dirfs import DirFileSystem @@ -50,12 +50,37 @@ class SeedReaderError(DataDesignerError): ... +class SeedReaderConfigError(SeedReaderError): ... + + @dataclass(frozen=True) class SeedReaderFileSystemContext: """Filesystem and root path available to filesystem seed-reader plugins.""" fs: AbstractFileSystem - root_path: Path + root_path: PurePath + + +class FileSystemProvider(Protocol): + """Resolves a runtime path into a rooted fsspec filesystem.""" + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: ... + + def ensure_root_exists(self, *, runtime_path: str) -> None: ... + + +class LocalFileSystemProvider: + """Default filesystem provider backed by the local disk.""" + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + resolved_root_path = Path(runtime_path).expanduser().resolve() + rooted_fs = DirFileSystem(path=str(resolved_root_path), fs=LocalFileSystem()) + return SeedReaderFileSystemContext(fs=rooted_fs, root_path=resolved_root_path) + + def ensure_root_exists(self, *, runtime_path: str) -> None: + resolved_root_path = Path(runtime_path).expanduser().resolve() + if not resolved_root_path.is_dir(): + raise SeedReaderConfigError(f"🛑 Seed source directory '{resolved_root_path}' does not exist.") class SeedReaderBatch(Protocol): @@ -388,12 +413,23 @@ class FileSystemSeedReader(SeedReader[FileSystemSourceT], ABC): output_columns: ClassVar[list[str] | None] = None + def __init__(self, fs_provider: FileSystemProvider | None = None) -> None: + self._fs_provider = fs_provider or LocalFileSystemProvider() + def _reset_attachment_state(self) -> None: super()._reset_attachment_state() self._filesystem_context = None self._output_df = None self._row_manifest_df = None + def create_filesystem_context(self, root_path: Path | str) -> SeedReaderFileSystemContext: + """Create a rooted filesystem context for directory-backed seed readers. + + This hook is preserved for existing plugin readers. New host integrations + should prefer passing a ``FileSystemProvider`` to the reader constructor. + """ + return self._get_fs_provider().create_context(runtime_path=str(root_path)) + def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection: return self.create_dataframe_duckdb_connection( table_name=self.get_dataset_uri(), @@ -495,10 +531,18 @@ def _get_filesystem_context(self) -> SeedReaderFileSystemContext: self._ensure_attached() context = getattr(self, "_filesystem_context", None) if context is None: + self._get_fs_provider().ensure_root_exists(runtime_path=self.source.runtime_path) context = self.create_filesystem_context(self.source.runtime_path) self._filesystem_context = context return context + def _get_fs_provider(self) -> FileSystemProvider: + provider = getattr(self, "_fs_provider", None) + if provider is None: + provider = LocalFileSystemProvider() + self._fs_provider = provider + return provider + def _get_manifest_dataset_uri(self) -> str: return self._build_internal_table_name("manifest") diff --git a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py index 979e66a12..717a29787 100644 --- a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py +++ b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py @@ -738,7 +738,7 @@ def test_local_file_seed_reader_uses_load_time_runtime_path_when_cwd_changes( assert list(df["value"]) == [1] -def test_directory_seed_reader_uses_load_time_runtime_path_when_cwd_changes( +def test_directory_seed_reader_uses_read_time_runtime_path_when_cwd_changes( tmp_path: Path, monkeypatch: pytest.MonkeyPatch, ) -> None: @@ -760,8 +760,17 @@ def test_directory_seed_reader_uses_load_time_runtime_path_when_cwd_changes( df = reader.create_duckdb_connection().execute(f"SELECT * FROM '{reader.get_dataset_uri()}'").df() assert source.path == "seed-dir" - assert list(df["relative_path"]) == ["alpha.txt"] - assert list(df["source_path"]) == [str((initial_seed_dir / "alpha.txt").resolve())] + assert list(df["relative_path"]) == ["beta.txt"] + assert list(df["source_path"]) == [str((later_seed_dir / "beta.txt").resolve())] + + +def test_directory_seed_reader_reports_missing_root_before_matching_files(tmp_path: Path) -> None: + missing_dir = tmp_path / "missing" + reader = DirectorySeedReader() + reader.attach(DirectorySeedSource(path=str(missing_dir), file_pattern="*.txt"), PlaintextResolver()) + + with pytest.raises(SeedReaderError, match="Seed source directory .* does not exist"): + reader.get_column_names() def test_filesystem_seed_reader_on_attach_requires_no_super_and_resets_state(tmp_path: Path) -> None: diff --git a/packages/data-designer-engine/tests/engine/test_compiler.py b/packages/data-designer-engine/tests/engine/test_compiler.py index fb3f9dbc0..81d7c0194 100644 --- a/packages/data-designer-engine/tests/engine/test_compiler.py +++ b/packages/data-designer-engine/tests/engine/test_compiler.py @@ -1,7 +1,7 @@ # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -from unittest.mock import patch +from unittest.mock import Mock, patch import pytest @@ -12,7 +12,7 @@ from data_designer.config.seed_source import HuggingFaceSeedSource from data_designer.engine.compiler import compile_data_designer_config from data_designer.engine.resources.resource_provider import ResourceProvider -from data_designer.engine.resources.seed_reader import SeedReader +from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderConfigError from data_designer.engine.validation import Violation, ViolationLevel, ViolationType @@ -55,6 +55,18 @@ def test_errors_on_seed_column_collisions(resource_provider: ResourceProvider): assert "city" in str(excinfo) +def test_seed_reader_config_errors_are_invalid_config_errors(resource_provider: ResourceProvider): + builder = DataDesignerConfigBuilder() + builder.with_seed_dataset(HuggingFaceSeedSource(path="hf://datasets/test/data.csv")) + resource_provider.seed_reader = Mock(spec=SeedReader) + resource_provider.seed_reader.get_column_names.side_effect = SeedReaderConfigError("missing seed root") + + with pytest.raises(InvalidConfigError, match="missing seed root") as excinfo: + compile_data_designer_config(builder.build(), resource_provider) + + assert isinstance(excinfo.value.__cause__, SeedReaderConfigError) + + def test_validation_errors(resource_provider: ResourceProvider): builder = DataDesignerConfigBuilder() builder.add_column( From 487a2a0e669756c3705990931477be84fdb27a4e Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Mon, 22 Jun 2026 10:00:58 -0500 Subject: [PATCH 02/10] SeedSource cleanup Signed-off-by: Mike Knepper --- .../src/data_designer/config/seed_source.py | 59 ++++--------------- .../tests/config/test_seed_source.py | 53 +++++++++++++++++ 2 files changed, 66 insertions(+), 46 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index ca1f6b2ba..e5040182d 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -110,8 +110,6 @@ class FileSystemSeedSource(SeedSource, ABC): directory for matching files. Defaults to ``True``. """ - _runtime_path: str | None = PrivateAttr(default=None) - path: str = Field( ..., description=( @@ -137,32 +135,22 @@ def validate_path(cls, value: str | None) -> str | None: # and inherited validators fire for all subclasses. return _validate_filesystem_seed_source_path(value) - def model_post_init(self, __context: Any) -> None: - # None guard is exercised by AgentRolloutSeedSource (path: str | None) via inheritance. - self._runtime_path = None if self.path is None else _resolve_filesystem_runtime_path(self.path) - - @property - def runtime_path(self) -> str: - if self._runtime_path is None: - self._runtime_path = _resolve_filesystem_runtime_path(self.path) - return self._runtime_path - @field_validator("file_pattern", mode="after") def validate_file_pattern(cls, value: str | None) -> str | None: return _validate_filesystem_seed_source_file_pattern(value) - -class DirectorySeedSource(FileSystemSeedSource): - seed_type: Literal["directory"] = "directory" - - def model_post_init(self, __context: Any) -> None: - self._runtime_path = self.path - @property def runtime_path(self) -> str: + # Path resolution and existence checks are the filesystem provider's job at read + # time, not the config object's. Keeping the raw value here preserves relative + # paths and avoids assuming a local filesystem. return self.path +class DirectorySeedSource(FileSystemSeedSource): + seed_type: Literal["directory"] = "directory" + + class FileContentsSeedSource(FileSystemSeedSource): seed_type: Literal["file_contents"] = "file_contents" @@ -179,13 +167,6 @@ def validate_encoding(cls, value: str) -> str: raise ValueError(f"🛑 Unknown encoding: {value!r}. Use a valid Python codec name.") from error return value - def model_post_init(self, __context: Any) -> None: - self._runtime_path = self.path - - @property - def runtime_path(self) -> str: - return self.path - def _resolve_filesystem_runtime_path(path: str) -> str: return str(Path(path).expanduser().resolve()) @@ -224,16 +205,6 @@ def _validate_filesystem_seed_source_path(value: str | None) -> str | None: return value -def _validate_local_filesystem_seed_source_path(value: str | None) -> str | None: - value = _validate_filesystem_seed_source_path(value) - if value is None: - return None - path = Path(value).expanduser().resolve() - if not path.is_dir(): - raise InvalidFilePathError(f"🛑 Path {path} is not a directory.") - return value - - def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | None: if value is None: return None @@ -296,10 +267,6 @@ class AgentRolloutSeedSource(FileSystemSeedSource): ), ) - @field_validator("path", mode="after") - def validate_path(cls, value: str | None) -> str | None: - return _validate_local_filesystem_seed_source_path(value) - @model_validator(mode="after") def validate_runtime_path_source(self) -> Self: default_path, _ = get_agent_rollout_format_defaults(self.format) @@ -309,14 +276,14 @@ def validate_runtime_path_source(self) -> Self: @property def runtime_path(self) -> str: - if self._runtime_path is not None: - return self._runtime_path + # Path resolution and existence checks happen in the filesystem provider at read + # time. When no explicit path is given, fall back to the format's default root. + if self.path is not None: + return self.path default_path, _ = get_agent_rollout_format_defaults(self.format) - resolved_path = self.path if self.path is not None else default_path - if resolved_path is None: + if default_path is None: raise ValueError(f"🛑 AgentRolloutSeedSource.path is required for format {self.format.value!r}.") - self._runtime_path = _resolve_filesystem_runtime_path(resolved_path) - return self._runtime_path + return default_path @property def resolved_file_pattern(self) -> str: diff --git a/packages/data-designer-config/tests/config/test_seed_source.py b/packages/data-designer-config/tests/config/test_seed_source.py index d886c9ca3..24a54eb54 100644 --- a/packages/data-designer-config/tests/config/test_seed_source.py +++ b/packages/data-designer-config/tests/config/test_seed_source.py @@ -4,6 +4,7 @@ from __future__ import annotations from pathlib import Path +from typing import Literal import pytest @@ -15,6 +16,7 @@ AgentRolloutSeedSource, DirectorySeedSource, FileContentsSeedSource, + FileSystemSeedSource, LocalFileSeedSource, ) from data_designer.config.seed_source_dataframe import DataFrameSeedSource @@ -224,6 +226,17 @@ def test_filesystem_seed_sources_reject_path_like_file_patterns( source_type(path=str(tmp_path), file_pattern=file_pattern) +def test_filesystem_seed_source_subclass_inherits_runtime_path(tmp_path: Path) -> None: + # Plugin authors subclass FileSystemSeedSource directly; readers rely on + # `source.runtime_path`, so the base must provide it without an override. + class PluginSeedSource(FileSystemSeedSource): + seed_type: Literal["plugin-seed-source"] = "plugin-seed-source" + + source = PluginSeedSource(path=str(tmp_path)) + + assert source.runtime_path == str(tmp_path) + + @pytest.mark.parametrize( ("rollout_format", "file_pattern", "error_message"), [ @@ -268,6 +281,46 @@ def test_agent_rollout_seed_source_requires_explicit_atif_path() -> None: AgentRolloutSeedSource(format=AgentRolloutFormat.ATIF) +def test_agent_rollout_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None: + missing_dir = tmp_path / "does-not-exist" + + source = AgentRolloutSeedSource(path=str(missing_dir), format=AgentRolloutFormat.ATIF) + + assert source.path == str(missing_dir) + assert source.runtime_path == str(missing_dir) + + +def test_agent_rollout_seed_source_preserves_raw_runtime_path_across_cwd_changes( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + initial_root = tmp_path / "initial" + later_root = tmp_path / "later" + (initial_root / "seed-dir").mkdir(parents=True) + later_root.mkdir() + + monkeypatch.chdir(initial_root) + source = AgentRolloutSeedSource(path="seed-dir", format=AgentRolloutFormat.ATIF) + + monkeypatch.chdir(later_root) + + assert source.path == "seed-dir" + assert source.runtime_path == "seed-dir" + assert source.model_dump(mode="json")["path"] == "seed-dir" + + +def test_agent_rollout_seed_source_runtime_path_falls_back_to_format_default( + monkeypatch: pytest.MonkeyPatch, + tmp_path: Path, +) -> None: + monkeypatch.setenv("HOME", str(tmp_path)) + + source = AgentRolloutSeedSource(format=AgentRolloutFormat.CLAUDE_CODE) + + assert source.path is None + assert source.runtime_path == str(tmp_path / ".claude" / "projects") + + def test_agent_rollout_seed_source_uses_default_atif_file_pattern(tmp_path: Path) -> None: trace_dir = tmp_path / "atif" trace_dir.mkdir() From 119b11c01f28e42470fcae867a73ab2ba2942044 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 14:27:58 -0500 Subject: [PATCH 03/10] Add Path/PurePath guard Signed-off-by: Mike Knepper --- .../src/data_designer/engine/resources/seed_reader.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index dedf15122..93869a8e7 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -697,8 +697,17 @@ def _get_parse_context(self, context: SeedReaderFileSystemContext) -> AgentRollo if self._parse_context is not self._PARSE_CONTEXT_UNSET: return self._parse_context + # Agent rollout handlers operate on the local filesystem directly (root_path.glob, + # root_path / relative_path), so they require a concrete Path rather than the + # PurePath the context type permits for remote providers. + root_path = context.root_path + if not isinstance(root_path, Path): + raise SeedReaderConfigError( + f"🛑 Agent rollout seed readers require a local filesystem, but got non-local root path " + f"{root_path!r} ({type(root_path).__name__})." + ) handler = self.get_format_handler() - self._parse_context = handler.build_parse_context(root_path=context.root_path, recursive=self.source.recursive) + self._parse_context = handler.build_parse_context(root_path=root_path, recursive=self.source.recursive) return self._parse_context From fad02f042d11c7bcd04441c6814f3a222626c2d6 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Tue, 23 Jun 2026 14:34:08 -0500 Subject: [PATCH 04/10] Fix stale docstring Signed-off-by: Mike Knepper --- .../src/data_designer/config/seed_source.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index e5040182d..0e06c82ce 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -253,8 +253,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource): "Claude Code defaults to ~/.claude/projects, Codex defaults to ~/.codex/sessions, " "Hermes Agent defaults to ~/.hermes/sessions, " "and Pi Coding Agent defaults to ~/.pi/agent/sessions. " - "Relative paths are resolved from the current working directory when the config is loaded, " - "not from the config file location." + "Relative local paths are resolved by the active filesystem provider when the seed is " + "validated or read, not when the config object is constructed." ), ) From d0d67470b076c7e91cc6de1a2daa5757c1982bfe Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:45:35 -0500 Subject: [PATCH 05/10] Preserve custom filesystem context overrides --- .../latest/pages/plugins/filesystem_seed_reader.mdx | 9 ++++++++- .../src/data_designer/engine/resources/seed_reader.py | 6 ++++-- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/fern/versions/latest/pages/plugins/filesystem_seed_reader.mdx b/fern/versions/latest/pages/plugins/filesystem_seed_reader.mdx index b54885b88..4e021f551 100644 --- a/fern/versions/latest/pages/plugins/filesystem_seed_reader.mdx +++ b/fern/versions/latest/pages/plugins/filesystem_seed_reader.mdx @@ -165,4 +165,11 @@ If you need more control, `FileSystemSeedReader` also lets you override: - `on_attach(...)` for per-attachment setup - `create_filesystem_context(...)` for custom rooted filesystem behavior -Most filesystem plugins do not need either hook. +For new non-local backends, prefer passing a `FileSystemProvider` to the reader +constructor. The default `create_filesystem_context(...)` implementation calls +the provider's existence preflight and then asks it to create the rooted +filesystem context. Overriding `create_filesystem_context(...)` remains supported +for existing plugins, but that override takes ownership of any backend-specific +existence checks. + +Most filesystem plugins do not need these hooks. diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 93869a8e7..343c62853 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -428,7 +428,10 @@ def create_filesystem_context(self, root_path: Path | str) -> SeedReaderFileSyst This hook is preserved for existing plugin readers. New host integrations should prefer passing a ``FileSystemProvider`` to the reader constructor. """ - return self._get_fs_provider().create_context(runtime_path=str(root_path)) + runtime_path = str(root_path) + provider = self._get_fs_provider() + provider.ensure_root_exists(runtime_path=runtime_path) + return provider.create_context(runtime_path=runtime_path) def create_duckdb_connection(self) -> duckdb.DuckDBPyConnection: return self.create_dataframe_duckdb_connection( @@ -531,7 +534,6 @@ def _get_filesystem_context(self) -> SeedReaderFileSystemContext: self._ensure_attached() context = getattr(self, "_filesystem_context", None) if context is None: - self._get_fs_provider().ensure_root_exists(runtime_path=self.source.runtime_path) context = self.create_filesystem_context(self.source.runtime_path) self._filesystem_context = context return context From 7db319687738200f1d78a697aaddb8efda07d013 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:46:29 -0500 Subject: [PATCH 06/10] Cover filesystem provider injection --- .../engine/resources/test_seed_reader.py | 82 +++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py index 717a29787..07fb3099b 100644 --- a/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py +++ b/packages/data-designer-engine/tests/engine/resources/test_seed_reader.py @@ -11,6 +11,7 @@ from unittest.mock import patch import pytest +from fsspec.implementations.memory import MemoryFileSystem import data_designer.lazy_heavy_imports as lazy from data_designer.config.seed import IndexRange @@ -29,6 +30,8 @@ FileContentsSeedReader, FileSystemSeedReader, LocalFileSeedReader, + LocalFileSystemProvider, + SeedReaderConfigError, SeedReaderError, SeedReaderFileSystemContext, SeedReaderRegistry, @@ -51,6 +54,20 @@ def hydrate_row( return super().hydrate_row(manifest_row=manifest_row, context=context) +class StubFileSystemProvider: + def __init__(self, context: SeedReaderFileSystemContext) -> None: + self.context = context + self.ensure_root_exists_calls: list[str] = [] + self.create_context_calls: list[str] = [] + + def ensure_root_exists(self, *, runtime_path: str) -> None: + self.ensure_root_exists_calls.append(runtime_path) + + def create_context(self, *, runtime_path: str) -> SeedReaderFileSystemContext: + self.create_context_calls.append(runtime_path) + return self.context + + class PluginStyleDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]): def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.DataFrame | list[dict[str, str]]: matched_paths = self.get_matching_relative_paths( @@ -186,6 +203,16 @@ def build_manifest(self, *, context: SeedReaderFileSystemContext) -> lazy.pd.Dat return [{"relative_path": relative_path} for relative_path in matched_paths] +class ContextCapturingDirectorySeedReader(FileSystemSeedReader[DirectorySeedSource]): + def __init__(self, fs_provider: StubFileSystemProvider) -> None: + super().__init__(fs_provider=fs_provider) + self.manifest_context: SeedReaderFileSystemContext | None = None + + def build_manifest(self, *, context: SeedReaderFileSystemContext) -> list[dict[str, str]]: + self.manifest_context = context + return [{"label": "captured"}] + + class TrackingAgentRolloutSeedReader(AgentRolloutSeedReader): def __init__(self) -> None: super().__init__() @@ -477,6 +504,61 @@ def test_plugin_style_filesystem_seed_reader_can_fan_out_rows(tmp_path: Path) -> assert list(df["line"]) == ["alpha-0", "alpha-1", "beta-0"] +def test_filesystem_seed_reader_uses_injected_filesystem_provider() -> None: + context = SeedReaderFileSystemContext( + fs=MemoryFileSystem(), + root_path=Path("remote-root"), + ) + context.fs.pipe("alpha.txt", b"alpha") + provider = StubFileSystemProvider(context) + reader = DirectorySeedReader(fs_provider=provider) + reader.attach( + DirectorySeedSource(path="remote-root", file_pattern="*.txt"), + PlaintextResolver(), + ) + + df = reader.create_duckdb_connection().execute(f"SELECT * FROM '{reader.get_dataset_uri()}'").df() + + assert provider.ensure_root_exists_calls == ["remote-root"] + assert provider.create_context_calls == ["remote-root"] + assert list(df["relative_path"]) == ["alpha.txt"] + assert list(df["source_path"]) == ["remote-root/alpha.txt"] + + +def test_filesystem_seed_reader_passes_provider_context_to_manifest_builder() -> None: + context = SeedReaderFileSystemContext( + fs=MemoryFileSystem(), + root_path=Path("captured-root"), + ) + provider = StubFileSystemProvider(context) + reader = ContextCapturingDirectorySeedReader(fs_provider=provider) + reader.attach( + DirectorySeedSource(path="captured-root"), + PlaintextResolver(), + ) + + assert reader.get_column_names() == ["label"] + assert reader.manifest_context is context + + +def test_local_filesystem_provider_creates_context_for_existing_directory(tmp_path: Path) -> None: + (tmp_path / "alpha.txt").write_text("alpha", encoding="utf-8") + + provider = LocalFileSystemProvider() + provider.ensure_root_exists(runtime_path=str(tmp_path)) + context = provider.create_context(runtime_path=str(tmp_path)) + + assert context.root_path == tmp_path.resolve() + assert context.fs.find("", withdirs=False) == ["alpha.txt"] + + +def test_local_filesystem_provider_rejects_missing_directory(tmp_path: Path) -> None: + missing_dir = tmp_path / "missing" + + with pytest.raises(SeedReaderConfigError, match="Seed source directory .* does not exist"): + LocalFileSystemProvider().ensure_root_exists(runtime_path=str(missing_dir)) + + def test_directory_seed_reader_matches_files_recursively(tmp_path: Path) -> None: (tmp_path / "alpha.txt").write_text("alpha", encoding="utf-8") (tmp_path / "nested").mkdir() From e08f4bfb57aecb9bac53baff845df2dde711ec56 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:46:50 -0500 Subject: [PATCH 07/10] Remove base filesystem context helper --- .../src/data_designer/engine/resources/seed_reader.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 343c62853..70a5bbcc9 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -267,12 +267,6 @@ def create_batch_reader( query_result = conn.query(read_query) return DuckDBSeedReaderBatchReader(conn=conn, query_result=query_result, batch_size=batch_size) - def create_filesystem_context(self, root_path: Path | str) -> SeedReaderFileSystemContext: - """Create a rooted filesystem context for directory-backed seed readers.""" - resolved_root_path = Path(root_path).expanduser().resolve() - rooted_fs = DirFileSystem(path=str(resolved_root_path), fs=LocalFileSystem()) - return SeedReaderFileSystemContext(fs=rooted_fs, root_path=resolved_root_path) - def get_matching_relative_paths( self, *, From 8210c599607e24a2690a45cb01096c7e832fa24a Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:47:08 -0500 Subject: [PATCH 08/10] Clarify filesystem provider initialization --- .../src/data_designer/engine/resources/seed_reader.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py index 70a5bbcc9..8d2304499 100644 --- a/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py +++ b/packages/data-designer-engine/src/data_designer/engine/resources/seed_reader.py @@ -412,6 +412,11 @@ def __init__(self, fs_provider: FileSystemProvider | None = None) -> None: def _reset_attachment_state(self) -> None: super()._reset_attachment_state() + # Plugin readers have historically been allowed to define __init__ without + # calling super().__init__(). Attach-time initialization keeps that contract + # while preserving any provider explicitly injected through the base init. + if not hasattr(self, "_fs_provider"): + self._fs_provider = LocalFileSystemProvider() self._filesystem_context = None self._output_df = None self._row_manifest_df = None @@ -533,11 +538,7 @@ def _get_filesystem_context(self) -> SeedReaderFileSystemContext: return context def _get_fs_provider(self) -> FileSystemProvider: - provider = getattr(self, "_fs_provider", None) - if provider is None: - provider = LocalFileSystemProvider() - self._fs_provider = provider - return provider + return self._fs_provider def _get_manifest_dataset_uri(self) -> str: return self._build_internal_table_name("manifest") From f528434b0ed64466e43c7c6490fb2bf25720ac3e Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:47:27 -0500 Subject: [PATCH 09/10] Inline filesystem path validation --- .../src/data_designer/config/seed_source.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index 0e06c82ce..cf87180ec 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -133,7 +133,11 @@ class FileSystemSeedSource(SeedSource, ABC): def validate_path(cls, value: str | None) -> str | None: # Signature is str | None because AgentRolloutSeedSource overrides path to str | None # and inherited validators fire for all subclasses. - return _validate_filesystem_seed_source_path(value) + if value is None: + return None + if not value.strip(): + raise InvalidFilePathError("🛑 FileSystemSeedSource.path must be a non-empty string.") + return value @field_validator("file_pattern", mode="after") def validate_file_pattern(cls, value: str | None) -> str | None: @@ -197,14 +201,6 @@ def get_pi_coding_agent_default_path() -> str: return str(Path("~/.pi/agent/sessions").expanduser()) -def _validate_filesystem_seed_source_path(value: str | None) -> str | None: - if value is None: - return None - if not value.strip(): - raise InvalidFilePathError("🛑 FileSystemSeedSource.path must be a non-empty string.") - return value - - def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | None: if value is None: return None From 590abf0ef63e421be49a84470d196e7af38cb3f9 Mon Sep 17 00:00:00 2001 From: Mike Knepper Date: Fri, 26 Jun 2026 10:47:43 -0500 Subject: [PATCH 10/10] Simplify agent rollout default path --- .../src/data_designer/config/seed_source.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/data-designer-config/src/data_designer/config/seed_source.py b/packages/data-designer-config/src/data_designer/config/seed_source.py index cf87180ec..2a6c44072 100644 --- a/packages/data-designer-config/src/data_designer/config/seed_source.py +++ b/packages/data-designer-config/src/data_designer/config/seed_source.py @@ -277,8 +277,6 @@ def runtime_path(self) -> str: if self.path is not None: return self.path default_path, _ = get_agent_rollout_format_defaults(self.format) - if default_path is None: - raise ValueError(f"🛑 AgentRolloutSeedSource.path is required for format {self.format.value!r}.") return default_path @property