Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion fern/versions/latest/pages/concepts/seed-datasets.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ Directory-backed seed datasets expose these columns:

<Note>
Filesystem matching
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off.
`file_pattern` matches file names only, not relative paths. `recursive=True` is the default, so nested subdirectories are searched unless you turn it off. Relative local `path` values are resolved by the active filesystem provider when the seed is validated or read, not when the config object is constructed.
</Note>

### πŸ“„ FileContentsSeedSource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,4 +165,11 @@ If you need more control, `FileSystemSeedReader` also lets you override:
- `on_attach(...)` for per-attachment setup
- `create_filesystem_context(...)` for custom rooted filesystem behavior

Most filesystem plugins do not need either hook.
For new non-local backends, prefer passing a `FileSystemProvider` to the reader
constructor. The default `create_filesystem_context(...)` implementation calls
the provider's existence preflight and then asks it to create the rooted
filesystem context. Overriding `create_filesystem_context(...)` remains supported
for existing plugins, but that override takes ownership of any backend-specific
existence checks.

Most filesystem plugins do not need these hooks.
Original file line number Diff line number Diff line change
Expand Up @@ -100,23 +100,21 @@ class FileSystemSeedSource(SeedSource, ABC):
``FileSystemSeedReader`` implementation.

Attributes:
path: Directory containing seed artifacts. Relative paths are resolved
from the current working directory when the config is loaded, not
from the config file location.
path: Directory containing seed artifacts. Relative local paths are
resolved by the active filesystem provider when the seed is
validated or read, not when the config object is constructed.
file_pattern: Case-sensitive filename pattern used to match files under
the provided directory. Patterns match basenames only, not relative
paths. Defaults to ``'*'``.
recursive: Whether to search nested subdirectories under the provided
directory for matching files. Defaults to ``True``.
"""

_runtime_path: str | None = PrivateAttr(default=None)

path: str = Field(
...,
description=(
"Directory containing seed artifacts. Relative paths are resolved from the current working "
"directory when the config is loaded, not from the config file location."
"Directory containing seed artifacts. Relative local paths are resolved by the active filesystem "
"provider when the seed is validated or read, not when the config object is constructed."
),
)
file_pattern: str = Field(
Expand All @@ -135,22 +133,23 @@ class FileSystemSeedSource(SeedSource, ABC):
def validate_path(cls, value: str | None) -> str | None:
# Signature is str | None because AgentRolloutSeedSource overrides path to str | None
# and inherited validators fire for all subclasses.
return _validate_filesystem_seed_source_path(value)

def model_post_init(self, __context: Any) -> None:
# None guard is exercised by AgentRolloutSeedSource (path: str | None) via inheritance.
self._runtime_path = None if self.path is None else _resolve_filesystem_runtime_path(self.path)

@property
def runtime_path(self) -> str:
if self._runtime_path is None:
self._runtime_path = _resolve_filesystem_runtime_path(self.path)
return self._runtime_path
if value is None:
return None
if not value.strip():
raise InvalidFilePathError("πŸ›‘ FileSystemSeedSource.path must be a non-empty string.")
return value

@field_validator("file_pattern", mode="after")
def validate_file_pattern(cls, value: str | None) -> str | None:
return _validate_filesystem_seed_source_file_pattern(value)

@property
def runtime_path(self) -> str:
# Path resolution and existence checks are the filesystem provider's job at read
# time, not the config object's. Keeping the raw value here preserves relative
# paths and avoids assuming a local filesystem.
return self.path


class DirectorySeedSource(FileSystemSeedSource):
seed_type: Literal["directory"] = "directory"
Expand Down Expand Up @@ -202,15 +201,6 @@ def get_pi_coding_agent_default_path() -> str:
return str(Path("~/.pi/agent/sessions").expanduser())


def _validate_filesystem_seed_source_path(value: str | None) -> str | None:
if value is None:
return None
path = Path(value).expanduser().resolve()
if not path.is_dir():
raise InvalidFilePathError(f"πŸ›‘ Path {path} is not a directory.")
return value


def _validate_filesystem_seed_source_file_pattern(value: str | None) -> str | None:
if value is None:
return None
Expand Down Expand Up @@ -259,8 +249,8 @@ class AgentRolloutSeedSource(FileSystemSeedSource):
"Claude Code defaults to ~/.claude/projects, Codex defaults to ~/.codex/sessions, "
"Hermes Agent defaults to ~/.hermes/sessions, "
"and Pi Coding Agent defaults to ~/.pi/agent/sessions. "
"Relative paths are resolved from the current working directory when the config is loaded, "
"not from the config file location."
"Relative local paths are resolved by the active filesystem provider when the seed is "
"validated or read, not when the config object is constructed."
),
)

Expand All @@ -282,14 +272,12 @@ def validate_runtime_path_source(self) -> Self:

@property
def runtime_path(self) -> str:
if self._runtime_path is not None:
return self._runtime_path
# Path resolution and existence checks happen in the filesystem provider at read
# time. When no explicit path is given, fall back to the format's default root.
if self.path is not None:
return self.path
default_path, _ = get_agent_rollout_format_defaults(self.format)
resolved_path = self.path if self.path is not None else default_path
if resolved_path is None:
raise ValueError(f"πŸ›‘ AgentRolloutSeedSource.path is required for format {self.format.value!r}.")
self._runtime_path = _resolve_filesystem_runtime_path(resolved_path)
return self._runtime_path
return default_path

@property
def resolved_file_pattern(self) -> str:
Expand Down
74 changes: 64 additions & 10 deletions packages/data-designer-config/tests/config/test_seed_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from __future__ import annotations

from pathlib import Path
from typing import Literal

import pytest

Expand All @@ -15,6 +16,7 @@
AgentRolloutSeedSource,
DirectorySeedSource,
FileContentsSeedSource,
FileSystemSeedSource,
LocalFileSeedSource,
)
from data_designer.config.seed_source_dataframe import DataFrameSeedSource
Expand Down Expand Up @@ -95,12 +97,14 @@ def test_dataframe_seed_source_serialization() -> None:
assert serialized == {"seed_type": "df"}


def test_directory_seed_source_requires_directory(tmp_path: Path) -> None:
def test_directory_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
file_path = tmp_path / "file.txt"
file_path.write_text("alpha", encoding="utf-8")

with pytest.raises(InvalidFilePathError, match="is not a directory"):
DirectorySeedSource(path=str(file_path))
source = DirectorySeedSource(path=str(file_path))

assert source.path == str(file_path)
assert source.runtime_path == str(file_path)


def test_directory_seed_source_preserves_relative_path_input(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
Expand Down Expand Up @@ -146,7 +150,7 @@ def test_file_contents_seed_source_preserves_relative_path_input(
pytest.param(FileContentsSeedSource, {"file_pattern": "*.txt"}, id="file-contents"),
],
)
def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(
def test_filesystem_seed_sources_preserve_raw_runtime_path_across_cwd_changes(
source_type: type[DirectorySeedSource] | type[FileContentsSeedSource],
source_kwargs: dict[str, str],
tmp_path: Path,
Expand All @@ -160,12 +164,11 @@ def test_filesystem_seed_sources_cache_runtime_path_across_cwd_changes(

monkeypatch.chdir(initial_root)
source = source_type(path="seed-dir", **source_kwargs)
expected_runtime_path = str(initial_seed_dir.resolve())

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == expected_runtime_path
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


Expand All @@ -176,10 +179,10 @@ def test_seed_source_path_descriptions_document_cwd_resolution() -> None:

assert "current working directory" in local_path_description
assert "config file location" in local_path_description
assert "current working directory" in directory_path_description
assert "config file location" in directory_path_description
assert "current working directory" in file_contents_path_description
assert "config file location" in file_contents_path_description
assert "active filesystem provider" in directory_path_description
assert "config object is constructed" in directory_path_description
assert "active filesystem provider" in file_contents_path_description
assert "config object is constructed" in file_contents_path_description


def test_file_contents_seed_source_parses_from_dict(tmp_path: Path) -> None:
Expand Down Expand Up @@ -223,6 +226,17 @@ def test_filesystem_seed_sources_reject_path_like_file_patterns(
source_type(path=str(tmp_path), file_pattern=file_pattern)


def test_filesystem_seed_source_subclass_inherits_runtime_path(tmp_path: Path) -> None:
# Plugin authors subclass FileSystemSeedSource directly; readers rely on
# `source.runtime_path`, so the base must provide it without an override.
class PluginSeedSource(FileSystemSeedSource):
seed_type: Literal["plugin-seed-source"] = "plugin-seed-source"

source = PluginSeedSource(path=str(tmp_path))

assert source.runtime_path == str(tmp_path)


@pytest.mark.parametrize(
("rollout_format", "file_pattern", "error_message"),
[
Expand Down Expand Up @@ -267,6 +281,46 @@ def test_agent_rollout_seed_source_requires_explicit_atif_path() -> None:
AgentRolloutSeedSource(format=AgentRolloutFormat.ATIF)


def test_agent_rollout_seed_source_defers_directory_existence_validation(tmp_path: Path) -> None:
missing_dir = tmp_path / "does-not-exist"

source = AgentRolloutSeedSource(path=str(missing_dir), format=AgentRolloutFormat.ATIF)

assert source.path == str(missing_dir)
assert source.runtime_path == str(missing_dir)


def test_agent_rollout_seed_source_preserves_raw_runtime_path_across_cwd_changes(
tmp_path: Path,
monkeypatch: pytest.MonkeyPatch,
) -> None:
initial_root = tmp_path / "initial"
later_root = tmp_path / "later"
(initial_root / "seed-dir").mkdir(parents=True)
later_root.mkdir()

monkeypatch.chdir(initial_root)
source = AgentRolloutSeedSource(path="seed-dir", format=AgentRolloutFormat.ATIF)

monkeypatch.chdir(later_root)

assert source.path == "seed-dir"
assert source.runtime_path == "seed-dir"
assert source.model_dump(mode="json")["path"] == "seed-dir"


def test_agent_rollout_seed_source_runtime_path_falls_back_to_format_default(
monkeypatch: pytest.MonkeyPatch,
tmp_path: Path,
) -> None:
monkeypatch.setenv("HOME", str(tmp_path))

source = AgentRolloutSeedSource(format=AgentRolloutFormat.CLAUDE_CODE)

assert source.path is None
assert source.runtime_path == str(tmp_path / ".claude" / "projects")


def test_agent_rollout_seed_source_uses_default_atif_file_pattern(tmp_path: Path) -> None:
trace_dir = tmp_path / "atif"
trace_dir.mkdir()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from data_designer.config.errors import InvalidConfigError
from data_designer.config.sampler_params import UUIDSamplerParams
from data_designer.engine.resources.resource_provider import ResourceProvider
from data_designer.engine.resources.seed_reader import SeedReader
from data_designer.engine.resources.seed_reader import SeedReader, SeedReaderConfigError
from data_designer.engine.validation import ViolationLevel, rich_print_violations, validate_data_designer_config

logger = logging.getLogger(__name__)
Expand All @@ -31,7 +31,10 @@ def _resolve_and_add_seed_columns(config: DataDesignerConfig, seed_reader: SeedR
if not seed_reader:
return

seed_col_names = seed_reader.get_column_names()
try:
seed_col_names = seed_reader.get_column_names()
except SeedReaderConfigError as error:
raise InvalidConfigError(str(error)) from error
existing_columns = {column.name for column in config.columns}
colliding_columns = {name for name in seed_col_names if name in existing_columns}
if colliding_columns:
Expand Down
Loading
Loading