From ef93af5d5c5ac3541abb4485a55d3e00deb38689 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Fri, 5 Jun 2026 17:31:53 +0100 Subject: [PATCH 1/2] feat(chunker): add Solidity language support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Solidity (`.sol`) files were detected as unknown language and indexed as plain text via sliding window — no symbols, no contract/function navigation, no `cix def` / `cix refs`. Repos cloned by the server-side `repoindexer` skipped `.sol` files entirely because their language resolved to "". Fix on three sides: - `server/internal/langdetect`: map `.sol` → "solidity" - `server/internal/chunker`: register `grammars.SolidityLanguage` with node kinds for contract / library (class), interface / struct / enum / event (type), and function / modifier / constructor / fallback-receive (function). Tree-sitter grammar was already shipped by gotreesitter. - `cli/internal/discovery`: mirror `.sol` → "solidity" in the CLI's own extension map so locally-discovered files reach the server with the right language tag instead of "". Tests: - new `TestChunkFile_Solidity` verifies contract / function symbols are extracted with correct kind and parent linkage - `TestRegistry_NodeNamesMatchAST` fixture extended with a Solidity snippet so node names stay in sync with the grammar - langdetect test gets a `Token.sol → solidity` case Verified on a sample contract: 10 symbols extracted (contract, library, interface, struct, enum, event, function, modifier, method) where before everything came out as a single `block` chunk with no symbol name. Co-Authored-By: Claude Opus 4.7 --- cli/internal/discovery/language.go | 1 + server/internal/chunker/chunker.go | 9 ++++ server/internal/chunker/chunker_test.go | 51 +++++++++++++++++++ server/internal/langdetect/langdetect.go | 43 ++++++++-------- server/internal/langdetect/langdetect_test.go | 3 +- 5 files changed, 85 insertions(+), 22 deletions(-) diff --git a/cli/internal/discovery/language.go b/cli/internal/discovery/language.go index ac6e91f..951e4be 100644 --- a/cli/internal/discovery/language.go +++ b/cli/internal/discovery/language.go @@ -27,6 +27,7 @@ var ExtensionMap = map[string]string{ ".swift": "swift", ".kt": "kotlin", ".scala": "scala", + ".sol": "solidity", ".r": "r", ".lua": "lua", ".sh": "bash", diff --git a/server/internal/chunker/chunker.go b/server/internal/chunker/chunker.go index b5e34a2..f8768b0 100644 --- a/server/internal/chunker/chunker.go +++ b/server/internal/chunker/chunker.go @@ -432,6 +432,15 @@ func defaultRegistry() map[string]languageEntry { }, identifiers: idID("type_identifier"), }, + "solidity": { + factory: grammars.SolidityLanguage, + nodes: map[string][]string{ + "function": {"function_definition", "modifier_definition", "constructor_definition", "fallback_receive_definition"}, + "class": {"contract_declaration", "library_declaration"}, + "type": {"interface_declaration", "struct_declaration", "enum_declaration", "event_definition"}, + }, + identifiers: idID(), + }, } } diff --git a/server/internal/chunker/chunker_test.go b/server/internal/chunker/chunker_test.go index 15310c5..6287ee5 100644 --- a/server/internal/chunker/chunker_test.go +++ b/server/internal/chunker/chunker_test.go @@ -475,6 +475,56 @@ end } } +func TestChunkFile_Solidity(t *testing.T) { + src := `// SPDX-License-Identifier: MIT +pragma solidity ^0.8.0; + +interface IERC20 { + function transfer(address to, uint256 amount) external returns (bool); +} + +contract Token is IERC20 { + enum State { Active, Paused } + + function transfer(address to, uint256 amount) external returns (bool) { + return true; + } +} +` + chunks, _, err := ChunkFile("Token.sol", src, "solidity", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if len(chunks) == 0 { + t.Fatal("expected chunks from Solidity source") + } + counts := chunkTypeCounts(chunks) + if counts["class"] == 0 { + t.Errorf("expected a class chunk for the contract, got: %v", counts) + } + + var names []string + for _, c := range chunks { + if c.SymbolName != nil { + names = append(names, *c.SymbolName) + } + } + if len(names) == 0 { + t.Fatalf("expected named symbols from Solidity source, got none (chunk types: %v)", counts) + } + want := map[string]bool{"Token": false, "transfer": false} + for _, n := range names { + if _, ok := want[n]; ok { + want[n] = true + } + } + for sym, found := range want { + if !found { + t.Errorf("expected symbol %q among extracted symbols %v", sym, names) + } + } +} + // --- Configure() filtering --- func TestConfigure_FilterToSubset(t *testing.T) { @@ -605,6 +655,7 @@ func TestRegistry_NodeNamesMatchAST(t *testing.T) { "fortran": "subroutine s\nend subroutine\n", "haskell": "module M where\n\nf :: Int -> Int\nf x = x\n", "ocaml": "let f x = x\n", + "solidity": "contract C {\n function f() public {}\n}\n", } for lang, src := range fixtures { diff --git a/server/internal/langdetect/langdetect.go b/server/internal/langdetect/langdetect.go index 4568fdf..eef4755 100644 --- a/server/internal/langdetect/langdetect.go +++ b/server/internal/langdetect/langdetect.go @@ -12,29 +12,30 @@ import ( // Ported 1:1 from EXTENSION_MAP in api/app/core/language.py. var extensionMap = map[string]string{ // Systems / compiled - ".py": "python", - ".go": "go", - ".rs": "rust", - ".java": "java", - ".c": "c", - ".h": "c", - ".cpp": "cpp", - ".cc": "cpp", - ".cxx": "cpp", - ".hpp": "cpp", - ".cs": "c_sharp", + ".py": "python", + ".go": "go", + ".rs": "rust", + ".java": "java", + ".c": "c", + ".h": "c", + ".cpp": "cpp", + ".cc": "cpp", + ".cxx": "cpp", + ".hpp": "cpp", + ".cs": "c_sharp", ".swift": "swift", - ".kt": "kotlin", - ".kts": "kotlin", + ".kt": "kotlin", + ".kts": "kotlin", ".scala": "scala", - ".zig": "zig", - ".jl": "julia", - ".f90": "fortran", - ".f95": "fortran", - ".f03": "fortran", - ".f": "fortran", - ".m": "objc", - ".mm": "objc", + ".zig": "zig", + ".jl": "julia", + ".f90": "fortran", + ".f95": "fortran", + ".f03": "fortran", + ".f": "fortran", + ".m": "objc", + ".mm": "objc", + ".sol": "solidity", // Web / scripting ".ts": "typescript", ".tsx": "tsx", diff --git a/server/internal/langdetect/langdetect_test.go b/server/internal/langdetect/langdetect_test.go index 27e94f7..defe8fb 100644 --- a/server/internal/langdetect/langdetect_test.go +++ b/server/internal/langdetect/langdetect_test.go @@ -33,8 +33,9 @@ func TestDetect(t *testing.T) { {"README.md", "markdown"}, {"unknown.xyz", ""}, {"/some/path/to/main.go", "go"}, - {"script.R", "r"}, // uppercase .R + {"script.R", "r"}, // uppercase .R {"script.sh", "bash"}, + {"Token.sol", "solidity"}, {"build.gradle.kts", "kotlin"}, {"app.kts", "kotlin"}, } From 1eddfa3cdd0d4c1b1e9f922f20dde624640324c6 Mon Sep 17 00:00:00 2001 From: dvcdsys Date: Fri, 5 Jun 2026 17:39:46 +0100 Subject: [PATCH 2/2] review: update LANGUAGES.md, strict node-kind regression for Solidity PR review feedback: 1. doc/LANGUAGES.md was missed in the original PR. Bump default set header from (30) -> (31) and add the `solidity` row to the table. 2. TestRegistry_NodeNamesMatchAST's `matched := false` logic passes if any *one* of a language's configured kinds appears in the AST -- so a future grammar rename of `modifier_definition` (or any of the other 8 Solidity kinds beyond contract/function) would slip through. Reviewer flagged this as a low-priority shared limitation; rather than reshape the generic test for every language, add a focused `TestRegistry_SolidityAllNodeKindsPresent` that parses a contract exercising all 10 advertised node kinds and fails if any is missing. Also widen the Solidity fixture in the generic test for parity. Co-Authored-By: Claude Opus 4.7 --- doc/LANGUAGES.md | 3 +- server/internal/chunker/chunker_test.go | 85 ++++++++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/doc/LANGUAGES.md b/doc/LANGUAGES.md index 70ef5c4..69164af 100644 --- a/doc/LANGUAGES.md +++ b/doc/LANGUAGES.md @@ -2,7 +2,7 @@ cix uses tree-sitter (via `github.com/odvcencio/gotreesitter`) to extract semantic chunks (functions, classes, methods, types) from source code. Files in unsupported languages still get indexed via a sliding-window fallback — they're searchable, just without per-symbol granularity. -## Default language set (30) +## Default language set (31) | ID | gotreesitter factory | Function | Class | Method | Type | |---|---|:-:|:-:|:-:|:-:| @@ -36,6 +36,7 @@ cix uses tree-sitter (via `github.com/odvcencio/gotreesitter`) to extract semant | `fortran` | `FortranLanguage` | ✓ | ✓ | | | | `haskell` | `HaskellLanguage` | ✓ | | | ✓ | | `ocaml` | `OcamlLanguage` | ✓ | ✓ | | ✓ | +| `solidity` | `SolidityLanguage` | ✓ | ✓ | | ✓ | The exact AST node types per language live in `server/internal/chunker/chunker.go` (`defaultRegistry`). File-extension mapping lives in `server/internal/langdetect/langdetect.go`. diff --git a/server/internal/chunker/chunker_test.go b/server/internal/chunker/chunker_test.go index 6287ee5..78463f8 100644 --- a/server/internal/chunker/chunker_test.go +++ b/server/internal/chunker/chunker_test.go @@ -525,6 +525,74 @@ contract Token is IERC20 { } } +// TestRegistry_SolidityAllNodeKindsPresent is a stricter sibling to +// TestRegistry_NodeNamesMatchAST: the generic test passes if *any* one of a +// language's configured node types appears in the AST, which would let a +// grammar rename of e.g. `modifier_definition` slip through unnoticed. +// +// Solidity's registry entry advertises 10 node kinds. This test parses a +// contract that exercises every one of them and fails loudly if any single +// kind is missing from the AST — protecting downstream `cix def` / `cix refs` +// behavior for modifiers, events, constructors, and receive/fallback +// functions, which are easy to overlook in the broader smoke test. +func TestRegistry_SolidityAllNodeKindsPresent(t *testing.T) { + defer Configure(nil) + Configure(nil) + + src := `// SPDX-License-Identifier: MIT +pragma solidity ^0.8.0; + +interface I { function f() external; } +library L { function g() internal pure returns (uint) { return 1; } } +contract C { + struct S { uint x; } + enum E { A, B } + event Ev(uint x); + modifier m() { _; } + constructor() {} + receive() external payable {} + function f() public {} +} +` + + registryMu.RLock() + fn, ok := languageRegistry["solidity"] + nodes := languageNodes["solidity"] + registryMu.RUnlock() + if !ok { + t.Fatal("solidity not in registry") + } + if nodes == nil { + t.Fatal("solidity has no node map") + } + + grammar := fn() + if grammar == nil { + t.Fatal("nil solidity grammar") + } + + parser := sitter.NewParser(grammar) + tree, err := parser.Parse([]byte(src)) + if err != nil { + t.Fatalf("parse error: %v", err) + } + root := tree.RootNode() + if root == nil { + t.Fatal("nil root") + } + + seen := map[string]struct{}{} + collectNodeTypes(root, grammar, seen) + + for _, types := range nodes { + for _, ty := range types { + if _, found := seen[ty]; !found { + t.Errorf("configured Solidity node type %q not present in AST — grammar rename?", ty) + } + } + } +} + // --- Configure() filtering --- func TestConfigure_FilterToSubset(t *testing.T) { @@ -655,7 +723,22 @@ func TestRegistry_NodeNamesMatchAST(t *testing.T) { "fortran": "subroutine s\nend subroutine\n", "haskell": "module M where\n\nf :: Int -> Int\nf x = x\n", "ocaml": "let f x = x\n", - "solidity": "contract C {\n function f() public {}\n}\n", + // Covers every configured kind: contract_declaration, library_declaration, + // interface_declaration, struct_declaration, enum_declaration, event_definition, + // function_definition, modifier_definition, constructor_definition, + // fallback_receive_definition. Guards against grammar renames for any of them. + "solidity": "" + + "interface I { function f() external; }\n" + + "library L { function g() internal pure returns (uint) { return 1; } }\n" + + "contract C {\n" + + " struct S { uint x; }\n" + + " enum E { A, B }\n" + + " event Ev(uint x);\n" + + " modifier m() { _; }\n" + + " constructor() {}\n" + + " receive() external payable {}\n" + + " function f() public {}\n" + + "}\n", } for lang, src := range fixtures {