diff --git a/src/skillspector/nodes/analyzers/static_runner.py b/src/skillspector/nodes/analyzers/static_runner.py index ee0d50fb..7f7837c5 100644 --- a/src/skillspector/nodes/analyzers/static_runner.py +++ b/src/skillspector/nodes/analyzers/static_runner.py @@ -17,6 +17,7 @@ from __future__ import annotations +import re from collections.abc import Callable from skillspector.logging_config import get_logger @@ -67,6 +68,63 @@ def _infer_file_type(path: str) -> str: return FILE_TYPES.get(suffix, "other") +_BINARY_EXTENSIONS = frozenset({ + ".pdf", ".png", ".jpg", ".jpeg", ".gif", ".bmp", ".ico", + ".woff", ".woff2", ".ttf", ".otf", ".eot", + ".zip", ".tar", ".gz", ".bz2", ".xz", ".7z", ".rar", + ".exe", ".dll", ".so", ".dylib", ".bin", ".o", ".a", + ".pyc", ".pyo", ".class", ".wasm", + ".mp3", ".mp4", ".wav", ".avi", ".mov", ".webm", + ".sqlite", ".db", +}) + +_NULL_BYTE_SAMPLE_SIZE = 512 + + +def _is_binary_file(path: str, content: str) -> bool: + """Detect binary files by extension or null-byte presence in the first 512 chars.""" + idx = path.rfind(".") + if idx >= 0 and path[idx:].lower() in _BINARY_EXTENSIONS: + return True + return "\x00" in content[:_NULL_BYTE_SAMPLE_SIZE] + + +_PE3_ENV_REFERENCE_CONTEXT = re.compile( + r"(?:create|copy|rename|add|set up|configure|make)\s+.*\.env", + re.IGNORECASE, +) + + +def _is_env_file_reference_in_docs(finding: AnalyzerFinding, file_type: str, file_path: str = "") -> bool: + """Return True if a PE3 finding is a documentation reference to .env files, not actual access. + + SKILL.md is exempt: it is the agent's primary instruction file, so `.env` + references there may be genuine credential-access instructions. + """ + if finding.rule_id != "PE3": + return False + if file_type not in ("markdown", "text"): + return False + if file_path.replace("\\", "/").lower().endswith("skill.md"): + return False + if not finding.context: + return False + if _PE3_ENV_REFERENCE_CONTEXT.search(finding.context): + return True + ctx_lower = finding.context.lower() + doc_phrases = ( + ".env.example", + "cp .env", + "copy .env", + "mv .env", + "rename .env", + ".env file", + "environment file", + "dotenv", + ) + return any(phrase in ctx_lower for phrase in doc_phrases) + + def _is_eval_dataset(path: str) -> bool: """Return True for authored eval datasets that contain test-case prose.""" return path.replace("\\", "/") in _EVAL_DATASET_FILES @@ -160,12 +218,21 @@ def run_static_patterns( MAX_FILE_BYTES, ) continue + if _is_binary_file(path, content): + logger.debug("Skipping binary file: %s", path) + continue file_type = _infer_file_type(path) is_doc_markdown = _is_documentation_markdown(path) is_non_executable = file_type in _NON_EXECUTABLE_FILE_TYPES for module in pattern_modules: raw = module.analyze(content=content, file_path=path, file_type=file_type) for af in raw: + if _is_env_file_reference_in_docs(af, file_type, path): + logger.debug( + "Filtered PE3 .env doc reference: %s in %s:%d", + af.rule_id, path, af.location.start_line, + ) + continue if af.context and is_code_example(af.context): if is_non_executable: logger.debug( diff --git a/tests/nodes/analyzers/test_binary_and_pe3_filtering.py b/tests/nodes/analyzers/test_binary_and_pe3_filtering.py new file mode 100644 index 00000000..ae1c72ba --- /dev/null +++ b/tests/nodes/analyzers/test_binary_and_pe3_filtering.py @@ -0,0 +1,277 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for binary file skipping and PE3 .env documentation reference filtering.""" + +from __future__ import annotations + +from unittest.mock import MagicMock + +from skillspector.models import AnalyzerFinding, Location, Severity +from skillspector.nodes.analyzers.static_runner import ( + _is_binary_file, + _is_env_file_reference_in_docs, + run_static_patterns, +) + + +def _make_pe3_finding(context: str) -> AnalyzerFinding: + return AnalyzerFinding( + rule_id="PE3", + message="Credential Access", + severity=Severity.HIGH, + location=Location(file="docs/setup.md", start_line=10), + confidence=0.6, + tags=["privilege_escalation"], + context=context, + matched_text=".env", + ) + + +class TestBinaryFileDetection: + """Binary files are correctly identified and skipped.""" + + def test_pdf_extension_detected(self) -> None: + assert _is_binary_file("report.pdf", "some content") is True + + def test_png_extension_detected(self) -> None: + assert _is_binary_file("image.png", "fake data") is True + + def test_zip_extension_detected(self) -> None: + assert _is_binary_file("archive.zip", "PK\x03\x04") is True + + def test_exe_extension_detected(self) -> None: + assert _is_binary_file("tool.exe", "MZ") is True + + def test_markdown_not_binary(self) -> None: + assert _is_binary_file("README.md", "# Hello\n") is False + + def test_python_not_binary(self) -> None: + assert _is_binary_file("tool.py", "import os\n") is False + + def test_null_byte_in_content_detected(self) -> None: + content = "start\x00binary\x00data" + assert _is_binary_file("unknownfile", content) is True + + def test_no_null_byte_not_binary(self) -> None: + assert _is_binary_file("unknownfile", "normal text content") is False + + def test_case_insensitive_extension(self) -> None: + assert _is_binary_file("photo.JPEG", "data") is True + assert _is_binary_file("archive.ZIP", "PK") is True + + def test_svg_not_treated_as_binary(self) -> None: + """SVG is text/XML and can carry