Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@

# MP2: Context Window Stuffing — filling context to displace content
MP2_PATTERNS = [
(r"((\S)(?!\2).{1,19}?)\1{20,}", 0.8),
(r"(.{2,20}?)\1{20,}", 0.8),
(
r"(?:repeat|duplicate|echo)\s+(?:this|the\s+following)\s+(?:\d{3,}|many|hundreds?|thousands?)\s+times?",
0.85,
Expand Down Expand Up @@ -182,6 +182,10 @@ def ctx(start: int) -> str:
)
for pattern, confidence in MP2_PATTERNS:
for match in re.finditer(pattern, content, re.IGNORECASE | re.MULTILINE):
captured = match.group(1) if match.lastindex else match.group(0)
non_ws_chars = set(captured) - {" ", "\t", "\n", "\r"}
if len(non_ws_chars) <= 1 and not any(c in captured for c in (" ", "\t")):
continue
line_num = get_line_number(content, match.start())
findings.append(
AnalyzerFinding(
Expand Down
106 changes: 106 additions & 0 deletions tests/nodes/analyzers/test_mp2_regex_backtracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for MP2 regex pattern — catastrophic backtracking prevention."""

from __future__ import annotations

import time

import pytest

from skillspector.nodes.analyzers import static_patterns_memory_poisoning as mp_module


class TestMP2DetectsStuffing:
"""MP2 correctly detects repeated content (context window stuffing)."""

def test_repeated_phrase_detected(self) -> None:
"""A multi-char phrase repeated 25+ times triggers MP2."""
content = "ABCD" * 30
findings = mp_module.analyze(content, "test.md", "markdown")
mp2 = [f for f in findings if f.rule_id == "MP2"]
assert len(mp2) >= 1

def test_repeated_short_phrase_detected(self) -> None:
"""A short multi-char phrase repeated 25+ times triggers MP2."""
content = "DEADBEEF_PAYLOAD" * 25
findings = mp_module.analyze(content, "payload.md", "markdown")
mp2 = [f for f in findings if f.rule_id == "MP2"]
assert len(mp2) >= 1

def test_short_repetition_not_detected(self) -> None:
"""Under 20 repetitions should not trigger the repetition pattern."""
content = "hello world. " * 5
findings = mp_module.analyze(content, "normal.md", "markdown")
mp2_repetition = [
f for f in findings
if f.rule_id == "MP2" and "Context Window Stuffing" in f.message
]
assert len(mp2_repetition) == 0

def test_separator_line_not_detected(self) -> None:
"""Single-char separators like '=' * 80 should be suppressed."""
content = "=" * 80
findings = mp_module.analyze(content, "readme.md", "markdown")
mp2 = [f for f in findings if f.rule_id == "MP2"]
assert len(mp2) == 0

def test_whitespace_bearing_stuffing_detected(self) -> None:
"""Repeated tokens containing whitespace (e.g. 'x ' * 30) must not be suppressed."""
content = "x " * 30
findings = mp_module.analyze(content, "payload.md", "markdown")
mp2 = [f for f in findings if f.rule_id == "MP2"]
assert len(mp2) >= 1, "Whitespace-bearing stuffing should be detected, not suppressed"


class TestMP2NoBacktracking:
"""MP2 regex completes in bounded time on adversarial inputs."""

@pytest.mark.timeout(5)
def test_non_matching_random_input_completes_fast(self) -> None:
"""Non-repeating input of moderate size should complete within 5 seconds.

The old regex with nested lazy quantifier and backreference would hang
on non-matching inputs due to catastrophic backtracking.
"""
content = "".join(chr(65 + (i % 26)) for i in range(2000))
start = time.monotonic()
mp_module.analyze(content, "adversarial.txt", "text")
elapsed = time.monotonic() - start
assert elapsed < 5.0, f"MP2 regex took {elapsed:.1f}s — possible backtracking"

@pytest.mark.timeout(5)
def test_near_miss_pattern_completes_fast(self) -> None:
"""Input with almost-repeating but not-quite structure completes quickly.

This is the classic ReDoS vector: content that almost matches but
requires the regex engine to explore many backtracking paths.
"""
content = ("abcdefghij" * 19) + "abcdefghiX" + ("abcdefghij" * 5)
start = time.monotonic()
mp_module.analyze(content, "nearmiss.txt", "text")
elapsed = time.monotonic() - start
assert elapsed < 5.0, f"MP2 regex took {elapsed:.1f}s — possible backtracking"

@pytest.mark.timeout(5)
def test_large_non_repeating_content(self) -> None:
"""5KB of non-repeating text should not cause regex to hang."""
lines = [f"Line {i}: This is unique content number {i * 7 + 3}." for i in range(100)]
content = "\n".join(lines)
start = time.monotonic()
mp_module.analyze(content, "large.md", "markdown")
elapsed = time.monotonic() - start
assert elapsed < 5.0, f"MP2 regex took {elapsed:.1f}s on 5KB input"