From fc39723bbc70a78418bb127e11da9a7b37d48c23 Mon Sep 17 00:00:00 2001 From: Raman Marozau Date: Thu, 9 Apr 2026 01:50:15 +0200 Subject: [PATCH 1/2] fix(fetch): add fallback extraction for readability-stripped content - Add fallback mechanism when readability extracts minimal content (e.g., SSR hydration divs) - Implement minimum content length threshold (1% of original HTML) to detect over-aggressive stripping - Fall back to extraction without readability, then raw HTML markdownify if threshold not met - Update extract_content_from_html to gracefully handle empty content instead of returning error - Add comprehensive test suite for fallback scenarios including SSR patterns and hidden content - Update test_empty_content_returns_error to test graceful handling instead of error message --- src/fetch/src/mcp_server_fetch/server.py | 42 +++++++++-- src/fetch/tests/test_server.py | 95 ++++++++++++++++++++++-- 2 files changed, 126 insertions(+), 11 deletions(-) diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..dc32afa5f9 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -27,6 +27,10 @@ def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. + Uses readability for content extraction with fallback mechanisms for cases + where readability strips too much content (e.g. sites using hidden divs + for SSR hydration). + Args: html: Raw HTML content to process @@ -36,12 +40,38 @@ def extract_content_from_html(html: str) -> str: ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" - content = markdownify.markdownify( - ret["content"], - heading_style=markdownify.ATX, - ) + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + else: + content = "" + + # If readability extracted very little text compared to the original HTML, + # it likely stripped meaningful content (e.g. hidden SSR hydration divs). + # Fall back to extraction without readability, then raw markdownify. + min_length = max(1, len(html) // 100) + content_text = content.strip() + if len(content_text) < min_length: + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + if ret["content"]: + content = markdownify.markdownify( + ret["content"], + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_length: + return content + + # Last resort: convert the raw HTML directly + content = markdownify.markdownify( + html, + heading_style=markdownify.ATX, + ) + return content diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..9f23e82703 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -81,11 +81,11 @@ def test_html_with_links(self): result = extract_content_from_html(html) assert "Example" in result - def test_empty_content_returns_error(self): - """Test that empty/invalid HTML returns error message.""" - html = "" - result = extract_content_from_html(html) - assert "" in result + def test_empty_content(self): + """Test that empty HTML is handled gracefully via fallback.""" + result = extract_content_from_html("") + # Empty input produces empty output after fallback + assert "" not in result class TestCheckMayAutonomouslyFetchUrl: @@ -324,3 +324,88 @@ async def test_fetch_with_proxy(self): # Verify AsyncClient was called with proxy mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080") + + +class TestExtractContentFromHtmlFallback: + """Tests for extract_content_from_html readability fallback.""" + + def test_normal_html_no_fallback(self): + """Test that normal HTML with visible content works without fallback.""" + html = """ + + Normal Page + +
+

Welcome

+

This is a normal page with plenty of visible content that + readability should have no trouble extracting properly.

+

Here is another paragraph with more content to ensure + we have enough text for the extraction to work well.

+
+ + + """ + result = extract_content_from_html(html) + assert "normal page" in result.lower() + assert "" not in result + + def test_hidden_ssr_content_triggers_fallback(self): + """Test that hidden SSR content triggers fallback extraction.""" + visible_shell = "

Loading...

" + hidden_content = "

{}

".format(" ".join(["word"] * 500)) + html = """ + + +
{}
+ + + + """.format(visible_shell, hidden_content) + result = extract_content_from_html(html) + # Fallback should recover the hidden content + assert len(result.strip()) > len(html) // 100 + + def test_readability_empty_content_triggers_fallback(self): + """Test that readability returning empty content triggers fallback.""" + content_text = " ".join(["meaningful"] * 200) + html = """ + + +

{}

+ + + """.format(content_text) + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + # First call (use_readability=True) returns empty content + # Second call (use_readability=False) returns empty content too + mock_readability.return_value = {"content": None} + result = extract_content_from_html(html) + # Fallback to raw markdownify should recover content + assert "meaningful" in result + assert "" not in result + + def test_small_visible_shell_large_hidden_ssr(self): + """Test realistic SSR pattern: small visible loading shell + large hidden content.""" + ssr_paragraphs = "\n".join( + "

Article paragraph {} with enough text to be meaningful content.

".format(i) + for i in range(50) + ) + html = """ + + SSR App + +
+
Loading application...
+
+
+
+

Full Article Title

+ {} +
+
+ + + """.format(ssr_paragraphs) + result = extract_content_from_html(html) + # Should not return just "Loading application..." — fallback should recover content + assert len(result.strip()) > len(html) // 100 From 98352637ab989a5e2123a5fca179a474c0ebb2c1 Mon Sep 17 00:00:00 2001 From: Raman Marozau Date: Tue, 14 Apr 2026 14:18:51 +0200 Subject: [PATCH 2/2] test(fetch): expand extract_content_from_html fallback test coverage - Replace single generic test with 11 comprehensive test cases covering fallback scenarios - Add tests for readability returning sufficient content without fallback - Add tests for readability stripping content and triggering fallback to non-readability mode - Add tests for both readability modes failing and falling back to raw markdownify - Add tests for None content and missing 'content' key edge cases - Add tests for 1% threshold calculation and boundary conditions - Add tests for whitespace-only content triggering fallback - Add tests for stage 2 (no-readability) failures falling back to raw extraction - Improve test clarity with descriptive names and docstrings explaining each scenario - Ensure comprehensive coverage of the three-stage fallback extraction pipeline --- src/fetch/tests/test_server.py | 223 +++++++++++++++++++++++++++------ 1 file changed, 185 insertions(+), 38 deletions(-) diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 9f23e82703..ff43779d43 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -329,24 +329,149 @@ async def test_fetch_with_proxy(self): class TestExtractContentFromHtmlFallback: """Tests for extract_content_from_html readability fallback.""" - def test_normal_html_no_fallback(self): - """Test that normal HTML with visible content works without fallback.""" - html = """ - - Normal Page - -
-

Welcome

-

This is a normal page with plenty of visible content that - readability should have no trouble extracting properly.

-

Here is another paragraph with more content to ensure - we have enough text for the extraction to work well.

-
- - - """ + def test_readability_sufficient_content_no_fallback(self): + """When Readability returns enough content, no fallback is triggered.""" + html = "" + "

word

" * 200 + "" + readability_content = "
" + "

word

" * 200 + "
" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + assert mock_readability.call_count == 1 + assert "word" in result + + def test_readability_strips_content_falls_back_to_no_readability(self): + """When Readability returns too little, falls back to non-Readability extraction.""" + html = "" + "

content

" * 500 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "
Loading...
"} + else: + return {"content": "
" + "

content

" * 500 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "content" in result + assert len(result.strip()) > 100 + + def test_both_readability_modes_fail_falls_back_to_markdownify(self): + """When both readabilipy modes return too little, falls back to raw markdownify.""" + html = "" + "

important data

" * 300 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": ""} + result = extract_content_from_html(html) + assert "important data" in result + assert mock_readability.call_count == 2 + + def test_readability_none_content_triggers_fallback(self): + """When Readability returns None content, fallback is triggered.""" + html = "" + "

real content

" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {"content": None} + else: + return {"content": "
" + "

real content

" * 200 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "real content" in result + + def test_readability_missing_content_key_triggers_fallback(self): + """When Readability returns dict without 'content' key, fallback is triggered.""" + html = "" + "

data

" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {} # no "content" key at all + else: + return {"content": "
" + "

data

" * 200 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "data" in result + + def test_threshold_is_one_percent_of_html(self): + """The fallback threshold is 1% of the input HTML length.""" + padding = "x" * 9000 + html = f'
{padding}

tiny

' + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "

tiny

"} + else: + return {"content": f"
{padding}

tiny

"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # "tiny" (4 chars) < 1% of ~9100 chars → fallback triggered + assert len(result.strip()) > 50 + + def test_content_at_threshold_boundary_no_fallback(self): + """Content exactly at the 1% threshold does not trigger fallback.""" + # Build HTML of known size, readability returns content exactly at threshold + filler = "a" * 1000 + html = f"

{filler}

" + # 1% of ~1030 chars ≈ 10 chars; return content well above that + readability_content = f"

{filler}

" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + assert mock_readability.call_count == 1 + assert "a" in result + + def test_whitespace_only_readability_output_triggers_fallback(self): + """Readability returning whitespace-only content triggers fallback.""" + html = "" + "

payload

" * 300 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "
\n\t
"} + else: + return {"content": "
" + "

payload

" * 300 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "payload" in result + + def test_no_readability_also_returns_too_little_falls_to_raw(self): + """When stage 2 (no-readability) also returns too little, raw markdownify is used.""" + html = "" + "

deep content

" * 400 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + # Both calls return minimal content + mock_readability.return_value = {"content": "

x

"} + result = extract_content_from_html(html) + assert "deep content" in result + assert mock_readability.call_count == 2 + + def test_no_readability_returns_none_falls_to_raw(self): + """When stage 2 (no-readability) returns None content, raw markdownify is used.""" + html = "" + "

fallback target

" * 200 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": None} + result = extract_content_from_html(html) + assert "fallback target" in result + + def test_empty_html_returns_empty_string(self): + """Empty HTML input produces empty output, no error tags.""" + result = extract_content_from_html("") + assert "" not in result + + def test_small_html_min_length_is_one(self): + """For very small HTML, min_length floors at 1 so fallback still works.""" + html = "

Hi

" result = extract_content_from_html(html) - assert "normal page" in result.lower() assert "" not in result def test_hidden_ssr_content_triggers_fallback(self): @@ -362,28 +487,8 @@ def test_hidden_ssr_content_triggers_fallback(self): """.format(visible_shell, hidden_content) result = extract_content_from_html(html) - # Fallback should recover the hidden content assert len(result.strip()) > len(html) // 100 - def test_readability_empty_content_triggers_fallback(self): - """Test that readability returning empty content triggers fallback.""" - content_text = " ".join(["meaningful"] * 200) - html = """ - - -

{}

- - - """.format(content_text) - with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: - # First call (use_readability=True) returns empty content - # Second call (use_readability=False) returns empty content too - mock_readability.return_value = {"content": None} - result = extract_content_from_html(html) - # Fallback to raw markdownify should recover content - assert "meaningful" in result - assert "" not in result - def test_small_visible_shell_large_hidden_ssr(self): """Test realistic SSR pattern: small visible loading shell + large hidden content.""" ssr_paragraphs = "\n".join( @@ -407,5 +512,47 @@ def test_small_visible_shell_large_hidden_ssr(self): """.format(ssr_paragraphs) result = extract_content_from_html(html) - # Should not return just "Loading application..." — fallback should recover content assert len(result.strip()) > len(html) // 100 + + def test_opacity_zero_hidden_content(self): + """Content hidden via opacity:0 should be recovered by fallback.""" + hidden_text = " ".join(["secret"] * 300) + html = """ + + +
Visible shell
+

{}

+ + + """.format(hidden_text) + result = extract_content_from_html(html) + assert len(result.strip()) > len(html) // 100 + + def test_result_never_contains_error_tags(self): + """No code path in the updated function should produce tags.""" + cases = [ + "", + "", + "", + "

short

", + ] + for html in cases: + result = extract_content_from_html(html) + assert "" not in result, f"Got for input: {html!r}" + + def test_all_three_stages_called_when_needed(self): + """Verify the full three-stage cascade: readability → no-readability → raw markdownify.""" + html = "" + "

cascade

" * 400 + "" + + calls = [] + + def mock_simple_json(h, use_readability=True): + calls.append(use_readability) + return {"content": ""} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # Stage 1: use_readability=True, Stage 2: use_readability=False + assert calls == [True, False] + # Stage 3: raw markdownify recovers content + assert "cascade" in result