diff --git a/src/fetch/src/mcp_server_fetch/server.py b/src/fetch/src/mcp_server_fetch/server.py index b42c7b1f6b..dc32afa5f9 100644 --- a/src/fetch/src/mcp_server_fetch/server.py +++ b/src/fetch/src/mcp_server_fetch/server.py @@ -27,6 +27,10 @@ def extract_content_from_html(html: str) -> str: """Extract and convert HTML content to Markdown format. + Uses readability for content extraction with fallback mechanisms for cases + where readability strips too much content (e.g. sites using hidden divs + for SSR hydration). + Args: html: Raw HTML content to process @@ -36,12 +40,38 @@ def extract_content_from_html(html: str) -> str: ret = readabilipy.simple_json.simple_json_from_html_string( html, use_readability=True ) - if not ret["content"]: - return "Page failed to be simplified from HTML" - content = markdownify.markdownify( - ret["content"], - heading_style=markdownify.ATX, - ) + content_html = ret.get("content", "") + if content_html: + content = markdownify.markdownify( + content_html, + heading_style=markdownify.ATX, + ) + else: + content = "" + + # If readability extracted very little text compared to the original HTML, + # it likely stripped meaningful content (e.g. hidden SSR hydration divs). + # Fall back to extraction without readability, then raw markdownify. + min_length = max(1, len(html) // 100) + content_text = content.strip() + if len(content_text) < min_length: + ret = readabilipy.simple_json.simple_json_from_html_string( + html, use_readability=False + ) + if ret["content"]: + content = markdownify.markdownify( + ret["content"], + heading_style=markdownify.ATX, + ) + if len(content.strip()) >= min_length: + return content + + # Last resort: convert the raw HTML directly + content = markdownify.markdownify( + html, + heading_style=markdownify.ATX, + ) + return content diff --git a/src/fetch/tests/test_server.py b/src/fetch/tests/test_server.py index 96c1cb38c7..ff43779d43 100644 --- a/src/fetch/tests/test_server.py +++ b/src/fetch/tests/test_server.py @@ -81,11 +81,11 @@ def test_html_with_links(self): result = extract_content_from_html(html) assert "Example" in result - def test_empty_content_returns_error(self): - """Test that empty/invalid HTML returns error message.""" - html = "" - result = extract_content_from_html(html) - assert "" in result + def test_empty_content(self): + """Test that empty HTML is handled gracefully via fallback.""" + result = extract_content_from_html("") + # Empty input produces empty output after fallback + assert "" not in result class TestCheckMayAutonomouslyFetchUrl: @@ -324,3 +324,235 @@ async def test_fetch_with_proxy(self): # Verify AsyncClient was called with proxy mock_client_class.assert_called_once_with(proxy="http://proxy.example.com:8080") + + +class TestExtractContentFromHtmlFallback: + """Tests for extract_content_from_html readability fallback.""" + + def test_readability_sufficient_content_no_fallback(self): + """When Readability returns enough content, no fallback is triggered.""" + html = "" + "

word

" * 200 + "" + readability_content = "
" + "

word

" * 200 + "
" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + assert mock_readability.call_count == 1 + assert "word" in result + + def test_readability_strips_content_falls_back_to_no_readability(self): + """When Readability returns too little, falls back to non-Readability extraction.""" + html = "" + "

content

" * 500 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "
Loading...
"} + else: + return {"content": "
" + "

content

" * 500 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "content" in result + assert len(result.strip()) > 100 + + def test_both_readability_modes_fail_falls_back_to_markdownify(self): + """When both readabilipy modes return too little, falls back to raw markdownify.""" + html = "" + "

important data

" * 300 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": ""} + result = extract_content_from_html(html) + assert "important data" in result + assert mock_readability.call_count == 2 + + def test_readability_none_content_triggers_fallback(self): + """When Readability returns None content, fallback is triggered.""" + html = "" + "

real content

" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {"content": None} + else: + return {"content": "
" + "

real content

" * 200 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "real content" in result + + def test_readability_missing_content_key_triggers_fallback(self): + """When Readability returns dict without 'content' key, fallback is triggered.""" + html = "" + "

data

" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {} # no "content" key at all + else: + return {"content": "
" + "

data

" * 200 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "data" in result + + def test_threshold_is_one_percent_of_html(self): + """The fallback threshold is 1% of the input HTML length.""" + padding = "x" * 9000 + html = f'
{padding}

tiny

' + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "

tiny

"} + else: + return {"content": f"
{padding}

tiny

"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # "tiny" (4 chars) < 1% of ~9100 chars → fallback triggered + assert len(result.strip()) > 50 + + def test_content_at_threshold_boundary_no_fallback(self): + """Content exactly at the 1% threshold does not trigger fallback.""" + # Build HTML of known size, readability returns content exactly at threshold + filler = "a" * 1000 + html = f"

{filler}

" + # 1% of ~1030 chars ≈ 10 chars; return content well above that + readability_content = f"

{filler}

" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + assert mock_readability.call_count == 1 + assert "a" in result + + def test_whitespace_only_readability_output_triggers_fallback(self): + """Readability returning whitespace-only content triggers fallback.""" + html = "" + "

payload

" * 300 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "
\n\t
"} + else: + return {"content": "
" + "

payload

" * 300 + "
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + assert "payload" in result + + def test_no_readability_also_returns_too_little_falls_to_raw(self): + """When stage 2 (no-readability) also returns too little, raw markdownify is used.""" + html = "" + "

deep content

" * 400 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + # Both calls return minimal content + mock_readability.return_value = {"content": "

x

"} + result = extract_content_from_html(html) + assert "deep content" in result + assert mock_readability.call_count == 2 + + def test_no_readability_returns_none_falls_to_raw(self): + """When stage 2 (no-readability) returns None content, raw markdownify is used.""" + html = "" + "

fallback target

" * 200 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": None} + result = extract_content_from_html(html) + assert "fallback target" in result + + def test_empty_html_returns_empty_string(self): + """Empty HTML input produces empty output, no error tags.""" + result = extract_content_from_html("") + assert "" not in result + + def test_small_html_min_length_is_one(self): + """For very small HTML, min_length floors at 1 so fallback still works.""" + html = "

Hi

" + result = extract_content_from_html(html) + assert "" not in result + + def test_hidden_ssr_content_triggers_fallback(self): + """Test that hidden SSR content triggers fallback extraction.""" + visible_shell = "

Loading...

" + hidden_content = "

{}

".format(" ".join(["word"] * 500)) + html = """ + + +
{}
+ + + + """.format(visible_shell, hidden_content) + result = extract_content_from_html(html) + assert len(result.strip()) > len(html) // 100 + + def test_small_visible_shell_large_hidden_ssr(self): + """Test realistic SSR pattern: small visible loading shell + large hidden content.""" + ssr_paragraphs = "\n".join( + "

Article paragraph {} with enough text to be meaningful content.

".format(i) + for i in range(50) + ) + html = """ + + SSR App + +
+
Loading application...
+
+
+
+

Full Article Title

+ {} +
+
+ + + """.format(ssr_paragraphs) + result = extract_content_from_html(html) + assert len(result.strip()) > len(html) // 100 + + def test_opacity_zero_hidden_content(self): + """Content hidden via opacity:0 should be recovered by fallback.""" + hidden_text = " ".join(["secret"] * 300) + html = """ + + +
Visible shell
+

{}

+ + + """.format(hidden_text) + result = extract_content_from_html(html) + assert len(result.strip()) > len(html) // 100 + + def test_result_never_contains_error_tags(self): + """No code path in the updated function should produce tags.""" + cases = [ + "", + "", + "", + "

short

", + ] + for html in cases: + result = extract_content_from_html(html) + assert "" not in result, f"Got for input: {html!r}" + + def test_all_three_stages_called_when_needed(self): + """Verify the full three-stage cascade: readability → no-readability → raw markdownify.""" + html = "" + "

cascade

" * 400 + "" + + calls = [] + + def mock_simple_json(h, use_readability=True): + calls.append(use_readability) + return {"content": ""} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # Stage 1: use_readability=True, Stage 2: use_readability=False + assert calls == [True, False] + # Stage 3: raw markdownify recovers content + assert "cascade" in result