word
" * 200 + "" + readability_content = "word
" * 200 + "content
" * 500 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "content
" * 500 + "important data
" * 300 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": ""} + result = extract_content_from_html(html) + assert "important data" in result + assert mock_readability.call_count == 2 + + def test_readability_none_content_triggers_fallback(self): + """When Readability returns None content, fallback is triggered.""" + html = "" + "real content
" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {"content": None} + else: + return {"content": "real content
" * 200 + "data
" * 200 + "" + + call_count = [0] + + def mock_simple_json(h, use_readability=True): + call_count[0] += 1 + if call_count[0] == 1: + return {} # no "content" key at all + else: + return {"content": "data
" * 200 + "tiny
' + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "tiny
"} + else: + return {"content": f"tiny
"} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # "tiny" (4 chars) < 1% of ~9100 chars → fallback triggered + assert len(result.strip()) > 50 + + def test_content_at_threshold_boundary_no_fallback(self): + """Content exactly at the 1% threshold does not trigger fallback.""" + # Build HTML of known size, readability returns content exactly at threshold + filler = "a" * 1000 + html = f"{filler}
" + # 1% of ~1030 chars ≈ 10 chars; return content well above that + readability_content = f"{filler}
" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": readability_content} + result = extract_content_from_html(html) + assert mock_readability.call_count == 1 + assert "a" in result + + def test_whitespace_only_readability_output_triggers_fallback(self): + """Readability returning whitespace-only content triggers fallback.""" + html = "" + "payload
" * 300 + "" + + def mock_simple_json(h, use_readability=True): + if use_readability: + return {"content": "payload
" * 300 + "deep content
" * 400 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + # Both calls return minimal content + mock_readability.return_value = {"content": "x
"} + result = extract_content_from_html(html) + assert "deep content" in result + assert mock_readability.call_count == 2 + + def test_no_readability_returns_none_falls_to_raw(self): + """When stage 2 (no-readability) returns None content, raw markdownify is used.""" + html = "" + "fallback target
" * 200 + "" + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string") as mock_readability: + mock_readability.return_value = {"content": None} + result = extract_content_from_html(html) + assert "fallback target" in result + + def test_empty_html_returns_empty_string(self): + """Empty HTML input produces empty output, no error tags.""" + result = extract_content_from_html("") + assert "Hi
" + result = extract_content_from_html(html) + assert "Loading...
" + hidden_content = "{}
".format(" ".join(["word"] * 500)) + html = """ + + +Article paragraph {} with enough text to be meaningful content.
".format(i) + for i in range(50) + ) + html = """ + +{}
short
", + ] + for html in cases: + result = extract_content_from_html(html) + assert "cascade
" * 400 + "" + + calls = [] + + def mock_simple_json(h, use_readability=True): + calls.append(use_readability) + return {"content": ""} + + with patch("mcp_server_fetch.server.readabilipy.simple_json.simple_json_from_html_string", side_effect=mock_simple_json): + result = extract_content_from_html(html) + # Stage 1: use_readability=True, Stage 2: use_readability=False + assert calls == [True, False] + # Stage 3: raw markdownify recovers content + assert "cascade" in result