diff --git a/netlify/edge-functions/markdown-negotiation.ts b/netlify/edge-functions/markdown-negotiation.ts index 3d7c59344..28f388b52 100644 --- a/netlify/edge-functions/markdown-negotiation.ts +++ b/netlify/edge-functions/markdown-negotiation.ts @@ -1,6 +1,34 @@ const MARKDOWN_CONTENT_TYPE = "text/markdown; charset=utf-8"; const TEXT_NODE = 3; const ELEMENT_NODE = 1; +const NOISE_SELECTORS = [ + "script", + "style", + "noscript", + "template", + "svg", + "header", + "footer", + "nav", + "aside", + ".site-header", + ".site-footer", + ".side-bar", + ".secondary-nav", + ".search-input-wrap", + ".social-media-list", + ".footer-nav", + ".skip-to-content-link", + ".header-anchor", + ".next-previous", + ".next-previous--feedback", + ".home-banner-info", + "#secondary-nav", + "#site-nav", + "#toggled-search", + "[aria-label='Main navigation']", + "[aria-label='Footer navigation']", +]; export default async (request: Request, context: { next: () => Promise }) => { const response = await context.next(); @@ -76,18 +104,15 @@ function htmlToMarkdown(html: string, baseUrl: URL): string { return htmlToMarkdownFallback(html, baseUrl); } - for (const selector of ["script", "style", "noscript"]) { - doc.querySelectorAll(selector).forEach((node: any) => node.remove()); - } + pruneDocument(doc); const title = normalizeWhitespace(doc.querySelector("title")?.textContent || ""); - const body = doc.body; - const bodyMarkdown = body ? renderChildren(body, baseUrl) : ""; + const contentRoot = selectContentRoot(doc); + const bodyMarkdown = contentRoot ? renderChildren(contentRoot, baseUrl) : ""; const parts: string[] = []; if (title) { parts.push(`---\ntitle: ${title}\n---`); - parts.push(`# ${title}`); } if (bodyMarkdown) { @@ -102,19 +127,20 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string { .replace(//g, "") .replace(/]*>[\s\S]*?<\/script>/gi, "") .replace(/]*>[\s\S]*?<\/style>/gi, "") - .replace(/]*>[\s\S]*?<\/noscript>/gi, ""); + .replace(/]*>[\s\S]*?<\/noscript>/gi, "") + .replace(/]*>[\s\S]*?<\/svg>/gi, ""); const titleMatch = sanitized.match(/]*>([\s\S]*?)<\/title>/i); const title = titleMatch ? normalizeWhitespace(decodeHtmlEntities(stripTags(titleMatch[1]))) : ""; const bodyMatch = sanitized.match(/]*>([\s\S]*?)<\/body>/i); - const bodyHtml = bodyMatch ? bodyMatch[1] : sanitized; + const rawBodyHtml = bodyMatch ? bodyMatch[1] : sanitized; + const bodyHtml = extractPrimaryHtmlFragment(rawBodyHtml); const bodyMarkdown = renderHtmlFragmentToMarkdown(bodyHtml, baseUrl); const parts: string[] = []; if (title) { parts.push(`---\ntitle: ${title}\n---`); - parts.push(`# ${title}`); } if (bodyMarkdown) { @@ -126,7 +152,7 @@ function htmlToMarkdownFallback(html: string, baseUrl: URL): string { function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string { const protectedBlocks: string[] = []; - let content = html; + let content = stripFallbackNoise(html); // Protect code blocks so later tag cleanup does not alter their content. content = content.replace(/]*>([\s\S]*?)<\/pre>/gi, (_match, preContent: string) => { @@ -137,18 +163,10 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string { }); content = content.replace(/]*>([\s\S]*?)<\/h\1>/gi, (_match, level: string, text: string) => { - const clean = decodeHtmlEntities(stripTags(text)).trim(); + const clean = normalizeWhitespace(decodeHtmlEntities(stripTags(text))); return clean ? `\n\n${"#".repeat(Number(level))} ${clean}\n\n` : ""; }); - content = content.replace(/]*)>([\s\S]*?)<\/a>/gi, (_match, attrs: string, text: string) => { - const hrefMatch = attrs.match(/href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i); - const rawHref = hrefMatch ? hrefMatch[1] || hrefMatch[2] || hrefMatch[3] || "" : ""; - const href = toAbsoluteUrl(rawHref, baseUrl); - const label = normalizeWhitespace(decodeHtmlEntities(stripTags(text))) || href; - return href ? `[${label}](${href})` : label; - }); - content = content.replace(/]*)>/gi, (_match, attrs: string) => { const altMatch = attrs.match(/alt\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i); const srcMatch = attrs.match(/src\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i); @@ -159,6 +177,18 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string { return src ? `![${alt}](${src})` : ""; }); + content = content.replace(/]*)>([\s\S]*?)<\/a>/gi, (_match, attrs: string, text: string) => { + const hrefMatch = attrs.match(/href\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s>]+))/i); + const rawHref = hrefMatch ? hrefMatch[1] || hrefMatch[2] || hrefMatch[3] || "" : ""; + const href = toAbsoluteUrl(rawHref, baseUrl); + const label = normalizeWhitespace(decodeHtmlEntities(stripTags(text))); + if (!href || !label) { + return label; + } + + return `[${label}](${href})`; + }); + content = content .replace(/<(strong|b)\b[^>]*>([\s\S]*?)<\/\1>/gi, (_m, _tag: string, text: string) => { const clean = normalizeWhitespace(decodeHtmlEntities(stripTags(text))); @@ -185,7 +215,7 @@ function renderHtmlFragmentToMarkdown(html: string, baseUrl: URL): string { content = decodeHtmlEntities(stripTags(content)) .replace(/\n{3,}/g, "\n\n") .split("\n") - .map((line) => line.trimEnd()) + .map((line) => line.trim()) .join("\n") .trim(); @@ -244,8 +274,12 @@ function renderNode(node: any, baseUrl: URL, listDepth: number): string { } if (tag === "a") { - const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || ""); + const text = renderInlineChildren(el, baseUrl); const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl); + if (!text) { + return ""; + } + return href ? `[${text}](${href})` : text; } @@ -349,9 +383,20 @@ function renderInlineChildren(parent: any, baseUrl: URL): string { const tag = el.tagName.toLowerCase(); if (tag === "a") { - const text = renderInlineChildren(el, baseUrl) || normalizeWhitespace(el.getAttribute("href") || ""); + const text = renderInlineChildren(el, baseUrl); const href = toAbsoluteUrl(el.getAttribute("href"), baseUrl); - out.push(href ? `[${text}](${href})` : text); + if (text) { + out.push(href ? `[${text}](${href})` : text); + } + continue; + } + + if (tag === "img") { + const alt = normalizeWhitespace(el.getAttribute("alt") || "image"); + const src = toAbsoluteUrl(el.getAttribute("src"), baseUrl); + if (src) { + out.push(`![${alt}](${src})`); + } continue; } @@ -408,3 +453,70 @@ function toAbsoluteUrl(href: string | null, baseUrl: URL): string { function normalizeWhitespace(value: string): string { return value.replace(/\s+/g, " ").trim(); } + +function pruneDocument(doc: { querySelectorAll: (selector: string) => any }): void { + for (const selector of NOISE_SELECTORS) { + doc.querySelectorAll(selector).forEach((node: any) => node.remove()); + } +} + +function selectContentRoot(doc: { querySelector: (selector: string) => any; body?: any }): any { + const main = doc.querySelector("main"); + if (main && /\bhome-page\b/.test(main.getAttribute?.("class") || "")) { + return main; + } + + const preferredSelectors = [ + "#main article", + "main article", + "article.guide", + "article", + "#main", + "[role='main']", + "main", + "body", + ]; + + for (const selector of preferredSelectors) { + const found = doc.querySelector(selector); + if (found) { + return found; + } + } + + return doc.body || null; +} + +function extractPrimaryHtmlFragment(html: string): string { + const mainMatch = html.match(/]*>([\s\S]*?)<\/main>/i); + const mainHtml = mainMatch ? mainMatch[1] : ""; + const isHome = /]*class=("|')[^"']*\bhome-page\b[^"']*\1/i.test(html); + + if (isHome && mainHtml) { + return mainHtml; + } + + const articleMatch = html.match(/]*>([\s\S]*?)<\/article>/i); + if (articleMatch) { + return articleMatch[1]; + } + + if (mainHtml) { + return mainHtml; + } + + return html; +} + +function stripFallbackNoise(html: string): string { + return html + .replace(/]*>[\s\S]*?<\/header>/gi, "") + .replace(/]*>[\s\S]*?<\/footer>/gi, "") + .replace(/]*>[\s\S]*?<\/nav>/gi, "") + .replace(/]*>[\s\S]*?<\/aside>/gi, "") + .replace(/]*class=("|')[^"']*\bnext-previous\b[^"']*\1[^>]*>[\s\S]*?<\/div>/gi, "") + .replace(/]*class=("|')[^"']*\bhome-banner-info\b[^"']*\1[^>]*>[\s\S]*?<\/section>/gi, "") + .replace(/]*class=("|')[^"']*\bside-bar\b[^"']*\1[^>]*>[\s\S]*?<\/div>/gi, "") + .replace(/]*class=("|')[^"']*\bskip-to-content-link\b[^"']*\1[^>]*>[\s\S]*?<\/a>/gi, "") + .replace(/]*class=("|')[^"']*\bheader-anchor\b[^"']*\1[^>]*>[\s\S]*?<\/a>/gi, ""); +}