JimLiu-baoyu-skills/packages/baoyu-fetch/src/extract/html-to-markdown.ts

759 lines
21 KiB
TypeScript

import { Readability } from "@mozilla/readability";
import { Defuddle } from "defuddle/node";
import { JSDOM, VirtualConsole } from "jsdom";
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";
import { collectMediaFromMarkdown } from "../media/markdown-media";
import type { MediaAsset } from "../media/types";
import { cleanHtml } from "./html-cleaner";
export interface HtmlConversionMetadata {
url: string;
canonicalUrl?: string;
siteName?: string;
title?: string;
summary?: string;
author?: string;
publishedAt?: string;
coverImage?: string;
language?: string;
capturedAt: string;
}
export interface ConvertHtmlToMarkdownOptions {
enableRemoteMarkdownFallback?: boolean;
preserveBase64Images?: boolean;
}
export interface HtmlToMarkdownResult {
metadata: HtmlConversionMetadata;
markdown: string;
rawHtml: string;
cleanedHtml: string;
media: MediaAsset[];
conversionMethod: string;
fallbackReason?: string;
}
type JsonObject = Record<string, unknown>;
const MIN_CONTENT_LENGTH = 120;
const DEFUDDLE_API_ORIGIN = "https://defuddle.md";
const LOCAL_FALLBACK_SCORE_DELTA = 120;
const REMOTE_FALLBACK_SCORE_DELTA = 20;
const LOW_QUALITY_MARKERS = [
/Join The Conversation/i,
/One Community\. Many Voices/i,
/Read our community guidelines/i,
/Create a free account to share your thoughts/i,
/Become a Forbes Member/i,
/Subscribe to trusted journalism/i,
/\bComments\b/i,
];
const ARTICLE_TYPES = new Set([
"Article",
"NewsArticle",
"BlogPosting",
"WebPage",
"ReportageNewsArticle",
]);
const turndown = new TurndownService({
headingStyle: "atx",
bulletListMarker: "-",
codeBlockStyle: "fenced",
}) as TurndownService & {
remove(selectors: string[]): void;
addRule(
key: string,
rule: {
filter: string | ((node: Node) => boolean);
replacement: (content: string) => string;
},
): void;
};
turndown.use(gfm);
turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);
turndown.addRule("collapseFigure", {
filter: "figure",
replacement(content: string) {
return `\n\n${content.trim()}\n\n`;
},
});
turndown.addRule("dropInvisibleAnchors", {
filter(node: Node) {
return (
node.nodeName === "A" &&
!(node as Element).textContent?.trim() &&
!(node as Element).querySelector("img, video, picture, source")
);
},
replacement() {
return "";
},
});
function pickString(...values: unknown[]): string | undefined {
for (const value of values) {
if (typeof value !== "string") {
continue;
}
const trimmed = value.trim();
if (trimmed) {
return trimmed;
}
}
return undefined;
}
function normalizeMarkdown(markdown: string): string {
return markdown
.replace(/\r\n/g, "\n")
.replace(/[ \t]+\n/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
function stripWrappingQuotes(value: string): string {
const trimmed = value.trim();
if (
(trimmed.startsWith('"') && trimmed.endsWith('"')) ||
(trimmed.startsWith("'") && trimmed.endsWith("'"))
) {
return trimmed.slice(1, -1).trim();
}
return trimmed;
}
function stripMarkdownFrontmatter(markdown: string): string {
return markdown.replace(/^\uFEFF?---\n[\s\S]*?\n---(?:\n|$)/, "").trim();
}
function cleanMarkdownTitle(value: string): string | undefined {
const cleaned = stripWrappingQuotes(
value
.replace(/\s+#+\s*$/, "")
.replace(/!\[[^\]]*\]\([^)]+\)/g, "")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.replace(/[*_`~]/g, "")
.trim(),
);
return cleaned || undefined;
}
export function extractTitleFromMarkdownDocument(markdown: string): string | undefined {
const normalized = markdown.replace(/\r\n/g, "\n").trim();
if (!normalized) {
return undefined;
}
const frontmatterMatch = normalized.match(/^\uFEFF?---\n([\s\S]*?)\n---(?:\n|$)/);
if (frontmatterMatch) {
for (const line of frontmatterMatch[1].split("\n")) {
const match = line.match(/^title:\s*(.+?)\s*$/i);
if (!match) {
continue;
}
const title = cleanMarkdownTitle(match[1]);
if (title) {
return title;
}
}
}
const body = stripMarkdownFrontmatter(normalized);
const headingMatch = body.match(/^#{1,6}\s+(.+)$/m);
if (!headingMatch) {
return undefined;
}
return cleanMarkdownTitle(headingMatch[1]);
}
function trimKnownBoilerplate(markdown: string): string {
const normalized = normalizeMarkdown(markdown);
const lines = normalized.split("\n");
while (lines.length > 0) {
const lastLine = lines[lines.length - 1]?.trim();
if (!lastLine) {
lines.pop();
continue;
}
if (/^继续滑动看下一个$/.test(lastLine) || /^轻触阅读原文$/.test(lastLine)) {
lines.pop();
continue;
}
break;
}
return normalizeMarkdown(lines.join("\n"));
}
function buildDefuddleApiUrl(targetUrl: string): string {
return `${DEFUDDLE_API_ORIGIN}/${encodeURIComponent(targetUrl)}`;
}
async function fetchDefuddleApiMarkdown(
targetUrl: string,
): Promise<{ markdown: string; title?: string }> {
const response = await fetch(buildDefuddleApiUrl(targetUrl), {
headers: {
accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1",
},
redirect: "follow",
});
if (!response.ok) {
throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`);
}
const rawMarkdown = (await response.text()).replace(/\r\n/g, "\n").trim();
if (!rawMarkdown) {
throw new Error("defuddle.md returned empty markdown");
}
const title = extractTitleFromMarkdownDocument(rawMarkdown);
const markdown = trimKnownBoilerplate(stripMarkdownFrontmatter(rawMarkdown));
if (!markdown) {
throw new Error("defuddle.md returned empty markdown");
}
return {
markdown,
title,
};
}
function sanitizeHtmlFragment(html: string): string {
const dom = new JSDOM(`<div id="__root">${html}</div>`);
const root = dom.window.document.querySelector("#__root");
if (!root) {
return html;
}
for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
root.querySelectorAll(selector).forEach((element) => element.remove());
}
return root.innerHTML;
}
function extractTextFromHtml(html: string): string {
const dom = new JSDOM(`<!doctype html><html><body>${html}</body></html>`);
const { document } = dom.window;
for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
document.querySelectorAll(selector).forEach((element) => element.remove());
}
return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
}
function getMetaContent(document: Document, names: string[]): string | undefined {
for (const name of names) {
const element =
document.querySelector(`meta[name="${name}"]`) ??
document.querySelector(`meta[property="${name}"]`);
const content = element?.getAttribute("content")?.trim();
if (content) {
return content;
}
}
return undefined;
}
function normalizeLanguageTag(value: string | null | undefined): string | undefined {
if (!value) {
return undefined;
}
const trimmed = value.trim();
if (!trimmed) {
return undefined;
}
const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();
if (!primary) {
return undefined;
}
return primary.replace(/_/g, "-");
}
function flattenJsonLdItems(data: unknown): JsonObject[] {
if (!data || typeof data !== "object") {
return [];
}
if (Array.isArray(data)) {
return data.flatMap(flattenJsonLdItems);
}
const item = data as JsonObject;
if (Array.isArray(item["@graph"])) {
return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
}
return [item];
}
function parseJsonLdScripts(document: Document): JsonObject[] {
const results: JsonObject[] = [];
document.querySelectorAll("script[type='application/ld+json']").forEach((script) => {
try {
const data = JSON.parse(script.textContent ?? "");
results.push(...flattenJsonLdItems(data));
} catch {
// Ignore malformed json-ld blocks.
}
});
return results;
}
function extractAuthorFromJsonLd(authorData: unknown): string | undefined {
if (typeof authorData === "string") {
return authorData.trim() || undefined;
}
if (!authorData || typeof authorData !== "object") {
return undefined;
}
if (Array.isArray(authorData)) {
return authorData
.map((author) => extractAuthorFromJsonLd(author))
.filter((value): value is string => Boolean(value))
.join(", ") || undefined;
}
const author = authorData as JsonObject;
return pickString(author.name);
}
function extractPrimaryJsonLdMeta(document: Document): Partial<HtmlConversionMetadata> {
for (const item of parseJsonLdScripts(document)) {
const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
if (typeof type !== "string" || !ARTICLE_TYPES.has(type)) {
continue;
}
return {
title: pickString(item.headline, item.name),
summary: pickString(item.description),
author: extractAuthorFromJsonLd(item.author),
publishedAt: pickString(item.datePublished, item.dateCreated),
coverImage: pickString(
item.image,
(item.image as JsonObject | undefined)?.url,
Array.isArray(item.image) ? item.image[0] : undefined,
),
};
}
return {};
}
function extractPageMetadata(
html: string,
url: string,
capturedAt: string,
): HtmlConversionMetadata {
const dom = new JSDOM(html, { url });
const { document } = dom.window;
const jsonLd = extractPrimaryJsonLdMeta(document);
return {
url,
canonicalUrl:
document.querySelector('link[rel="canonical"]')?.getAttribute("href")?.trim() ??
getMetaContent(document, ["og:url"]),
siteName: pickString(
getMetaContent(document, ["og:site_name"]),
document.querySelector('meta[name="application-name"]')?.getAttribute("content"),
),
title: pickString(
getMetaContent(document, ["og:title", "twitter:title"]),
jsonLd.title,
document.querySelector("h1")?.textContent,
document.title,
),
summary: pickString(
getMetaContent(document, ["description", "og:description", "twitter:description"]),
jsonLd.summary,
),
author: pickString(
getMetaContent(document, ["author", "article:author", "twitter:creator"]),
jsonLd.author,
),
publishedAt: pickString(
document.querySelector("time[datetime]")?.getAttribute("datetime"),
getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
jsonLd.publishedAt,
),
coverImage: pickString(
getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
jsonLd.coverImage,
),
language: pickString(
normalizeLanguageTag(document.documentElement.getAttribute("lang")),
normalizeLanguageTag(
pickString(
getMetaContent(document, ["language", "content-language", "og:locale"]),
document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content"),
),
),
),
capturedAt,
};
}
function isMarkdownUsable(markdown: string, html: string): boolean {
const normalized = normalizeMarkdown(markdown);
if (!normalized) {
return false;
}
const htmlTextLength = extractTextFromHtml(html).length;
if (htmlTextLength < MIN_CONTENT_LENGTH) {
return true;
}
if (normalized.length >= 80) {
return true;
}
return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
}
function countMarkerHits(markdown: string, markers: RegExp[]): number {
let hits = 0;
for (const marker of markers) {
if (marker.test(markdown)) {
hits += 1;
}
}
return hits;
}
function countUsefulParagraphs(markdown: string): number {
const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
let count = 0;
for (const paragraph of paragraphs) {
const trimmed = paragraph.trim();
if (!trimmed) {
continue;
}
if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) {
continue;
}
if (/^#{1,6}\s+/.test(trimmed)) {
continue;
}
if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) {
continue;
}
count += 1;
}
return count;
}
function scoreMarkdownQuality(markdown: string): number {
const normalized = normalizeMarkdown(markdown);
const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
const usefulParagraphs = countUsefulParagraphs(normalized);
const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
return Math.min(wordCount, 4000) + usefulParagraphs * 40 + headingCount * 10 - markerHits * 180;
}
function shouldCompareWithFallback(markdown: string): boolean {
const normalized = normalizeMarkdown(markdown);
return countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || countUsefulParagraphs(normalized) < 6;
}
function hasMeaningfulMarkdownStructure(markdown: string): boolean {
const normalized = normalizeMarkdown(markdown);
if (!normalized) {
return false;
}
return (
countUsefulParagraphs(normalized) > 0 ||
/^#{1,6}\s+/m.test(normalized) ||
/^[-*]\s+/m.test(normalized) ||
/^\d+\.\s+/m.test(normalized) ||
/!\[[^\]]*\]\([^)]+\)/.test(normalized)
);
}
function shouldTryRemoteMarkdownFallback(
markdown: string,
html: string,
options: ConvertHtmlToMarkdownOptions,
): boolean {
if (!options.enableRemoteMarkdownFallback) {
return false;
}
return !isMarkdownUsable(markdown, html) || shouldCompareWithFallback(markdown);
}
function shouldPreferRemoteMarkdown(
current: HtmlToMarkdownResult,
remote: HtmlToMarkdownResult,
html: string,
): boolean {
if (!isMarkdownUsable(current.markdown, html)) {
return true;
}
if (!hasMeaningfulMarkdownStructure(current.markdown) && hasMeaningfulMarkdownStructure(remote.markdown)) {
return true;
}
return scoreMarkdownQuality(remote.markdown) > scoreMarkdownQuality(current.markdown) + REMOTE_FALLBACK_SCORE_DELTA;
}
function buildRemoteFallbackReason(current: HtmlToMarkdownResult, html: string): string {
if (!isMarkdownUsable(current.markdown, html)) {
return current.fallbackReason
? `Used defuddle.md markdown fallback after local extraction failed: ${current.fallbackReason}`
: "Used defuddle.md markdown fallback after local extraction returned empty or incomplete markdown";
}
return "defuddle.md produced higher-quality markdown than local extraction";
}
async function tryDefuddleConversion(
html: string,
url: string,
baseMetadata: HtmlConversionMetadata,
): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {
try {
const virtualConsole = new VirtualConsole();
virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
return;
}
});
const dom = new JSDOM(html, { url, virtualConsole });
const result = await Defuddle(dom, url, { markdown: true });
const markdown = trimKnownBoilerplate(result.content || "");
if (!isMarkdownUsable(markdown, html)) {
return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
}
const metadata: HtmlConversionMetadata = {
...baseMetadata,
title: pickString(result.title, baseMetadata.title),
summary: pickString(result.description, baseMetadata.summary),
author: pickString(result.author, baseMetadata.author),
publishedAt: pickString(result.published, baseMetadata.publishedAt),
coverImage: pickString(result.image, baseMetadata.coverImage),
language: pickString(result.language, baseMetadata.language),
};
return {
ok: true,
result: {
metadata,
markdown,
rawHtml: html,
cleanedHtml: html,
media: collectMediaFromMarkdown(markdown).concat(
metadata.coverImage
? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]
: [],
),
conversionMethod: "defuddle",
},
};
} catch (error) {
return {
ok: false,
reason: error instanceof Error ? error.message : String(error),
};
}
}
async function tryDefuddleApiConversion(
html: string,
url: string,
baseMetadata: HtmlConversionMetadata,
): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {
try {
const result = await fetchDefuddleApiMarkdown(url);
const markdown = result.markdown;
if (!isMarkdownUsable(markdown, html) && scoreMarkdownQuality(markdown) < 80) {
return { ok: false, reason: "defuddle.md returned empty or incomplete markdown" };
}
const metadata: HtmlConversionMetadata = {
...baseMetadata,
title: pickString(result.title, baseMetadata.title),
};
return {
ok: true,
result: {
metadata,
markdown,
rawHtml: html,
cleanedHtml: html,
media: collectMediaFromMarkdown(markdown).concat(
metadata.coverImage
? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]
: [],
),
conversionMethod: "defuddle-api",
},
};
} catch (error) {
return {
ok: false,
reason: error instanceof Error ? error.message : String(error),
};
}
}
function convertHtmlFragmentToMarkdown(html: string): string {
if (!html.trim()) {
return "";
}
try {
return turndown.turndown(sanitizeHtmlFragment(html));
} catch {
return "";
}
}
function fallbackPlainText(html: string): string {
return trimKnownBoilerplate(extractTextFromHtml(html));
}
function convertWithReadability(
rawHtml: string,
cleanedHtml: string,
url: string,
baseMetadata: HtmlConversionMetadata,
): HtmlToMarkdownResult {
const dom = new JSDOM(cleanedHtml, { url });
const document = dom.window.document;
const article = new Readability(document).parse();
const contentHtml =
article?.content?.trim() ??
document.querySelector("main")?.innerHTML?.trim() ??
document.body?.innerHTML?.trim() ??
"";
let markdown = contentHtml ? convertHtmlFragmentToMarkdown(contentHtml) : "";
if (!markdown) {
markdown = fallbackPlainText(cleanedHtml);
}
const metadata: HtmlConversionMetadata = {
...baseMetadata,
title: pickString(article?.title, baseMetadata.title),
summary: pickString(article?.excerpt, baseMetadata.summary),
author: pickString(article?.byline, baseMetadata.author),
};
const media = collectMediaFromMarkdown(markdown);
if (metadata.coverImage) {
media.unshift({
url: metadata.coverImage,
kind: "image",
role: "cover",
});
}
return {
metadata,
markdown: trimKnownBoilerplate(markdown),
rawHtml,
cleanedHtml,
media,
conversionMethod: article?.content ? "legacy:readability" : "legacy:body",
};
}
export async function convertHtmlToMarkdown(
html: string,
url: string,
options: ConvertHtmlToMarkdownOptions = {},
): Promise<HtmlToMarkdownResult> {
const capturedAt = new Date().toISOString();
const baseMetadata = extractPageMetadata(html, url, capturedAt);
let cleanedHtml = html;
try {
cleanedHtml = cleanHtml(html, url, {
removeBase64Images: !options.preserveBase64Images,
});
} catch {
cleanedHtml = html;
}
let selectedResult: HtmlToMarkdownResult;
const defuddleResult = await tryDefuddleConversion(cleanedHtml, url, baseMetadata);
if (defuddleResult.ok) {
if (shouldCompareWithFallback(defuddleResult.result.markdown)) {
const fallbackResult = convertWithReadability(html, cleanedHtml, url, baseMetadata);
if (
scoreMarkdownQuality(fallbackResult.markdown) >
scoreMarkdownQuality(defuddleResult.result.markdown) + LOCAL_FALLBACK_SCORE_DELTA
) {
selectedResult = {
...fallbackResult,
fallbackReason: "Readability/Turndown produced higher-quality markdown than Defuddle",
};
} else {
selectedResult = {
...defuddleResult.result,
rawHtml: html,
cleanedHtml,
};
}
} else {
selectedResult = {
...defuddleResult.result,
rawHtml: html,
cleanedHtml,
};
}
} else {
selectedResult = {
...convertWithReadability(html, cleanedHtml, url, baseMetadata),
fallbackReason: defuddleResult.reason,
};
}
if (!shouldTryRemoteMarkdownFallback(selectedResult.markdown, cleanedHtml, options)) {
return selectedResult;
}
const remoteDefuddleResult = await tryDefuddleApiConversion(cleanedHtml, url, baseMetadata);
if (!remoteDefuddleResult.ok || !shouldPreferRemoteMarkdown(selectedResult, remoteDefuddleResult.result, cleanedHtml)) {
return selectedResult;
}
return {
...remoteDefuddleResult.result,
rawHtml: html,
cleanedHtml,
fallbackReason: buildRemoteFallbackReason(selectedResult, cleanedHtml),
};
}