759 lines
21 KiB
TypeScript
759 lines
21 KiB
TypeScript
import { Readability } from "@mozilla/readability";
|
|
import { Defuddle } from "defuddle/node";
|
|
import { JSDOM, VirtualConsole } from "jsdom";
|
|
import TurndownService from "turndown";
|
|
import { gfm } from "turndown-plugin-gfm";
|
|
import { collectMediaFromMarkdown } from "../media/markdown-media";
|
|
import type { MediaAsset } from "../media/types";
|
|
import { cleanHtml } from "./html-cleaner";
|
|
|
|
export interface HtmlConversionMetadata {
|
|
url: string;
|
|
canonicalUrl?: string;
|
|
siteName?: string;
|
|
title?: string;
|
|
summary?: string;
|
|
author?: string;
|
|
publishedAt?: string;
|
|
coverImage?: string;
|
|
language?: string;
|
|
capturedAt: string;
|
|
}
|
|
|
|
export interface ConvertHtmlToMarkdownOptions {
|
|
enableRemoteMarkdownFallback?: boolean;
|
|
preserveBase64Images?: boolean;
|
|
}
|
|
|
|
export interface HtmlToMarkdownResult {
|
|
metadata: HtmlConversionMetadata;
|
|
markdown: string;
|
|
rawHtml: string;
|
|
cleanedHtml: string;
|
|
media: MediaAsset[];
|
|
conversionMethod: string;
|
|
fallbackReason?: string;
|
|
}
|
|
|
|
type JsonObject = Record<string, unknown>;
|
|
|
|
const MIN_CONTENT_LENGTH = 120;
|
|
const DEFUDDLE_API_ORIGIN = "https://defuddle.md";
|
|
const LOCAL_FALLBACK_SCORE_DELTA = 120;
|
|
const REMOTE_FALLBACK_SCORE_DELTA = 20;
|
|
const LOW_QUALITY_MARKERS = [
|
|
/Join The Conversation/i,
|
|
/One Community\. Many Voices/i,
|
|
/Read our community guidelines/i,
|
|
/Create a free account to share your thoughts/i,
|
|
/Become a Forbes Member/i,
|
|
/Subscribe to trusted journalism/i,
|
|
/\bComments\b/i,
|
|
];
|
|
|
|
const ARTICLE_TYPES = new Set([
|
|
"Article",
|
|
"NewsArticle",
|
|
"BlogPosting",
|
|
"WebPage",
|
|
"ReportageNewsArticle",
|
|
]);
|
|
|
|
const turndown = new TurndownService({
|
|
headingStyle: "atx",
|
|
bulletListMarker: "-",
|
|
codeBlockStyle: "fenced",
|
|
}) as TurndownService & {
|
|
remove(selectors: string[]): void;
|
|
addRule(
|
|
key: string,
|
|
rule: {
|
|
filter: string | ((node: Node) => boolean);
|
|
replacement: (content: string) => string;
|
|
},
|
|
): void;
|
|
};
|
|
|
|
turndown.use(gfm);
|
|
turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);
|
|
turndown.addRule("collapseFigure", {
|
|
filter: "figure",
|
|
replacement(content: string) {
|
|
return `\n\n${content.trim()}\n\n`;
|
|
},
|
|
});
|
|
turndown.addRule("dropInvisibleAnchors", {
|
|
filter(node: Node) {
|
|
return (
|
|
node.nodeName === "A" &&
|
|
!(node as Element).textContent?.trim() &&
|
|
!(node as Element).querySelector("img, video, picture, source")
|
|
);
|
|
},
|
|
replacement() {
|
|
return "";
|
|
},
|
|
});
|
|
|
|
function pickString(...values: unknown[]): string | undefined {
|
|
for (const value of values) {
|
|
if (typeof value !== "string") {
|
|
continue;
|
|
}
|
|
const trimmed = value.trim();
|
|
if (trimmed) {
|
|
return trimmed;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function normalizeMarkdown(markdown: string): string {
|
|
return markdown
|
|
.replace(/\r\n/g, "\n")
|
|
.replace(/[ \t]+\n/g, "\n")
|
|
.replace(/\n{3,}/g, "\n\n")
|
|
.trim();
|
|
}
|
|
|
|
function stripWrappingQuotes(value: string): string {
|
|
const trimmed = value.trim();
|
|
if (
|
|
(trimmed.startsWith('"') && trimmed.endsWith('"')) ||
|
|
(trimmed.startsWith("'") && trimmed.endsWith("'"))
|
|
) {
|
|
return trimmed.slice(1, -1).trim();
|
|
}
|
|
return trimmed;
|
|
}
|
|
|
|
function stripMarkdownFrontmatter(markdown: string): string {
|
|
return markdown.replace(/^\uFEFF?---\n[\s\S]*?\n---(?:\n|$)/, "").trim();
|
|
}
|
|
|
|
function cleanMarkdownTitle(value: string): string | undefined {
|
|
const cleaned = stripWrappingQuotes(
|
|
value
|
|
.replace(/\s+#+\s*$/, "")
|
|
.replace(/!\[[^\]]*\]\([^)]+\)/g, "")
|
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
|
.replace(/[*_`~]/g, "")
|
|
.trim(),
|
|
);
|
|
|
|
return cleaned || undefined;
|
|
}
|
|
|
|
export function extractTitleFromMarkdownDocument(markdown: string): string | undefined {
|
|
const normalized = markdown.replace(/\r\n/g, "\n").trim();
|
|
if (!normalized) {
|
|
return undefined;
|
|
}
|
|
|
|
const frontmatterMatch = normalized.match(/^\uFEFF?---\n([\s\S]*?)\n---(?:\n|$)/);
|
|
if (frontmatterMatch) {
|
|
for (const line of frontmatterMatch[1].split("\n")) {
|
|
const match = line.match(/^title:\s*(.+?)\s*$/i);
|
|
if (!match) {
|
|
continue;
|
|
}
|
|
|
|
const title = cleanMarkdownTitle(match[1]);
|
|
if (title) {
|
|
return title;
|
|
}
|
|
}
|
|
}
|
|
|
|
const body = stripMarkdownFrontmatter(normalized);
|
|
const headingMatch = body.match(/^#{1,6}\s+(.+)$/m);
|
|
if (!headingMatch) {
|
|
return undefined;
|
|
}
|
|
|
|
return cleanMarkdownTitle(headingMatch[1]);
|
|
}
|
|
|
|
function trimKnownBoilerplate(markdown: string): string {
|
|
const normalized = normalizeMarkdown(markdown);
|
|
const lines = normalized.split("\n");
|
|
|
|
while (lines.length > 0) {
|
|
const lastLine = lines[lines.length - 1]?.trim();
|
|
if (!lastLine) {
|
|
lines.pop();
|
|
continue;
|
|
}
|
|
|
|
if (/^继续滑动看下一个$/.test(lastLine) || /^轻触阅读原文$/.test(lastLine)) {
|
|
lines.pop();
|
|
continue;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
return normalizeMarkdown(lines.join("\n"));
|
|
}
|
|
|
|
function buildDefuddleApiUrl(targetUrl: string): string {
|
|
return `${DEFUDDLE_API_ORIGIN}/${encodeURIComponent(targetUrl)}`;
|
|
}
|
|
|
|
async function fetchDefuddleApiMarkdown(
|
|
targetUrl: string,
|
|
): Promise<{ markdown: string; title?: string }> {
|
|
const response = await fetch(buildDefuddleApiUrl(targetUrl), {
|
|
headers: {
|
|
accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1",
|
|
},
|
|
redirect: "follow",
|
|
});
|
|
|
|
if (!response.ok) {
|
|
throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`);
|
|
}
|
|
|
|
const rawMarkdown = (await response.text()).replace(/\r\n/g, "\n").trim();
|
|
if (!rawMarkdown) {
|
|
throw new Error("defuddle.md returned empty markdown");
|
|
}
|
|
|
|
const title = extractTitleFromMarkdownDocument(rawMarkdown);
|
|
const markdown = trimKnownBoilerplate(stripMarkdownFrontmatter(rawMarkdown));
|
|
if (!markdown) {
|
|
throw new Error("defuddle.md returned empty markdown");
|
|
}
|
|
|
|
return {
|
|
markdown,
|
|
title,
|
|
};
|
|
}
|
|
|
|
function sanitizeHtmlFragment(html: string): string {
|
|
const dom = new JSDOM(`<div id="__root">${html}</div>`);
|
|
const root = dom.window.document.querySelector("#__root");
|
|
if (!root) {
|
|
return html;
|
|
}
|
|
|
|
for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
|
|
root.querySelectorAll(selector).forEach((element) => element.remove());
|
|
}
|
|
|
|
return root.innerHTML;
|
|
}
|
|
|
|
function extractTextFromHtml(html: string): string {
|
|
const dom = new JSDOM(`<!doctype html><html><body>${html}</body></html>`);
|
|
const { document } = dom.window;
|
|
for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
|
|
document.querySelectorAll(selector).forEach((element) => element.remove());
|
|
}
|
|
return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
|
|
}
|
|
|
|
function getMetaContent(document: Document, names: string[]): string | undefined {
|
|
for (const name of names) {
|
|
const element =
|
|
document.querySelector(`meta[name="${name}"]`) ??
|
|
document.querySelector(`meta[property="${name}"]`);
|
|
const content = element?.getAttribute("content")?.trim();
|
|
if (content) {
|
|
return content;
|
|
}
|
|
}
|
|
return undefined;
|
|
}
|
|
|
|
function normalizeLanguageTag(value: string | null | undefined): string | undefined {
|
|
if (!value) {
|
|
return undefined;
|
|
}
|
|
|
|
const trimmed = value.trim();
|
|
if (!trimmed) {
|
|
return undefined;
|
|
}
|
|
|
|
const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();
|
|
if (!primary) {
|
|
return undefined;
|
|
}
|
|
|
|
return primary.replace(/_/g, "-");
|
|
}
|
|
|
|
function flattenJsonLdItems(data: unknown): JsonObject[] {
|
|
if (!data || typeof data !== "object") {
|
|
return [];
|
|
}
|
|
|
|
if (Array.isArray(data)) {
|
|
return data.flatMap(flattenJsonLdItems);
|
|
}
|
|
|
|
const item = data as JsonObject;
|
|
if (Array.isArray(item["@graph"])) {
|
|
return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
|
|
}
|
|
|
|
return [item];
|
|
}
|
|
|
|
function parseJsonLdScripts(document: Document): JsonObject[] {
|
|
const results: JsonObject[] = [];
|
|
document.querySelectorAll("script[type='application/ld+json']").forEach((script) => {
|
|
try {
|
|
const data = JSON.parse(script.textContent ?? "");
|
|
results.push(...flattenJsonLdItems(data));
|
|
} catch {
|
|
// Ignore malformed json-ld blocks.
|
|
}
|
|
});
|
|
return results;
|
|
}
|
|
|
|
function extractAuthorFromJsonLd(authorData: unknown): string | undefined {
|
|
if (typeof authorData === "string") {
|
|
return authorData.trim() || undefined;
|
|
}
|
|
|
|
if (!authorData || typeof authorData !== "object") {
|
|
return undefined;
|
|
}
|
|
|
|
if (Array.isArray(authorData)) {
|
|
return authorData
|
|
.map((author) => extractAuthorFromJsonLd(author))
|
|
.filter((value): value is string => Boolean(value))
|
|
.join(", ") || undefined;
|
|
}
|
|
|
|
const author = authorData as JsonObject;
|
|
return pickString(author.name);
|
|
}
|
|
|
|
function extractPrimaryJsonLdMeta(document: Document): Partial<HtmlConversionMetadata> {
|
|
for (const item of parseJsonLdScripts(document)) {
|
|
const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
|
|
if (typeof type !== "string" || !ARTICLE_TYPES.has(type)) {
|
|
continue;
|
|
}
|
|
|
|
return {
|
|
title: pickString(item.headline, item.name),
|
|
summary: pickString(item.description),
|
|
author: extractAuthorFromJsonLd(item.author),
|
|
publishedAt: pickString(item.datePublished, item.dateCreated),
|
|
coverImage: pickString(
|
|
item.image,
|
|
(item.image as JsonObject | undefined)?.url,
|
|
Array.isArray(item.image) ? item.image[0] : undefined,
|
|
),
|
|
};
|
|
}
|
|
|
|
return {};
|
|
}
|
|
|
|
function extractPageMetadata(
|
|
html: string,
|
|
url: string,
|
|
capturedAt: string,
|
|
): HtmlConversionMetadata {
|
|
const dom = new JSDOM(html, { url });
|
|
const { document } = dom.window;
|
|
const jsonLd = extractPrimaryJsonLdMeta(document);
|
|
|
|
return {
|
|
url,
|
|
canonicalUrl:
|
|
document.querySelector('link[rel="canonical"]')?.getAttribute("href")?.trim() ??
|
|
getMetaContent(document, ["og:url"]),
|
|
siteName: pickString(
|
|
getMetaContent(document, ["og:site_name"]),
|
|
document.querySelector('meta[name="application-name"]')?.getAttribute("content"),
|
|
),
|
|
title: pickString(
|
|
getMetaContent(document, ["og:title", "twitter:title"]),
|
|
jsonLd.title,
|
|
document.querySelector("h1")?.textContent,
|
|
document.title,
|
|
),
|
|
summary: pickString(
|
|
getMetaContent(document, ["description", "og:description", "twitter:description"]),
|
|
jsonLd.summary,
|
|
),
|
|
author: pickString(
|
|
getMetaContent(document, ["author", "article:author", "twitter:creator"]),
|
|
jsonLd.author,
|
|
),
|
|
publishedAt: pickString(
|
|
document.querySelector("time[datetime]")?.getAttribute("datetime"),
|
|
getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
|
|
jsonLd.publishedAt,
|
|
),
|
|
coverImage: pickString(
|
|
getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
|
|
jsonLd.coverImage,
|
|
),
|
|
language: pickString(
|
|
normalizeLanguageTag(document.documentElement.getAttribute("lang")),
|
|
normalizeLanguageTag(
|
|
pickString(
|
|
getMetaContent(document, ["language", "content-language", "og:locale"]),
|
|
document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content"),
|
|
),
|
|
),
|
|
),
|
|
capturedAt,
|
|
};
|
|
}
|
|
|
|
function isMarkdownUsable(markdown: string, html: string): boolean {
|
|
const normalized = normalizeMarkdown(markdown);
|
|
if (!normalized) {
|
|
return false;
|
|
}
|
|
|
|
const htmlTextLength = extractTextFromHtml(html).length;
|
|
if (htmlTextLength < MIN_CONTENT_LENGTH) {
|
|
return true;
|
|
}
|
|
|
|
if (normalized.length >= 80) {
|
|
return true;
|
|
}
|
|
|
|
return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
|
|
}
|
|
|
|
function countMarkerHits(markdown: string, markers: RegExp[]): number {
|
|
let hits = 0;
|
|
for (const marker of markers) {
|
|
if (marker.test(markdown)) {
|
|
hits += 1;
|
|
}
|
|
}
|
|
return hits;
|
|
}
|
|
|
|
function countUsefulParagraphs(markdown: string): number {
|
|
const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
|
|
let count = 0;
|
|
|
|
for (const paragraph of paragraphs) {
|
|
const trimmed = paragraph.trim();
|
|
if (!trimmed) {
|
|
continue;
|
|
}
|
|
if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) {
|
|
continue;
|
|
}
|
|
if (/^#{1,6}\s+/.test(trimmed)) {
|
|
continue;
|
|
}
|
|
if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) {
|
|
continue;
|
|
}
|
|
count += 1;
|
|
}
|
|
|
|
return count;
|
|
}
|
|
|
|
function scoreMarkdownQuality(markdown: string): number {
|
|
const normalized = normalizeMarkdown(markdown);
|
|
const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
|
|
const usefulParagraphs = countUsefulParagraphs(normalized);
|
|
const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
|
|
const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
|
|
return Math.min(wordCount, 4000) + usefulParagraphs * 40 + headingCount * 10 - markerHits * 180;
|
|
}
|
|
|
|
function shouldCompareWithFallback(markdown: string): boolean {
|
|
const normalized = normalizeMarkdown(markdown);
|
|
return countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || countUsefulParagraphs(normalized) < 6;
|
|
}
|
|
|
|
function hasMeaningfulMarkdownStructure(markdown: string): boolean {
|
|
const normalized = normalizeMarkdown(markdown);
|
|
if (!normalized) {
|
|
return false;
|
|
}
|
|
|
|
return (
|
|
countUsefulParagraphs(normalized) > 0 ||
|
|
/^#{1,6}\s+/m.test(normalized) ||
|
|
/^[-*]\s+/m.test(normalized) ||
|
|
/^\d+\.\s+/m.test(normalized) ||
|
|
/!\[[^\]]*\]\([^)]+\)/.test(normalized)
|
|
);
|
|
}
|
|
|
|
function shouldTryRemoteMarkdownFallback(
|
|
markdown: string,
|
|
html: string,
|
|
options: ConvertHtmlToMarkdownOptions,
|
|
): boolean {
|
|
if (!options.enableRemoteMarkdownFallback) {
|
|
return false;
|
|
}
|
|
|
|
return !isMarkdownUsable(markdown, html) || shouldCompareWithFallback(markdown);
|
|
}
|
|
|
|
function shouldPreferRemoteMarkdown(
|
|
current: HtmlToMarkdownResult,
|
|
remote: HtmlToMarkdownResult,
|
|
html: string,
|
|
): boolean {
|
|
if (!isMarkdownUsable(current.markdown, html)) {
|
|
return true;
|
|
}
|
|
|
|
if (!hasMeaningfulMarkdownStructure(current.markdown) && hasMeaningfulMarkdownStructure(remote.markdown)) {
|
|
return true;
|
|
}
|
|
|
|
return scoreMarkdownQuality(remote.markdown) > scoreMarkdownQuality(current.markdown) + REMOTE_FALLBACK_SCORE_DELTA;
|
|
}
|
|
|
|
function buildRemoteFallbackReason(current: HtmlToMarkdownResult, html: string): string {
|
|
if (!isMarkdownUsable(current.markdown, html)) {
|
|
return current.fallbackReason
|
|
? `Used defuddle.md markdown fallback after local extraction failed: ${current.fallbackReason}`
|
|
: "Used defuddle.md markdown fallback after local extraction returned empty or incomplete markdown";
|
|
}
|
|
|
|
return "defuddle.md produced higher-quality markdown than local extraction";
|
|
}
|
|
|
|
async function tryDefuddleConversion(
|
|
html: string,
|
|
url: string,
|
|
baseMetadata: HtmlConversionMetadata,
|
|
): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {
|
|
try {
|
|
const virtualConsole = new VirtualConsole();
|
|
virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
|
|
if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
|
|
return;
|
|
}
|
|
});
|
|
|
|
const dom = new JSDOM(html, { url, virtualConsole });
|
|
const result = await Defuddle(dom, url, { markdown: true });
|
|
const markdown = trimKnownBoilerplate(result.content || "");
|
|
|
|
if (!isMarkdownUsable(markdown, html)) {
|
|
return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
|
|
}
|
|
|
|
const metadata: HtmlConversionMetadata = {
|
|
...baseMetadata,
|
|
title: pickString(result.title, baseMetadata.title),
|
|
summary: pickString(result.description, baseMetadata.summary),
|
|
author: pickString(result.author, baseMetadata.author),
|
|
publishedAt: pickString(result.published, baseMetadata.publishedAt),
|
|
coverImage: pickString(result.image, baseMetadata.coverImage),
|
|
language: pickString(result.language, baseMetadata.language),
|
|
};
|
|
|
|
return {
|
|
ok: true,
|
|
result: {
|
|
metadata,
|
|
markdown,
|
|
rawHtml: html,
|
|
cleanedHtml: html,
|
|
media: collectMediaFromMarkdown(markdown).concat(
|
|
metadata.coverImage
|
|
? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]
|
|
: [],
|
|
),
|
|
conversionMethod: "defuddle",
|
|
},
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
ok: false,
|
|
reason: error instanceof Error ? error.message : String(error),
|
|
};
|
|
}
|
|
}
|
|
|
|
async function tryDefuddleApiConversion(
|
|
html: string,
|
|
url: string,
|
|
baseMetadata: HtmlConversionMetadata,
|
|
): Promise<{ ok: true; result: HtmlToMarkdownResult } | { ok: false; reason: string }> {
|
|
try {
|
|
const result = await fetchDefuddleApiMarkdown(url);
|
|
const markdown = result.markdown;
|
|
|
|
if (!isMarkdownUsable(markdown, html) && scoreMarkdownQuality(markdown) < 80) {
|
|
return { ok: false, reason: "defuddle.md returned empty or incomplete markdown" };
|
|
}
|
|
|
|
const metadata: HtmlConversionMetadata = {
|
|
...baseMetadata,
|
|
title: pickString(result.title, baseMetadata.title),
|
|
};
|
|
|
|
return {
|
|
ok: true,
|
|
result: {
|
|
metadata,
|
|
markdown,
|
|
rawHtml: html,
|
|
cleanedHtml: html,
|
|
media: collectMediaFromMarkdown(markdown).concat(
|
|
metadata.coverImage
|
|
? [{ url: metadata.coverImage, kind: "image", role: "cover" as const }]
|
|
: [],
|
|
),
|
|
conversionMethod: "defuddle-api",
|
|
},
|
|
};
|
|
} catch (error) {
|
|
return {
|
|
ok: false,
|
|
reason: error instanceof Error ? error.message : String(error),
|
|
};
|
|
}
|
|
}
|
|
|
|
function convertHtmlFragmentToMarkdown(html: string): string {
|
|
if (!html.trim()) {
|
|
return "";
|
|
}
|
|
|
|
try {
|
|
return turndown.turndown(sanitizeHtmlFragment(html));
|
|
} catch {
|
|
return "";
|
|
}
|
|
}
|
|
|
|
function fallbackPlainText(html: string): string {
|
|
return trimKnownBoilerplate(extractTextFromHtml(html));
|
|
}
|
|
|
|
function convertWithReadability(
|
|
rawHtml: string,
|
|
cleanedHtml: string,
|
|
url: string,
|
|
baseMetadata: HtmlConversionMetadata,
|
|
): HtmlToMarkdownResult {
|
|
const dom = new JSDOM(cleanedHtml, { url });
|
|
const document = dom.window.document;
|
|
const article = new Readability(document).parse();
|
|
|
|
const contentHtml =
|
|
article?.content?.trim() ??
|
|
document.querySelector("main")?.innerHTML?.trim() ??
|
|
document.body?.innerHTML?.trim() ??
|
|
"";
|
|
|
|
let markdown = contentHtml ? convertHtmlFragmentToMarkdown(contentHtml) : "";
|
|
if (!markdown) {
|
|
markdown = fallbackPlainText(cleanedHtml);
|
|
}
|
|
|
|
const metadata: HtmlConversionMetadata = {
|
|
...baseMetadata,
|
|
title: pickString(article?.title, baseMetadata.title),
|
|
summary: pickString(article?.excerpt, baseMetadata.summary),
|
|
author: pickString(article?.byline, baseMetadata.author),
|
|
};
|
|
|
|
const media = collectMediaFromMarkdown(markdown);
|
|
if (metadata.coverImage) {
|
|
media.unshift({
|
|
url: metadata.coverImage,
|
|
kind: "image",
|
|
role: "cover",
|
|
});
|
|
}
|
|
|
|
return {
|
|
metadata,
|
|
markdown: trimKnownBoilerplate(markdown),
|
|
rawHtml,
|
|
cleanedHtml,
|
|
media,
|
|
conversionMethod: article?.content ? "legacy:readability" : "legacy:body",
|
|
};
|
|
}
|
|
|
|
export async function convertHtmlToMarkdown(
|
|
html: string,
|
|
url: string,
|
|
options: ConvertHtmlToMarkdownOptions = {},
|
|
): Promise<HtmlToMarkdownResult> {
|
|
const capturedAt = new Date().toISOString();
|
|
const baseMetadata = extractPageMetadata(html, url, capturedAt);
|
|
|
|
let cleanedHtml = html;
|
|
try {
|
|
cleanedHtml = cleanHtml(html, url, {
|
|
removeBase64Images: !options.preserveBase64Images,
|
|
});
|
|
} catch {
|
|
cleanedHtml = html;
|
|
}
|
|
|
|
let selectedResult: HtmlToMarkdownResult;
|
|
const defuddleResult = await tryDefuddleConversion(cleanedHtml, url, baseMetadata);
|
|
if (defuddleResult.ok) {
|
|
if (shouldCompareWithFallback(defuddleResult.result.markdown)) {
|
|
const fallbackResult = convertWithReadability(html, cleanedHtml, url, baseMetadata);
|
|
if (
|
|
scoreMarkdownQuality(fallbackResult.markdown) >
|
|
scoreMarkdownQuality(defuddleResult.result.markdown) + LOCAL_FALLBACK_SCORE_DELTA
|
|
) {
|
|
selectedResult = {
|
|
...fallbackResult,
|
|
fallbackReason: "Readability/Turndown produced higher-quality markdown than Defuddle",
|
|
};
|
|
} else {
|
|
selectedResult = {
|
|
...defuddleResult.result,
|
|
rawHtml: html,
|
|
cleanedHtml,
|
|
};
|
|
}
|
|
} else {
|
|
selectedResult = {
|
|
...defuddleResult.result,
|
|
rawHtml: html,
|
|
cleanedHtml,
|
|
};
|
|
}
|
|
} else {
|
|
selectedResult = {
|
|
...convertWithReadability(html, cleanedHtml, url, baseMetadata),
|
|
fallbackReason: defuddleResult.reason,
|
|
};
|
|
}
|
|
|
|
if (!shouldTryRemoteMarkdownFallback(selectedResult.markdown, cleanedHtml, options)) {
|
|
return selectedResult;
|
|
}
|
|
|
|
const remoteDefuddleResult = await tryDefuddleApiConversion(cleanedHtml, url, baseMetadata);
|
|
if (!remoteDefuddleResult.ok || !shouldPreferRemoteMarkdown(selectedResult, remoteDefuddleResult.result, cleanedHtml)) {
|
|
return selectedResult;
|
|
}
|
|
|
|
return {
|
|
...remoteDefuddleResult.result,
|
|
rawHtml: html,
|
|
cleanedHtml,
|
|
fallbackReason: buildRemoteFallbackReason(selectedResult, cleanedHtml),
|
|
};
|
|
}
|