JimLiu-baoyu-skills/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts

import { parseHTML } from "linkedom";
import { Readability } from "@mozilla/readability";
import TurndownService from "turndown";
import { gfm } from "turndown-plugin-gfm";

export interface PageMetadata {
  url: string;
  title: string;
  description?: string;
  author?: string;
  published?: string;
  coverImage?: string;
  captured_at: string;
}

export interface ConversionResult {
  metadata: PageMetadata;
  markdown: string;
  rawHtml: string;
  conversionMethod: string;
  fallbackReason?: string;
}

interface ExtractionCandidate {
  title: string | null;
  byline: string | null;
  excerpt: string | null;
  published: string | null;
  html: string | null;
  textContent: string;
  method: string;
}

type AnyRecord = Record<string, unknown>;

const MIN_CONTENT_LENGTH = 120;
const GOOD_CONTENT_LENGTH = 900;

const CONTENT_SELECTORS = [
  "article",
  "main article",
  "[role='main'] article",
  "[itemprop='articleBody']",
  ".article-content",
  ".article-body",
  ".post-content",
  ".entry-content",
  ".story-body",
  "main",
  "[role='main']",
  "#content",
  ".content",
];

const REMOVE_SELECTORS = [
  "script",
  "style",
  "noscript",
  "template",
  "iframe",
  "svg",
  "path",
  "nav",
  "aside",
  "footer",
  "header",
  "form",
  ".advertisement",
  ".ads",
  ".social-share",
  ".related-articles",
  ".comments",
  ".newsletter",
  ".cookie-banner",
  ".cookie-consent",
  "[role='navigation']",
  "[aria-label*='cookie' i]",
];

const PUBLISHED_TIME_SELECTORS = [
  "meta[property='article:published_time']",
  "meta[name='pubdate']",
  "meta[name='publishdate']",
  "meta[name='date']",
  "time[datetime]",
];

const ARTICLE_TYPES = new Set([
  "Article",
  "NewsArticle",
  "BlogPosting",
  "WebPage",
  "ReportageNewsArticle",
]);

const NEXT_DATA_CONTENT_PATHS = [
  "props.pageProps.content.body",
  "props.pageProps.article.body",
  "props.pageProps.article.content",
  "props.pageProps.post.body",
  "props.pageProps.post.content",
  "props.pageProps.data.body",
  "props.pageProps.story.body.content",
];

const LOW_QUALITY_MARKERS = [
  /Join The Conversation/i,
  /One Community\. Many Voices/i,
  /Read our community guidelines/i,
  /Create a free account to share your thoughts/i,
  /Become a Forbes Member/i,
  /Subscribe to trusted journalism/i,
  /\bComments\b/i,
];

export const absolutizeUrlsScript = String.raw`
(function() {
  const baseUrl = document.baseURI || location.href;
  function toAbsolute(url) {
    if (!url) return url;
    try { return new URL(url, baseUrl).href; } catch { return url; }
  }
  function absAttr(sel, attr) {
    document.querySelectorAll(sel).forEach(el => {
      const v = el.getAttribute(attr);
      if (v) { const a = toAbsolute(v); if (a) el.setAttribute(attr, a); }
    });
  }
  function absSrcset(sel) {
    document.querySelectorAll(sel).forEach(el => {
      const s = el.getAttribute("srcset");
      if (!s) return;
      el.setAttribute("srcset", s.split(",").map(p => {
        const t = p.trim(); if (!t) return "";
        const [url, ...d] = t.split(/\s+/);
        return d.length ? toAbsolute(url) + " " + d.join(" ") : toAbsolute(url);
      }).filter(Boolean).join(", "));
    });
  }
  document.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => {
    const ds = el.getAttribute("data-src");
    if (ds && (!el.getAttribute("src") || el.getAttribute("src") === "" || el.getAttribute("src")?.startsWith("data:"))) {
      el.setAttribute("src", ds);
    }
  });
  absAttr("a[href]", "href");
  absAttr("img[src], video[src], audio[src], source[src]", "src");
  absSrcset("img[srcset], source[srcset]");
  return { html: document.documentElement.outerHTML };
})()
`;

function pickString(...values: unknown[]): string | null {
  for (const value of values) {
    if (typeof value === "string") {
      const trimmed = value.trim();
      if (trimmed) return trimmed;
    }
  }
  return null;
}

function normalizeMarkdown(markdown: string): string {
  return markdown
    .replace(/\r\n/g, "\n")
    .replace(/[ \t]+\n/g, "\n")
    .replace(/\n{3,}/g, "\n\n")
    .trim();
}

function parseDocument(html: string): Document {
  const normalized = /<\s*html[\s>]/i.test(html)
    ? html
    : `<!doctype html><html><body>${html}</body></html>`;
  return parseHTML(normalized).document as unknown as Document;
}

function sanitizeHtml(html: string): string {
  const { document } = parseHTML(`<div id="__root">${html}</div>`);
  const root = document.querySelector("#__root");
  if (!root) return html;

  for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
    for (const el of root.querySelectorAll(selector)) {
      el.remove();
    }
  }

  return root.innerHTML;
}

function extractTextFromHtml(html: string): string {
  const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
    for (const el of document.querySelectorAll(selector)) {
      el.remove();
    }
  }
  return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
}

function getMetaContent(document: Document, names: string[]): string | null {
  for (const name of names) {
    const element =
      document.querySelector(`meta[name="${name}"]`) ??
      document.querySelector(`meta[property="${name}"]`);
    const content = element?.getAttribute("content");
    if (content && content.trim()) return content.trim();
  }
  return null;
}

function flattenJsonLdItems(data: unknown): AnyRecord[] {
  if (!data || typeof data !== "object") return [];
  if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);

  const item = data as AnyRecord;
  if (Array.isArray(item["@graph"])) {
    return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
  }

  return [item];
}

function parseJsonLdScripts(document: Document): AnyRecord[] {
  const results: AnyRecord[] = [];
  const scripts = document.querySelectorAll("script[type='application/ld+json']");

  for (const script of scripts) {
    try {
      const data = JSON.parse(script.textContent ?? "");
      results.push(...flattenJsonLdItems(data));
    } catch {
      // Ignore malformed blocks.
    }
  }

  return results;
}

function isArticleType(item: AnyRecord): boolean {
  const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
  return typeof value === "string" && ARTICLE_TYPES.has(value);
}

function extractAuthorFromJsonLd(authorData: unknown): string | null {
  if (typeof authorData === "string") return authorData;
  if (!authorData || typeof authorData !== "object") return null;

  if (Array.isArray(authorData)) {
    const names = authorData
      .map((author) => extractAuthorFromJsonLd(author))
      .filter((name): name is string => Boolean(name));
    return names.length > 0 ? names.join(", ") : null;
  }

  const author = authorData as AnyRecord;
  return typeof author.name === "string" ? author.name : null;
}

function extractPrimaryJsonLdMeta(document: Document): Partial<PageMetadata> {
  for (const item of parseJsonLdScripts(document)) {
    if (!isArticleType(item)) continue;

    return {
      title: pickString(item.headline, item.name) ?? undefined,
      description: pickString(item.description) ?? undefined,
      author: extractAuthorFromJsonLd(item.author) ?? undefined,
      published: pickString(item.datePublished, item.dateCreated) ?? undefined,
      coverImage:
        pickString(
          item.image,
          (item.image as AnyRecord | undefined)?.url,
          (Array.isArray(item.image) ? item.image[0] : undefined) as unknown
        ) ?? undefined,
    };
  }

  return {};
}

function extractPublishedTime(document: Document): string | null {
  for (const selector of PUBLISHED_TIME_SELECTORS) {
    const el = document.querySelector(selector);
    if (!el) continue;
    const value = el.getAttribute("content") ?? el.getAttribute("datetime");
    if (value && value.trim()) return value.trim();
  }
  return null;
}

function extractTitle(document: Document): string | null {
  const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content");
  if (ogTitle && ogTitle.trim()) return ogTitle.trim();

  const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content");
  if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim();

  const title = document.querySelector("title")?.textContent?.trim();
  if (title) {
    const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim();
    if (cleaned) return cleaned;
  }

  const h1 = document.querySelector("h1")?.textContent?.trim();
  return h1 || null;
}

function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata {
  const document = parseDocument(html);
  const jsonLd = extractPrimaryJsonLdMeta(document);
  const timeEl = document.querySelector("time[datetime]");

  return {
    url,
    title:
      pickString(
        getMetaContent(document, ["og:title", "twitter:title"]),
        jsonLd.title,
        document.querySelector("h1")?.textContent,
        document.title
      ) ?? "",
    description:
      pickString(
        getMetaContent(document, ["description", "og:description", "twitter:description"]),
        jsonLd.description
      ) ?? undefined,
    author:
      pickString(
        getMetaContent(document, ["author", "article:author", "twitter:creator"]),
        jsonLd.author
      ) ?? undefined,
    published:
      pickString(
        timeEl?.getAttribute("datetime"),
        getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
        jsonLd.published,
        extractPublishedTime(document)
      ) ?? undefined,
    coverImage:
      pickString(
        getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
        jsonLd.coverImage
      ) ?? undefined,
    captured_at: capturedAt,
  };
}

function generateExcerpt(excerpt: string | null, textContent: string | null): string | null {
  if (excerpt) return excerpt;
  if (!textContent) return null;
  const trimmed = textContent.trim();
  if (!trimmed) return null;
  return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed;
}

function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null {
  if (!isArticleType(item)) return null;

  const rawContent =
    (typeof item.articleBody === "string" && item.articleBody) ||
    (typeof item.text === "string" && item.text) ||
    (typeof item.description === "string" && item.description) ||
    null;

  if (!rawContent) return null;

  const content = rawContent.trim();
  const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content);
  const textContent = htmlLike ? extractTextFromHtml(content) : content;

  if (textContent.length < MIN_CONTENT_LENGTH) return null;

  return {
    title: pickString(item.headline, item.name),
    byline: extractAuthorFromJsonLd(item.author),
    excerpt: pickString(item.description),
    published: pickString(item.datePublished, item.dateCreated),
    html: htmlLike ? content : null,
    textContent,
    method: "json-ld",
  };
}

function tryJsonLdExtraction(document: Document): ExtractionCandidate | null {
  for (const item of parseJsonLdScripts(document)) {
    const extracted = parseJsonLdItem(item);
    if (extracted) return extracted;
  }
  return null;
}

function getByPath(value: unknown, path: string): unknown {
  let current = value;
  for (const part of path.split(".")) {
    if (!current || typeof current !== "object") return undefined;
    current = (current as AnyRecord)[part];
  }
  return current;
}

function isContentBlockArray(value: unknown): value is AnyRecord[] {
  if (!Array.isArray(value) || value.length === 0) return false;
  return value.slice(0, 5).some((item) => {
    if (!item || typeof item !== "object") return false;
    const obj = item as AnyRecord;
    return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj;
  });
}

function extractTextFromContentBlocks(blocks: AnyRecord[]): string {
  const parts: string[] = [];

  function pushParagraph(text: string): void {
    const trimmed = text.trim();
    if (!trimmed) return;
    parts.push(trimmed, "\n\n");
  }

  function walk(node: unknown): void {
    if (!node || typeof node !== "object") return;
    const block = node as AnyRecord;

    if (typeof block.text === "string") {
      pushParagraph(block.text);
      return;
    }

    if (typeof block.textHtml === "string") {
      pushParagraph(extractTextFromHtml(block.textHtml));
      return;
    }

    if (Array.isArray(block.items)) {
      for (const item of block.items) {
        if (item && typeof item === "object") {
          const text = pickString((item as AnyRecord).text);
          if (text) parts.push(`- ${text}\n`);
        }
      }
      parts.push("\n");
    }

    if (Array.isArray(block.components)) {
      for (const component of block.components) {
        walk(component);
      }
    }

    if (Array.isArray(block.content)) {
      for (const child of block.content) {
        walk(child);
      }
    }
  }

  for (const block of blocks) {
    walk(block);
  }

  return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
}

function tryStringBodyExtraction(
  content: string,
  meta: AnyRecord,
  document: Document,
  method: string
): ExtractionCandidate | null {
  if (!content || content.length < MIN_CONTENT_LENGTH) return null;

  const isHtml = /<\/?[a-z][\s\S]*>/i.test(content);
  const html = isHtml ? sanitizeHtml(content) : null;
  const textContent = isHtml ? extractTextFromHtml(html) : content.trim();

  if (textContent.length < MIN_CONTENT_LENGTH) return null;

  return {
    title: pickString(meta.headline, meta.title, extractTitle(document)),
    byline: pickString(meta.byline, meta.author),
    excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)),
    published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)),
    html,
    textContent,
    method,
  };
}

function tryNextDataExtraction(document: Document): ExtractionCandidate | null {
  try {
    const script = document.querySelector("script#__NEXT_DATA__");
    if (!script?.textContent) return null;

    const data = JSON.parse(script.textContent) as AnyRecord;
    const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord;

    for (const path of NEXT_DATA_CONTENT_PATHS) {
      const value = getByPath(data, path);

      if (typeof value === "string") {
        const parentPath = path.split(".").slice(0, -1).join(".");
        const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord;
        const meta = {
          ...pageProps,
          ...parent,
          title: parent.title ?? (pageProps.title as string | undefined),
        };

        const candidate = tryStringBodyExtraction(value, meta, document, "next-data");
        if (candidate) return candidate;
      }

      if (isContentBlockArray(value)) {
        const textContent = extractTextFromContentBlocks(value);
        if (textContent.length < MIN_CONTENT_LENGTH) continue;

        return {
          title: pickString(
            getByPath(data, "props.pageProps.content.headline"),
            getByPath(data, "props.pageProps.article.headline"),
            getByPath(data, "props.pageProps.article.title"),
            getByPath(data, "props.pageProps.post.title"),
            pageProps.title,
            extractTitle(document)
          ),
          byline: pickString(
            getByPath(data, "props.pageProps.author.name"),
            getByPath(data, "props.pageProps.article.author.name")
          ),
          excerpt: pickString(
            getByPath(data, "props.pageProps.content.description"),
            getByPath(data, "props.pageProps.article.description"),
            pageProps.description,
            generateExcerpt(null, textContent)
          ),
          published: pickString(
            getByPath(data, "props.pageProps.content.datePublished"),
            getByPath(data, "props.pageProps.article.datePublished"),
            getByPath(data, "props.pageProps.publishedAt"),
            extractPublishedTime(document)
          ),
          html: null,
          textContent,
          method: "next-data",
        };
      }
    }
  } catch {
    return null;
  }

  return null;
}

function buildReadabilityCandidate(
  article: ReturnType<Readability["parse"]>,
  document: Document,
  method: string
): ExtractionCandidate | null {
  const textContent = article?.textContent?.trim() ?? "";
  if (textContent.length < MIN_CONTENT_LENGTH) return null;

  return {
    title: pickString(article?.title, extractTitle(document)),
    byline: pickString((article as { byline?: string } | null)?.byline),
    excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)),
    published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)),
    html: article?.content ? sanitizeHtml(article.content) : null,
    textContent,
    method,
  };
}

function tryReadability(document: Document): ExtractionCandidate | null {
  try {
    const strictClone = document.cloneNode(true) as Document;
    const strictResult = buildReadabilityCandidate(
      new Readability(strictClone).parse(),
      document,
      "readability"
    );
    if (strictResult) return strictResult;

    const relaxedClone = document.cloneNode(true) as Document;
    return buildReadabilityCandidate(
      new Readability(relaxedClone, { charThreshold: 120 }).parse(),
      document,
      "readability-relaxed"
    );
  } catch {
    return null;
  }
}

function trySelectorExtraction(document: Document): ExtractionCandidate | null {
  for (const selector of CONTENT_SELECTORS) {
    const element = document.querySelector(selector);
    if (!element) continue;

    const clone = element.cloneNode(true) as Element;
    for (const removeSelector of REMOVE_SELECTORS) {
      for (const node of clone.querySelectorAll(removeSelector)) {
        node.remove();
      }
    }

    const html = sanitizeHtml(clone.innerHTML);
    const textContent = extractTextFromHtml(html);
    if (textContent.length < MIN_CONTENT_LENGTH) continue;

    return {
      title: extractTitle(document),
      byline: null,
      excerpt: generateExcerpt(null, textContent),
      published: extractPublishedTime(document),
      html,
      textContent,
      method: `selector:${selector}`,
    };
  }

  return null;
}

function tryBodyExtraction(document: Document): ExtractionCandidate | null {
  const body = document.body;
  if (!body) return null;

  const clone = body.cloneNode(true) as Element;
  for (const removeSelector of REMOVE_SELECTORS) {
    for (const node of clone.querySelectorAll(removeSelector)) {
      node.remove();
    }
  }

  const html = sanitizeHtml(clone.innerHTML);
  const textContent = extractTextFromHtml(html);
  if (!textContent) return null;

  return {
    title: extractTitle(document),
    byline: null,
    excerpt: generateExcerpt(null, textContent),
    published: extractPublishedTime(document),
    html,
    textContent,
    method: "body-fallback",
  };
}

function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null {
  if (candidates.length === 0) return null;

  const methodOrder = [
    "readability",
    "readability-relaxed",
    "next-data",
    "json-ld",
    "selector:",
    "body-fallback",
  ];

  function methodRank(method: string): number {
    const idx = methodOrder.findIndex((entry) =>
      entry.endsWith(":") ? method.startsWith(entry) : method === entry
    );
    return idx === -1 ? methodOrder.length : idx;
  }

  const ranked = [...candidates].sort((a, b) => {
    const rankA = methodRank(a.method);
    const rankB = methodRank(b.method);
    if (rankA !== rankB) return rankA - rankB;
    return (b.textContent.length ?? 0) - (a.textContent.length ?? 0);
  });

  for (const candidate of ranked) {
    if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) {
      return candidate;
    }
  }

  for (const candidate of ranked) {
    if (candidate.textContent.length >= MIN_CONTENT_LENGTH) {
      return candidate;
    }
  }

  return ranked[0];
}

function extractFromHtml(html: string): ExtractionCandidate | null {
  const document = parseDocument(html);

  const readabilityCandidate = tryReadability(document);
  const nextDataCandidate = tryNextDataExtraction(document);
  const jsonLdCandidate = tryJsonLdExtraction(document);
  const selectorCandidate = trySelectorExtraction(document);
  const bodyCandidate = tryBodyExtraction(document);

  const candidates = [
    readabilityCandidate,
    nextDataCandidate,
    jsonLdCandidate,
    selectorCandidate,
    bodyCandidate,
  ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate));

  const winner = pickBestCandidate(candidates);
  if (!winner) return null;

  return {
    ...winner,
    title: winner.title ?? extractTitle(document),
    published: winner.published ?? extractPublishedTime(document),
    excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent),
  };
}

const turndown = new TurndownService({
  headingStyle: "atx",
  hr: "---",
  bulletListMarker: "-",
  codeBlockStyle: "fenced",
  emDelimiter: "*",
  strongDelimiter: "**",
  linkStyle: "inlined",
});

turndown.use(gfm);
turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);

turndown.addRule("collapseFigure", {
  filter: "figure",
  replacement(content) {
    return `\n\n${content.trim()}\n\n`;
  },
});

turndown.addRule("dropInvisibleAnchors", {
  filter(node) {
    return node.nodeName === "A" && !(node as Element).textContent?.trim();
  },
  replacement() {
    return "";
  },
});

function convertHtmlToMarkdown(html: string): string {
  if (!html || !html.trim()) return "";

  try {
    const sanitized = sanitizeHtml(html);
    return turndown.turndown(sanitized);
  } catch {
    return "";
  }
}

function fallbackPlainText(html: string): string {
  const document = parseDocument(html);
  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
    for (const el of document.querySelectorAll(selector)) {
      el.remove();
    }
  }
  const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
  return normalizeMarkdown(text.replace(/\s+/g, " "));
}

function countBylines(markdown: string): number {
  return (markdown.match(/(^|\n)By\s+/g) || []).length;
}

function countUsefulParagraphs(markdown: string): number {
  const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
  let count = 0;

  for (const paragraph of paragraphs) {
    const trimmed = paragraph.trim();
    if (!trimmed) continue;
    if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue;
    if (/^#{1,6}\s+/.test(trimmed)) continue;
    if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue;
    count++;
  }

  return count;
}

function countMarkerHits(markdown: string, markers: RegExp[]): number {
  let hits = 0;
  for (const marker of markers) {
    if (marker.test(markdown)) hits++;
  }
  return hits;
}

function scoreMarkdownQuality(markdown: string): number {
  const normalized = normalizeMarkdown(markdown);
  const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
  const usefulParagraphs = countUsefulParagraphs(normalized);
  const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
  const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
  const bylineCount = countBylines(normalized);
  const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length;

  return (
    Math.min(wordCount, 4000) +
    usefulParagraphs * 40 +
    headingCount * 10 -
    markerHits * 180 -
    Math.max(0, bylineCount - 1) * 120 -
    Math.max(0, staffCount - 1) * 80
  );
}

function shouldCompareWithLegacy(markdown: string): boolean {
  const normalized = normalizeMarkdown(markdown);
  return (
    countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 ||
    countBylines(normalized) > 1 ||
    countUsefulParagraphs(normalized) < 6
  );
}

function isMarkdownUsable(markdown: string, html: string): boolean {
  const normalized = normalizeMarkdown(markdown);
  if (!normalized) return false;

  const htmlTextLength = extractTextFromHtml(html).length;
  if (htmlTextLength < MIN_CONTENT_LENGTH) return true;

  if (normalized.length >= 80) return true;
  return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
}

async function tryDefuddleConversion(
  html: string,
  url: string,
  baseMetadata: PageMetadata
): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> {
  try {
    const [{ JSDOM, VirtualConsole }, { Defuddle }] = await Promise.all([
      import("jsdom"),
      import("defuddle/node"),
    ]);

    const virtualConsole = new VirtualConsole();
    virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
      if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
        return;
      }
      console.warn(`[url-to-markdown] jsdom: ${error.message}`);
    });

    const dom = new JSDOM(html, { url, virtualConsole });
    const result = await Defuddle(dom, url, { markdown: true });
    const markdown = normalizeMarkdown(result.content || "");

    if (!isMarkdownUsable(markdown, html)) {
      return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
    }

    return {
      ok: true,
      result: {
        metadata: {
          ...baseMetadata,
          title: pickString(result.title, baseMetadata.title) ?? "",
          description: pickString(result.description, baseMetadata.description) ?? undefined,
          author: pickString(result.author, baseMetadata.author) ?? undefined,
          published: pickString(result.published, baseMetadata.published) ?? undefined,
          coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined,
        },
        markdown,
        rawHtml: html,
        conversionMethod: "defuddle",
      },
    };
  } catch (error) {
    return {
      ok: false,
      reason: error instanceof Error ? error.message : String(error),
    };
  }
}

function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult {
  const extracted = extractFromHtml(html);

  let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : "";
  if (!markdown.trim()) {
    markdown = extracted?.textContent?.trim() || fallbackPlainText(html);
  }

  return {
    metadata: {
      ...baseMetadata,
      title: pickString(extracted?.title, baseMetadata.title) ?? "",
      description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined,
      author: pickString(extracted?.byline, baseMetadata.author) ?? undefined,
      published: pickString(extracted?.published, baseMetadata.published) ?? undefined,
    },
    markdown: normalizeMarkdown(markdown),
    rawHtml: html,
    conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text",
  };
}

export async function extractContent(html: string, url: string): Promise<ConversionResult> {
  const capturedAt = new Date().toISOString();
  const baseMetadata = extractMetadataFromHtml(html, url, capturedAt);

  const defuddleResult = await tryDefuddleConversion(html, url, baseMetadata);
  if (defuddleResult.ok) {
    if (shouldCompareWithLegacy(defuddleResult.result.markdown)) {
      const legacyResult = convertWithLegacyExtractor(html, baseMetadata);
      const legacyScore = scoreMarkdownQuality(legacyResult.markdown);
      const defuddleScore = scoreMarkdownQuality(defuddleResult.result.markdown);

      if (legacyScore > defuddleScore + 120) {
        return {
          ...legacyResult,
          fallbackReason: "Legacy extractor produced higher-quality markdown than Defuddle",
        };
      }
    }

    return defuddleResult.result;
  }

  const fallbackResult = convertWithLegacyExtractor(html, baseMetadata);
  return {
    ...fallbackResult,
    fallbackReason: defuddleResult.reason,
  };
}

function escapeYamlValue(value: string): string {
  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n");
}

export function formatMetadataYaml(meta: PageMetadata): string {
  const lines = ["---"];
  lines.push(`url: ${meta.url}`);
  lines.push(`title: "${escapeYamlValue(meta.title)}"`);
  if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
  if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
  if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
  if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
  lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
  lines.push("---");
  return lines.join("\n");
}

export function createMarkdownDocument(result: ConversionResult): string {
  const yaml = formatMetadataYaml(result.metadata);
  const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
  const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
  const hasTitle = titleRegex.test(result.markdown.trimStart());
  const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
  return yaml + title + result.markdown;
}