From 0279fa403dcbd8b81e0176e8c0eb5ac52a023585 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= <junminliu@gmail.com>
Date: Fri, 13 Mar 2026 00:22:03 -0500
Subject: [PATCH] feat(baoyu-url-to-markdown): add defuddle.md API fallback,
 YouTube transcripts, and modular converter architecture

---
 skills/baoyu-url-to-markdown/SKILL.md         |  36 +-
 skills/baoyu-url-to-markdown/scripts/bun.lock |   4 +-
 .../scripts/defuddle-converter.ts             |  58 ++
 .../scripts/html-to-markdown.ts               | 975 ++----------------
 .../scripts/legacy-converter.ts               | 629 +++++++++++
 skills/baoyu-url-to-markdown/scripts/main.ts  | 104 +-
 .../scripts/markdown-conversion-shared.ts     | 305 ++++++
 .../scripts/package.json                      |   2 +-
 8 files changed, 1190 insertions(+), 923 deletions(-)
 create mode 100644 skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts
 create mode 100644 skills/baoyu-url-to-markdown/scripts/legacy-converter.ts
 create mode 100644 skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts
diff --git a/skills/baoyu-url-to-markdown/SKILL.md b/skills/baoyu-url-to-markdown/SKILL.md
index 1f382c2..48881b9 100644
--- a/skills/baoyu-url-to-markdown/SKILL.md
+++ b/skills/baoyu-url-to-markdown/SKILL.md
@@ -1,7 +1,7 @@
 ---
 name: baoyu-url-to-markdown
-description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when Defuddle fails. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown.
-version: 1.56.1
+description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, uses an upgraded Defuddle pipeline with better web-component handling and YouTube transcript extraction, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when needed. If local browser capture fails entirely, it can fall back to the hosted defuddle.md API. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown.
+version: 1.58.1
 metadata:
   openclaw:
     homepage: https://github.com/JimLiu/baoyu-skills#baoyu-url-to-markdown
@@ -29,7 +29,10 @@ Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts i
 | Script | Purpose |
 |--------|---------|
 | `scripts/main.ts` | CLI entry point for URL fetching |
-| `scripts/html-to-markdown.ts` | Defuddle-first conversion with automatic legacy fallback |
+| `scripts/html-to-markdown.ts` | Markdown conversion entry point and converter selection |
+| `scripts/defuddle-converter.ts` | Defuddle-based conversion |
+| `scripts/legacy-converter.ts` | Pre-Defuddle legacy extraction and markdown conversion |
+| `scripts/markdown-conversion-shared.ts` | Shared metadata parsing and markdown document helpers |
 
 ## Preferences (EXTEND.md)
 
@@ -115,7 +118,10 @@ Full reference: [references/config/first-time-setup.md](references/config/first-
 - Two capture modes: auto or wait-for-user
 - Save rendered HTML as a sibling `-captured.html` file
 - Clean markdown output with metadata
-- Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history
+- Upgraded Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history
+- Materializes shadow DOM content before conversion so web-component pages survive serialization better
+- YouTube pages can include transcript/caption text in the markdown when YouTube exposes a caption track
+- If local browser capture fails completely, can fall back to `defuddle.md/<url>` and still save markdown
 - Handles login-required pages via wait mode
 - Download images and videos to local directories
 
@@ -168,7 +174,10 @@ Each run saves two files side by side:
 - Markdown: YAML front matter with `url`, `title`, `description`, `author`, `published`, optional `coverImage`, and `captured_at`, followed by converted markdown content
 - HTML snapshot: `*-captured.html`, containing the rendered page HTML captured from Chrome
 
+When Defuddle or page metadata provides a language hint, the markdown front matter also includes `language`.
+
 The HTML snapshot is saved before any markdown media localization, so it stays a faithful capture of the page DOM used for conversion.
+If the hosted `defuddle.md` API fallback is used, markdown is still saved, but there is no local `-captured.html` snapshot for that run.
 
 ## Output Directory
 
@@ -193,13 +202,16 @@ When `--download-media` is enabled:
 Conversion order:
 
 1. Try Defuddle first
-2. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
-3. The fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
+2. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline
+3. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
+4. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/<url>` API and save its markdown output directly
+5. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
 
 CLI output will show:
 
 - `Converter: defuddle` when Defuddle succeeds
 - `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed
+- `Converter: defuddle-api` when local browser capture failed and the hosted API was used instead
 
 ## Media Download Workflow
 
@@ -232,6 +244,18 @@ Based on `download_media` setting in EXTEND.md:
 
 **Troubleshooting**: Chrome not found → set `URL_CHROME_PATH`. Timeout → increase `--timeout`. Complex pages → try `--wait` mode. If markdown quality is poor, inspect the saved `-captured.html` and check whether the run logged a legacy fallback.
 
+### YouTube Notes
+
+- The upgraded Defuddle path uses async extractors, so YouTube pages can include transcript text directly in the markdown body.
+- Transcript availability depends on YouTube exposing a caption track. Videos with captions disabled, restricted playback, or blocked regional access may still produce description-only output.
+- If the page needs time to finish loading descriptions, chapters, or player metadata, prefer `--wait` and capture after the watch page is fully hydrated.
+
+### Hosted API Fallback
+
+- The hosted fallback endpoint is `https://defuddle.md/<url>`. In shell form: `curl https://defuddle.md/stephango.com`
+- Use it only when the local Chrome/CDP capture path fails outright. The local path still has higher fidelity because it can save the captured HTML and handle authenticated pages.
+- The hosted API already returns Markdown with YAML frontmatter, so save that response as-is and then apply the normal media-localization step if requested.
+
 ## Extension Support
 
 Custom configurations via EXTEND.md. See **Preferences** section for paths and supported options.
diff --git a/skills/baoyu-url-to-markdown/scripts/bun.lock b/skills/baoyu-url-to-markdown/scripts/bun.lock
index 50109d2..de2f3dc 100644
--- a/skills/baoyu-url-to-markdown/scripts/bun.lock
+++ b/skills/baoyu-url-to-markdown/scripts/bun.lock
@@ -6,7 +6,7 @@
       "dependencies": {
         "@mozilla/readability": "^0.6.0",
         "baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp",
-        "defuddle": "^0.10.0",
+        "defuddle": "^0.12.0",
         "jsdom": "^24.1.3",
         "linkedom": "^0.18.12",
         "turndown": "^7.2.2",
@@ -61,7 +61,7 @@
 
     "decimal.js": ["decimal.js@10.6.0", "", {}, "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg=="],
 
-    "defuddle": ["defuddle@0.10.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-a43juTtHv6Vs4+sxvahVLM5NxoyDsarO1Ag3UxLORI4Fo/nsNFwzDxuQBvosKVGTIRxCwN/mfnWAzNXmQfieqw=="],
+    "defuddle": ["defuddle@0.12.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-Y/WgyGKBxwxFir+hWNth4nmWDDDb8BzQi3qASS2NWYPXsKU42Ku49/3M5yFYefnRef9prynnmasfnXjk99EWgA=="],
 
     "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
 
diff --git a/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts b/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts
new file mode 100644
index 0000000..15a265a
--- /dev/null
+++ b/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts
@@ -0,0 +1,58 @@
+import { JSDOM, VirtualConsole } from "jsdom";
+import { Defuddle } from "defuddle/node";
+
+import {
+  type ConversionResult,
+  type PageMetadata,
+  isMarkdownUsable,
+  normalizeMarkdown,
+  pickString,
+} from "./markdown-conversion-shared.js";
+
+export async function tryDefuddleConversion(
+  html: string,
+  url: string,
+  baseMetadata: PageMetadata
+): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> {
+  try {
+    const virtualConsole = new VirtualConsole();
+    virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
+      if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
+        return;
+      }
+      console.warn(`[url-to-markdown] jsdom: ${error.message}`);
+    });
+
+    const dom = new JSDOM(html, { url, virtualConsole });
+    const result = await Defuddle(dom, url, { markdown: true });
+    const markdown = normalizeMarkdown(result.content || "");
+
+    if (!isMarkdownUsable(markdown, html)) {
+      return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
+    }
+
+    return {
+      ok: true,
+      result: {
+        metadata: {
+          ...baseMetadata,
+          title: pickString(result.title, baseMetadata.title) ?? "",
+          description: pickString(result.description, baseMetadata.description) ?? undefined,
+          author: pickString(result.author, baseMetadata.author) ?? undefined,
+          published: pickString(result.published, baseMetadata.published) ?? undefined,
+          coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined,
+          language: pickString(result.language, baseMetadata.language) ?? undefined,
+        },
+        markdown,
+        rawHtml: html,
+        conversionMethod: "defuddle",
+        variables: result.variables,
+      },
+    };
+  } catch (error) {
+    return {
+      ok: false,
+      reason: error instanceof Error ? error.message : String(error),
+    };
+  }
+}
diff --git a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts
index f93af43..34667b6 100644
--- a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts
+++ b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts
@@ -1,911 +1,104 @@
-import { parseHTML } from "linkedom";
-import { Readability } from "@mozilla/readability";
-import TurndownService from "turndown";
-import { gfm } from "turndown-plugin-gfm";
+import {
+  createMarkdownDocument,
+  extractMetadataFromHtml,
+  formatMetadataYaml,
+  type ConversionResult,
+  type PageMetadata,
+  isYouTubeUrl,
+} from "./markdown-conversion-shared.js";
+import { tryDefuddleConversion } from "./defuddle-converter.js";
+import {
+  convertWithLegacyExtractor,
+  scoreMarkdownQuality,
+  shouldCompareWithLegacy,
+} from "./legacy-converter.js";
 
-export interface PageMetadata {
-  url: string;
-  title: string;
-  description?: string;
-  author?: string;
-  published?: string;
-  coverImage?: string;
-  captured_at: string;
-}
-
-export interface ConversionResult {
-  metadata: PageMetadata;
-  markdown: string;
-  rawHtml: string;
-  conversionMethod: string;
-  fallbackReason?: string;
-}
-
-interface ExtractionCandidate {
-  title: string | null;
-  byline: string | null;
-  excerpt: string | null;
-  published: string | null;
-  html: string | null;
-  textContent: string;
-  method: string;
-}
-
-type AnyRecord = Record<string, unknown>;
-
-const MIN_CONTENT_LENGTH = 120;
-const GOOD_CONTENT_LENGTH = 900;
-
-const CONTENT_SELECTORS = [
-  "article",
-  "main article",
-  "[role='main'] article",
-  "[itemprop='articleBody']",
-  ".article-content",
-  ".article-body",
-  ".post-content",
-  ".entry-content",
-  ".story-body",
-  "main",
-  "[role='main']",
-  "#content",
-  ".content",
-];
-
-const REMOVE_SELECTORS = [
-  "script",
-  "style",
-  "noscript",
-  "template",
-  "iframe",
-  "svg",
-  "path",
-  "nav",
-  "aside",
-  "footer",
-  "header",
-  "form",
-  ".advertisement",
-  ".ads",
-  ".social-share",
-  ".related-articles",
-  ".comments",
-  ".newsletter",
-  ".cookie-banner",
-  ".cookie-consent",
-  "[role='navigation']",
-  "[aria-label*='cookie' i]",
-];
-
-const PUBLISHED_TIME_SELECTORS = [
-  "meta[property='article:published_time']",
-  "meta[name='pubdate']",
-  "meta[name='publishdate']",
-  "meta[name='date']",
-  "time[datetime]",
-];
-
-const ARTICLE_TYPES = new Set([
-  "Article",
-  "NewsArticle",
-  "BlogPosting",
-  "WebPage",
-  "ReportageNewsArticle",
-]);
-
-const NEXT_DATA_CONTENT_PATHS = [
-  "props.pageProps.content.body",
-  "props.pageProps.article.body",
-  "props.pageProps.article.content",
-  "props.pageProps.post.body",
-  "props.pageProps.post.content",
-  "props.pageProps.data.body",
-  "props.pageProps.story.body.content",
-];
-
-const LOW_QUALITY_MARKERS = [
-  /Join The Conversation/i,
-  /One Community\. Many Voices/i,
-  /Read our community guidelines/i,
-  /Create a free account to share your thoughts/i,
-  /Become a Forbes Member/i,
-  /Subscribe to trusted journalism/i,
-  /\bComments\b/i,
-];
+export type { ConversionResult, PageMetadata };
+export { createMarkdownDocument, formatMetadataYaml };
 
 export const absolutizeUrlsScript = String.raw`
 (function() {
   const baseUrl = document.baseURI || location.href;
+  const htmlClone = document.documentElement.cloneNode(true);
+
+  function materializeShadowDom(sourceRoot, cloneRoot) {
+    const sourceElements = Array.from(sourceRoot.querySelectorAll("*"));
+    const cloneElements = Array.from(cloneRoot.querySelectorAll("*"));
+
+    for (let i = sourceElements.length - 1; i >= 0; i--) {
+      const sourceEl = sourceElements[i];
+      const cloneEl = cloneElements[i];
+      const shadowRoot = sourceEl && sourceEl.shadowRoot;
+      if (!shadowRoot || !cloneEl || !shadowRoot.innerHTML) continue;
+
+      if (cloneEl.tagName && cloneEl.tagName.includes("-")) {
+        const wrapper = document.createElement("div");
+        wrapper.setAttribute("data-shadow-host", cloneEl.tagName.toLowerCase());
+        wrapper.innerHTML = shadowRoot.innerHTML;
+        cloneEl.replaceWith(wrapper);
+      } else {
+        cloneEl.innerHTML = shadowRoot.innerHTML;
+      }
+    }
+  }
+
   function toAbsolute(url) {
     if (!url) return url;
     try { return new URL(url, baseUrl).href; } catch { return url; }
   }
-  function absAttr(sel, attr) {
-    document.querySelectorAll(sel).forEach(el => {
+
+  function absAttr(root, sel, attr) {
+    root.querySelectorAll(sel).forEach(el => {
       const v = el.getAttribute(attr);
-      if (v) { const a = toAbsolute(v); if (a) el.setAttribute(attr, a); }
+      if (v) {
+        const a = toAbsolute(v);
+        if (a) el.setAttribute(attr, a);
+      }
     });
   }
-  function absSrcset(sel) {
-    document.querySelectorAll(sel).forEach(el => {
+
+  function absSrcset(root, sel) {
+    root.querySelectorAll(sel).forEach(el => {
       const s = el.getAttribute("srcset");
       if (!s) return;
       el.setAttribute("srcset", s.split(",").map(p => {
-        const t = p.trim(); if (!t) return "";
+        const t = p.trim();
+        if (!t) return "";
         const [url, ...d] = t.split(/\s+/);
         return d.length ? toAbsolute(url) + " " + d.join(" ") : toAbsolute(url);
       }).filter(Boolean).join(", "));
     });
   }
-  document.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => {
+
+  materializeShadowDom(document.documentElement, htmlClone);
+
+  htmlClone.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => {
     const ds = el.getAttribute("data-src");
     if (ds && (!el.getAttribute("src") || el.getAttribute("src") === "" || el.getAttribute("src")?.startsWith("data:"))) {
       el.setAttribute("src", ds);
     }
   });
-  absAttr("a[href]", "href");
-  absAttr("img[src], video[src], audio[src], source[src]", "src");
-  absSrcset("img[srcset], source[srcset]");
-  return { html: document.documentElement.outerHTML };
+
+  absAttr(htmlClone, "a[href]", "href");
+  absAttr(htmlClone, "img[src], video[src], audio[src], source[src], iframe[src]", "src");
+  absAttr(htmlClone, "video[poster]", "poster");
+  absSrcset(htmlClone, "img[srcset], source[srcset]");
+
+  return { html: "<!doctype html>\n" + htmlClone.outerHTML };
 })()
 `;
 
-function pickString(...values: unknown[]): string | null {
-  for (const value of values) {
-    if (typeof value === "string") {
-      const trimmed = value.trim();
-      if (trimmed) return trimmed;
-    }
-  }
-  return null;
-}
-
-function normalizeMarkdown(markdown: string): string {
-  return markdown
-    .replace(/\r\n/g, "\n")
-    .replace(/[ \t]+\n/g, "\n")
-    .replace(/\n{3,}/g, "\n\n")
-    .trim();
-}
-
-function parseDocument(html: string): Document {
-  const normalized = /<\s*html[\s>]/i.test(html)
-    ? html
-    : `<!doctype html><html><body>${html}</body></html>`;
-  return parseHTML(normalized).document as unknown as Document;
-}
-
-function sanitizeHtml(html: string): string {
-  const { document } = parseHTML(`<div id="__root">${html}</div>`);
-  const root = document.querySelector("#__root");
-  if (!root) return html;
-
-  for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
-    for (const el of root.querySelectorAll(selector)) {
-      el.remove();
-    }
+function shouldPreferDefuddle(result: ConversionResult): boolean {
+  if (isYouTubeUrl(result.metadata.url)) {
+    return true;
   }
 
-  return root.innerHTML;
-}
-
-function extractTextFromHtml(html: string): string {
-  const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
-  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
-    for (const el of document.querySelectorAll(selector)) {
-      el.remove();
-    }
-  }
-  return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
-}
-
-function getMetaContent(document: Document, names: string[]): string | null {
-  for (const name of names) {
-    const element =
-      document.querySelector(`meta[name="${name}"]`) ??
-      document.querySelector(`meta[property="${name}"]`);
-    const content = element?.getAttribute("content");
-    if (content && content.trim()) return content.trim();
-  }
-  return null;
-}
-
-function flattenJsonLdItems(data: unknown): AnyRecord[] {
-  if (!data || typeof data !== "object") return [];
-  if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
-
-  const item = data as AnyRecord;
-  if (Array.isArray(item["@graph"])) {
-    return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
+  const transcript = result.variables?.transcript?.trim();
+  if (transcript) {
+    return true;
   }
 
-  return [item];
-}
-
-function parseJsonLdScripts(document: Document): AnyRecord[] {
-  const results: AnyRecord[] = [];
-  const scripts = document.querySelectorAll("script[type='application/ld+json']");
-
-  for (const script of scripts) {
-    try {
-      const data = JSON.parse(script.textContent ?? "");
-      results.push(...flattenJsonLdItems(data));
-    } catch {
-      // Ignore malformed blocks.
-    }
-  }
-
-  return results;
-}
-
-function isArticleType(item: AnyRecord): boolean {
-  const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
-  return typeof value === "string" && ARTICLE_TYPES.has(value);
-}
-
-function extractAuthorFromJsonLd(authorData: unknown): string | null {
-  if (typeof authorData === "string") return authorData;
-  if (!authorData || typeof authorData !== "object") return null;
-
-  if (Array.isArray(authorData)) {
-    const names = authorData
-      .map((author) => extractAuthorFromJsonLd(author))
-      .filter((name): name is string => Boolean(name));
-    return names.length > 0 ? names.join(", ") : null;
-  }
-
-  const author = authorData as AnyRecord;
-  return typeof author.name === "string" ? author.name : null;
-}
-
-function extractPrimaryJsonLdMeta(document: Document): Partial<PageMetadata> {
-  for (const item of parseJsonLdScripts(document)) {
-    if (!isArticleType(item)) continue;
-
-    return {
-      title: pickString(item.headline, item.name) ?? undefined,
-      description: pickString(item.description) ?? undefined,
-      author: extractAuthorFromJsonLd(item.author) ?? undefined,
-      published: pickString(item.datePublished, item.dateCreated) ?? undefined,
-      coverImage:
-        pickString(
-          item.image,
-          (item.image as AnyRecord | undefined)?.url,
-          (Array.isArray(item.image) ? item.image[0] : undefined) as unknown
-        ) ?? undefined,
-    };
-  }
-
-  return {};
-}
-
-function extractPublishedTime(document: Document): string | null {
-  for (const selector of PUBLISHED_TIME_SELECTORS) {
-    const el = document.querySelector(selector);
-    if (!el) continue;
-    const value = el.getAttribute("content") ?? el.getAttribute("datetime");
-    if (value && value.trim()) return value.trim();
-  }
-  return null;
-}
-
-function extractTitle(document: Document): string | null {
-  const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content");
-  if (ogTitle && ogTitle.trim()) return ogTitle.trim();
-
-  const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content");
-  if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim();
-
-  const title = document.querySelector("title")?.textContent?.trim();
-  if (title) {
-    const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim();
-    if (cleaned) return cleaned;
-  }
-
-  const h1 = document.querySelector("h1")?.textContent?.trim();
-  return h1 || null;
-}
-
-function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata {
-  const document = parseDocument(html);
-  const jsonLd = extractPrimaryJsonLdMeta(document);
-  const timeEl = document.querySelector("time[datetime]");
-
-  return {
-    url,
-    title:
-      pickString(
-        getMetaContent(document, ["og:title", "twitter:title"]),
-        jsonLd.title,
-        document.querySelector("h1")?.textContent,
-        document.title
-      ) ?? "",
-    description:
-      pickString(
-        getMetaContent(document, ["description", "og:description", "twitter:description"]),
-        jsonLd.description
-      ) ?? undefined,
-    author:
-      pickString(
-        getMetaContent(document, ["author", "article:author", "twitter:creator"]),
-        jsonLd.author
-      ) ?? undefined,
-    published:
-      pickString(
-        timeEl?.getAttribute("datetime"),
-        getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
-        jsonLd.published,
-        extractPublishedTime(document)
-      ) ?? undefined,
-    coverImage:
-      pickString(
-        getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
-        jsonLd.coverImage
-      ) ?? undefined,
-    captured_at: capturedAt,
-  };
-}
-
-function generateExcerpt(excerpt: string | null, textContent: string | null): string | null {
-  if (excerpt) return excerpt;
-  if (!textContent) return null;
-  const trimmed = textContent.trim();
-  if (!trimmed) return null;
-  return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed;
-}
-
-function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null {
-  if (!isArticleType(item)) return null;
-
-  const rawContent =
-    (typeof item.articleBody === "string" && item.articleBody) ||
-    (typeof item.text === "string" && item.text) ||
-    (typeof item.description === "string" && item.description) ||
-    null;
-
-  if (!rawContent) return null;
-
-  const content = rawContent.trim();
-  const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content);
-  const textContent = htmlLike ? extractTextFromHtml(content) : content;
-
-  if (textContent.length < MIN_CONTENT_LENGTH) return null;
-
-  return {
-    title: pickString(item.headline, item.name),
-    byline: extractAuthorFromJsonLd(item.author),
-    excerpt: pickString(item.description),
-    published: pickString(item.datePublished, item.dateCreated),
-    html: htmlLike ? content : null,
-    textContent,
-    method: "json-ld",
-  };
-}
-
-function tryJsonLdExtraction(document: Document): ExtractionCandidate | null {
-  for (const item of parseJsonLdScripts(document)) {
-    const extracted = parseJsonLdItem(item);
-    if (extracted) return extracted;
-  }
-  return null;
-}
-
-function getByPath(value: unknown, path: string): unknown {
-  let current = value;
-  for (const part of path.split(".")) {
-    if (!current || typeof current !== "object") return undefined;
-    current = (current as AnyRecord)[part];
-  }
-  return current;
-}
-
-function isContentBlockArray(value: unknown): value is AnyRecord[] {
-  if (!Array.isArray(value) || value.length === 0) return false;
-  return value.slice(0, 5).some((item) => {
-    if (!item || typeof item !== "object") return false;
-    const obj = item as AnyRecord;
-    return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj;
-  });
-}
-
-function extractTextFromContentBlocks(blocks: AnyRecord[]): string {
-  const parts: string[] = [];
-
-  function pushParagraph(text: string): void {
-    const trimmed = text.trim();
-    if (!trimmed) return;
-    parts.push(trimmed, "\n\n");
-  }
-
-  function walk(node: unknown): void {
-    if (!node || typeof node !== "object") return;
-    const block = node as AnyRecord;
-
-    if (typeof block.text === "string") {
-      pushParagraph(block.text);
-      return;
-    }
-
-    if (typeof block.textHtml === "string") {
-      pushParagraph(extractTextFromHtml(block.textHtml));
-      return;
-    }
-
-    if (Array.isArray(block.items)) {
-      for (const item of block.items) {
-        if (item && typeof item === "object") {
-          const text = pickString((item as AnyRecord).text);
-          if (text) parts.push(`- ${text}\n`);
-        }
-      }
-      parts.push("\n");
-    }
-
-    if (Array.isArray(block.components)) {
-      for (const component of block.components) {
-        walk(component);
-      }
-    }
-
-    if (Array.isArray(block.content)) {
-      for (const child of block.content) {
-        walk(child);
-      }
-    }
-  }
-
-  for (const block of blocks) {
-    walk(block);
-  }
-
-  return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
-}
-
-function tryStringBodyExtraction(
-  content: string,
-  meta: AnyRecord,
-  document: Document,
-  method: string
-): ExtractionCandidate | null {
-  if (!content || content.length < MIN_CONTENT_LENGTH) return null;
-
-  const isHtml = /<\/?[a-z][\s\S]*>/i.test(content);
-  const html = isHtml ? sanitizeHtml(content) : null;
-  const textContent = isHtml ? extractTextFromHtml(html) : content.trim();
-
-  if (textContent.length < MIN_CONTENT_LENGTH) return null;
-
-  return {
-    title: pickString(meta.headline, meta.title, extractTitle(document)),
-    byline: pickString(meta.byline, meta.author),
-    excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)),
-    published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)),
-    html,
-    textContent,
-    method,
-  };
-}
-
-function tryNextDataExtraction(document: Document): ExtractionCandidate | null {
-  try {
-    const script = document.querySelector("script#__NEXT_DATA__");
-    if (!script?.textContent) return null;
-
-    const data = JSON.parse(script.textContent) as AnyRecord;
-    const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord;
-
-    for (const path of NEXT_DATA_CONTENT_PATHS) {
-      const value = getByPath(data, path);
-
-      if (typeof value === "string") {
-        const parentPath = path.split(".").slice(0, -1).join(".");
-        const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord;
-        const meta = {
-          ...pageProps,
-          ...parent,
-          title: parent.title ?? (pageProps.title as string | undefined),
-        };
-
-        const candidate = tryStringBodyExtraction(value, meta, document, "next-data");
-        if (candidate) return candidate;
-      }
-
-      if (isContentBlockArray(value)) {
-        const textContent = extractTextFromContentBlocks(value);
-        if (textContent.length < MIN_CONTENT_LENGTH) continue;
-
-        return {
-          title: pickString(
-            getByPath(data, "props.pageProps.content.headline"),
-            getByPath(data, "props.pageProps.article.headline"),
-            getByPath(data, "props.pageProps.article.title"),
-            getByPath(data, "props.pageProps.post.title"),
-            pageProps.title,
-            extractTitle(document)
-          ),
-          byline: pickString(
-            getByPath(data, "props.pageProps.author.name"),
-            getByPath(data, "props.pageProps.article.author.name")
-          ),
-          excerpt: pickString(
-            getByPath(data, "props.pageProps.content.description"),
-            getByPath(data, "props.pageProps.article.description"),
-            pageProps.description,
-            generateExcerpt(null, textContent)
-          ),
-          published: pickString(
-            getByPath(data, "props.pageProps.content.datePublished"),
-            getByPath(data, "props.pageProps.article.datePublished"),
-            getByPath(data, "props.pageProps.publishedAt"),
-            extractPublishedTime(document)
-          ),
-          html: null,
-          textContent,
-          method: "next-data",
-        };
-      }
-    }
-  } catch {
-    return null;
-  }
-
-  return null;
-}
-
-function buildReadabilityCandidate(
-  article: ReturnType<Readability["parse"]>,
-  document: Document,
-  method: string
-): ExtractionCandidate | null {
-  const textContent = article?.textContent?.trim() ?? "";
-  if (textContent.length < MIN_CONTENT_LENGTH) return null;
-
-  return {
-    title: pickString(article?.title, extractTitle(document)),
-    byline: pickString((article as { byline?: string } | null)?.byline),
-    excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)),
-    published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)),
-    html: article?.content ? sanitizeHtml(article.content) : null,
-    textContent,
-    method,
-  };
-}
-
-function tryReadability(document: Document): ExtractionCandidate | null {
-  try {
-    const strictClone = document.cloneNode(true) as Document;
-    const strictResult = buildReadabilityCandidate(
-      new Readability(strictClone).parse(),
-      document,
-      "readability"
-    );
-    if (strictResult) return strictResult;
-
-    const relaxedClone = document.cloneNode(true) as Document;
-    return buildReadabilityCandidate(
-      new Readability(relaxedClone, { charThreshold: 120 }).parse(),
-      document,
-      "readability-relaxed"
-    );
-  } catch {
-    return null;
-  }
-}
-
-function trySelectorExtraction(document: Document): ExtractionCandidate | null {
-  for (const selector of CONTENT_SELECTORS) {
-    const element = document.querySelector(selector);
-    if (!element) continue;
-
-    const clone = element.cloneNode(true) as Element;
-    for (const removeSelector of REMOVE_SELECTORS) {
-      for (const node of clone.querySelectorAll(removeSelector)) {
-        node.remove();
-      }
-    }
-
-    const html = sanitizeHtml(clone.innerHTML);
-    const textContent = extractTextFromHtml(html);
-    if (textContent.length < MIN_CONTENT_LENGTH) continue;
-
-    return {
-      title: extractTitle(document),
-      byline: null,
-      excerpt: generateExcerpt(null, textContent),
-      published: extractPublishedTime(document),
-      html,
-      textContent,
-      method: `selector:${selector}`,
-    };
-  }
-
-  return null;
-}
-
-function tryBodyExtraction(document: Document): ExtractionCandidate | null {
-  const body = document.body;
-  if (!body) return null;
-
-  const clone = body.cloneNode(true) as Element;
-  for (const removeSelector of REMOVE_SELECTORS) {
-    for (const node of clone.querySelectorAll(removeSelector)) {
-      node.remove();
-    }
-  }
-
-  const html = sanitizeHtml(clone.innerHTML);
-  const textContent = extractTextFromHtml(html);
-  if (!textContent) return null;
-
-  return {
-    title: extractTitle(document),
-    byline: null,
-    excerpt: generateExcerpt(null, textContent),
-    published: extractPublishedTime(document),
-    html,
-    textContent,
-    method: "body-fallback",
-  };
-}
-
-function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null {
-  if (candidates.length === 0) return null;
-
-  const methodOrder = [
-    "readability",
-    "readability-relaxed",
-    "next-data",
-    "json-ld",
-    "selector:",
-    "body-fallback",
-  ];
-
-  function methodRank(method: string): number {
-    const idx = methodOrder.findIndex((entry) =>
-      entry.endsWith(":") ? method.startsWith(entry) : method === entry
-    );
-    return idx === -1 ? methodOrder.length : idx;
-  }
-
-  const ranked = [...candidates].sort((a, b) => {
-    const rankA = methodRank(a.method);
-    const rankB = methodRank(b.method);
-    if (rankA !== rankB) return rankA - rankB;
-    return (b.textContent.length ?? 0) - (a.textContent.length ?? 0);
-  });
-
-  for (const candidate of ranked) {
-    if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) {
-      return candidate;
-    }
-  }
-
-  for (const candidate of ranked) {
-    if (candidate.textContent.length >= MIN_CONTENT_LENGTH) {
-      return candidate;
-    }
-  }
-
-  return ranked[0];
-}
-
-function extractFromHtml(html: string): ExtractionCandidate | null {
-  const document = parseDocument(html);
-
-  const readabilityCandidate = tryReadability(document);
-  const nextDataCandidate = tryNextDataExtraction(document);
-  const jsonLdCandidate = tryJsonLdExtraction(document);
-  const selectorCandidate = trySelectorExtraction(document);
-  const bodyCandidate = tryBodyExtraction(document);
-
-  const candidates = [
-    readabilityCandidate,
-    nextDataCandidate,
-    jsonLdCandidate,
-    selectorCandidate,
-    bodyCandidate,
-  ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate));
-
-  const winner = pickBestCandidate(candidates);
-  if (!winner) return null;
-
-  return {
-    ...winner,
-    title: winner.title ?? extractTitle(document),
-    published: winner.published ?? extractPublishedTime(document),
-    excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent),
-  };
-}
-
-const turndown = new TurndownService({
-  headingStyle: "atx",
-  hr: "---",
-  bulletListMarker: "-",
-  codeBlockStyle: "fenced",
-  emDelimiter: "*",
-  strongDelimiter: "**",
-  linkStyle: "inlined",
-});
-
-turndown.use(gfm);
-turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);
-
-turndown.addRule("collapseFigure", {
-  filter: "figure",
-  replacement(content) {
-    return `\n\n${content.trim()}\n\n`;
-  },
-});
-
-turndown.addRule("dropInvisibleAnchors", {
-  filter(node) {
-    return node.nodeName === "A" && !(node as Element).textContent?.trim();
-  },
-  replacement() {
-    return "";
-  },
-});
-
-function convertHtmlToMarkdown(html: string): string {
-  if (!html || !html.trim()) return "";
-
-  try {
-    const sanitized = sanitizeHtml(html);
-    return turndown.turndown(sanitized);
-  } catch {
-    return "";
-  }
-}
-
-function fallbackPlainText(html: string): string {
-  const document = parseDocument(html);
-  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
-    for (const el of document.querySelectorAll(selector)) {
-      el.remove();
-    }
-  }
-  const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
-  return normalizeMarkdown(text.replace(/\s+/g, " "));
-}
-
-function countBylines(markdown: string): number {
-  return (markdown.match(/(^|\n)By\s+/g) || []).length;
-}
-
-function countUsefulParagraphs(markdown: string): number {
-  const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
-  let count = 0;
-
-  for (const paragraph of paragraphs) {
-    const trimmed = paragraph.trim();
-    if (!trimmed) continue;
-    if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue;
-    if (/^#{1,6}\s+/.test(trimmed)) continue;
-    if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue;
-    count++;
-  }
-
-  return count;
-}
-
-function countMarkerHits(markdown: string, markers: RegExp[]): number {
-  let hits = 0;
-  for (const marker of markers) {
-    if (marker.test(markdown)) hits++;
-  }
-  return hits;
-}
-
-function scoreMarkdownQuality(markdown: string): number {
-  const normalized = normalizeMarkdown(markdown);
-  const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
-  const usefulParagraphs = countUsefulParagraphs(normalized);
-  const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
-  const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
-  const bylineCount = countBylines(normalized);
-  const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length;
-
-  return (
-    Math.min(wordCount, 4000) +
-    usefulParagraphs * 40 +
-    headingCount * 10 -
-    markerHits * 180 -
-    Math.max(0, bylineCount - 1) * 120 -
-    Math.max(0, staffCount - 1) * 80
-  );
-}
-
-function shouldCompareWithLegacy(markdown: string): boolean {
-  const normalized = normalizeMarkdown(markdown);
-  return (
-    countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 ||
-    countBylines(normalized) > 1 ||
-    countUsefulParagraphs(normalized) < 6
-  );
-}
-
-function isMarkdownUsable(markdown: string, html: string): boolean {
-  const normalized = normalizeMarkdown(markdown);
-  if (!normalized) return false;
-
-  const htmlTextLength = extractTextFromHtml(html).length;
-  if (htmlTextLength < MIN_CONTENT_LENGTH) return true;
-
-  if (normalized.length >= 80) return true;
-  return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
-}
-
-async function tryDefuddleConversion(
-  html: string,
-  url: string,
-  baseMetadata: PageMetadata
-): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> {
-  try {
-    const [{ JSDOM, VirtualConsole }, { Defuddle }] = await Promise.all([
-      import("jsdom"),
-      import("defuddle/node"),
-    ]);
-
-    const virtualConsole = new VirtualConsole();
-    virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
-      if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
-        return;
-      }
-      console.warn(`[url-to-markdown] jsdom: ${error.message}`);
-    });
-
-    const dom = new JSDOM(html, { url, virtualConsole });
-    const result = await Defuddle(dom, url, { markdown: true });
-    const markdown = normalizeMarkdown(result.content || "");
-
-    if (!isMarkdownUsable(markdown, html)) {
-      return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
-    }
-
-    return {
-      ok: true,
-      result: {
-        metadata: {
-          ...baseMetadata,
-          title: pickString(result.title, baseMetadata.title) ?? "",
-          description: pickString(result.description, baseMetadata.description) ?? undefined,
-          author: pickString(result.author, baseMetadata.author) ?? undefined,
-          published: pickString(result.published, baseMetadata.published) ?? undefined,
-          coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined,
-        },
-        markdown,
-        rawHtml: html,
-        conversionMethod: "defuddle",
-      },
-    };
-  } catch (error) {
-    return {
-      ok: false,
-      reason: error instanceof Error ? error.message : String(error),
-    };
-  }
-}
-
-function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult {
-  const extracted = extractFromHtml(html);
-
-  let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : "";
-  if (!markdown.trim()) {
-    markdown = extracted?.textContent?.trim() || fallbackPlainText(html);
-  }
-
-  return {
-    metadata: {
-      ...baseMetadata,
-      title: pickString(extracted?.title, baseMetadata.title) ?? "",
-      description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined,
-      author: pickString(extracted?.byline, baseMetadata.author) ?? undefined,
-      published: pickString(extracted?.published, baseMetadata.published) ?? undefined,
-    },
-    markdown: normalizeMarkdown(markdown),
-    rawHtml: html,
-    conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text",
-  };
+  return /^##?\s+transcript\b/im.test(result.markdown);
 }
 
 export async function extractContent(html: string, url: string): Promise<ConversionResult> {
@@ -914,6 +107,10 @@ export async function extractContent(html: string, url: string): Promise<Convers
 
   const defuddleResult = await tryDefuddleConversion(html, url, baseMetadata);
   if (defuddleResult.ok) {
+    if (shouldPreferDefuddle(defuddleResult.result)) {
+      return defuddleResult.result;
+    }
+
     if (shouldCompareWithLegacy(defuddleResult.result.markdown)) {
       const legacyResult = convertWithLegacyExtractor(html, baseMetadata);
       const legacyScore = scoreMarkdownQuality(legacyResult.markdown);
@@ -936,29 +133,3 @@ export async function extractContent(html: string, url: string): Promise<Convers
     fallbackReason: defuddleResult.reason,
   };
 }
-
-function escapeYamlValue(value: string): string {
-  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n");
-}
-
-export function formatMetadataYaml(meta: PageMetadata): string {
-  const lines = ["---"];
-  lines.push(`url: ${meta.url}`);
-  lines.push(`title: "${escapeYamlValue(meta.title)}"`);
-  if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
-  if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
-  if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
-  if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
-  lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
-  lines.push("---");
-  return lines.join("\n");
-}
-
-export function createMarkdownDocument(result: ConversionResult): string {
-  const yaml = formatMetadataYaml(result.metadata);
-  const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
-  const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
-  const hasTitle = titleRegex.test(result.markdown.trimStart());
-  const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
-  return yaml + title + result.markdown;
-}
diff --git a/skills/baoyu-url-to-markdown/scripts/legacy-converter.ts b/skills/baoyu-url-to-markdown/scripts/legacy-converter.ts
new file mode 100644
index 0000000..bb4612c
--- /dev/null
+++ b/skills/baoyu-url-to-markdown/scripts/legacy-converter.ts
@@ -0,0 +1,629 @@
+import { Readability } from "@mozilla/readability";
+import TurndownService from "turndown";
+import { gfm } from "turndown-plugin-gfm";
+
+import {
+  type AnyRecord,
+  type ConversionResult,
+  type PageMetadata,
+  GOOD_CONTENT_LENGTH,
+  MIN_CONTENT_LENGTH,
+  extractPublishedTime,
+  extractTextFromHtml,
+  extractTitle,
+  normalizeMarkdown,
+  parseDocument,
+  pickString,
+  sanitizeHtml,
+} from "./markdown-conversion-shared.js";
+
+interface ExtractionCandidate {
+  title: string | null;
+  byline: string | null;
+  excerpt: string | null;
+  published: string | null;
+  html: string | null;
+  textContent: string;
+  method: string;
+}
+
+const CONTENT_SELECTORS = [
+  "article",
+  "main article",
+  "[role='main'] article",
+  "[itemprop='articleBody']",
+  ".article-content",
+  ".article-body",
+  ".post-content",
+  ".entry-content",
+  ".story-body",
+  "main",
+  "[role='main']",
+  "#content",
+  ".content",
+];
+
+const REMOVE_SELECTORS = [
+  "script",
+  "style",
+  "noscript",
+  "template",
+  "iframe",
+  "svg",
+  "path",
+  "nav",
+  "aside",
+  "footer",
+  "header",
+  "form",
+  ".advertisement",
+  ".ads",
+  ".social-share",
+  ".related-articles",
+  ".comments",
+  ".newsletter",
+  ".cookie-banner",
+  ".cookie-consent",
+  "[role='navigation']",
+  "[aria-label*='cookie' i]",
+];
+
+const NEXT_DATA_CONTENT_PATHS = [
+  "props.pageProps.content.body",
+  "props.pageProps.article.body",
+  "props.pageProps.article.content",
+  "props.pageProps.post.body",
+  "props.pageProps.post.content",
+  "props.pageProps.data.body",
+  "props.pageProps.story.body.content",
+];
+
+const LOW_QUALITY_MARKERS = [
+  /Join The Conversation/i,
+  /One Community\. Many Voices/i,
+  /Read our community guidelines/i,
+  /Create a free account to share your thoughts/i,
+  /Become a Forbes Member/i,
+  /Subscribe to trusted journalism/i,
+  /\bComments\b/i,
+];
+
+function generateExcerpt(excerpt: string | null, textContent: string | null): string | null {
+  if (excerpt) return excerpt;
+  if (!textContent) return null;
+  const trimmed = textContent.trim();
+  if (!trimmed) return null;
+  return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed;
+}
+
+function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null {
+  const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
+  if (typeof type !== "string" || !["Article", "NewsArticle", "BlogPosting", "WebPage", "ReportageNewsArticle"].includes(type)) {
+    return null;
+  }
+
+  const rawContent =
+    (typeof item.articleBody === "string" && item.articleBody) ||
+    (typeof item.text === "string" && item.text) ||
+    (typeof item.description === "string" && item.description) ||
+    null;
+
+  if (!rawContent) return null;
+
+  const content = rawContent.trim();
+  const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content);
+  const textContent = htmlLike ? extractTextFromHtml(content) : content;
+
+  if (textContent.length < MIN_CONTENT_LENGTH) return null;
+
+  return {
+    title: pickString(item.headline, item.name),
+    byline: extractAuthorFromJsonLd(item.author),
+    excerpt: pickString(item.description),
+    published: pickString(item.datePublished, item.dateCreated),
+    html: htmlLike ? content : null,
+    textContent,
+    method: "json-ld",
+  };
+}
+
+function extractAuthorFromJsonLd(authorData: unknown): string | null {
+  if (typeof authorData === "string") return authorData;
+  if (!authorData || typeof authorData !== "object") return null;
+
+  if (Array.isArray(authorData)) {
+    const names = authorData
+      .map((author) => extractAuthorFromJsonLd(author))
+      .filter((name): name is string => Boolean(name));
+    return names.length > 0 ? names.join(", ") : null;
+  }
+
+  const author = authorData as AnyRecord;
+  return typeof author.name === "string" ? author.name : null;
+}
+
+function flattenJsonLdItems(data: unknown): AnyRecord[] {
+  if (!data || typeof data !== "object") return [];
+  if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
+
+  const item = data as AnyRecord;
+  if (Array.isArray(item["@graph"])) {
+    return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
+  }
+
+  return [item];
+}
+
+function tryJsonLdExtraction(document: Document): ExtractionCandidate | null {
+  const scripts = document.querySelectorAll("script[type='application/ld+json']");
+
+  for (const script of scripts) {
+    try {
+      const data = JSON.parse(script.textContent ?? "");
+      for (const item of flattenJsonLdItems(data)) {
+        const extracted = parseJsonLdItem(item);
+        if (extracted) return extracted;
+      }
+    } catch {
+      // Ignore malformed blocks.
+    }
+  }
+
+  return null;
+}
+
+function getByPath(value: unknown, path: string): unknown {
+  let current = value;
+  for (const part of path.split(".")) {
+    if (!current || typeof current !== "object") return undefined;
+    current = (current as AnyRecord)[part];
+  }
+  return current;
+}
+
+function isContentBlockArray(value: unknown): value is AnyRecord[] {
+  if (!Array.isArray(value) || value.length === 0) return false;
+  return value.slice(0, 5).some((item) => {
+    if (!item || typeof item !== "object") return false;
+    const obj = item as AnyRecord;
+    return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj;
+  });
+}
+
+function extractTextFromContentBlocks(blocks: AnyRecord[]): string {
+  const parts: string[] = [];
+
+  function pushParagraph(text: string): void {
+    const trimmed = text.trim();
+    if (!trimmed) return;
+    parts.push(trimmed, "\n\n");
+  }
+
+  function walk(node: unknown): void {
+    if (!node || typeof node !== "object") return;
+    const block = node as AnyRecord;
+
+    if (typeof block.text === "string") {
+      pushParagraph(block.text);
+      return;
+    }
+
+    if (typeof block.textHtml === "string") {
+      pushParagraph(extractTextFromHtml(block.textHtml));
+      return;
+    }
+
+    if (Array.isArray(block.items)) {
+      for (const item of block.items) {
+        if (item && typeof item === "object") {
+          const text = pickString((item as AnyRecord).text);
+          if (text) parts.push(`- ${text}\n`);
+        }
+      }
+      parts.push("\n");
+    }
+
+    if (Array.isArray(block.components)) {
+      for (const component of block.components) {
+        walk(component);
+      }
+    }
+
+    if (Array.isArray(block.content)) {
+      for (const child of block.content) {
+        walk(child);
+      }
+    }
+  }
+
+  for (const block of blocks) {
+    walk(block);
+  }
+
+  return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
+}
+
+function tryStringBodyExtraction(
+  content: string,
+  meta: AnyRecord,
+  document: Document,
+  method: string
+): ExtractionCandidate | null {
+  if (!content || content.length < MIN_CONTENT_LENGTH) return null;
+
+  const isHtml = /<\/?[a-z][\s\S]*>/i.test(content);
+  const html = isHtml ? sanitizeHtml(content) : null;
+  const textContent = isHtml ? extractTextFromHtml(html) : content.trim();
+
+  if (textContent.length < MIN_CONTENT_LENGTH) return null;
+
+  return {
+    title: pickString(meta.headline, meta.title, extractTitle(document)),
+    byline: pickString(meta.byline, meta.author),
+    excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)),
+    published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)),
+    html,
+    textContent,
+    method,
+  };
+}
+
+function tryNextDataExtraction(document: Document): ExtractionCandidate | null {
+  try {
+    const script = document.querySelector("script#__NEXT_DATA__");
+    if (!script?.textContent) return null;
+
+    const data = JSON.parse(script.textContent) as AnyRecord;
+    const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord;
+
+    for (const path of NEXT_DATA_CONTENT_PATHS) {
+      const value = getByPath(data, path);
+
+      if (typeof value === "string") {
+        const parentPath = path.split(".").slice(0, -1).join(".");
+        const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord;
+        const meta = {
+          ...pageProps,
+          ...parent,
+          title: parent.title ?? (pageProps.title as string | undefined),
+        };
+
+        const candidate = tryStringBodyExtraction(value, meta, document, "next-data");
+        if (candidate) return candidate;
+      }
+
+      if (isContentBlockArray(value)) {
+        const textContent = extractTextFromContentBlocks(value);
+        if (textContent.length < MIN_CONTENT_LENGTH) continue;
+
+        return {
+          title: pickString(
+            getByPath(data, "props.pageProps.content.headline"),
+            getByPath(data, "props.pageProps.article.headline"),
+            getByPath(data, "props.pageProps.article.title"),
+            getByPath(data, "props.pageProps.post.title"),
+            pageProps.title,
+            extractTitle(document)
+          ),
+          byline: pickString(
+            getByPath(data, "props.pageProps.author.name"),
+            getByPath(data, "props.pageProps.article.author.name")
+          ),
+          excerpt: pickString(
+            getByPath(data, "props.pageProps.content.description"),
+            getByPath(data, "props.pageProps.article.description"),
+            pageProps.description,
+            generateExcerpt(null, textContent)
+          ),
+          published: pickString(
+            getByPath(data, "props.pageProps.content.datePublished"),
+            getByPath(data, "props.pageProps.article.datePublished"),
+            getByPath(data, "props.pageProps.publishedAt"),
+            extractPublishedTime(document)
+          ),
+          html: null,
+          textContent,
+          method: "next-data",
+        };
+      }
+    }
+  } catch {
+    return null;
+  }
+
+  return null;
+}
+
+function buildReadabilityCandidate(
+  article: ReturnType<Readability["parse"]>,
+  document: Document,
+  method: string
+): ExtractionCandidate | null {
+  const textContent = article?.textContent?.trim() ?? "";
+  if (textContent.length < MIN_CONTENT_LENGTH) return null;
+
+  return {
+    title: pickString(article?.title, extractTitle(document)),
+    byline: pickString((article as { byline?: string } | null)?.byline),
+    excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)),
+    published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)),
+    html: article?.content ? sanitizeHtml(article.content) : null,
+    textContent,
+    method,
+  };
+}
+
+function tryReadability(document: Document): ExtractionCandidate | null {
+  try {
+    const strictClone = document.cloneNode(true) as Document;
+    const strictResult = buildReadabilityCandidate(
+      new Readability(strictClone).parse(),
+      document,
+      "readability"
+    );
+    if (strictResult) return strictResult;
+
+    const relaxedClone = document.cloneNode(true) as Document;
+    return buildReadabilityCandidate(
+      new Readability(relaxedClone, { charThreshold: 120 }).parse(),
+      document,
+      "readability-relaxed"
+    );
+  } catch {
+    return null;
+  }
+}
+
+function trySelectorExtraction(document: Document): ExtractionCandidate | null {
+  for (const selector of CONTENT_SELECTORS) {
+    const element = document.querySelector(selector);
+    if (!element) continue;
+
+    const clone = element.cloneNode(true) as Element;
+    for (const removeSelector of REMOVE_SELECTORS) {
+      for (const node of clone.querySelectorAll(removeSelector)) {
+        node.remove();
+      }
+    }
+
+    const html = sanitizeHtml(clone.innerHTML);
+    const textContent = extractTextFromHtml(html);
+    if (textContent.length < MIN_CONTENT_LENGTH) continue;
+
+    return {
+      title: extractTitle(document),
+      byline: null,
+      excerpt: generateExcerpt(null, textContent),
+      published: extractPublishedTime(document),
+      html,
+      textContent,
+      method: `selector:${selector}`,
+    };
+  }
+
+  return null;
+}
+
+function tryBodyExtraction(document: Document): ExtractionCandidate | null {
+  const body = document.body;
+  if (!body) return null;
+
+  const clone = body.cloneNode(true) as Element;
+  for (const removeSelector of REMOVE_SELECTORS) {
+    for (const node of clone.querySelectorAll(removeSelector)) {
+      node.remove();
+    }
+  }
+
+  const html = sanitizeHtml(clone.innerHTML);
+  const textContent = extractTextFromHtml(html);
+  if (!textContent) return null;
+
+  return {
+    title: extractTitle(document),
+    byline: null,
+    excerpt: generateExcerpt(null, textContent),
+    published: extractPublishedTime(document),
+    html,
+    textContent,
+    method: "body-fallback",
+  };
+}
+
+function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null {
+  if (candidates.length === 0) return null;
+
+  const methodOrder = [
+    "readability",
+    "readability-relaxed",
+    "next-data",
+    "json-ld",
+    "selector:",
+    "body-fallback",
+  ];
+
+  function methodRank(method: string): number {
+    const idx = methodOrder.findIndex((entry) =>
+      entry.endsWith(":") ? method.startsWith(entry) : method === entry
+    );
+    return idx === -1 ? methodOrder.length : idx;
+  }
+
+  const ranked = [...candidates].sort((a, b) => {
+    const rankA = methodRank(a.method);
+    const rankB = methodRank(b.method);
+    if (rankA !== rankB) return rankA - rankB;
+    return (b.textContent.length ?? 0) - (a.textContent.length ?? 0);
+  });
+
+  for (const candidate of ranked) {
+    if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) {
+      return candidate;
+    }
+  }
+
+  for (const candidate of ranked) {
+    if (candidate.textContent.length >= MIN_CONTENT_LENGTH) {
+      return candidate;
+    }
+  }
+
+  return ranked[0];
+}
+
+function extractFromHtml(html: string): ExtractionCandidate | null {
+  const document = parseDocument(html);
+
+  const readabilityCandidate = tryReadability(document);
+  const nextDataCandidate = tryNextDataExtraction(document);
+  const jsonLdCandidate = tryJsonLdExtraction(document);
+  const selectorCandidate = trySelectorExtraction(document);
+  const bodyCandidate = tryBodyExtraction(document);
+
+  const candidates = [
+    readabilityCandidate,
+    nextDataCandidate,
+    jsonLdCandidate,
+    selectorCandidate,
+    bodyCandidate,
+  ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate));
+
+  const winner = pickBestCandidate(candidates);
+  if (!winner) return null;
+
+  return {
+    ...winner,
+    title: winner.title ?? extractTitle(document),
+    published: winner.published ?? extractPublishedTime(document),
+    excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent),
+  };
+}
+
+const turndown = new TurndownService({
+  headingStyle: "atx",
+  hr: "---",
+  bulletListMarker: "-",
+  codeBlockStyle: "fenced",
+  emDelimiter: "*",
+  strongDelimiter: "**",
+  linkStyle: "inlined",
+});
+
+turndown.use(gfm);
+turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);
+
+turndown.addRule("collapseFigure", {
+  filter: "figure",
+  replacement(content) {
+    return `\n\n${content.trim()}\n\n`;
+  },
+});
+
+turndown.addRule("dropInvisibleAnchors", {
+  filter(node) {
+    return node.nodeName === "A" && !(node as Element).textContent?.trim();
+  },
+  replacement() {
+    return "";
+  },
+});
+
+function convertHtmlToMarkdown(html: string): string {
+  if (!html || !html.trim()) return "";
+
+  try {
+    const sanitized = sanitizeHtml(html);
+    return turndown.turndown(sanitized);
+  } catch {
+    return "";
+  }
+}
+
+function fallbackPlainText(html: string): string {
+  const document = parseDocument(html);
+  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
+    for (const el of document.querySelectorAll(selector)) {
+      el.remove();
+    }
+  }
+  const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
+  return normalizeMarkdown(text.replace(/\s+/g, " "));
+}
+
+function countBylines(markdown: string): number {
+  return (markdown.match(/(^|\n)By\s+/g) || []).length;
+}
+
+function countUsefulParagraphs(markdown: string): number {
+  const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
+  let count = 0;
+
+  for (const paragraph of paragraphs) {
+    const trimmed = paragraph.trim();
+    if (!trimmed) continue;
+    if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue;
+    if (/^#{1,6}\s+/.test(trimmed)) continue;
+    if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue;
+    count++;
+  }
+
+  return count;
+}
+
+function countMarkerHits(markdown: string, markers: RegExp[]): number {
+  let hits = 0;
+  for (const marker of markers) {
+    if (marker.test(markdown)) hits++;
+  }
+  return hits;
+}
+
+export function scoreMarkdownQuality(markdown: string): number {
+  const normalized = normalizeMarkdown(markdown);
+  const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
+  const usefulParagraphs = countUsefulParagraphs(normalized);
+  const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
+  const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
+  const bylineCount = countBylines(normalized);
+  const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length;
+
+  return (
+    Math.min(wordCount, 4000) +
+    usefulParagraphs * 40 +
+    headingCount * 10 -
+    markerHits * 180 -
+    Math.max(0, bylineCount - 1) * 120 -
+    Math.max(0, staffCount - 1) * 80
+  );
+}
+
+export function shouldCompareWithLegacy(markdown: string): boolean {
+  const normalized = normalizeMarkdown(markdown);
+  return (
+    countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 ||
+    countBylines(normalized) > 1 ||
+    countUsefulParagraphs(normalized) < 6
+  );
+}
+
+export function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult {
+  const extracted = extractFromHtml(html);
+
+  let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : "";
+  if (!markdown.trim()) {
+    markdown = extracted?.textContent?.trim() || fallbackPlainText(html);
+  }
+
+  return {
+    metadata: {
+      ...baseMetadata,
+      title: pickString(extracted?.title, baseMetadata.title) ?? "",
+      description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined,
+      author: pickString(extracted?.byline, baseMetadata.author) ?? undefined,
+      published: pickString(extracted?.published, baseMetadata.published) ?? undefined,
+    },
+    markdown: normalizeMarkdown(markdown),
+    rawHtml: html,
+    conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text",
+  };
+}
diff --git a/skills/baoyu-url-to-markdown/scripts/main.ts b/skills/baoyu-url-to-markdown/scripts/main.ts
index 31d948f..b246c84 100644
--- a/skills/baoyu-url-to-markdown/scripts/main.ts
+++ b/skills/baoyu-url-to-markdown/scripts/main.ts
@@ -75,6 +75,55 @@ function deriveHtmlSnapshotPath(markdownPath: string): string {
   return path.join(parsed.dir, `${basename}-captured.html`);
 }
 
+function extractTitleFromMarkdownDocument(document: string): string {
+  const normalized = document.replace(/\r\n/g, "\n");
+  const frontmatterMatch = normalized.match(/^---\n([\s\S]*?)\n---\n?/);
+  if (frontmatterMatch) {
+    const titleLine = frontmatterMatch[1]
+      .split("\n")
+      .find((line) => /^title:\s*/i.test(line));
+
+    if (titleLine) {
+      const rawValue = titleLine.replace(/^title:\s*/i, "").trim();
+      const unquoted = rawValue
+        .replace(/^"(.*)"$/, "$1")
+        .replace(/^'(.*)'$/, "$1")
+        .replace(/\\"/g, '"');
+      if (unquoted) return unquoted;
+    }
+  }
+
+  const headingMatch = normalized.match(/^#\s+(.+)$/m);
+  return headingMatch?.[1]?.trim() ?? "";
+}
+
+function buildDefuddleApiUrl(targetUrl: string): string {
+  return `https://defuddle.md/${encodeURIComponent(targetUrl)}`;
+}
+
+async function fetchDefuddleApiMarkdown(targetUrl: string): Promise<{ markdown: string; title: string }> {
+  const apiUrl = buildDefuddleApiUrl(targetUrl);
+  const response = await fetch(apiUrl, {
+    headers: {
+      accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1",
+    },
+  });
+
+  if (!response.ok) {
+    throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`);
+  }
+
+  const markdown = (await response.text()).replace(/\r\n/g, "\n").trim();
+  if (!markdown) {
+    throw new Error("defuddle.md returned empty markdown");
+  }
+
+  return {
+    markdown,
+    title: extractTitleFromMarkdownDocument(markdown),
+  };
+}
+
 async function generateOutputPath(url: string, title: string, outputDir?: string): Promise<string> {
   const domain = new URL(url).hostname.replace(/^www\./, "");
   const slug = generateSlug(title, url);
@@ -192,14 +241,41 @@ async function main(): Promise<void> {
   console.log(`Fetching: ${args.url}`);
   console.log(`Mode: ${args.wait ? "wait" : "auto"}`);
 
-  const result = await captureUrl(args);
-  const outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir);
-  const outputDir = path.dirname(outputPath);
-  const htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath);
-  await mkdir(outputDir, { recursive: true });
-  await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8");
+  let outputPath: string;
+  let htmlSnapshotPath: string | null = null;
+  let document: string;
+  let conversionMethod: string;
+  let fallbackReason: string | undefined;
 
-  let document = createMarkdownDocument(result);
+  try {
+    const result = await captureUrl(args);
+    outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir);
+    const outputDir = path.dirname(outputPath);
+    htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath);
+    await mkdir(outputDir, { recursive: true });
+    await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8");
+
+    document = createMarkdownDocument(result);
+    conversionMethod = result.conversionMethod;
+    fallbackReason = result.fallbackReason;
+  } catch (error) {
+    const primaryError = error instanceof Error ? error.message : String(error);
+    console.warn(`Primary capture failed: ${primaryError}`);
+    console.warn("Trying defuddle.md API fallback...");
+
+    try {
+      const remoteResult = await fetchDefuddleApiMarkdown(args.url);
+      outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir);
+      await mkdir(path.dirname(outputPath), { recursive: true });
+
+      document = remoteResult.markdown;
+      conversionMethod = "defuddle-api";
+      fallbackReason = `Local browser capture failed: ${primaryError}`;
+    } catch (remoteError) {
+      const remoteMessage = remoteError instanceof Error ? remoteError.message : String(remoteError);
+      throw new Error(`Local browser capture failed (${primaryError}); defuddle.md fallback failed (${remoteMessage})`);
+    }
+  }
 
   if (args.downloadMedia) {
     const mediaResult = await localizeMarkdownMedia(document, {
@@ -220,11 +296,15 @@ async function main(): Promise<void> {
   await writeFile(outputPath, document, "utf-8");
 
   console.log(`Saved: ${outputPath}`);
-  console.log(`Saved HTML: ${htmlSnapshotPath}`);
-  console.log(`Title: ${result.metadata.title || "(no title)"}`);
-  console.log(`Converter: ${result.conversionMethod}`);
-  if (result.fallbackReason) {
-    console.warn(`Fallback used: ${result.fallbackReason}`);
+  if (htmlSnapshotPath) {
+    console.log(`Saved HTML: ${htmlSnapshotPath}`);
+  } else {
+    console.log("Saved HTML: unavailable (defuddle.md fallback)");
+  }
+  console.log(`Title: ${extractTitleFromMarkdownDocument(document) || "(no title)"}`);
+  console.log(`Converter: ${conversionMethod}`);
+  if (fallbackReason) {
+    console.warn(`Fallback used: ${fallbackReason}`);
   }
 }
 
diff --git a/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts
new file mode 100644
index 0000000..25ed29b
--- /dev/null
+++ b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts
@@ -0,0 +1,305 @@
+import { parseHTML } from "linkedom";
+
+export interface PageMetadata {
+  url: string;
+  title: string;
+  description?: string;
+  author?: string;
+  published?: string;
+  coverImage?: string;
+  language?: string;
+  captured_at: string;
+}
+
+export interface ConversionResult {
+  metadata: PageMetadata;
+  markdown: string;
+  rawHtml: string;
+  conversionMethod: string;
+  fallbackReason?: string;
+  variables?: Record<string, string>;
+}
+
+export type AnyRecord = Record<string, unknown>;
+
+export const MIN_CONTENT_LENGTH = 120;
+export const GOOD_CONTENT_LENGTH = 900;
+
+const PUBLISHED_TIME_SELECTORS = [
+  "meta[property='article:published_time']",
+  "meta[name='pubdate']",
+  "meta[name='publishdate']",
+  "meta[name='date']",
+  "time[datetime]",
+];
+
+const ARTICLE_TYPES = new Set([
+  "Article",
+  "NewsArticle",
+  "BlogPosting",
+  "WebPage",
+  "ReportageNewsArticle",
+]);
+
+export function pickString(...values: unknown[]): string | null {
+  for (const value of values) {
+    if (typeof value === "string") {
+      const trimmed = value.trim();
+      if (trimmed) return trimmed;
+    }
+  }
+  return null;
+}
+
+export function normalizeMarkdown(markdown: string): string {
+  return markdown
+    .replace(/\r\n/g, "\n")
+    .replace(/[ \t]+\n/g, "\n")
+    .replace(/\n{3,}/g, "\n\n")
+    .trim();
+}
+
+export function parseDocument(html: string): Document {
+  const normalized = /<\s*html[\s>]/i.test(html)
+    ? html
+    : `<!doctype html><html><body>${html}</body></html>`;
+  return parseHTML(normalized).document as unknown as Document;
+}
+
+export function sanitizeHtml(html: string): string {
+  const { document } = parseHTML(`<div id="__root">${html}</div>`);
+  const root = document.querySelector("#__root");
+  if (!root) return html;
+
+  for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
+    for (const el of root.querySelectorAll(selector)) {
+      el.remove();
+    }
+  }
+
+  return root.innerHTML;
+}
+
+export function extractTextFromHtml(html: string): string {
+  const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
+  for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
+    for (const el of document.querySelectorAll(selector)) {
+      el.remove();
+    }
+  }
+  return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
+}
+
+export function getMetaContent(document: Document, names: string[]): string | null {
+  for (const name of names) {
+    const element =
+      document.querySelector(`meta[name="${name}"]`) ??
+      document.querySelector(`meta[property="${name}"]`);
+    const content = element?.getAttribute("content");
+    if (content && content.trim()) return content.trim();
+  }
+  return null;
+}
+
+function normalizeLanguageTag(value: string | null): string | null {
+  if (!value) return null;
+
+  const trimmed = value.trim();
+  if (!trimmed) return null;
+
+  const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();
+  if (!primary) return null;
+
+  return primary.replace(/_/g, "-");
+}
+
+function flattenJsonLdItems(data: unknown): AnyRecord[] {
+  if (!data || typeof data !== "object") return [];
+  if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
+
+  const item = data as AnyRecord;
+  if (Array.isArray(item["@graph"])) {
+    return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
+  }
+
+  return [item];
+}
+
+function parseJsonLdScripts(document: Document): AnyRecord[] {
+  const results: AnyRecord[] = [];
+  const scripts = document.querySelectorAll("script[type='application/ld+json']");
+
+  for (const script of scripts) {
+    try {
+      const data = JSON.parse(script.textContent ?? "");
+      results.push(...flattenJsonLdItems(data));
+    } catch {
+      // Ignore malformed blocks.
+    }
+  }
+
+  return results;
+}
+
+function isArticleType(item: AnyRecord): boolean {
+  const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
+  return typeof value === "string" && ARTICLE_TYPES.has(value);
+}
+
+function extractAuthorFromJsonLd(authorData: unknown): string | null {
+  if (typeof authorData === "string") return authorData;
+  if (!authorData || typeof authorData !== "object") return null;
+
+  if (Array.isArray(authorData)) {
+    const names = authorData
+      .map((author) => extractAuthorFromJsonLd(author))
+      .filter((name): name is string => Boolean(name));
+    return names.length > 0 ? names.join(", ") : null;
+  }
+
+  const author = authorData as AnyRecord;
+  return typeof author.name === "string" ? author.name : null;
+}
+
+function extractPrimaryJsonLdMeta(document: Document): Partial<PageMetadata> {
+  for (const item of parseJsonLdScripts(document)) {
+    if (!isArticleType(item)) continue;
+
+    return {
+      title: pickString(item.headline, item.name) ?? undefined,
+      description: pickString(item.description) ?? undefined,
+      author: extractAuthorFromJsonLd(item.author) ?? undefined,
+      published: pickString(item.datePublished, item.dateCreated) ?? undefined,
+      coverImage:
+        pickString(
+          item.image,
+          (item.image as AnyRecord | undefined)?.url,
+          (Array.isArray(item.image) ? item.image[0] : undefined) as unknown
+        ) ?? undefined,
+    };
+  }
+
+  return {};
+}
+
+export function extractPublishedTime(document: Document): string | null {
+  for (const selector of PUBLISHED_TIME_SELECTORS) {
+    const el = document.querySelector(selector);
+    if (!el) continue;
+    const value = el.getAttribute("content") ?? el.getAttribute("datetime");
+    if (value && value.trim()) return value.trim();
+  }
+  return null;
+}
+
+export function extractTitle(document: Document): string | null {
+  const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content");
+  if (ogTitle && ogTitle.trim()) return ogTitle.trim();
+
+  const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content");
+  if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim();
+
+  const title = document.querySelector("title")?.textContent?.trim();
+  if (title) {
+    const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim();
+    if (cleaned) return cleaned;
+  }
+
+  const h1 = document.querySelector("h1")?.textContent?.trim();
+  return h1 || null;
+}
+
+export function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata {
+  const document = parseDocument(html);
+  const jsonLd = extractPrimaryJsonLdMeta(document);
+  const timeEl = document.querySelector("time[datetime]");
+  const htmlLang = normalizeLanguageTag(document.documentElement?.getAttribute("lang"));
+  const metaLanguage = normalizeLanguageTag(
+    pickString(
+      getMetaContent(document, ["language", "content-language", "og:locale"]),
+      document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content")
+    )
+  );
+
+  return {
+    url,
+    title:
+      pickString(
+        getMetaContent(document, ["og:title", "twitter:title"]),
+        jsonLd.title,
+        document.querySelector("h1")?.textContent,
+        document.title
+      ) ?? "",
+    description:
+      pickString(
+        getMetaContent(document, ["description", "og:description", "twitter:description"]),
+        jsonLd.description
+      ) ?? undefined,
+    author:
+      pickString(
+        getMetaContent(document, ["author", "article:author", "twitter:creator"]),
+        jsonLd.author
+      ) ?? undefined,
+    published:
+      pickString(
+        timeEl?.getAttribute("datetime"),
+        getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
+        jsonLd.published,
+        extractPublishedTime(document)
+      ) ?? undefined,
+    coverImage:
+      pickString(
+        getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
+        jsonLd.coverImage
+      ) ?? undefined,
+    language: pickString(htmlLang, metaLanguage) ?? undefined,
+    captured_at: capturedAt,
+  };
+}
+
+export function isMarkdownUsable(markdown: string, html: string): boolean {
+  const normalized = normalizeMarkdown(markdown);
+  if (!normalized) return false;
+
+  const htmlTextLength = extractTextFromHtml(html).length;
+  if (htmlTextLength < MIN_CONTENT_LENGTH) return true;
+
+  if (normalized.length >= 80) return true;
+  return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
+}
+
+export function isYouTubeUrl(url: string): boolean {
+  try {
+    const hostname = new URL(url).hostname.toLowerCase();
+    return hostname === "youtu.be" || hostname.endsWith(".youtube.com") || hostname === "youtube.com";
+  } catch {
+    return false;
+  }
+}
+
+function escapeYamlValue(value: string): string {
+  return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n");
+}
+
+export function formatMetadataYaml(meta: PageMetadata): string {
+  const lines = ["---"];
+  lines.push(`url: ${meta.url}`);
+  lines.push(`title: "${escapeYamlValue(meta.title)}"`);
+  if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
+  if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
+  if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
+  if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
+  if (meta.language) lines.push(`language: "${escapeYamlValue(meta.language)}"`);
+  lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
+  lines.push("---");
+  return lines.join("\n");
+}
+
+export function createMarkdownDocument(result: ConversionResult): string {
+  const yaml = formatMetadataYaml(result.metadata);
+  const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
+  const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
+  const hasTitle = titleRegex.test(result.markdown.trimStart());
+  const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
+  return yaml + title + result.markdown;
+}
diff --git a/skills/baoyu-url-to-markdown/scripts/package.json b/skills/baoyu-url-to-markdown/scripts/package.json
index e37fdae..ac8aac0 100644
--- a/skills/baoyu-url-to-markdown/scripts/package.json
+++ b/skills/baoyu-url-to-markdown/scripts/package.json
@@ -5,7 +5,7 @@
   "dependencies": {
     "@mozilla/readability": "^0.6.0",
     "baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp",
-    "defuddle": "^0.10.0",
+    "defuddle": "^0.12.0",
     "jsdom": "^24.1.3",
     "linkedom": "^0.18.12",
     "turndown": "^7.2.2",