From 0279fa403dcbd8b81e0176e8c0eb5ac52a023585 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= Date: Fri, 13 Mar 2026 00:22:03 -0500 Subject: [PATCH] feat(baoyu-url-to-markdown): add defuddle.md API fallback, YouTube transcripts, and modular converter architecture --- skills/baoyu-url-to-markdown/SKILL.md | 36 +- skills/baoyu-url-to-markdown/scripts/bun.lock | 4 +- .../scripts/defuddle-converter.ts | 58 ++ .../scripts/html-to-markdown.ts | 975 ++---------------- .../scripts/legacy-converter.ts | 629 +++++++++++ skills/baoyu-url-to-markdown/scripts/main.ts | 104 +- .../scripts/markdown-conversion-shared.ts | 305 ++++++ .../scripts/package.json | 2 +- 8 files changed, 1190 insertions(+), 923 deletions(-) create mode 100644 skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/legacy-converter.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts diff --git a/skills/baoyu-url-to-markdown/SKILL.md b/skills/baoyu-url-to-markdown/SKILL.md index 1f382c2..48881b9 100644 --- a/skills/baoyu-url-to-markdown/SKILL.md +++ b/skills/baoyu-url-to-markdown/SKILL.md @@ -1,7 +1,7 @@ --- name: baoyu-url-to-markdown -description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when Defuddle fails. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown. -version: 1.56.1 +description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, uses an upgraded Defuddle pipeline with better web-component handling and YouTube transcript extraction, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when needed. If local browser capture fails entirely, it can fall back to the hosted defuddle.md API. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown. +version: 1.58.1 metadata: openclaw: homepage: https://github.com/JimLiu/baoyu-skills#baoyu-url-to-markdown @@ -29,7 +29,10 @@ Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts i | Script | Purpose | |--------|---------| | `scripts/main.ts` | CLI entry point for URL fetching | -| `scripts/html-to-markdown.ts` | Defuddle-first conversion with automatic legacy fallback | +| `scripts/html-to-markdown.ts` | Markdown conversion entry point and converter selection | +| `scripts/defuddle-converter.ts` | Defuddle-based conversion | +| `scripts/legacy-converter.ts` | Pre-Defuddle legacy extraction and markdown conversion | +| `scripts/markdown-conversion-shared.ts` | Shared metadata parsing and markdown document helpers | ## Preferences (EXTEND.md) @@ -115,7 +118,10 @@ Full reference: [references/config/first-time-setup.md](references/config/first- - Two capture modes: auto or wait-for-user - Save rendered HTML as a sibling `-captured.html` file - Clean markdown output with metadata -- Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history +- Upgraded Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history +- Materializes shadow DOM content before conversion so web-component pages survive serialization better +- YouTube pages can include transcript/caption text in the markdown when YouTube exposes a caption track +- If local browser capture fails completely, can fall back to `defuddle.md/` and still save markdown - Handles login-required pages via wait mode - Download images and videos to local directories @@ -168,7 +174,10 @@ Each run saves two files side by side: - Markdown: YAML front matter with `url`, `title`, `description`, `author`, `published`, optional `coverImage`, and `captured_at`, followed by converted markdown content - HTML snapshot: `*-captured.html`, containing the rendered page HTML captured from Chrome +When Defuddle or page metadata provides a language hint, the markdown front matter also includes `language`. + The HTML snapshot is saved before any markdown media localization, so it stays a faithful capture of the page DOM used for conversion. +If the hosted `defuddle.md` API fallback is used, markdown is still saved, but there is no local `-captured.html` snapshot for that run. ## Output Directory @@ -193,13 +202,16 @@ When `--download-media` is enabled: Conversion order: 1. Try Defuddle first -2. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor -3. The fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history +2. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline +3. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor +4. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/` API and save its markdown output directly +5. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history CLI output will show: - `Converter: defuddle` when Defuddle succeeds - `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed +- `Converter: defuddle-api` when local browser capture failed and the hosted API was used instead ## Media Download Workflow @@ -232,6 +244,18 @@ Based on `download_media` setting in EXTEND.md: **Troubleshooting**: Chrome not found → set `URL_CHROME_PATH`. Timeout → increase `--timeout`. Complex pages → try `--wait` mode. If markdown quality is poor, inspect the saved `-captured.html` and check whether the run logged a legacy fallback. +### YouTube Notes + +- The upgraded Defuddle path uses async extractors, so YouTube pages can include transcript text directly in the markdown body. +- Transcript availability depends on YouTube exposing a caption track. Videos with captions disabled, restricted playback, or blocked regional access may still produce description-only output. +- If the page needs time to finish loading descriptions, chapters, or player metadata, prefer `--wait` and capture after the watch page is fully hydrated. + +### Hosted API Fallback + +- The hosted fallback endpoint is `https://defuddle.md/`. In shell form: `curl https://defuddle.md/stephango.com` +- Use it only when the local Chrome/CDP capture path fails outright. The local path still has higher fidelity because it can save the captured HTML and handle authenticated pages. +- The hosted API already returns Markdown with YAML frontmatter, so save that response as-is and then apply the normal media-localization step if requested. + ## Extension Support Custom configurations via EXTEND.md. See **Preferences** section for paths and supported options. diff --git a/skills/baoyu-url-to-markdown/scripts/bun.lock b/skills/baoyu-url-to-markdown/scripts/bun.lock index 50109d2..de2f3dc 100644 --- a/skills/baoyu-url-to-markdown/scripts/bun.lock +++ b/skills/baoyu-url-to-markdown/scripts/bun.lock @@ -6,7 +6,7 @@ "dependencies": { "@mozilla/readability": "^0.6.0", "baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp", - "defuddle": "^0.10.0", + "defuddle": "^0.12.0", "jsdom": "^24.1.3", "linkedom": "^0.18.12", "turndown": "^7.2.2", @@ -61,7 +61,7 @@ "decimal.js": ["decimal.js@10.6.0", "", {}, "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg=="], - "defuddle": ["defuddle@0.10.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-a43juTtHv6Vs4+sxvahVLM5NxoyDsarO1Ag3UxLORI4Fo/nsNFwzDxuQBvosKVGTIRxCwN/mfnWAzNXmQfieqw=="], + "defuddle": ["defuddle@0.12.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-Y/WgyGKBxwxFir+hWNth4nmWDDDb8BzQi3qASS2NWYPXsKU42Ku49/3M5yFYefnRef9prynnmasfnXjk99EWgA=="], "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], diff --git a/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts b/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts new file mode 100644 index 0000000..15a265a --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/defuddle-converter.ts @@ -0,0 +1,58 @@ +import { JSDOM, VirtualConsole } from "jsdom"; +import { Defuddle } from "defuddle/node"; + +import { + type ConversionResult, + type PageMetadata, + isMarkdownUsable, + normalizeMarkdown, + pickString, +} from "./markdown-conversion-shared.js"; + +export async function tryDefuddleConversion( + html: string, + url: string, + baseMetadata: PageMetadata +): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> { + try { + const virtualConsole = new VirtualConsole(); + virtualConsole.on("jsdomError", (error: Error & { type?: string }) => { + if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) { + return; + } + console.warn(`[url-to-markdown] jsdom: ${error.message}`); + }); + + const dom = new JSDOM(html, { url, virtualConsole }); + const result = await Defuddle(dom, url, { markdown: true }); + const markdown = normalizeMarkdown(result.content || ""); + + if (!isMarkdownUsable(markdown, html)) { + return { ok: false, reason: "Defuddle returned empty or incomplete markdown" }; + } + + return { + ok: true, + result: { + metadata: { + ...baseMetadata, + title: pickString(result.title, baseMetadata.title) ?? "", + description: pickString(result.description, baseMetadata.description) ?? undefined, + author: pickString(result.author, baseMetadata.author) ?? undefined, + published: pickString(result.published, baseMetadata.published) ?? undefined, + coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined, + language: pickString(result.language, baseMetadata.language) ?? undefined, + }, + markdown, + rawHtml: html, + conversionMethod: "defuddle", + variables: result.variables, + }, + }; + } catch (error) { + return { + ok: false, + reason: error instanceof Error ? error.message : String(error), + }; + } +} diff --git a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts index f93af43..34667b6 100644 --- a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts +++ b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts @@ -1,911 +1,104 @@ -import { parseHTML } from "linkedom"; -import { Readability } from "@mozilla/readability"; -import TurndownService from "turndown"; -import { gfm } from "turndown-plugin-gfm"; +import { + createMarkdownDocument, + extractMetadataFromHtml, + formatMetadataYaml, + type ConversionResult, + type PageMetadata, + isYouTubeUrl, +} from "./markdown-conversion-shared.js"; +import { tryDefuddleConversion } from "./defuddle-converter.js"; +import { + convertWithLegacyExtractor, + scoreMarkdownQuality, + shouldCompareWithLegacy, +} from "./legacy-converter.js"; -export interface PageMetadata { - url: string; - title: string; - description?: string; - author?: string; - published?: string; - coverImage?: string; - captured_at: string; -} - -export interface ConversionResult { - metadata: PageMetadata; - markdown: string; - rawHtml: string; - conversionMethod: string; - fallbackReason?: string; -} - -interface ExtractionCandidate { - title: string | null; - byline: string | null; - excerpt: string | null; - published: string | null; - html: string | null; - textContent: string; - method: string; -} - -type AnyRecord = Record; - -const MIN_CONTENT_LENGTH = 120; -const GOOD_CONTENT_LENGTH = 900; - -const CONTENT_SELECTORS = [ - "article", - "main article", - "[role='main'] article", - "[itemprop='articleBody']", - ".article-content", - ".article-body", - ".post-content", - ".entry-content", - ".story-body", - "main", - "[role='main']", - "#content", - ".content", -]; - -const REMOVE_SELECTORS = [ - "script", - "style", - "noscript", - "template", - "iframe", - "svg", - "path", - "nav", - "aside", - "footer", - "header", - "form", - ".advertisement", - ".ads", - ".social-share", - ".related-articles", - ".comments", - ".newsletter", - ".cookie-banner", - ".cookie-consent", - "[role='navigation']", - "[aria-label*='cookie' i]", -]; - -const PUBLISHED_TIME_SELECTORS = [ - "meta[property='article:published_time']", - "meta[name='pubdate']", - "meta[name='publishdate']", - "meta[name='date']", - "time[datetime]", -]; - -const ARTICLE_TYPES = new Set([ - "Article", - "NewsArticle", - "BlogPosting", - "WebPage", - "ReportageNewsArticle", -]); - -const NEXT_DATA_CONTENT_PATHS = [ - "props.pageProps.content.body", - "props.pageProps.article.body", - "props.pageProps.article.content", - "props.pageProps.post.body", - "props.pageProps.post.content", - "props.pageProps.data.body", - "props.pageProps.story.body.content", -]; - -const LOW_QUALITY_MARKERS = [ - /Join The Conversation/i, - /One Community\. Many Voices/i, - /Read our community guidelines/i, - /Create a free account to share your thoughts/i, - /Become a Forbes Member/i, - /Subscribe to trusted journalism/i, - /\bComments\b/i, -]; +export type { ConversionResult, PageMetadata }; +export { createMarkdownDocument, formatMetadataYaml }; export const absolutizeUrlsScript = String.raw` (function() { const baseUrl = document.baseURI || location.href; + const htmlClone = document.documentElement.cloneNode(true); + + function materializeShadowDom(sourceRoot, cloneRoot) { + const sourceElements = Array.from(sourceRoot.querySelectorAll("*")); + const cloneElements = Array.from(cloneRoot.querySelectorAll("*")); + + for (let i = sourceElements.length - 1; i >= 0; i--) { + const sourceEl = sourceElements[i]; + const cloneEl = cloneElements[i]; + const shadowRoot = sourceEl && sourceEl.shadowRoot; + if (!shadowRoot || !cloneEl || !shadowRoot.innerHTML) continue; + + if (cloneEl.tagName && cloneEl.tagName.includes("-")) { + const wrapper = document.createElement("div"); + wrapper.setAttribute("data-shadow-host", cloneEl.tagName.toLowerCase()); + wrapper.innerHTML = shadowRoot.innerHTML; + cloneEl.replaceWith(wrapper); + } else { + cloneEl.innerHTML = shadowRoot.innerHTML; + } + } + } + function toAbsolute(url) { if (!url) return url; try { return new URL(url, baseUrl).href; } catch { return url; } } - function absAttr(sel, attr) { - document.querySelectorAll(sel).forEach(el => { + + function absAttr(root, sel, attr) { + root.querySelectorAll(sel).forEach(el => { const v = el.getAttribute(attr); - if (v) { const a = toAbsolute(v); if (a) el.setAttribute(attr, a); } + if (v) { + const a = toAbsolute(v); + if (a) el.setAttribute(attr, a); + } }); } - function absSrcset(sel) { - document.querySelectorAll(sel).forEach(el => { + + function absSrcset(root, sel) { + root.querySelectorAll(sel).forEach(el => { const s = el.getAttribute("srcset"); if (!s) return; el.setAttribute("srcset", s.split(",").map(p => { - const t = p.trim(); if (!t) return ""; + const t = p.trim(); + if (!t) return ""; const [url, ...d] = t.split(/\s+/); return d.length ? toAbsolute(url) + " " + d.join(" ") : toAbsolute(url); }).filter(Boolean).join(", ")); }); } - document.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => { + + materializeShadowDom(document.documentElement, htmlClone); + + htmlClone.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => { const ds = el.getAttribute("data-src"); if (ds && (!el.getAttribute("src") || el.getAttribute("src") === "" || el.getAttribute("src")?.startsWith("data:"))) { el.setAttribute("src", ds); } }); - absAttr("a[href]", "href"); - absAttr("img[src], video[src], audio[src], source[src]", "src"); - absSrcset("img[srcset], source[srcset]"); - return { html: document.documentElement.outerHTML }; + + absAttr(htmlClone, "a[href]", "href"); + absAttr(htmlClone, "img[src], video[src], audio[src], source[src], iframe[src]", "src"); + absAttr(htmlClone, "video[poster]", "poster"); + absSrcset(htmlClone, "img[srcset], source[srcset]"); + + return { html: "\n" + htmlClone.outerHTML }; })() `; -function pickString(...values: unknown[]): string | null { - for (const value of values) { - if (typeof value === "string") { - const trimmed = value.trim(); - if (trimmed) return trimmed; - } - } - return null; -} - -function normalizeMarkdown(markdown: string): string { - return markdown - .replace(/\r\n/g, "\n") - .replace(/[ \t]+\n/g, "\n") - .replace(/\n{3,}/g, "\n\n") - .trim(); -} - -function parseDocument(html: string): Document { - const normalized = /<\s*html[\s>]/i.test(html) - ? html - : `${html}`; - return parseHTML(normalized).document as unknown as Document; -} - -function sanitizeHtml(html: string): string { - const { document } = parseHTML(`
${html}
`); - const root = document.querySelector("#__root"); - if (!root) return html; - - for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) { - for (const el of root.querySelectorAll(selector)) { - el.remove(); - } +function shouldPreferDefuddle(result: ConversionResult): boolean { + if (isYouTubeUrl(result.metadata.url)) { + return true; } - return root.innerHTML; -} - -function extractTextFromHtml(html: string): string { - const { document } = parseHTML(`${html}`); - for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { - for (const el of document.querySelectorAll(selector)) { - el.remove(); - } - } - return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? ""; -} - -function getMetaContent(document: Document, names: string[]): string | null { - for (const name of names) { - const element = - document.querySelector(`meta[name="${name}"]`) ?? - document.querySelector(`meta[property="${name}"]`); - const content = element?.getAttribute("content"); - if (content && content.trim()) return content.trim(); - } - return null; -} - -function flattenJsonLdItems(data: unknown): AnyRecord[] { - if (!data || typeof data !== "object") return []; - if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems); - - const item = data as AnyRecord; - if (Array.isArray(item["@graph"])) { - return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems); + const transcript = result.variables?.transcript?.trim(); + if (transcript) { + return true; } - return [item]; -} - -function parseJsonLdScripts(document: Document): AnyRecord[] { - const results: AnyRecord[] = []; - const scripts = document.querySelectorAll("script[type='application/ld+json']"); - - for (const script of scripts) { - try { - const data = JSON.parse(script.textContent ?? ""); - results.push(...flattenJsonLdItems(data)); - } catch { - // Ignore malformed blocks. - } - } - - return results; -} - -function isArticleType(item: AnyRecord): boolean { - const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"]; - return typeof value === "string" && ARTICLE_TYPES.has(value); -} - -function extractAuthorFromJsonLd(authorData: unknown): string | null { - if (typeof authorData === "string") return authorData; - if (!authorData || typeof authorData !== "object") return null; - - if (Array.isArray(authorData)) { - const names = authorData - .map((author) => extractAuthorFromJsonLd(author)) - .filter((name): name is string => Boolean(name)); - return names.length > 0 ? names.join(", ") : null; - } - - const author = authorData as AnyRecord; - return typeof author.name === "string" ? author.name : null; -} - -function extractPrimaryJsonLdMeta(document: Document): Partial { - for (const item of parseJsonLdScripts(document)) { - if (!isArticleType(item)) continue; - - return { - title: pickString(item.headline, item.name) ?? undefined, - description: pickString(item.description) ?? undefined, - author: extractAuthorFromJsonLd(item.author) ?? undefined, - published: pickString(item.datePublished, item.dateCreated) ?? undefined, - coverImage: - pickString( - item.image, - (item.image as AnyRecord | undefined)?.url, - (Array.isArray(item.image) ? item.image[0] : undefined) as unknown - ) ?? undefined, - }; - } - - return {}; -} - -function extractPublishedTime(document: Document): string | null { - for (const selector of PUBLISHED_TIME_SELECTORS) { - const el = document.querySelector(selector); - if (!el) continue; - const value = el.getAttribute("content") ?? el.getAttribute("datetime"); - if (value && value.trim()) return value.trim(); - } - return null; -} - -function extractTitle(document: Document): string | null { - const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content"); - if (ogTitle && ogTitle.trim()) return ogTitle.trim(); - - const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content"); - if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim(); - - const title = document.querySelector("title")?.textContent?.trim(); - if (title) { - const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim(); - if (cleaned) return cleaned; - } - - const h1 = document.querySelector("h1")?.textContent?.trim(); - return h1 || null; -} - -function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata { - const document = parseDocument(html); - const jsonLd = extractPrimaryJsonLdMeta(document); - const timeEl = document.querySelector("time[datetime]"); - - return { - url, - title: - pickString( - getMetaContent(document, ["og:title", "twitter:title"]), - jsonLd.title, - document.querySelector("h1")?.textContent, - document.title - ) ?? "", - description: - pickString( - getMetaContent(document, ["description", "og:description", "twitter:description"]), - jsonLd.description - ) ?? undefined, - author: - pickString( - getMetaContent(document, ["author", "article:author", "twitter:creator"]), - jsonLd.author - ) ?? undefined, - published: - pickString( - timeEl?.getAttribute("datetime"), - getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]), - jsonLd.published, - extractPublishedTime(document) - ) ?? undefined, - coverImage: - pickString( - getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]), - jsonLd.coverImage - ) ?? undefined, - captured_at: capturedAt, - }; -} - -function generateExcerpt(excerpt: string | null, textContent: string | null): string | null { - if (excerpt) return excerpt; - if (!textContent) return null; - const trimmed = textContent.trim(); - if (!trimmed) return null; - return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed; -} - -function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null { - if (!isArticleType(item)) return null; - - const rawContent = - (typeof item.articleBody === "string" && item.articleBody) || - (typeof item.text === "string" && item.text) || - (typeof item.description === "string" && item.description) || - null; - - if (!rawContent) return null; - - const content = rawContent.trim(); - const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content); - const textContent = htmlLike ? extractTextFromHtml(content) : content; - - if (textContent.length < MIN_CONTENT_LENGTH) return null; - - return { - title: pickString(item.headline, item.name), - byline: extractAuthorFromJsonLd(item.author), - excerpt: pickString(item.description), - published: pickString(item.datePublished, item.dateCreated), - html: htmlLike ? content : null, - textContent, - method: "json-ld", - }; -} - -function tryJsonLdExtraction(document: Document): ExtractionCandidate | null { - for (const item of parseJsonLdScripts(document)) { - const extracted = parseJsonLdItem(item); - if (extracted) return extracted; - } - return null; -} - -function getByPath(value: unknown, path: string): unknown { - let current = value; - for (const part of path.split(".")) { - if (!current || typeof current !== "object") return undefined; - current = (current as AnyRecord)[part]; - } - return current; -} - -function isContentBlockArray(value: unknown): value is AnyRecord[] { - if (!Array.isArray(value) || value.length === 0) return false; - return value.slice(0, 5).some((item) => { - if (!item || typeof item !== "object") return false; - const obj = item as AnyRecord; - return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj; - }); -} - -function extractTextFromContentBlocks(blocks: AnyRecord[]): string { - const parts: string[] = []; - - function pushParagraph(text: string): void { - const trimmed = text.trim(); - if (!trimmed) return; - parts.push(trimmed, "\n\n"); - } - - function walk(node: unknown): void { - if (!node || typeof node !== "object") return; - const block = node as AnyRecord; - - if (typeof block.text === "string") { - pushParagraph(block.text); - return; - } - - if (typeof block.textHtml === "string") { - pushParagraph(extractTextFromHtml(block.textHtml)); - return; - } - - if (Array.isArray(block.items)) { - for (const item of block.items) { - if (item && typeof item === "object") { - const text = pickString((item as AnyRecord).text); - if (text) parts.push(`- ${text}\n`); - } - } - parts.push("\n"); - } - - if (Array.isArray(block.components)) { - for (const component of block.components) { - walk(component); - } - } - - if (Array.isArray(block.content)) { - for (const child of block.content) { - walk(child); - } - } - } - - for (const block of blocks) { - walk(block); - } - - return parts.join("").replace(/\n{3,}/g, "\n\n").trim(); -} - -function tryStringBodyExtraction( - content: string, - meta: AnyRecord, - document: Document, - method: string -): ExtractionCandidate | null { - if (!content || content.length < MIN_CONTENT_LENGTH) return null; - - const isHtml = /<\/?[a-z][\s\S]*>/i.test(content); - const html = isHtml ? sanitizeHtml(content) : null; - const textContent = isHtml ? extractTextFromHtml(html) : content.trim(); - - if (textContent.length < MIN_CONTENT_LENGTH) return null; - - return { - title: pickString(meta.headline, meta.title, extractTitle(document)), - byline: pickString(meta.byline, meta.author), - excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)), - published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)), - html, - textContent, - method, - }; -} - -function tryNextDataExtraction(document: Document): ExtractionCandidate | null { - try { - const script = document.querySelector("script#__NEXT_DATA__"); - if (!script?.textContent) return null; - - const data = JSON.parse(script.textContent) as AnyRecord; - const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord; - - for (const path of NEXT_DATA_CONTENT_PATHS) { - const value = getByPath(data, path); - - if (typeof value === "string") { - const parentPath = path.split(".").slice(0, -1).join("."); - const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord; - const meta = { - ...pageProps, - ...parent, - title: parent.title ?? (pageProps.title as string | undefined), - }; - - const candidate = tryStringBodyExtraction(value, meta, document, "next-data"); - if (candidate) return candidate; - } - - if (isContentBlockArray(value)) { - const textContent = extractTextFromContentBlocks(value); - if (textContent.length < MIN_CONTENT_LENGTH) continue; - - return { - title: pickString( - getByPath(data, "props.pageProps.content.headline"), - getByPath(data, "props.pageProps.article.headline"), - getByPath(data, "props.pageProps.article.title"), - getByPath(data, "props.pageProps.post.title"), - pageProps.title, - extractTitle(document) - ), - byline: pickString( - getByPath(data, "props.pageProps.author.name"), - getByPath(data, "props.pageProps.article.author.name") - ), - excerpt: pickString( - getByPath(data, "props.pageProps.content.description"), - getByPath(data, "props.pageProps.article.description"), - pageProps.description, - generateExcerpt(null, textContent) - ), - published: pickString( - getByPath(data, "props.pageProps.content.datePublished"), - getByPath(data, "props.pageProps.article.datePublished"), - getByPath(data, "props.pageProps.publishedAt"), - extractPublishedTime(document) - ), - html: null, - textContent, - method: "next-data", - }; - } - } - } catch { - return null; - } - - return null; -} - -function buildReadabilityCandidate( - article: ReturnType, - document: Document, - method: string -): ExtractionCandidate | null { - const textContent = article?.textContent?.trim() ?? ""; - if (textContent.length < MIN_CONTENT_LENGTH) return null; - - return { - title: pickString(article?.title, extractTitle(document)), - byline: pickString((article as { byline?: string } | null)?.byline), - excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)), - published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)), - html: article?.content ? sanitizeHtml(article.content) : null, - textContent, - method, - }; -} - -function tryReadability(document: Document): ExtractionCandidate | null { - try { - const strictClone = document.cloneNode(true) as Document; - const strictResult = buildReadabilityCandidate( - new Readability(strictClone).parse(), - document, - "readability" - ); - if (strictResult) return strictResult; - - const relaxedClone = document.cloneNode(true) as Document; - return buildReadabilityCandidate( - new Readability(relaxedClone, { charThreshold: 120 }).parse(), - document, - "readability-relaxed" - ); - } catch { - return null; - } -} - -function trySelectorExtraction(document: Document): ExtractionCandidate | null { - for (const selector of CONTENT_SELECTORS) { - const element = document.querySelector(selector); - if (!element) continue; - - const clone = element.cloneNode(true) as Element; - for (const removeSelector of REMOVE_SELECTORS) { - for (const node of clone.querySelectorAll(removeSelector)) { - node.remove(); - } - } - - const html = sanitizeHtml(clone.innerHTML); - const textContent = extractTextFromHtml(html); - if (textContent.length < MIN_CONTENT_LENGTH) continue; - - return { - title: extractTitle(document), - byline: null, - excerpt: generateExcerpt(null, textContent), - published: extractPublishedTime(document), - html, - textContent, - method: `selector:${selector}`, - }; - } - - return null; -} - -function tryBodyExtraction(document: Document): ExtractionCandidate | null { - const body = document.body; - if (!body) return null; - - const clone = body.cloneNode(true) as Element; - for (const removeSelector of REMOVE_SELECTORS) { - for (const node of clone.querySelectorAll(removeSelector)) { - node.remove(); - } - } - - const html = sanitizeHtml(clone.innerHTML); - const textContent = extractTextFromHtml(html); - if (!textContent) return null; - - return { - title: extractTitle(document), - byline: null, - excerpt: generateExcerpt(null, textContent), - published: extractPublishedTime(document), - html, - textContent, - method: "body-fallback", - }; -} - -function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null { - if (candidates.length === 0) return null; - - const methodOrder = [ - "readability", - "readability-relaxed", - "next-data", - "json-ld", - "selector:", - "body-fallback", - ]; - - function methodRank(method: string): number { - const idx = methodOrder.findIndex((entry) => - entry.endsWith(":") ? method.startsWith(entry) : method === entry - ); - return idx === -1 ? methodOrder.length : idx; - } - - const ranked = [...candidates].sort((a, b) => { - const rankA = methodRank(a.method); - const rankB = methodRank(b.method); - if (rankA !== rankB) return rankA - rankB; - return (b.textContent.length ?? 0) - (a.textContent.length ?? 0); - }); - - for (const candidate of ranked) { - if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) { - return candidate; - } - } - - for (const candidate of ranked) { - if (candidate.textContent.length >= MIN_CONTENT_LENGTH) { - return candidate; - } - } - - return ranked[0]; -} - -function extractFromHtml(html: string): ExtractionCandidate | null { - const document = parseDocument(html); - - const readabilityCandidate = tryReadability(document); - const nextDataCandidate = tryNextDataExtraction(document); - const jsonLdCandidate = tryJsonLdExtraction(document); - const selectorCandidate = trySelectorExtraction(document); - const bodyCandidate = tryBodyExtraction(document); - - const candidates = [ - readabilityCandidate, - nextDataCandidate, - jsonLdCandidate, - selectorCandidate, - bodyCandidate, - ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate)); - - const winner = pickBestCandidate(candidates); - if (!winner) return null; - - return { - ...winner, - title: winner.title ?? extractTitle(document), - published: winner.published ?? extractPublishedTime(document), - excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent), - }; -} - -const turndown = new TurndownService({ - headingStyle: "atx", - hr: "---", - bulletListMarker: "-", - codeBlockStyle: "fenced", - emDelimiter: "*", - strongDelimiter: "**", - linkStyle: "inlined", -}); - -turndown.use(gfm); -turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]); - -turndown.addRule("collapseFigure", { - filter: "figure", - replacement(content) { - return `\n\n${content.trim()}\n\n`; - }, -}); - -turndown.addRule("dropInvisibleAnchors", { - filter(node) { - return node.nodeName === "A" && !(node as Element).textContent?.trim(); - }, - replacement() { - return ""; - }, -}); - -function convertHtmlToMarkdown(html: string): string { - if (!html || !html.trim()) return ""; - - try { - const sanitized = sanitizeHtml(html); - return turndown.turndown(sanitized); - } catch { - return ""; - } -} - -function fallbackPlainText(html: string): string { - const document = parseDocument(html); - for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { - for (const el of document.querySelectorAll(selector)) { - el.remove(); - } - } - const text = document.body?.textContent ?? document.documentElement?.textContent ?? ""; - return normalizeMarkdown(text.replace(/\s+/g, " ")); -} - -function countBylines(markdown: string): number { - return (markdown.match(/(^|\n)By\s+/g) || []).length; -} - -function countUsefulParagraphs(markdown: string): number { - const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/); - let count = 0; - - for (const paragraph of paragraphs) { - const trimmed = paragraph.trim(); - if (!trimmed) continue; - if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue; - if (/^#{1,6}\s+/.test(trimmed)) continue; - if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue; - count++; - } - - return count; -} - -function countMarkerHits(markdown: string, markers: RegExp[]): number { - let hits = 0; - for (const marker of markers) { - if (marker.test(markdown)) hits++; - } - return hits; -} - -function scoreMarkdownQuality(markdown: string): number { - const normalized = normalizeMarkdown(markdown); - const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length; - const usefulParagraphs = countUsefulParagraphs(normalized); - const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length; - const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS); - const bylineCount = countBylines(normalized); - const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length; - - return ( - Math.min(wordCount, 4000) + - usefulParagraphs * 40 + - headingCount * 10 - - markerHits * 180 - - Math.max(0, bylineCount - 1) * 120 - - Math.max(0, staffCount - 1) * 80 - ); -} - -function shouldCompareWithLegacy(markdown: string): boolean { - const normalized = normalizeMarkdown(markdown); - return ( - countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || - countBylines(normalized) > 1 || - countUsefulParagraphs(normalized) < 6 - ); -} - -function isMarkdownUsable(markdown: string, html: string): boolean { - const normalized = normalizeMarkdown(markdown); - if (!normalized) return false; - - const htmlTextLength = extractTextFromHtml(html).length; - if (htmlTextLength < MIN_CONTENT_LENGTH) return true; - - if (normalized.length >= 80) return true; - return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2)); -} - -async function tryDefuddleConversion( - html: string, - url: string, - baseMetadata: PageMetadata -): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> { - try { - const [{ JSDOM, VirtualConsole }, { Defuddle }] = await Promise.all([ - import("jsdom"), - import("defuddle/node"), - ]); - - const virtualConsole = new VirtualConsole(); - virtualConsole.on("jsdomError", (error: Error & { type?: string }) => { - if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) { - return; - } - console.warn(`[url-to-markdown] jsdom: ${error.message}`); - }); - - const dom = new JSDOM(html, { url, virtualConsole }); - const result = await Defuddle(dom, url, { markdown: true }); - const markdown = normalizeMarkdown(result.content || ""); - - if (!isMarkdownUsable(markdown, html)) { - return { ok: false, reason: "Defuddle returned empty or incomplete markdown" }; - } - - return { - ok: true, - result: { - metadata: { - ...baseMetadata, - title: pickString(result.title, baseMetadata.title) ?? "", - description: pickString(result.description, baseMetadata.description) ?? undefined, - author: pickString(result.author, baseMetadata.author) ?? undefined, - published: pickString(result.published, baseMetadata.published) ?? undefined, - coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined, - }, - markdown, - rawHtml: html, - conversionMethod: "defuddle", - }, - }; - } catch (error) { - return { - ok: false, - reason: error instanceof Error ? error.message : String(error), - }; - } -} - -function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult { - const extracted = extractFromHtml(html); - - let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : ""; - if (!markdown.trim()) { - markdown = extracted?.textContent?.trim() || fallbackPlainText(html); - } - - return { - metadata: { - ...baseMetadata, - title: pickString(extracted?.title, baseMetadata.title) ?? "", - description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined, - author: pickString(extracted?.byline, baseMetadata.author) ?? undefined, - published: pickString(extracted?.published, baseMetadata.published) ?? undefined, - }, - markdown: normalizeMarkdown(markdown), - rawHtml: html, - conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text", - }; + return /^##?\s+transcript\b/im.test(result.markdown); } export async function extractContent(html: string, url: string): Promise { @@ -914,6 +107,10 @@ export async function extractContent(html: string, url: string): Promise 200 ? `${trimmed.slice(0, 200)}...` : trimmed; +} + +function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null { + const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"]; + if (typeof type !== "string" || !["Article", "NewsArticle", "BlogPosting", "WebPage", "ReportageNewsArticle"].includes(type)) { + return null; + } + + const rawContent = + (typeof item.articleBody === "string" && item.articleBody) || + (typeof item.text === "string" && item.text) || + (typeof item.description === "string" && item.description) || + null; + + if (!rawContent) return null; + + const content = rawContent.trim(); + const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content); + const textContent = htmlLike ? extractTextFromHtml(content) : content; + + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(item.headline, item.name), + byline: extractAuthorFromJsonLd(item.author), + excerpt: pickString(item.description), + published: pickString(item.datePublished, item.dateCreated), + html: htmlLike ? content : null, + textContent, + method: "json-ld", + }; +} + +function extractAuthorFromJsonLd(authorData: unknown): string | null { + if (typeof authorData === "string") return authorData; + if (!authorData || typeof authorData !== "object") return null; + + if (Array.isArray(authorData)) { + const names = authorData + .map((author) => extractAuthorFromJsonLd(author)) + .filter((name): name is string => Boolean(name)); + return names.length > 0 ? names.join(", ") : null; + } + + const author = authorData as AnyRecord; + return typeof author.name === "string" ? author.name : null; +} + +function flattenJsonLdItems(data: unknown): AnyRecord[] { + if (!data || typeof data !== "object") return []; + if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems); + + const item = data as AnyRecord; + if (Array.isArray(item["@graph"])) { + return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems); + } + + return [item]; +} + +function tryJsonLdExtraction(document: Document): ExtractionCandidate | null { + const scripts = document.querySelectorAll("script[type='application/ld+json']"); + + for (const script of scripts) { + try { + const data = JSON.parse(script.textContent ?? ""); + for (const item of flattenJsonLdItems(data)) { + const extracted = parseJsonLdItem(item); + if (extracted) return extracted; + } + } catch { + // Ignore malformed blocks. + } + } + + return null; +} + +function getByPath(value: unknown, path: string): unknown { + let current = value; + for (const part of path.split(".")) { + if (!current || typeof current !== "object") return undefined; + current = (current as AnyRecord)[part]; + } + return current; +} + +function isContentBlockArray(value: unknown): value is AnyRecord[] { + if (!Array.isArray(value) || value.length === 0) return false; + return value.slice(0, 5).some((item) => { + if (!item || typeof item !== "object") return false; + const obj = item as AnyRecord; + return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj; + }); +} + +function extractTextFromContentBlocks(blocks: AnyRecord[]): string { + const parts: string[] = []; + + function pushParagraph(text: string): void { + const trimmed = text.trim(); + if (!trimmed) return; + parts.push(trimmed, "\n\n"); + } + + function walk(node: unknown): void { + if (!node || typeof node !== "object") return; + const block = node as AnyRecord; + + if (typeof block.text === "string") { + pushParagraph(block.text); + return; + } + + if (typeof block.textHtml === "string") { + pushParagraph(extractTextFromHtml(block.textHtml)); + return; + } + + if (Array.isArray(block.items)) { + for (const item of block.items) { + if (item && typeof item === "object") { + const text = pickString((item as AnyRecord).text); + if (text) parts.push(`- ${text}\n`); + } + } + parts.push("\n"); + } + + if (Array.isArray(block.components)) { + for (const component of block.components) { + walk(component); + } + } + + if (Array.isArray(block.content)) { + for (const child of block.content) { + walk(child); + } + } + } + + for (const block of blocks) { + walk(block); + } + + return parts.join("").replace(/\n{3,}/g, "\n\n").trim(); +} + +function tryStringBodyExtraction( + content: string, + meta: AnyRecord, + document: Document, + method: string +): ExtractionCandidate | null { + if (!content || content.length < MIN_CONTENT_LENGTH) return null; + + const isHtml = /<\/?[a-z][\s\S]*>/i.test(content); + const html = isHtml ? sanitizeHtml(content) : null; + const textContent = isHtml ? extractTextFromHtml(html) : content.trim(); + + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(meta.headline, meta.title, extractTitle(document)), + byline: pickString(meta.byline, meta.author), + excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)), + published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)), + html, + textContent, + method, + }; +} + +function tryNextDataExtraction(document: Document): ExtractionCandidate | null { + try { + const script = document.querySelector("script#__NEXT_DATA__"); + if (!script?.textContent) return null; + + const data = JSON.parse(script.textContent) as AnyRecord; + const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord; + + for (const path of NEXT_DATA_CONTENT_PATHS) { + const value = getByPath(data, path); + + if (typeof value === "string") { + const parentPath = path.split(".").slice(0, -1).join("."); + const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord; + const meta = { + ...pageProps, + ...parent, + title: parent.title ?? (pageProps.title as string | undefined), + }; + + const candidate = tryStringBodyExtraction(value, meta, document, "next-data"); + if (candidate) return candidate; + } + + if (isContentBlockArray(value)) { + const textContent = extractTextFromContentBlocks(value); + if (textContent.length < MIN_CONTENT_LENGTH) continue; + + return { + title: pickString( + getByPath(data, "props.pageProps.content.headline"), + getByPath(data, "props.pageProps.article.headline"), + getByPath(data, "props.pageProps.article.title"), + getByPath(data, "props.pageProps.post.title"), + pageProps.title, + extractTitle(document) + ), + byline: pickString( + getByPath(data, "props.pageProps.author.name"), + getByPath(data, "props.pageProps.article.author.name") + ), + excerpt: pickString( + getByPath(data, "props.pageProps.content.description"), + getByPath(data, "props.pageProps.article.description"), + pageProps.description, + generateExcerpt(null, textContent) + ), + published: pickString( + getByPath(data, "props.pageProps.content.datePublished"), + getByPath(data, "props.pageProps.article.datePublished"), + getByPath(data, "props.pageProps.publishedAt"), + extractPublishedTime(document) + ), + html: null, + textContent, + method: "next-data", + }; + } + } + } catch { + return null; + } + + return null; +} + +function buildReadabilityCandidate( + article: ReturnType, + document: Document, + method: string +): ExtractionCandidate | null { + const textContent = article?.textContent?.trim() ?? ""; + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(article?.title, extractTitle(document)), + byline: pickString((article as { byline?: string } | null)?.byline), + excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)), + published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)), + html: article?.content ? sanitizeHtml(article.content) : null, + textContent, + method, + }; +} + +function tryReadability(document: Document): ExtractionCandidate | null { + try { + const strictClone = document.cloneNode(true) as Document; + const strictResult = buildReadabilityCandidate( + new Readability(strictClone).parse(), + document, + "readability" + ); + if (strictResult) return strictResult; + + const relaxedClone = document.cloneNode(true) as Document; + return buildReadabilityCandidate( + new Readability(relaxedClone, { charThreshold: 120 }).parse(), + document, + "readability-relaxed" + ); + } catch { + return null; + } +} + +function trySelectorExtraction(document: Document): ExtractionCandidate | null { + for (const selector of CONTENT_SELECTORS) { + const element = document.querySelector(selector); + if (!element) continue; + + const clone = element.cloneNode(true) as Element; + for (const removeSelector of REMOVE_SELECTORS) { + for (const node of clone.querySelectorAll(removeSelector)) { + node.remove(); + } + } + + const html = sanitizeHtml(clone.innerHTML); + const textContent = extractTextFromHtml(html); + if (textContent.length < MIN_CONTENT_LENGTH) continue; + + return { + title: extractTitle(document), + byline: null, + excerpt: generateExcerpt(null, textContent), + published: extractPublishedTime(document), + html, + textContent, + method: `selector:${selector}`, + }; + } + + return null; +} + +function tryBodyExtraction(document: Document): ExtractionCandidate | null { + const body = document.body; + if (!body) return null; + + const clone = body.cloneNode(true) as Element; + for (const removeSelector of REMOVE_SELECTORS) { + for (const node of clone.querySelectorAll(removeSelector)) { + node.remove(); + } + } + + const html = sanitizeHtml(clone.innerHTML); + const textContent = extractTextFromHtml(html); + if (!textContent) return null; + + return { + title: extractTitle(document), + byline: null, + excerpt: generateExcerpt(null, textContent), + published: extractPublishedTime(document), + html, + textContent, + method: "body-fallback", + }; +} + +function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null { + if (candidates.length === 0) return null; + + const methodOrder = [ + "readability", + "readability-relaxed", + "next-data", + "json-ld", + "selector:", + "body-fallback", + ]; + + function methodRank(method: string): number { + const idx = methodOrder.findIndex((entry) => + entry.endsWith(":") ? method.startsWith(entry) : method === entry + ); + return idx === -1 ? methodOrder.length : idx; + } + + const ranked = [...candidates].sort((a, b) => { + const rankA = methodRank(a.method); + const rankB = methodRank(b.method); + if (rankA !== rankB) return rankA - rankB; + return (b.textContent.length ?? 0) - (a.textContent.length ?? 0); + }); + + for (const candidate of ranked) { + if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) { + return candidate; + } + } + + for (const candidate of ranked) { + if (candidate.textContent.length >= MIN_CONTENT_LENGTH) { + return candidate; + } + } + + return ranked[0]; +} + +function extractFromHtml(html: string): ExtractionCandidate | null { + const document = parseDocument(html); + + const readabilityCandidate = tryReadability(document); + const nextDataCandidate = tryNextDataExtraction(document); + const jsonLdCandidate = tryJsonLdExtraction(document); + const selectorCandidate = trySelectorExtraction(document); + const bodyCandidate = tryBodyExtraction(document); + + const candidates = [ + readabilityCandidate, + nextDataCandidate, + jsonLdCandidate, + selectorCandidate, + bodyCandidate, + ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate)); + + const winner = pickBestCandidate(candidates); + if (!winner) return null; + + return { + ...winner, + title: winner.title ?? extractTitle(document), + published: winner.published ?? extractPublishedTime(document), + excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent), + }; +} + +const turndown = new TurndownService({ + headingStyle: "atx", + hr: "---", + bulletListMarker: "-", + codeBlockStyle: "fenced", + emDelimiter: "*", + strongDelimiter: "**", + linkStyle: "inlined", +}); + +turndown.use(gfm); +turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]); + +turndown.addRule("collapseFigure", { + filter: "figure", + replacement(content) { + return `\n\n${content.trim()}\n\n`; + }, +}); + +turndown.addRule("dropInvisibleAnchors", { + filter(node) { + return node.nodeName === "A" && !(node as Element).textContent?.trim(); + }, + replacement() { + return ""; + }, +}); + +function convertHtmlToMarkdown(html: string): string { + if (!html || !html.trim()) return ""; + + try { + const sanitized = sanitizeHtml(html); + return turndown.turndown(sanitized); + } catch { + return ""; + } +} + +function fallbackPlainText(html: string): string { + const document = parseDocument(html); + for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { + for (const el of document.querySelectorAll(selector)) { + el.remove(); + } + } + const text = document.body?.textContent ?? document.documentElement?.textContent ?? ""; + return normalizeMarkdown(text.replace(/\s+/g, " ")); +} + +function countBylines(markdown: string): number { + return (markdown.match(/(^|\n)By\s+/g) || []).length; +} + +function countUsefulParagraphs(markdown: string): number { + const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/); + let count = 0; + + for (const paragraph of paragraphs) { + const trimmed = paragraph.trim(); + if (!trimmed) continue; + if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue; + if (/^#{1,6}\s+/.test(trimmed)) continue; + if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue; + count++; + } + + return count; +} + +function countMarkerHits(markdown: string, markers: RegExp[]): number { + let hits = 0; + for (const marker of markers) { + if (marker.test(markdown)) hits++; + } + return hits; +} + +export function scoreMarkdownQuality(markdown: string): number { + const normalized = normalizeMarkdown(markdown); + const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length; + const usefulParagraphs = countUsefulParagraphs(normalized); + const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length; + const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS); + const bylineCount = countBylines(normalized); + const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length; + + return ( + Math.min(wordCount, 4000) + + usefulParagraphs * 40 + + headingCount * 10 - + markerHits * 180 - + Math.max(0, bylineCount - 1) * 120 - + Math.max(0, staffCount - 1) * 80 + ); +} + +export function shouldCompareWithLegacy(markdown: string): boolean { + const normalized = normalizeMarkdown(markdown); + return ( + countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || + countBylines(normalized) > 1 || + countUsefulParagraphs(normalized) < 6 + ); +} + +export function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult { + const extracted = extractFromHtml(html); + + let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : ""; + if (!markdown.trim()) { + markdown = extracted?.textContent?.trim() || fallbackPlainText(html); + } + + return { + metadata: { + ...baseMetadata, + title: pickString(extracted?.title, baseMetadata.title) ?? "", + description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined, + author: pickString(extracted?.byline, baseMetadata.author) ?? undefined, + published: pickString(extracted?.published, baseMetadata.published) ?? undefined, + }, + markdown: normalizeMarkdown(markdown), + rawHtml: html, + conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text", + }; +} diff --git a/skills/baoyu-url-to-markdown/scripts/main.ts b/skills/baoyu-url-to-markdown/scripts/main.ts index 31d948f..b246c84 100644 --- a/skills/baoyu-url-to-markdown/scripts/main.ts +++ b/skills/baoyu-url-to-markdown/scripts/main.ts @@ -75,6 +75,55 @@ function deriveHtmlSnapshotPath(markdownPath: string): string { return path.join(parsed.dir, `${basename}-captured.html`); } +function extractTitleFromMarkdownDocument(document: string): string { + const normalized = document.replace(/\r\n/g, "\n"); + const frontmatterMatch = normalized.match(/^---\n([\s\S]*?)\n---\n?/); + if (frontmatterMatch) { + const titleLine = frontmatterMatch[1] + .split("\n") + .find((line) => /^title:\s*/i.test(line)); + + if (titleLine) { + const rawValue = titleLine.replace(/^title:\s*/i, "").trim(); + const unquoted = rawValue + .replace(/^"(.*)"$/, "$1") + .replace(/^'(.*)'$/, "$1") + .replace(/\\"/g, '"'); + if (unquoted) return unquoted; + } + } + + const headingMatch = normalized.match(/^#\s+(.+)$/m); + return headingMatch?.[1]?.trim() ?? ""; +} + +function buildDefuddleApiUrl(targetUrl: string): string { + return `https://defuddle.md/${encodeURIComponent(targetUrl)}`; +} + +async function fetchDefuddleApiMarkdown(targetUrl: string): Promise<{ markdown: string; title: string }> { + const apiUrl = buildDefuddleApiUrl(targetUrl); + const response = await fetch(apiUrl, { + headers: { + accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1", + }, + }); + + if (!response.ok) { + throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`); + } + + const markdown = (await response.text()).replace(/\r\n/g, "\n").trim(); + if (!markdown) { + throw new Error("defuddle.md returned empty markdown"); + } + + return { + markdown, + title: extractTitleFromMarkdownDocument(markdown), + }; +} + async function generateOutputPath(url: string, title: string, outputDir?: string): Promise { const domain = new URL(url).hostname.replace(/^www\./, ""); const slug = generateSlug(title, url); @@ -192,14 +241,41 @@ async function main(): Promise { console.log(`Fetching: ${args.url}`); console.log(`Mode: ${args.wait ? "wait" : "auto"}`); - const result = await captureUrl(args); - const outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir); - const outputDir = path.dirname(outputPath); - const htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath); - await mkdir(outputDir, { recursive: true }); - await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8"); + let outputPath: string; + let htmlSnapshotPath: string | null = null; + let document: string; + let conversionMethod: string; + let fallbackReason: string | undefined; - let document = createMarkdownDocument(result); + try { + const result = await captureUrl(args); + outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir); + const outputDir = path.dirname(outputPath); + htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath); + await mkdir(outputDir, { recursive: true }); + await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8"); + + document = createMarkdownDocument(result); + conversionMethod = result.conversionMethod; + fallbackReason = result.fallbackReason; + } catch (error) { + const primaryError = error instanceof Error ? error.message : String(error); + console.warn(`Primary capture failed: ${primaryError}`); + console.warn("Trying defuddle.md API fallback..."); + + try { + const remoteResult = await fetchDefuddleApiMarkdown(args.url); + outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir); + await mkdir(path.dirname(outputPath), { recursive: true }); + + document = remoteResult.markdown; + conversionMethod = "defuddle-api"; + fallbackReason = `Local browser capture failed: ${primaryError}`; + } catch (remoteError) { + const remoteMessage = remoteError instanceof Error ? remoteError.message : String(remoteError); + throw new Error(`Local browser capture failed (${primaryError}); defuddle.md fallback failed (${remoteMessage})`); + } + } if (args.downloadMedia) { const mediaResult = await localizeMarkdownMedia(document, { @@ -220,11 +296,15 @@ async function main(): Promise { await writeFile(outputPath, document, "utf-8"); console.log(`Saved: ${outputPath}`); - console.log(`Saved HTML: ${htmlSnapshotPath}`); - console.log(`Title: ${result.metadata.title || "(no title)"}`); - console.log(`Converter: ${result.conversionMethod}`); - if (result.fallbackReason) { - console.warn(`Fallback used: ${result.fallbackReason}`); + if (htmlSnapshotPath) { + console.log(`Saved HTML: ${htmlSnapshotPath}`); + } else { + console.log("Saved HTML: unavailable (defuddle.md fallback)"); + } + console.log(`Title: ${extractTitleFromMarkdownDocument(document) || "(no title)"}`); + console.log(`Converter: ${conversionMethod}`); + if (fallbackReason) { + console.warn(`Fallback used: ${fallbackReason}`); } } diff --git a/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts new file mode 100644 index 0000000..25ed29b --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts @@ -0,0 +1,305 @@ +import { parseHTML } from "linkedom"; + +export interface PageMetadata { + url: string; + title: string; + description?: string; + author?: string; + published?: string; + coverImage?: string; + language?: string; + captured_at: string; +} + +export interface ConversionResult { + metadata: PageMetadata; + markdown: string; + rawHtml: string; + conversionMethod: string; + fallbackReason?: string; + variables?: Record; +} + +export type AnyRecord = Record; + +export const MIN_CONTENT_LENGTH = 120; +export const GOOD_CONTENT_LENGTH = 900; + +const PUBLISHED_TIME_SELECTORS = [ + "meta[property='article:published_time']", + "meta[name='pubdate']", + "meta[name='publishdate']", + "meta[name='date']", + "time[datetime]", +]; + +const ARTICLE_TYPES = new Set([ + "Article", + "NewsArticle", + "BlogPosting", + "WebPage", + "ReportageNewsArticle", +]); + +export function pickString(...values: unknown[]): string | null { + for (const value of values) { + if (typeof value === "string") { + const trimmed = value.trim(); + if (trimmed) return trimmed; + } + } + return null; +} + +export function normalizeMarkdown(markdown: string): string { + return markdown + .replace(/\r\n/g, "\n") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); +} + +export function parseDocument(html: string): Document { + const normalized = /<\s*html[\s>]/i.test(html) + ? html + : `${html}`; + return parseHTML(normalized).document as unknown as Document; +} + +export function sanitizeHtml(html: string): string { + const { document } = parseHTML(`
${html}
`); + const root = document.querySelector("#__root"); + if (!root) return html; + + for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) { + for (const el of root.querySelectorAll(selector)) { + el.remove(); + } + } + + return root.innerHTML; +} + +export function extractTextFromHtml(html: string): string { + const { document } = parseHTML(`${html}`); + for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { + for (const el of document.querySelectorAll(selector)) { + el.remove(); + } + } + return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? ""; +} + +export function getMetaContent(document: Document, names: string[]): string | null { + for (const name of names) { + const element = + document.querySelector(`meta[name="${name}"]`) ?? + document.querySelector(`meta[property="${name}"]`); + const content = element?.getAttribute("content"); + if (content && content.trim()) return content.trim(); + } + return null; +} + +function normalizeLanguageTag(value: string | null): string | null { + if (!value) return null; + + const trimmed = value.trim(); + if (!trimmed) return null; + + const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim(); + if (!primary) return null; + + return primary.replace(/_/g, "-"); +} + +function flattenJsonLdItems(data: unknown): AnyRecord[] { + if (!data || typeof data !== "object") return []; + if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems); + + const item = data as AnyRecord; + if (Array.isArray(item["@graph"])) { + return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems); + } + + return [item]; +} + +function parseJsonLdScripts(document: Document): AnyRecord[] { + const results: AnyRecord[] = []; + const scripts = document.querySelectorAll("script[type='application/ld+json']"); + + for (const script of scripts) { + try { + const data = JSON.parse(script.textContent ?? ""); + results.push(...flattenJsonLdItems(data)); + } catch { + // Ignore malformed blocks. + } + } + + return results; +} + +function isArticleType(item: AnyRecord): boolean { + const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"]; + return typeof value === "string" && ARTICLE_TYPES.has(value); +} + +function extractAuthorFromJsonLd(authorData: unknown): string | null { + if (typeof authorData === "string") return authorData; + if (!authorData || typeof authorData !== "object") return null; + + if (Array.isArray(authorData)) { + const names = authorData + .map((author) => extractAuthorFromJsonLd(author)) + .filter((name): name is string => Boolean(name)); + return names.length > 0 ? names.join(", ") : null; + } + + const author = authorData as AnyRecord; + return typeof author.name === "string" ? author.name : null; +} + +function extractPrimaryJsonLdMeta(document: Document): Partial { + for (const item of parseJsonLdScripts(document)) { + if (!isArticleType(item)) continue; + + return { + title: pickString(item.headline, item.name) ?? undefined, + description: pickString(item.description) ?? undefined, + author: extractAuthorFromJsonLd(item.author) ?? undefined, + published: pickString(item.datePublished, item.dateCreated) ?? undefined, + coverImage: + pickString( + item.image, + (item.image as AnyRecord | undefined)?.url, + (Array.isArray(item.image) ? item.image[0] : undefined) as unknown + ) ?? undefined, + }; + } + + return {}; +} + +export function extractPublishedTime(document: Document): string | null { + for (const selector of PUBLISHED_TIME_SELECTORS) { + const el = document.querySelector(selector); + if (!el) continue; + const value = el.getAttribute("content") ?? el.getAttribute("datetime"); + if (value && value.trim()) return value.trim(); + } + return null; +} + +export function extractTitle(document: Document): string | null { + const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content"); + if (ogTitle && ogTitle.trim()) return ogTitle.trim(); + + const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content"); + if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim(); + + const title = document.querySelector("title")?.textContent?.trim(); + if (title) { + const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim(); + if (cleaned) return cleaned; + } + + const h1 = document.querySelector("h1")?.textContent?.trim(); + return h1 || null; +} + +export function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata { + const document = parseDocument(html); + const jsonLd = extractPrimaryJsonLdMeta(document); + const timeEl = document.querySelector("time[datetime]"); + const htmlLang = normalizeLanguageTag(document.documentElement?.getAttribute("lang")); + const metaLanguage = normalizeLanguageTag( + pickString( + getMetaContent(document, ["language", "content-language", "og:locale"]), + document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content") + ) + ); + + return { + url, + title: + pickString( + getMetaContent(document, ["og:title", "twitter:title"]), + jsonLd.title, + document.querySelector("h1")?.textContent, + document.title + ) ?? "", + description: + pickString( + getMetaContent(document, ["description", "og:description", "twitter:description"]), + jsonLd.description + ) ?? undefined, + author: + pickString( + getMetaContent(document, ["author", "article:author", "twitter:creator"]), + jsonLd.author + ) ?? undefined, + published: + pickString( + timeEl?.getAttribute("datetime"), + getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]), + jsonLd.published, + extractPublishedTime(document) + ) ?? undefined, + coverImage: + pickString( + getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]), + jsonLd.coverImage + ) ?? undefined, + language: pickString(htmlLang, metaLanguage) ?? undefined, + captured_at: capturedAt, + }; +} + +export function isMarkdownUsable(markdown: string, html: string): boolean { + const normalized = normalizeMarkdown(markdown); + if (!normalized) return false; + + const htmlTextLength = extractTextFromHtml(html).length; + if (htmlTextLength < MIN_CONTENT_LENGTH) return true; + + if (normalized.length >= 80) return true; + return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2)); +} + +export function isYouTubeUrl(url: string): boolean { + try { + const hostname = new URL(url).hostname.toLowerCase(); + return hostname === "youtu.be" || hostname.endsWith(".youtube.com") || hostname === "youtube.com"; + } catch { + return false; + } +} + +function escapeYamlValue(value: string): string { + return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n"); +} + +export function formatMetadataYaml(meta: PageMetadata): string { + const lines = ["---"]; + lines.push(`url: ${meta.url}`); + lines.push(`title: "${escapeYamlValue(meta.title)}"`); + if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`); + if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`); + if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`); + if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`); + if (meta.language) lines.push(`language: "${escapeYamlValue(meta.language)}"`); + lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`); + lines.push("---"); + return lines.join("\n"); +} + +export function createMarkdownDocument(result: ConversionResult): string { + const yaml = formatMetadataYaml(result.metadata); + const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); + const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i"); + const hasTitle = titleRegex.test(result.markdown.trimStart()); + const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n"; + return yaml + title + result.markdown; +} diff --git a/skills/baoyu-url-to-markdown/scripts/package.json b/skills/baoyu-url-to-markdown/scripts/package.json index e37fdae..ac8aac0 100644 --- a/skills/baoyu-url-to-markdown/scripts/package.json +++ b/skills/baoyu-url-to-markdown/scripts/package.json @@ -5,7 +5,7 @@ "dependencies": { "@mozilla/readability": "^0.6.0", "baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp", - "defuddle": "^0.10.0", + "defuddle": "^0.12.0", "jsdom": "^24.1.3", "linkedom": "^0.18.12", "turndown": "^7.2.2",