From e5d6c8ec688ec0637ab96a53185eba36896a7637 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= Date: Sun, 22 Mar 2026 15:18:46 -0500 Subject: [PATCH] feat(baoyu-url-to-markdown): add URL-specific parser layer for X/Twitter and archive.ph - New parsers/ module with pluggable rule system for site-specific HTML extraction - X status parser: extract tweet text, media, quotes, author from data-testid elements - X article parser: extract long-form article content with inline media - archive.ph parser: restore original URL and prefer #CONTENT container - Improved slug generation with stop words and content-aware slugs - Output path uses subdirectory structure (domain/slug/slug.md) - Fix: preserve anchor elements containing media in legacy converter - Fix: smarter title deduplication in markdown document builder --- skills/baoyu-url-to-markdown/SKILL.md | 20 +- .../scripts/html-to-markdown.ts | 6 + .../scripts/legacy-converter.ts | 10 +- skills/baoyu-url-to-markdown/scripts/main.ts | 86 ++++-- .../scripts/markdown-conversion-shared.ts | 20 +- .../scripts/parsers/index.test.ts | 201 ++++++++++++++ .../scripts/parsers/index.ts | 47 ++++ .../scripts/parsers/rules/archive-ph.ts | 97 +++++++ .../scripts/parsers/rules/index.ts | 10 + .../scripts/parsers/rules/x-article.ts | 137 ++++++++++ .../scripts/parsers/rules/x-shared.ts | 249 ++++++++++++++++++ .../scripts/parsers/rules/x-status.ts | 82 ++++++ .../scripts/parsers/types.ts | 14 + 13 files changed, 953 insertions(+), 26 deletions(-) create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/index.test.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/index.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/rules/archive-ph.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/rules/index.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/rules/x-article.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/rules/x-shared.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/rules/x-status.ts create mode 100644 skills/baoyu-url-to-markdown/scripts/parsers/types.ts diff --git a/skills/baoyu-url-to-markdown/SKILL.md b/skills/baoyu-url-to-markdown/SKILL.md index 48881b9..efc71c8 100644 --- a/skills/baoyu-url-to-markdown/SKILL.md +++ b/skills/baoyu-url-to-markdown/SKILL.md @@ -1,7 +1,7 @@ --- name: baoyu-url-to-markdown description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, uses an upgraded Defuddle pipeline with better web-component handling and YouTube transcript extraction, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when needed. If local browser capture fails entirely, it can fall back to the hosted defuddle.md API. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown. -version: 1.58.1 +version: 1.59.0 metadata: openclaw: homepage: https://github.com/JimLiu/baoyu-skills#baoyu-url-to-markdown @@ -30,6 +30,9 @@ Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts i |--------|---------| | `scripts/main.ts` | CLI entry point for URL fetching | | `scripts/html-to-markdown.ts` | Markdown conversion entry point and converter selection | +| `scripts/parsers/index.ts` | Unified parser entry: dispatches URL-specific rules before generic converters | +| `scripts/parsers/types.ts` | Unified parser interface shared by all rule files | +| `scripts/parsers/rules/*.ts` | One file per URL rule, for example X status and X article | | `scripts/defuddle-converter.ts` | Defuddle-based conversion | | `scripts/legacy-converter.ts` | Pre-Defuddle legacy extraction and markdown conversion | | `scripts/markdown-conversion-shared.ts` | Shared metadata parsing and markdown document helpers | @@ -115,10 +118,13 @@ Full reference: [references/config/first-time-setup.md](references/config/first- ## Features - Chrome CDP for full JavaScript rendering +- URL-specific parser layer for sites that need custom HTML rules before generic extraction - Two capture modes: auto or wait-for-user - Save rendered HTML as a sibling `-captured.html` file - Clean markdown output with metadata - Upgraded Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history +- X/Twitter pages can use HTML-specific parsing for Tweets and Articles, which improves title/body/media extraction on `x.com` / `twitter.com` +- `archive.ph` / related archive mirrors can restore the original URL from `input[name=q]` and prefer `#CONTENT` before falling back to the page body - Materializes shadow DOM content before conversion so web-component pages survive serialization better - YouTube pages can include transcript/caption text in the markdown when YouTube exposes a caption track - If local browser capture fails completely, can fall back to `defuddle.md/` and still save markdown @@ -201,14 +207,16 @@ When `--download-media` is enabled: Conversion order: -1. Try Defuddle first -2. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline -3. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor -4. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/` API and save its markdown output directly -5. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history +1. Try the URL-specific parser layer first when a site rule matches +2. If no specialized parser matches, try Defuddle +3. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline +4. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor +5. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/` API and save its markdown output directly +6. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history CLI output will show: +- `Converter: parser:...` when a URL-specific parser succeeded - `Converter: defuddle` when Defuddle succeeds - `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed - `Converter: defuddle-api` when local browser capture failed and the hosted API was used instead diff --git a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts index 34667b6..af57808 100644 --- a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts +++ b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts @@ -12,6 +12,7 @@ import { scoreMarkdownQuality, shouldCompareWithLegacy, } from "./legacy-converter.js"; +import { tryUrlRuleParsers } from "./parsers/index.js"; export type { ConversionResult, PageMetadata }; export { createMarkdownDocument, formatMetadataYaml }; @@ -105,6 +106,11 @@ export async function extractContent(html: string, url: string): Promise /^[a-zA-Z]/.test(w) && w.length >= 2 && !SLUG_STOP_WORDS.has(w.toLowerCase())) + .map((w) => w.toLowerCase()); + + const unique: string[] = []; + const seen = new Set(); + for (const w of words) { + if (!seen.has(w)) { + seen.add(w); + unique.push(w); + if (unique.length >= 6) break; + } + } + return unique.length >= 2 ? unique.join("-").slice(0, 50) : null; +} + +function generateSlug(title: string, url: string, content?: string): string { + const asciiWords = title + .replace(/[^\w\s]/g, " ") + .split(/\s+/) + .filter((w) => /[a-zA-Z]/.test(w) && w.length >= 2 && !SLUG_STOP_WORDS.has(w.toLowerCase())) + .map((w) => w.toLowerCase()); + + if (asciiWords.length >= 2) { + return asciiWords.slice(0, 6).join("-").slice(0, 50); + } + + if (content) { + const contentSlug = extractSlugFromContent(content); + if (contentSlug) return contentSlug; + } + + const GENERIC_PATH_SEGMENTS = new Set(["status", "article", "post", "posts", "p", "blog", "news", "articles"]); + const parsed = new URL(url); + const pathSlug = parsed.pathname + .split("/") + .filter((s) => s.length > 0 && !/^\d{10,}$/.test(s) && !GENERIC_PATH_SEGMENTS.has(s.toLowerCase())) + .join("-") .toLowerCase() - .replace(/[^\w\s-]/g, "") - .replace(/\s+/g, "-") + .replace(/[^\w-]/g, "-") .replace(/-+/g, "-") .replace(/^-|-$/g, "") - .slice(0, 50) || "page"; + .slice(0, 40); + + const prefix = asciiWords.slice(0, 2).join("-"); + const combined = prefix ? `${prefix}-${pathSlug}` : pathSlug; + return combined.slice(0, 50) || "page"; } function formatTimestamp(): string { @@ -124,18 +180,18 @@ async function fetchDefuddleApiMarkdown(targetUrl: string): Promise<{ markdown: }; } -async function generateOutputPath(url: string, title: string, outputDir?: string): Promise { +async function generateOutputPath(url: string, title: string, outputDir?: string, content?: string): Promise { const domain = new URL(url).hostname.replace(/^www\./, ""); - const slug = generateSlug(title, url); + const slug = generateSlug(title, url, content); const dataDir = outputDir ? path.resolve(outputDir) : resolveUrlToMarkdownDataDir(); - const basePath = path.join(dataDir, domain, `${slug}.md`); + const basePath = path.join(dataDir, domain, slug, `${slug}.md`); if (!(await fileExists(basePath))) { return basePath; } const timestampSlug = `${slug}-${formatTimestamp()}`; - return path.join(dataDir, domain, `${timestampSlug}.md`); + return path.join(dataDir, domain, timestampSlug, `${timestampSlug}.md`); } async function waitForUserSignal(): Promise { @@ -249,13 +305,12 @@ async function main(): Promise { try { const result = await captureUrl(args); - outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir); + document = createMarkdownDocument(result); + outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir, document); const outputDir = path.dirname(outputPath); htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath); await mkdir(outputDir, { recursive: true }); await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8"); - - document = createMarkdownDocument(result); conversionMethod = result.conversionMethod; fallbackReason = result.fallbackReason; } catch (error) { @@ -265,10 +320,9 @@ async function main(): Promise { try { const remoteResult = await fetchDefuddleApiMarkdown(args.url); - outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir); - await mkdir(path.dirname(outputPath), { recursive: true }); - document = remoteResult.markdown; + outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir, document); + await mkdir(path.dirname(outputPath), { recursive: true }); conversionMethod = "defuddle-api"; fallbackReason = `Local browser capture failed: ${primaryError}`; } catch (remoteError) { diff --git a/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts index 25ed29b..500ccca 100644 --- a/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts +++ b/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts @@ -300,6 +300,24 @@ export function createMarkdownDocument(result: ConversionResult): string { const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&"); const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i"); const hasTitle = titleRegex.test(result.markdown.trimStart()); - const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n"; + const firstMeaningfulLine = result.markdown + .replace(/\r\n/g, "\n") + .split("\n") + .map((line) => line.trim()) + .find((line) => line && !/^!?\[[^\]]*\]\([^)]+\)$/.test(line)) + ?.replace(/^>\s*/, "") + ?.replace(/^#+\s+/, "") + ?.trim(); + const comparableTitle = result.metadata.title.toLowerCase().replace(/(?:\.{3}|…)\s*$/, ""); + const comparableFirstLine = firstMeaningfulLine?.toLowerCase() ?? ""; + const titleRepeatsContent = + comparableTitle !== "" && + comparableFirstLine !== "" && + (comparableFirstLine === comparableTitle || + comparableFirstLine.startsWith(comparableTitle) || + comparableTitle.startsWith(comparableFirstLine)); + const title = result.metadata.title && !hasTitle && !titleRepeatsContent + ? `\n\n# ${result.metadata.title}\n\n` + : "\n\n"; return yaml + title + result.markdown; } diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/index.test.ts b/skills/baoyu-url-to-markdown/scripts/parsers/index.test.ts new file mode 100644 index 0000000..b1e224f --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/index.test.ts @@ -0,0 +1,201 @@ +import { describe, expect, test } from "bun:test"; + +import { + createMarkdownDocument, + extractMetadataFromHtml, +} from "../markdown-conversion-shared.js"; +import { tryUrlRuleParsers } from "./index.js"; + +const CAPTURED_AT = "2026-03-22T06:00:00.000Z"; + +const ARTICLE_HTML = ` + + +
+ +
+ Image +
+
+
Karpathy:"写代码"已经不是对的动词了
+ +
+

Andrej Karpathy 说他从 2024 年 12 月起就基本没手写过一行代码。

+ +
+
+
+ Image +
+
+
+
+

要点速览

+
    +
  • 核心焦虑从 GPU 利用率转向 Token 吞吐量
  • +
+
+

写代码已经不是对的动词了。

+
+
+
+ +`; + +const STATUS_HTML = ` + + + + +`; + +const ARCHIVE_HTML = ` + + + archive.ph + + +
+ +
+ +
+

Major leap towards reanimation after death as mammal brain preserved

+

+ Researchers say the preserved structure and activity markers suggest a significant step + forward in keeping delicate brain tissue viable after clinical death. +

+

+ The archive wrapper should not take precedence over the actual article body when the + CONTENT container is available for parsing. +

+ Brain tissue +
+ +`; + +const ARCHIVE_FALLBACK_HTML = ` + + + archive.ph + + + +
+

Fallback body parsing still works

+

+ When CONTENT is absent, the parser should fall back to the body content instead of + returning null or keeping the archive wrapper as the final URL. +

+

+ This ensures archived pages with slightly different layouts still produce usable markdown. +

+
+ +`; + +function parse(html: string, url: string) { + const baseMetadata = extractMetadataFromHtml(html, url, CAPTURED_AT); + return tryUrlRuleParsers(html, url, baseMetadata); +} + +describe("url rule parsers", () => { + test("parses archive.ph pages from CONTENT and restores the original URL", () => { + const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5"); + + expect(result).not.toBeNull(); + expect(result?.conversionMethod).toBe("parser:archive-ph"); + expect(result?.metadata.url).toBe( + "https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/" + ); + expect(result?.metadata.title).toBe( + "Major leap towards reanimation after death as mammal brain preserved" + ); + expect(result?.metadata.coverImage).toBe("https://cdn.example.com/brain.jpg"); + expect(result?.markdown).toContain("Researchers say the preserved structure"); + expect(result?.markdown).toContain("![Brain tissue](https://cdn.example.com/brain.jpg)"); + expect(result?.markdown).not.toContain("Archive shell text that should be ignored"); + }); + + test("falls back to body when archive.ph CONTENT is missing", () => { + const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback"); + + expect(result).not.toBeNull(); + expect(result?.conversionMethod).toBe("parser:archive-ph"); + expect(result?.metadata.url).toBe("https://example.com/fallback-story"); + expect(result?.metadata.title).toBe("Fallback body parsing still works"); + expect(result?.markdown).toContain("When CONTENT is absent"); + }); + + test("parses X article pages from HTML", () => { + const result = parse( + ARTICLE_HTML, + "https://x.com/dotey/article/2035141635713941927" + ); + + expect(result).not.toBeNull(); + expect(result?.conversionMethod).toBe("parser:x-article"); + expect(result?.metadata.title).toBe("Karpathy:\"写代码\"已经不是对的动词了"); + expect(result?.metadata.author).toBe("宝玉 (@dotey)"); + expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/article-cover.jpg"); + expect(result?.metadata.published).toBe("2026-03-20T23:49:11.000Z"); + expect(result?.metadata.language).toBe("zh"); + expect(result?.markdown).toContain("## 要点速览"); + expect(result?.markdown).toContain( + "[![](https://pbs.twimg.com/media/article-inline.jpg)](/dotey/article/2035141635713941927/media/2)" + ); + expect(result?.markdown).toContain("写代码已经不是对的动词了。"); + + const document = createMarkdownDocument(result!); + expect(document).toContain("# Karpathy:\"写代码\"已经不是对的动词了"); + }); + + test("parses X status pages from HTML without duplicating the title heading", () => { + const result = parse( + STATUS_HTML, + "https://x.com/dotey/status/2035590649081196710" + ); + + expect(result).not.toBeNull(); + expect(result?.conversionMethod).toBe("parser:x-status"); + expect(result?.metadata.author).toBe("宝玉 (@dotey)"); + expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/tweet-main.jpg"); + expect(result?.metadata.language).toBe("zh"); + expect(result?.markdown).toContain("转译:把下面这段加到你的 Codex 自定义指令里"); + expect(result?.markdown).toContain("> Quote from Matt Shumer (@mattshumer_)"); + expect(result?.markdown).toContain("!["); + + const document = createMarkdownDocument(result!); + expect(document).not.toContain("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n"); + }); +}); diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/index.ts b/skills/baoyu-url-to-markdown/scripts/parsers/index.ts new file mode 100644 index 0000000..c6cac09 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/index.ts @@ -0,0 +1,47 @@ +import { + isMarkdownUsable, + normalizeMarkdown, + parseDocument, + type ConversionResult, + type PageMetadata, +} from "../markdown-conversion-shared.js"; +import { URL_RULE_PARSERS } from "./rules/index.js"; +import type { UrlRuleParserContext } from "./types.js"; + +export type { UrlRuleParser, UrlRuleParserContext } from "./types.js"; + +export function tryUrlRuleParsers( + html: string, + url: string, + baseMetadata: PageMetadata +): ConversionResult | null { + const document = parseDocument(html); + const context: UrlRuleParserContext = { + html, + url, + document, + baseMetadata, + }; + + for (const parser of URL_RULE_PARSERS) { + if (!parser.supports(context)) continue; + + try { + const result = parser.parse(context); + if (!result) continue; + + const markdown = normalizeMarkdown(result.markdown); + if (!isMarkdownUsable(markdown, html)) continue; + + return { + ...result, + markdown, + }; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`[url-to-markdown] parser ${parser.id} failed: ${message}`); + } + } + + return null; +} diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/rules/archive-ph.ts b/skills/baoyu-url-to-markdown/scripts/parsers/rules/archive-ph.ts new file mode 100644 index 0000000..f3191f9 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/rules/archive-ph.ts @@ -0,0 +1,97 @@ +import { convertHtmlFragmentToMarkdown } from "../../legacy-converter.js"; +import { + normalizeMarkdown, + pickString, + type ConversionResult, +} from "../../markdown-conversion-shared.js"; +import type { UrlRuleParser, UrlRuleParserContext } from "../types.js"; + +const ARCHIVE_HOSTS = new Set([ + "archive.ph", + "archive.is", + "archive.today", + "archive.md", + "archive.vn", + "archive.li", + "archive.fo", +]); + +function isArchiveHost(url: string): boolean { + try { + return ARCHIVE_HOSTS.has(new URL(url).hostname.toLowerCase()); + } catch { + return false; + } +} + +function readOriginalUrl(document: Document): string | undefined { + const value = document.querySelector("input[name='q']")?.getAttribute("value")?.trim(); + if (!value) return undefined; + + try { + return new URL(value).href; + } catch { + return undefined; + } +} + +function summarize(text: string, maxLength: number): string | undefined { + const normalized = text.replace(/\s+/g, " ").trim(); + if (!normalized) return undefined; + if (normalized.length <= maxLength) return normalized; + return `${normalized.slice(0, Math.max(0, maxLength - 1)).trimEnd()}…`; +} + +function pickContentRoot(document: Document): Element | null { + return ( + document.querySelector("#CONTENT") ?? + document.querySelector("#content") ?? + document.body + ); +} + +function pickContentTitle(root: Element, fallbackTitle: string): string { + const contentTitle = pickString( + root.querySelector("h1")?.textContent, + root.querySelector("[itemprop='headline']")?.textContent, + root.querySelector("article h2")?.textContent + ); + if (contentTitle) return contentTitle; + if (fallbackTitle && !/^archive\./i.test(fallbackTitle.trim())) return fallbackTitle; + return ""; +} + +function parseArchivePage(context: UrlRuleParserContext): ConversionResult | null { + const root = pickContentRoot(context.document); + if (!root) return null; + + const markdown = normalizeMarkdown(convertHtmlFragmentToMarkdown(root.innerHTML)); + if (!markdown) return null; + + const originalUrl = readOriginalUrl(context.document) ?? context.baseMetadata.url; + const bodyText = root.textContent?.replace(/\s+/g, " ").trim() ?? ""; + const published = root.querySelector("time[datetime]")?.getAttribute("datetime") ?? undefined; + const coverImage = root.querySelector("img[src]")?.getAttribute("src") ?? undefined; + + return { + metadata: { + ...context.baseMetadata, + url: originalUrl, + title: pickContentTitle(root, context.baseMetadata.title), + description: summarize(bodyText, 220) ?? context.baseMetadata.description, + published: pickString(published, context.baseMetadata.published) ?? undefined, + coverImage: pickString(coverImage, context.baseMetadata.coverImage) ?? undefined, + }, + markdown, + rawHtml: context.html, + conversionMethod: "parser:archive-ph", + }; +} + +export const archivePhRuleParser: UrlRuleParser = { + id: "archive-ph", + supports(context) { + return isArchiveHost(context.url); + }, + parse: parseArchivePage, +}; diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/rules/index.ts b/skills/baoyu-url-to-markdown/scripts/parsers/rules/index.ts new file mode 100644 index 0000000..fe504be --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/rules/index.ts @@ -0,0 +1,10 @@ +import { archivePhRuleParser } from "./archive-ph.js"; +import { xArticleRuleParser } from "./x-article.js"; +import { xStatusRuleParser } from "./x-status.js"; +import type { UrlRuleParser } from "../types.js"; + +export const URL_RULE_PARSERS: UrlRuleParser[] = [ + archivePhRuleParser, + xArticleRuleParser, + xStatusRuleParser, +]; diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-article.ts b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-article.ts new file mode 100644 index 0000000..b065182 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-article.ts @@ -0,0 +1,137 @@ +import { + normalizeMarkdown, + pickString, + type ConversionResult, +} from "../../markdown-conversion-shared.js"; +import type { UrlRuleParser, UrlRuleParserContext } from "../types.js"; +import { + cleanText, + collectMediaMarkdown, + convertXRichTextElementToMarkdown, + extractPublishedForCurrentUrl, + inferLanguage, + isXArticlePath, + isXHost, + normalizeXMarkdown, + parseUrl, + pickFirstValidLinkText, + sanitizeCoverImage, + summarizeText, +} from "./x-shared.js"; + +function collectArticleMarkdown(root: Element): { markdown: string; mediaUrls: string[] } { + const parts: string[] = []; + const seenMedia = new Set(); + const mediaUrls: string[] = []; + + function pushPart(value: string): void { + const normalized = normalizeMarkdown(value); + if (!normalized) return; + parts.push(normalized); + } + + function walk(node: Element): void { + const testId = node.getAttribute("data-testid"); + + if (testId === "twitterArticleRichTextView" || testId === "longformRichTextComponent") { + const bodyMedia = collectMediaMarkdown(node, seenMedia); + mediaUrls.push(...bodyMedia.urls.filter((url) => !mediaUrls.includes(url))); + pushPart(convertXRichTextElementToMarkdown(node)); + return; + } + + if (testId === "tweetPhoto") { + const media = collectMediaMarkdown(node, seenMedia); + mediaUrls.push(...media.urls.filter((url) => !mediaUrls.includes(url))); + for (const line of media.lines) pushPart(line); + return; + } + + if ( + testId === "twitter-article-title" || + testId === "User-Name" || + testId === "Tweet-User-Avatar" || + testId === "reply" || + testId === "retweet" || + testId === "like" || + testId === "bookmark" || + testId === "caret" || + testId === "app-text-transition-container" + ) { + return; + } + + if (node.tagName === "TIME" || node.tagName === "BUTTON") { + return; + } + + for (const child of Array.from(node.children)) { + walk(child); + } + } + + for (const child of Array.from(root.children)) { + walk(child); + } + + return { + markdown: normalizeXMarkdown(parts.join("\n\n")), + mediaUrls, + }; +} + +function parseXArticle(context: UrlRuleParserContext): ConversionResult | null { + const articleRoot = context.document.querySelector("[data-testid='twitterArticleReadView']") as Element | null; + if (!articleRoot) return null; + + const title = cleanText( + context.document.querySelector("[data-testid='twitter-article-title']")?.textContent + ); + const identity = pickFirstValidLinkText( + context.document.querySelector("[data-testid='User-Name']") + ); + const published = extractPublishedForCurrentUrl(articleRoot, context.url); + const { markdown, mediaUrls } = collectArticleMarkdown(articleRoot); + if (!markdown) return null; + + const bodyText = cleanText( + context.document.querySelector("[data-testid='twitterArticleRichTextView']")?.textContent ?? + context.document.querySelector("[data-testid='longformRichTextComponent']")?.textContent + ); + + return { + metadata: { + ...context.baseMetadata, + title: pickString(title, context.baseMetadata.title) ?? "", + description: summarizeText(bodyText, 220) ?? context.baseMetadata.description, + author: pickString(identity.author, context.baseMetadata.author) ?? undefined, + published: pickString(published, context.baseMetadata.published) ?? undefined, + coverImage: sanitizeCoverImage(mediaUrls[0], context.baseMetadata.coverImage), + language: inferLanguage(bodyText, context.baseMetadata.language), + }, + markdown, + rawHtml: context.html, + conversionMethod: "parser:x-article", + }; +} + +export const xArticleRuleParser: UrlRuleParser = { + id: "x-article", + supports(context) { + const parsed = parseUrl(context.url); + if (!parsed || !isXHost(parsed.hostname)) { + return false; + } + + return ( + isXArticlePath(parsed.pathname) || + Boolean( + context.document.querySelector("[data-testid='twitterArticleReadView']") || + context.document.querySelector("[data-testid='twitterArticleRichTextView']") + ) + ); + }, + parse(context) { + return parseXArticle(context); + }, +}; diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-shared.ts b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-shared.ts new file mode 100644 index 0000000..d433bda --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-shared.ts @@ -0,0 +1,249 @@ +import { convertHtmlFragmentToMarkdown } from "../../legacy-converter.js"; +import { normalizeMarkdown } from "../../markdown-conversion-shared.js"; + +export const DEFAULT_X_OG_IMAGE = "https://abs.twimg.com/rweb/ssr/default/v2/og/image.png"; + +export type MediaResult = { + lines: string[]; + urls: string[]; +}; + +export function isXHost(hostname: string): boolean { + const normalized = hostname.toLowerCase(); + return ( + normalized === "x.com" || + normalized === "twitter.com" || + normalized.endsWith(".x.com") || + normalized.endsWith(".twitter.com") + ); +} + +export function parseUrl(input: string): URL | null { + try { + return new URL(input); + } catch { + return null; + } +} + +export function isXStatusPath(pathname: string): boolean { + return /^\/[^/]+\/status(?:es)?\/\d+$/i.test(pathname) || /^\/i\/web\/status\/\d+$/i.test(pathname); +} + +export function isXArticlePath(pathname: string): boolean { + return /^\/[^/]+\/article\/\d+$/i.test(pathname) || /^\/(?:i\/)?article\/\d+$/i.test(pathname); +} + +export function cleanText(value: string | null | undefined): string { + return (value ?? "").replace(/\s+/g, " ").trim(); +} + +export function cleanUserLabel(value: string | null | undefined): string { + return cleanText(value).replace(/\bVerified account\b/gi, "").replace(/\s{2,}/g, " ").trim(); +} + +export function escapeMarkdownAlt(text: string): string { + return text.replace(/[\[\]]/g, "\\$&"); +} + +export function normalizeAlt(text: string | null | undefined): string { + const cleaned = cleanText(text); + if (!cleaned || /^(image|photo)$/i.test(cleaned)) return ""; + return escapeMarkdownAlt(cleaned); +} + +export function summarizeText(text: string, maxLength: number): string | undefined { + const normalized = cleanText(text); + if (!normalized) return undefined; + return normalized.length > maxLength + ? `${normalized.slice(0, maxLength - 3)}...` + : normalized; +} + +export function buildTweetTitle(text: string, fallback: string): string { + return summarizeText(text, 80) ?? fallback; +} + +export function normalizeXMarkdown(markdown: string): string { + return normalizeMarkdown(markdown.replace(/^(#{1,6})\s*\n+([^\n])/gm, "$1 $2")); +} + +export function inferLanguage(text: string, fallback?: string): string | undefined { + const normalized = cleanText(text); + if (!normalized) return fallback; + + const han = (normalized.match(/\p{Script=Han}/gu) || []).length; + const hiragana = (normalized.match(/\p{Script=Hiragana}/gu) || []).length; + const katakana = (normalized.match(/\p{Script=Katakana}/gu) || []).length; + const hangul = (normalized.match(/\p{Script=Hangul}/gu) || []).length; + + if (hangul >= 8) return "ko"; + if (hiragana + katakana >= 8) return "ja"; + if (han >= 16) return "zh"; + return fallback; +} + +export function buildQuoteMarkdown(markdown: string, author?: string): string { + const normalized = normalizeMarkdown(markdown); + if (!normalized) return ""; + + const lines = normalized.split("\n"); + const prefixed = lines.map((line) => (line ? `> ${line}` : ">")).join("\n"); + const header = author ? `> Quote from ${author}` : "> Quote"; + return `${header}\n${prefixed}`; +} + +export function pickFirstValidLinkText(userNameEl: Element | null | undefined): { + name?: string; + username?: string; + author?: string; +} { + if (!userNameEl) return {}; + + const linkTexts = Array.from(userNameEl.querySelectorAll("a[href]")) + .map((link) => cleanUserLabel(link.textContent)) + .filter(Boolean); + + let username = linkTexts.find((text) => text.startsWith("@")); + let name = linkTexts.find((text) => !text.startsWith("@") && !/^(promote|more)$/i.test(text)); + + if (!username || !name) { + const text = cleanUserLabel(userNameEl.textContent); + const fallbackMatch = text.match(/^(.*?)\s*(@[A-Za-z0-9_]+)(?:\s*·.*)?$/); + if (fallbackMatch) { + name = name ?? cleanText(fallbackMatch[1]); + username = username ?? cleanText(fallbackMatch[2]); + } + } + + const author = name && username ? `${name} (${username})` : username ?? name; + return { name, username, author }; +} + +export function extractPublishedForCurrentUrl(root: ParentNode, url: string): string | undefined { + const parsed = parseUrl(url); + if (!parsed) return undefined; + const currentPath = parsed.pathname.toLowerCase(); + + for (const timeElement of root.querySelectorAll("a[href] time[datetime]")) { + const href = timeElement.closest("a")?.getAttribute("href"); + const hrefUrl = href ? parseUrl(href.startsWith("http") ? href : `${parsed.origin}${href}`) : null; + if (hrefUrl?.pathname.toLowerCase() === currentPath) { + return timeElement.getAttribute("datetime") ?? undefined; + } + } + + return root.querySelector("time[datetime]")?.getAttribute("datetime") ?? undefined; +} + +export function collectMediaMarkdown(root: ParentNode, seen: Set): MediaResult { + const lines: string[] = []; + const urls: string[] = []; + const rootElement = root as Element & { + getAttribute?: (name: string) => string | null; + }; + const photoNodes = [ + ...(typeof rootElement.getAttribute === "function" && + rootElement.getAttribute("data-testid") === "tweetPhoto" + ? [rootElement] + : []), + ...Array.from(root.querySelectorAll("[data-testid='tweetPhoto']")), + ]; + + for (const node of photoNodes) { + const img = node.querySelector("img"); + const imageUrl = img?.getAttribute("src"); + if (imageUrl && !seen.has(imageUrl)) { + seen.add(imageUrl); + urls.push(imageUrl); + lines.push(`![${normalizeAlt(img?.getAttribute("alt"))}](${imageUrl})`); + } + + const video = node.querySelector("video"); + const posterUrl = video?.getAttribute("poster"); + if (posterUrl && !seen.has(posterUrl)) { + seen.add(posterUrl); + urls.push(posterUrl); + lines.push(`![video](${posterUrl})`); + } + + const videoUrl = video?.getAttribute("src") ?? video?.querySelector("source")?.getAttribute("src"); + if (videoUrl && !seen.has(videoUrl)) { + seen.add(videoUrl); + urls.push(videoUrl); + lines.push(`[video](${videoUrl})`); + } + } + + return { lines, urls }; +} + +export function materializeTweetPhotoNodes(root: Element): void { + for (const photo of Array.from(root.querySelectorAll("[data-testid='tweetPhoto']"))) { + const document = photo.ownerDocument; + const container = document.createElement("span"); + + const img = photo.querySelector("img"); + const imageUrl = img?.getAttribute("src"); + if (imageUrl) { + const image = document.createElement("img"); + image.setAttribute("src", imageUrl); + const alt = normalizeAlt(img?.getAttribute("alt")); + if (alt) { + image.setAttribute("alt", alt); + } + container.appendChild(image); + } + + const video = photo.querySelector("video"); + const posterUrl = video?.getAttribute("poster"); + if (posterUrl) { + const poster = document.createElement("img"); + poster.setAttribute("src", posterUrl); + poster.setAttribute("alt", "video"); + container.appendChild(poster); + } + + const videoUrl = video?.getAttribute("src") ?? video?.querySelector("source")?.getAttribute("src"); + if (videoUrl) { + if (container.childNodes.length > 0) { + container.appendChild(document.createTextNode(" ")); + } + const link = document.createElement("a"); + link.setAttribute("href", videoUrl); + link.textContent = "video"; + container.appendChild(link); + } + + if (container.childNodes.length === 0) { + photo.remove(); + continue; + } + + photo.replaceWith(container); + } +} + +function collapseLinkedMediaContainers(root: Element): void { + for (const anchor of Array.from(root.querySelectorAll("a[href]"))) { + const images = Array.from(anchor.querySelectorAll("img")); + if (images.length !== 1) continue; + if (cleanText(anchor.textContent)) continue; + + const image = images[0].cloneNode(true); + anchor.replaceChildren(image); + } +} + +export function convertXRichTextElementToMarkdown(node: Element): string { + const clone = node.cloneNode(true) as Element; + materializeTweetPhotoNodes(clone); + collapseLinkedMediaContainers(clone); + return normalizeXMarkdown(convertHtmlFragmentToMarkdown(clone.innerHTML)); +} + +export function sanitizeCoverImage(primary?: string, fallback?: string): string | undefined { + if (primary) return primary; + if (!fallback || fallback === DEFAULT_X_OG_IMAGE) return undefined; + return fallback; +} diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-status.ts b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-status.ts new file mode 100644 index 0000000..8f47e16 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/rules/x-status.ts @@ -0,0 +1,82 @@ +import type { ConversionResult } from "../../markdown-conversion-shared.js"; +import type { UrlRuleParser, UrlRuleParserContext } from "../types.js"; +import { + buildQuoteMarkdown, + buildTweetTitle, + cleanText, + collectMediaMarkdown, + convertXRichTextElementToMarkdown, + extractPublishedForCurrentUrl, + inferLanguage, + isXHost, + isXStatusPath, + normalizeXMarkdown, + parseUrl, + pickFirstValidLinkText, + sanitizeCoverImage, + summarizeText, +} from "./x-shared.js"; + +function parseXStatus(context: UrlRuleParserContext): ConversionResult | null { + const article = context.document.querySelector("article[data-testid='tweet'], article") as Element | null; + if (!article) return null; + + const tweetTextElements = Array.from(article.querySelectorAll("[data-testid='tweetText']")) as Element[]; + if (tweetTextElements.length === 0) return null; + + const userNameElements = Array.from(article.querySelectorAll("[data-testid='User-Name']")) as Element[]; + const mainTextElement = tweetTextElements[0]; + const mainIdentity = pickFirstValidLinkText(userNameElements[0]); + const published = extractPublishedForCurrentUrl(article, context.url); + const mainMarkdown = normalizeXMarkdown(convertXRichTextElementToMarkdown(mainTextElement)); + if (!mainMarkdown) return null; + + const parts = [mainMarkdown]; + const quotedTextElements = tweetTextElements.slice(1); + const quotedUserNameElements = userNameElements.slice(1); + + quotedTextElements.forEach((element, index) => { + const quoteMarkdown = normalizeXMarkdown(convertXRichTextElementToMarkdown(element)); + if (!quoteMarkdown) return; + const quoteIdentity = pickFirstValidLinkText(quotedUserNameElements[index]); + parts.push(buildQuoteMarkdown(quoteMarkdown, quoteIdentity.author)); + }); + + const media = collectMediaMarkdown(article, new Set()); + if (media.lines.length > 0) { + parts.push(media.lines.join("\n\n")); + } + + const mainText = cleanText(mainTextElement.textContent); + const markdown = normalizeXMarkdown(parts.join("\n\n")); + + return { + metadata: { + ...context.baseMetadata, + title: buildTweetTitle(mainText, context.baseMetadata.title), + description: summarizeText(mainText, 220) ?? context.baseMetadata.description, + author: mainIdentity.author ?? context.baseMetadata.author, + published: published ?? context.baseMetadata.published, + coverImage: sanitizeCoverImage(media.urls[0], context.baseMetadata.coverImage), + language: inferLanguage(mainText, context.baseMetadata.language), + }, + markdown, + rawHtml: context.html, + conversionMethod: "parser:x-status", + }; +} + +export const xStatusRuleParser: UrlRuleParser = { + id: "x-status", + supports(context) { + const parsed = parseUrl(context.url); + if (!parsed || !isXHost(parsed.hostname)) { + return false; + } + + return isXStatusPath(parsed.pathname) && Boolean(context.document.querySelector("[data-testid='tweetText']")); + }, + parse(context): ConversionResult | null { + return parseXStatus(context); + }, +}; diff --git a/skills/baoyu-url-to-markdown/scripts/parsers/types.ts b/skills/baoyu-url-to-markdown/scripts/parsers/types.ts new file mode 100644 index 0000000..7963699 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/parsers/types.ts @@ -0,0 +1,14 @@ +import type { ConversionResult, PageMetadata } from "../markdown-conversion-shared.js"; + +export interface UrlRuleParserContext { + html: string; + url: string; + document: Document; + baseMetadata: PageMetadata; +} + +export interface UrlRuleParser { + id: string; + supports(context: UrlRuleParserContext): boolean; + parse(context: UrlRuleParserContext): ConversionResult | null; +}