feat(baoyu-url-to-markdown): add URL-specific parser layer for X/Twitter and archive.ph
- New parsers/ module with pluggable rule system for site-specific HTML extraction - X status parser: extract tweet text, media, quotes, author from data-testid elements - X article parser: extract long-form article content with inline media - archive.ph parser: restore original URL and prefer #CONTENT container - Improved slug generation with stop words and content-aware slugs - Output path uses subdirectory structure (domain/slug/slug.md) - Fix: preserve anchor elements containing media in legacy converter - Fix: smarter title deduplication in markdown document builder
This commit is contained in:
parent
6a4b312146
commit
e5d6c8ec68
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: baoyu-url-to-markdown
|
||||
description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, uses an upgraded Defuddle pipeline with better web-component handling and YouTube transcript extraction, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when needed. If local browser capture fails entirely, it can fall back to the hosted defuddle.md API. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown.
|
||||
version: 1.58.1
|
||||
version: 1.59.0
|
||||
metadata:
|
||||
openclaw:
|
||||
homepage: https://github.com/JimLiu/baoyu-skills#baoyu-url-to-markdown
|
||||
|
|
@ -30,6 +30,9 @@ Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts i
|
|||
|--------|---------|
|
||||
| `scripts/main.ts` | CLI entry point for URL fetching |
|
||||
| `scripts/html-to-markdown.ts` | Markdown conversion entry point and converter selection |
|
||||
| `scripts/parsers/index.ts` | Unified parser entry: dispatches URL-specific rules before generic converters |
|
||||
| `scripts/parsers/types.ts` | Unified parser interface shared by all rule files |
|
||||
| `scripts/parsers/rules/*.ts` | One file per URL rule, for example X status and X article |
|
||||
| `scripts/defuddle-converter.ts` | Defuddle-based conversion |
|
||||
| `scripts/legacy-converter.ts` | Pre-Defuddle legacy extraction and markdown conversion |
|
||||
| `scripts/markdown-conversion-shared.ts` | Shared metadata parsing and markdown document helpers |
|
||||
|
|
@ -115,10 +118,13 @@ Full reference: [references/config/first-time-setup.md](references/config/first-
|
|||
## Features
|
||||
|
||||
- Chrome CDP for full JavaScript rendering
|
||||
- URL-specific parser layer for sites that need custom HTML rules before generic extraction
|
||||
- Two capture modes: auto or wait-for-user
|
||||
- Save rendered HTML as a sibling `-captured.html` file
|
||||
- Clean markdown output with metadata
|
||||
- Upgraded Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history
|
||||
- X/Twitter pages can use HTML-specific parsing for Tweets and Articles, which improves title/body/media extraction on `x.com` / `twitter.com`
|
||||
- `archive.ph` / related archive mirrors can restore the original URL from `input[name=q]` and prefer `#CONTENT` before falling back to the page body
|
||||
- Materializes shadow DOM content before conversion so web-component pages survive serialization better
|
||||
- YouTube pages can include transcript/caption text in the markdown when YouTube exposes a caption track
|
||||
- If local browser capture fails completely, can fall back to `defuddle.md/<url>` and still save markdown
|
||||
|
|
@ -201,14 +207,16 @@ When `--download-media` is enabled:
|
|||
|
||||
Conversion order:
|
||||
|
||||
1. Try Defuddle first
|
||||
2. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline
|
||||
3. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
|
||||
4. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/<url>` API and save its markdown output directly
|
||||
5. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
|
||||
1. Try the URL-specific parser layer first when a site rule matches
|
||||
2. If no specialized parser matches, try Defuddle
|
||||
3. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline
|
||||
4. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
|
||||
5. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/<url>` API and save its markdown output directly
|
||||
6. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
|
||||
|
||||
CLI output will show:
|
||||
|
||||
- `Converter: parser:...` when a URL-specific parser succeeded
|
||||
- `Converter: defuddle` when Defuddle succeeds
|
||||
- `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed
|
||||
- `Converter: defuddle-api` when local browser capture failed and the hosted API was used instead
|
||||
|
|
|
|||
|
|
@ -12,6 +12,7 @@ import {
|
|||
scoreMarkdownQuality,
|
||||
shouldCompareWithLegacy,
|
||||
} from "./legacy-converter.js";
|
||||
import { tryUrlRuleParsers } from "./parsers/index.js";
|
||||
|
||||
export type { ConversionResult, PageMetadata };
|
||||
export { createMarkdownDocument, formatMetadataYaml };
|
||||
|
|
@ -105,6 +106,11 @@ export async function extractContent(html: string, url: string): Promise<Convers
|
|||
const capturedAt = new Date().toISOString();
|
||||
const baseMetadata = extractMetadataFromHtml(html, url, capturedAt);
|
||||
|
||||
const specializedResult = tryUrlRuleParsers(html, url, baseMetadata);
|
||||
if (specializedResult) {
|
||||
return specializedResult;
|
||||
}
|
||||
|
||||
const defuddleResult = await tryDefuddleConversion(html, url, baseMetadata);
|
||||
if (defuddleResult.ok) {
|
||||
if (shouldPreferDefuddle(defuddleResult.result)) {
|
||||
|
|
|
|||
|
|
@ -521,14 +521,18 @@ turndown.addRule("collapseFigure", {
|
|||
|
||||
turndown.addRule("dropInvisibleAnchors", {
|
||||
filter(node) {
|
||||
return node.nodeName === "A" && !(node as Element).textContent?.trim();
|
||||
return (
|
||||
node.nodeName === "A" &&
|
||||
!(node as Element).textContent?.trim() &&
|
||||
!(node as Element).querySelector("img, video, picture, source")
|
||||
);
|
||||
},
|
||||
replacement() {
|
||||
return "";
|
||||
},
|
||||
});
|
||||
|
||||
function convertHtmlToMarkdown(html: string): string {
|
||||
export function convertHtmlFragmentToMarkdown(html: string): string {
|
||||
if (!html || !html.trim()) return "";
|
||||
|
||||
try {
|
||||
|
|
@ -609,7 +613,7 @@ export function shouldCompareWithLegacy(markdown: string): boolean {
|
|||
export function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult {
|
||||
const extracted = extractFromHtml(html);
|
||||
|
||||
let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : "";
|
||||
let markdown = extracted?.html ? convertHtmlFragmentToMarkdown(extracted.html) : "";
|
||||
if (!markdown.trim()) {
|
||||
markdown = extracted?.textContent?.trim() || fallbackPlainText(html);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -52,15 +52,71 @@ function parseArgs(argv: string[]): Args {
|
|||
return args;
|
||||
}
|
||||
|
||||
function generateSlug(title: string, url: string): string {
|
||||
const text = title || new URL(url).pathname.replace(/\//g, "-");
|
||||
return text
|
||||
const SLUG_STOP_WORDS = new Set([
|
||||
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
|
||||
"have", "has", "had", "do", "does", "did", "will", "would", "shall",
|
||||
"should", "may", "might", "must", "can", "could", "to", "of", "in",
|
||||
"for", "on", "with", "at", "by", "from", "as", "into", "through",
|
||||
"during", "before", "after", "above", "below", "between", "out",
|
||||
"off", "over", "under", "again", "further", "then", "once", "here",
|
||||
"there", "when", "where", "why", "how", "all", "both", "each",
|
||||
"few", "more", "most", "other", "some", "such", "no", "nor", "not",
|
||||
"only", "own", "same", "so", "than", "too", "very", "just", "but",
|
||||
"and", "or", "if", "this", "that", "these", "those", "it", "its",
|
||||
"http", "https", "www", "com", "org", "net", "post", "article",
|
||||
]);
|
||||
|
||||
function extractSlugFromContent(content: string): string | null {
|
||||
const body = content.replace(/^---\n[\s\S]*?\n---\n?/, "").slice(0, 1000);
|
||||
const words = body
|
||||
.replace(/[^\w\s-]/g, " ")
|
||||
.split(/\s+/)
|
||||
.filter((w) => /^[a-zA-Z]/.test(w) && w.length >= 2 && !SLUG_STOP_WORDS.has(w.toLowerCase()))
|
||||
.map((w) => w.toLowerCase());
|
||||
|
||||
const unique: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const w of words) {
|
||||
if (!seen.has(w)) {
|
||||
seen.add(w);
|
||||
unique.push(w);
|
||||
if (unique.length >= 6) break;
|
||||
}
|
||||
}
|
||||
return unique.length >= 2 ? unique.join("-").slice(0, 50) : null;
|
||||
}
|
||||
|
||||
function generateSlug(title: string, url: string, content?: string): string {
|
||||
const asciiWords = title
|
||||
.replace(/[^\w\s]/g, " ")
|
||||
.split(/\s+/)
|
||||
.filter((w) => /[a-zA-Z]/.test(w) && w.length >= 2 && !SLUG_STOP_WORDS.has(w.toLowerCase()))
|
||||
.map((w) => w.toLowerCase());
|
||||
|
||||
if (asciiWords.length >= 2) {
|
||||
return asciiWords.slice(0, 6).join("-").slice(0, 50);
|
||||
}
|
||||
|
||||
if (content) {
|
||||
const contentSlug = extractSlugFromContent(content);
|
||||
if (contentSlug) return contentSlug;
|
||||
}
|
||||
|
||||
const GENERIC_PATH_SEGMENTS = new Set(["status", "article", "post", "posts", "p", "blog", "news", "articles"]);
|
||||
const parsed = new URL(url);
|
||||
const pathSlug = parsed.pathname
|
||||
.split("/")
|
||||
.filter((s) => s.length > 0 && !/^\d{10,}$/.test(s) && !GENERIC_PATH_SEGMENTS.has(s.toLowerCase()))
|
||||
.join("-")
|
||||
.toLowerCase()
|
||||
.replace(/[^\w\s-]/g, "")
|
||||
.replace(/\s+/g, "-")
|
||||
.replace(/[^\w-]/g, "-")
|
||||
.replace(/-+/g, "-")
|
||||
.replace(/^-|-$/g, "")
|
||||
.slice(0, 50) || "page";
|
||||
.slice(0, 40);
|
||||
|
||||
const prefix = asciiWords.slice(0, 2).join("-");
|
||||
const combined = prefix ? `${prefix}-${pathSlug}` : pathSlug;
|
||||
return combined.slice(0, 50) || "page";
|
||||
}
|
||||
|
||||
function formatTimestamp(): string {
|
||||
|
|
@ -124,18 +180,18 @@ async function fetchDefuddleApiMarkdown(targetUrl: string): Promise<{ markdown:
|
|||
};
|
||||
}
|
||||
|
||||
async function generateOutputPath(url: string, title: string, outputDir?: string): Promise<string> {
|
||||
async function generateOutputPath(url: string, title: string, outputDir?: string, content?: string): Promise<string> {
|
||||
const domain = new URL(url).hostname.replace(/^www\./, "");
|
||||
const slug = generateSlug(title, url);
|
||||
const slug = generateSlug(title, url, content);
|
||||
const dataDir = outputDir ? path.resolve(outputDir) : resolveUrlToMarkdownDataDir();
|
||||
const basePath = path.join(dataDir, domain, `${slug}.md`);
|
||||
const basePath = path.join(dataDir, domain, slug, `${slug}.md`);
|
||||
|
||||
if (!(await fileExists(basePath))) {
|
||||
return basePath;
|
||||
}
|
||||
|
||||
const timestampSlug = `${slug}-${formatTimestamp()}`;
|
||||
return path.join(dataDir, domain, `${timestampSlug}.md`);
|
||||
return path.join(dataDir, domain, timestampSlug, `${timestampSlug}.md`);
|
||||
}
|
||||
|
||||
async function waitForUserSignal(): Promise<void> {
|
||||
|
|
@ -249,13 +305,12 @@ async function main(): Promise<void> {
|
|||
|
||||
try {
|
||||
const result = await captureUrl(args);
|
||||
outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir);
|
||||
document = createMarkdownDocument(result);
|
||||
outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir, document);
|
||||
const outputDir = path.dirname(outputPath);
|
||||
htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath);
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8");
|
||||
|
||||
document = createMarkdownDocument(result);
|
||||
conversionMethod = result.conversionMethod;
|
||||
fallbackReason = result.fallbackReason;
|
||||
} catch (error) {
|
||||
|
|
@ -265,10 +320,9 @@ async function main(): Promise<void> {
|
|||
|
||||
try {
|
||||
const remoteResult = await fetchDefuddleApiMarkdown(args.url);
|
||||
outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir);
|
||||
await mkdir(path.dirname(outputPath), { recursive: true });
|
||||
|
||||
document = remoteResult.markdown;
|
||||
outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir, document);
|
||||
await mkdir(path.dirname(outputPath), { recursive: true });
|
||||
conversionMethod = "defuddle-api";
|
||||
fallbackReason = `Local browser capture failed: ${primaryError}`;
|
||||
} catch (remoteError) {
|
||||
|
|
|
|||
|
|
@ -300,6 +300,24 @@ export function createMarkdownDocument(result: ConversionResult): string {
|
|||
const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
|
||||
const hasTitle = titleRegex.test(result.markdown.trimStart());
|
||||
const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
|
||||
const firstMeaningfulLine = result.markdown
|
||||
.replace(/\r\n/g, "\n")
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.find((line) => line && !/^!?\[[^\]]*\]\([^)]+\)$/.test(line))
|
||||
?.replace(/^>\s*/, "")
|
||||
?.replace(/^#+\s+/, "")
|
||||
?.trim();
|
||||
const comparableTitle = result.metadata.title.toLowerCase().replace(/(?:\.{3}|…)\s*$/, "");
|
||||
const comparableFirstLine = firstMeaningfulLine?.toLowerCase() ?? "";
|
||||
const titleRepeatsContent =
|
||||
comparableTitle !== "" &&
|
||||
comparableFirstLine !== "" &&
|
||||
(comparableFirstLine === comparableTitle ||
|
||||
comparableFirstLine.startsWith(comparableTitle) ||
|
||||
comparableTitle.startsWith(comparableFirstLine));
|
||||
const title = result.metadata.title && !hasTitle && !titleRepeatsContent
|
||||
? `\n\n# ${result.metadata.title}\n\n`
|
||||
: "\n\n";
|
||||
return yaml + title + result.markdown;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -0,0 +1,201 @@
|
|||
import { describe, expect, test } from "bun:test";
|
||||
|
||||
import {
|
||||
createMarkdownDocument,
|
||||
extractMetadataFromHtml,
|
||||
} from "../markdown-conversion-shared.js";
|
||||
import { tryUrlRuleParsers } from "./index.js";
|
||||
|
||||
const CAPTURED_AT = "2026-03-22T06:00:00.000Z";
|
||||
|
||||
const ARTICLE_HTML = `<!doctype html>
|
||||
<html lang="zh-CN">
|
||||
<body>
|
||||
<div data-testid="twitterArticleReadView">
|
||||
<a href="/dotey/article/2035141635713941927/media/1">
|
||||
<div data-testid="tweetPhoto">
|
||||
<img src="https://pbs.twimg.com/media/article-cover.jpg" alt="Image">
|
||||
</div>
|
||||
</a>
|
||||
<div data-testid="twitter-article-title">Karpathy:"写代码"已经不是对的动词了</div>
|
||||
<div data-testid="User-Name">
|
||||
<a href="/dotey">宝玉 Verified account</a>
|
||||
<a href="/dotey">@dotey</a>
|
||||
<time datetime="2026-03-20T23:49:11.000Z">Mar 20</time>
|
||||
</div>
|
||||
<div data-testid="twitterArticleRichTextView">
|
||||
<p>Andrej Karpathy 说他从 2024 年 12 月起就基本没手写过一行代码。</p>
|
||||
<a href="/dotey/article/2035141635713941927/media/2">
|
||||
<div>
|
||||
<div>
|
||||
<div data-testid="tweetPhoto">
|
||||
<img src="https://pbs.twimg.com/media/article-inline.jpg" alt="Image">
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</a>
|
||||
<h2>要点速览</h2>
|
||||
<ul>
|
||||
<li>核心焦虑从 GPU 利用率转向 Token 吞吐量</li>
|
||||
</ul>
|
||||
<blockquote>
|
||||
<p>写代码已经不是对的动词了。</p>
|
||||
</blockquote>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
const STATUS_HTML = `<!doctype html>
|
||||
<html lang="en">
|
||||
<body>
|
||||
<article data-testid="tweet">
|
||||
<div data-testid="User-Name">
|
||||
<a href="/dotey">宝玉 Verified account</a>
|
||||
<a href="/dotey">@dotey</a>
|
||||
<time datetime="2026-03-22T05:33:00.000Z">Mar 22</time>
|
||||
</div>
|
||||
<div data-testid="tweetText">
|
||||
<span>转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:</span>
|
||||
</div>
|
||||
<div data-testid="tweetPhoto">
|
||||
<img src="https://pbs.twimg.com/media/tweet-main.jpg" alt="Image">
|
||||
</div>
|
||||
<div data-testid="User-Name">
|
||||
<a href="/mattshumer_">Matt Shumer Verified account</a>
|
||||
<a href="/mattshumer_">@mattshumer_</a>
|
||||
<time datetime="2026-03-17T00:00:00.000Z">Mar 17</time>
|
||||
</div>
|
||||
<div data-testid="tweetText">
|
||||
<span>Add this to your Codex custom instructions for a way better experience.</span>
|
||||
</div>
|
||||
</article>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
const ARCHIVE_HTML = `<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>archive.ph</title>
|
||||
</head>
|
||||
<body>
|
||||
<form>
|
||||
<input
|
||||
type="text"
|
||||
name="q"
|
||||
value="https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
|
||||
>
|
||||
</form>
|
||||
<div id="HEADER">
|
||||
Archive shell text that should be ignored when CONTENT exists.
|
||||
</div>
|
||||
<div id="CONTENT">
|
||||
<h1>Major leap towards reanimation after death as mammal brain preserved</h1>
|
||||
<p>
|
||||
Researchers say the preserved structure and activity markers suggest a significant step
|
||||
forward in keeping delicate brain tissue viable after clinical death.
|
||||
</p>
|
||||
<p>
|
||||
The archive wrapper should not take precedence over the actual article body when the
|
||||
CONTENT container is available for parsing.
|
||||
</p>
|
||||
<img src="https://cdn.example.com/brain.jpg" alt="Brain tissue">
|
||||
</div>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
const ARCHIVE_FALLBACK_HTML = `<!doctype html>
|
||||
<html>
|
||||
<head>
|
||||
<title>archive.ph</title>
|
||||
</head>
|
||||
<body>
|
||||
<input type="text" name="q" value="https://example.com/fallback-story">
|
||||
<main>
|
||||
<h1>Fallback body parsing still works</h1>
|
||||
<p>
|
||||
When CONTENT is absent, the parser should fall back to the body content instead of
|
||||
returning null or keeping the archive wrapper as the final URL.
|
||||
</p>
|
||||
<p>
|
||||
This ensures archived pages with slightly different layouts still produce usable markdown.
|
||||
</p>
|
||||
</main>
|
||||
</body>
|
||||
</html>`;
|
||||
|
||||
function parse(html: string, url: string) {
|
||||
const baseMetadata = extractMetadataFromHtml(html, url, CAPTURED_AT);
|
||||
return tryUrlRuleParsers(html, url, baseMetadata);
|
||||
}
|
||||
|
||||
describe("url rule parsers", () => {
|
||||
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
|
||||
const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5");
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||||
expect(result?.metadata.url).toBe(
|
||||
"https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
|
||||
);
|
||||
expect(result?.metadata.title).toBe(
|
||||
"Major leap towards reanimation after death as mammal brain preserved"
|
||||
);
|
||||
expect(result?.metadata.coverImage).toBe("https://cdn.example.com/brain.jpg");
|
||||
expect(result?.markdown).toContain("Researchers say the preserved structure");
|
||||
expect(result?.markdown).toContain("");
|
||||
expect(result?.markdown).not.toContain("Archive shell text that should be ignored");
|
||||
});
|
||||
|
||||
test("falls back to body when archive.ph CONTENT is missing", () => {
|
||||
const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback");
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||||
expect(result?.metadata.url).toBe("https://example.com/fallback-story");
|
||||
expect(result?.metadata.title).toBe("Fallback body parsing still works");
|
||||
expect(result?.markdown).toContain("When CONTENT is absent");
|
||||
});
|
||||
|
||||
test("parses X article pages from HTML", () => {
|
||||
const result = parse(
|
||||
ARTICLE_HTML,
|
||||
"https://x.com/dotey/article/2035141635713941927"
|
||||
);
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:x-article");
|
||||
expect(result?.metadata.title).toBe("Karpathy:\"写代码\"已经不是对的动词了");
|
||||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/article-cover.jpg");
|
||||
expect(result?.metadata.published).toBe("2026-03-20T23:49:11.000Z");
|
||||
expect(result?.metadata.language).toBe("zh");
|
||||
expect(result?.markdown).toContain("## 要点速览");
|
||||
expect(result?.markdown).toContain(
|
||||
"[](/dotey/article/2035141635713941927/media/2)"
|
||||
);
|
||||
expect(result?.markdown).toContain("写代码已经不是对的动词了。");
|
||||
|
||||
const document = createMarkdownDocument(result!);
|
||||
expect(document).toContain("# Karpathy:\"写代码\"已经不是对的动词了");
|
||||
});
|
||||
|
||||
test("parses X status pages from HTML without duplicating the title heading", () => {
|
||||
const result = parse(
|
||||
STATUS_HTML,
|
||||
"https://x.com/dotey/status/2035590649081196710"
|
||||
);
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:x-status");
|
||||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/tweet-main.jpg");
|
||||
expect(result?.metadata.language).toBe("zh");
|
||||
expect(result?.markdown).toContain("转译:把下面这段加到你的 Codex 自定义指令里");
|
||||
expect(result?.markdown).toContain("> Quote from Matt Shumer (@mattshumer_)");
|
||||
expect(result?.markdown).toContain("![");
|
||||
|
||||
const document = createMarkdownDocument(result!);
|
||||
expect(document).not.toContain("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n");
|
||||
});
|
||||
});
|
||||
|
|
@ -0,0 +1,47 @@
|
|||
import {
|
||||
isMarkdownUsable,
|
||||
normalizeMarkdown,
|
||||
parseDocument,
|
||||
type ConversionResult,
|
||||
type PageMetadata,
|
||||
} from "../markdown-conversion-shared.js";
|
||||
import { URL_RULE_PARSERS } from "./rules/index.js";
|
||||
import type { UrlRuleParserContext } from "./types.js";
|
||||
|
||||
export type { UrlRuleParser, UrlRuleParserContext } from "./types.js";
|
||||
|
||||
export function tryUrlRuleParsers(
|
||||
html: string,
|
||||
url: string,
|
||||
baseMetadata: PageMetadata
|
||||
): ConversionResult | null {
|
||||
const document = parseDocument(html);
|
||||
const context: UrlRuleParserContext = {
|
||||
html,
|
||||
url,
|
||||
document,
|
||||
baseMetadata,
|
||||
};
|
||||
|
||||
for (const parser of URL_RULE_PARSERS) {
|
||||
if (!parser.supports(context)) continue;
|
||||
|
||||
try {
|
||||
const result = parser.parse(context);
|
||||
if (!result) continue;
|
||||
|
||||
const markdown = normalizeMarkdown(result.markdown);
|
||||
if (!isMarkdownUsable(markdown, html)) continue;
|
||||
|
||||
return {
|
||||
...result,
|
||||
markdown,
|
||||
};
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
console.warn(`[url-to-markdown] parser ${parser.id} failed: ${message}`);
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
|
@ -0,0 +1,97 @@
|
|||
import { convertHtmlFragmentToMarkdown } from "../../legacy-converter.js";
|
||||
import {
|
||||
normalizeMarkdown,
|
||||
pickString,
|
||||
type ConversionResult,
|
||||
} from "../../markdown-conversion-shared.js";
|
||||
import type { UrlRuleParser, UrlRuleParserContext } from "../types.js";
|
||||
|
||||
const ARCHIVE_HOSTS = new Set([
|
||||
"archive.ph",
|
||||
"archive.is",
|
||||
"archive.today",
|
||||
"archive.md",
|
||||
"archive.vn",
|
||||
"archive.li",
|
||||
"archive.fo",
|
||||
]);
|
||||
|
||||
function isArchiveHost(url: string): boolean {
|
||||
try {
|
||||
return ARCHIVE_HOSTS.has(new URL(url).hostname.toLowerCase());
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function readOriginalUrl(document: Document): string | undefined {
|
||||
const value = document.querySelector("input[name='q']")?.getAttribute("value")?.trim();
|
||||
if (!value) return undefined;
|
||||
|
||||
try {
|
||||
return new URL(value).href;
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function summarize(text: string, maxLength: number): string | undefined {
|
||||
const normalized = text.replace(/\s+/g, " ").trim();
|
||||
if (!normalized) return undefined;
|
||||
if (normalized.length <= maxLength) return normalized;
|
||||
return `${normalized.slice(0, Math.max(0, maxLength - 1)).trimEnd()}…`;
|
||||
}
|
||||
|
||||
function pickContentRoot(document: Document): Element | null {
|
||||
return (
|
||||
document.querySelector("#CONTENT") ??
|
||||
document.querySelector("#content") ??
|
||||
document.body
|
||||
);
|
||||
}
|
||||
|
||||
function pickContentTitle(root: Element, fallbackTitle: string): string {
|
||||
const contentTitle = pickString(
|
||||
root.querySelector("h1")?.textContent,
|
||||
root.querySelector("[itemprop='headline']")?.textContent,
|
||||
root.querySelector("article h2")?.textContent
|
||||
);
|
||||
if (contentTitle) return contentTitle;
|
||||
if (fallbackTitle && !/^archive\./i.test(fallbackTitle.trim())) return fallbackTitle;
|
||||
return "";
|
||||
}
|
||||
|
||||
function parseArchivePage(context: UrlRuleParserContext): ConversionResult | null {
|
||||
const root = pickContentRoot(context.document);
|
||||
if (!root) return null;
|
||||
|
||||
const markdown = normalizeMarkdown(convertHtmlFragmentToMarkdown(root.innerHTML));
|
||||
if (!markdown) return null;
|
||||
|
||||
const originalUrl = readOriginalUrl(context.document) ?? context.baseMetadata.url;
|
||||
const bodyText = root.textContent?.replace(/\s+/g, " ").trim() ?? "";
|
||||
const published = root.querySelector("time[datetime]")?.getAttribute("datetime") ?? undefined;
|
||||
const coverImage = root.querySelector("img[src]")?.getAttribute("src") ?? undefined;
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
...context.baseMetadata,
|
||||
url: originalUrl,
|
||||
title: pickContentTitle(root, context.baseMetadata.title),
|
||||
description: summarize(bodyText, 220) ?? context.baseMetadata.description,
|
||||
published: pickString(published, context.baseMetadata.published) ?? undefined,
|
||||
coverImage: pickString(coverImage, context.baseMetadata.coverImage) ?? undefined,
|
||||
},
|
||||
markdown,
|
||||
rawHtml: context.html,
|
||||
conversionMethod: "parser:archive-ph",
|
||||
};
|
||||
}
|
||||
|
||||
export const archivePhRuleParser: UrlRuleParser = {
|
||||
id: "archive-ph",
|
||||
supports(context) {
|
||||
return isArchiveHost(context.url);
|
||||
},
|
||||
parse: parseArchivePage,
|
||||
};
|
||||
|
|
@ -0,0 +1,10 @@
|
|||
import { archivePhRuleParser } from "./archive-ph.js";
|
||||
import { xArticleRuleParser } from "./x-article.js";
|
||||
import { xStatusRuleParser } from "./x-status.js";
|
||||
import type { UrlRuleParser } from "../types.js";
|
||||
|
||||
export const URL_RULE_PARSERS: UrlRuleParser[] = [
|
||||
archivePhRuleParser,
|
||||
xArticleRuleParser,
|
||||
xStatusRuleParser,
|
||||
];
|
||||
|
|
@ -0,0 +1,137 @@
|
|||
import {
|
||||
normalizeMarkdown,
|
||||
pickString,
|
||||
type ConversionResult,
|
||||
} from "../../markdown-conversion-shared.js";
|
||||
import type { UrlRuleParser, UrlRuleParserContext } from "../types.js";
|
||||
import {
|
||||
cleanText,
|
||||
collectMediaMarkdown,
|
||||
convertXRichTextElementToMarkdown,
|
||||
extractPublishedForCurrentUrl,
|
||||
inferLanguage,
|
||||
isXArticlePath,
|
||||
isXHost,
|
||||
normalizeXMarkdown,
|
||||
parseUrl,
|
||||
pickFirstValidLinkText,
|
||||
sanitizeCoverImage,
|
||||
summarizeText,
|
||||
} from "./x-shared.js";
|
||||
|
||||
function collectArticleMarkdown(root: Element): { markdown: string; mediaUrls: string[] } {
|
||||
const parts: string[] = [];
|
||||
const seenMedia = new Set<string>();
|
||||
const mediaUrls: string[] = [];
|
||||
|
||||
function pushPart(value: string): void {
|
||||
const normalized = normalizeMarkdown(value);
|
||||
if (!normalized) return;
|
||||
parts.push(normalized);
|
||||
}
|
||||
|
||||
function walk(node: Element): void {
|
||||
const testId = node.getAttribute("data-testid");
|
||||
|
||||
if (testId === "twitterArticleRichTextView" || testId === "longformRichTextComponent") {
|
||||
const bodyMedia = collectMediaMarkdown(node, seenMedia);
|
||||
mediaUrls.push(...bodyMedia.urls.filter((url) => !mediaUrls.includes(url)));
|
||||
pushPart(convertXRichTextElementToMarkdown(node));
|
||||
return;
|
||||
}
|
||||
|
||||
if (testId === "tweetPhoto") {
|
||||
const media = collectMediaMarkdown(node, seenMedia);
|
||||
mediaUrls.push(...media.urls.filter((url) => !mediaUrls.includes(url)));
|
||||
for (const line of media.lines) pushPart(line);
|
||||
return;
|
||||
}
|
||||
|
||||
if (
|
||||
testId === "twitter-article-title" ||
|
||||
testId === "User-Name" ||
|
||||
testId === "Tweet-User-Avatar" ||
|
||||
testId === "reply" ||
|
||||
testId === "retweet" ||
|
||||
testId === "like" ||
|
||||
testId === "bookmark" ||
|
||||
testId === "caret" ||
|
||||
testId === "app-text-transition-container"
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (node.tagName === "TIME" || node.tagName === "BUTTON") {
|
||||
return;
|
||||
}
|
||||
|
||||
for (const child of Array.from(node.children)) {
|
||||
walk(child);
|
||||
}
|
||||
}
|
||||
|
||||
for (const child of Array.from(root.children)) {
|
||||
walk(child);
|
||||
}
|
||||
|
||||
return {
|
||||
markdown: normalizeXMarkdown(parts.join("\n\n")),
|
||||
mediaUrls,
|
||||
};
|
||||
}
|
||||
|
||||
function parseXArticle(context: UrlRuleParserContext): ConversionResult | null {
|
||||
const articleRoot = context.document.querySelector("[data-testid='twitterArticleReadView']") as Element | null;
|
||||
if (!articleRoot) return null;
|
||||
|
||||
const title = cleanText(
|
||||
context.document.querySelector("[data-testid='twitter-article-title']")?.textContent
|
||||
);
|
||||
const identity = pickFirstValidLinkText(
|
||||
context.document.querySelector("[data-testid='User-Name']")
|
||||
);
|
||||
const published = extractPublishedForCurrentUrl(articleRoot, context.url);
|
||||
const { markdown, mediaUrls } = collectArticleMarkdown(articleRoot);
|
||||
if (!markdown) return null;
|
||||
|
||||
const bodyText = cleanText(
|
||||
context.document.querySelector("[data-testid='twitterArticleRichTextView']")?.textContent ??
|
||||
context.document.querySelector("[data-testid='longformRichTextComponent']")?.textContent
|
||||
);
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
...context.baseMetadata,
|
||||
title: pickString(title, context.baseMetadata.title) ?? "",
|
||||
description: summarizeText(bodyText, 220) ?? context.baseMetadata.description,
|
||||
author: pickString(identity.author, context.baseMetadata.author) ?? undefined,
|
||||
published: pickString(published, context.baseMetadata.published) ?? undefined,
|
||||
coverImage: sanitizeCoverImage(mediaUrls[0], context.baseMetadata.coverImage),
|
||||
language: inferLanguage(bodyText, context.baseMetadata.language),
|
||||
},
|
||||
markdown,
|
||||
rawHtml: context.html,
|
||||
conversionMethod: "parser:x-article",
|
||||
};
|
||||
}
|
||||
|
||||
export const xArticleRuleParser: UrlRuleParser = {
|
||||
id: "x-article",
|
||||
supports(context) {
|
||||
const parsed = parseUrl(context.url);
|
||||
if (!parsed || !isXHost(parsed.hostname)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return (
|
||||
isXArticlePath(parsed.pathname) ||
|
||||
Boolean(
|
||||
context.document.querySelector("[data-testid='twitterArticleReadView']") ||
|
||||
context.document.querySelector("[data-testid='twitterArticleRichTextView']")
|
||||
)
|
||||
);
|
||||
},
|
||||
parse(context) {
|
||||
return parseXArticle(context);
|
||||
},
|
||||
};
|
||||
|
|
@ -0,0 +1,249 @@
|
|||
import { convertHtmlFragmentToMarkdown } from "../../legacy-converter.js";
|
||||
import { normalizeMarkdown } from "../../markdown-conversion-shared.js";
|
||||
|
||||
export const DEFAULT_X_OG_IMAGE = "https://abs.twimg.com/rweb/ssr/default/v2/og/image.png";
|
||||
|
||||
export type MediaResult = {
|
||||
lines: string[];
|
||||
urls: string[];
|
||||
};
|
||||
|
||||
export function isXHost(hostname: string): boolean {
|
||||
const normalized = hostname.toLowerCase();
|
||||
return (
|
||||
normalized === "x.com" ||
|
||||
normalized === "twitter.com" ||
|
||||
normalized.endsWith(".x.com") ||
|
||||
normalized.endsWith(".twitter.com")
|
||||
);
|
||||
}
|
||||
|
||||
export function parseUrl(input: string): URL | null {
|
||||
try {
|
||||
return new URL(input);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
export function isXStatusPath(pathname: string): boolean {
|
||||
return /^\/[^/]+\/status(?:es)?\/\d+$/i.test(pathname) || /^\/i\/web\/status\/\d+$/i.test(pathname);
|
||||
}
|
||||
|
||||
export function isXArticlePath(pathname: string): boolean {
|
||||
return /^\/[^/]+\/article\/\d+$/i.test(pathname) || /^\/(?:i\/)?article\/\d+$/i.test(pathname);
|
||||
}
|
||||
|
||||
export function cleanText(value: string | null | undefined): string {
|
||||
return (value ?? "").replace(/\s+/g, " ").trim();
|
||||
}
|
||||
|
||||
export function cleanUserLabel(value: string | null | undefined): string {
|
||||
return cleanText(value).replace(/\bVerified account\b/gi, "").replace(/\s{2,}/g, " ").trim();
|
||||
}
|
||||
|
||||
export function escapeMarkdownAlt(text: string): string {
|
||||
return text.replace(/[\[\]]/g, "\\$&");
|
||||
}
|
||||
|
||||
export function normalizeAlt(text: string | null | undefined): string {
|
||||
const cleaned = cleanText(text);
|
||||
if (!cleaned || /^(image|photo)$/i.test(cleaned)) return "";
|
||||
return escapeMarkdownAlt(cleaned);
|
||||
}
|
||||
|
||||
export function summarizeText(text: string, maxLength: number): string | undefined {
|
||||
const normalized = cleanText(text);
|
||||
if (!normalized) return undefined;
|
||||
return normalized.length > maxLength
|
||||
? `${normalized.slice(0, maxLength - 3)}...`
|
||||
: normalized;
|
||||
}
|
||||
|
||||
export function buildTweetTitle(text: string, fallback: string): string {
|
||||
return summarizeText(text, 80) ?? fallback;
|
||||
}
|
||||
|
||||
export function normalizeXMarkdown(markdown: string): string {
|
||||
return normalizeMarkdown(markdown.replace(/^(#{1,6})\s*\n+([^\n])/gm, "$1 $2"));
|
||||
}
|
||||
|
||||
export function inferLanguage(text: string, fallback?: string): string | undefined {
|
||||
const normalized = cleanText(text);
|
||||
if (!normalized) return fallback;
|
||||
|
||||
const han = (normalized.match(/\p{Script=Han}/gu) || []).length;
|
||||
const hiragana = (normalized.match(/\p{Script=Hiragana}/gu) || []).length;
|
||||
const katakana = (normalized.match(/\p{Script=Katakana}/gu) || []).length;
|
||||
const hangul = (normalized.match(/\p{Script=Hangul}/gu) || []).length;
|
||||
|
||||
if (hangul >= 8) return "ko";
|
||||
if (hiragana + katakana >= 8) return "ja";
|
||||
if (han >= 16) return "zh";
|
||||
return fallback;
|
||||
}
|
||||
|
||||
export function buildQuoteMarkdown(markdown: string, author?: string): string {
|
||||
const normalized = normalizeMarkdown(markdown);
|
||||
if (!normalized) return "";
|
||||
|
||||
const lines = normalized.split("\n");
|
||||
const prefixed = lines.map((line) => (line ? `> ${line}` : ">")).join("\n");
|
||||
const header = author ? `> Quote from ${author}` : "> Quote";
|
||||
return `${header}\n${prefixed}`;
|
||||
}
|
||||
|
||||
export function pickFirstValidLinkText(userNameEl: Element | null | undefined): {
|
||||
name?: string;
|
||||
username?: string;
|
||||
author?: string;
|
||||
} {
|
||||
if (!userNameEl) return {};
|
||||
|
||||
const linkTexts = Array.from(userNameEl.querySelectorAll("a[href]"))
|
||||
.map((link) => cleanUserLabel(link.textContent))
|
||||
.filter(Boolean);
|
||||
|
||||
let username = linkTexts.find((text) => text.startsWith("@"));
|
||||
let name = linkTexts.find((text) => !text.startsWith("@") && !/^(promote|more)$/i.test(text));
|
||||
|
||||
if (!username || !name) {
|
||||
const text = cleanUserLabel(userNameEl.textContent);
|
||||
const fallbackMatch = text.match(/^(.*?)\s*(@[A-Za-z0-9_]+)(?:\s*·.*)?$/);
|
||||
if (fallbackMatch) {
|
||||
name = name ?? cleanText(fallbackMatch[1]);
|
||||
username = username ?? cleanText(fallbackMatch[2]);
|
||||
}
|
||||
}
|
||||
|
||||
const author = name && username ? `${name} (${username})` : username ?? name;
|
||||
return { name, username, author };
|
||||
}
|
||||
|
||||
export function extractPublishedForCurrentUrl(root: ParentNode, url: string): string | undefined {
|
||||
const parsed = parseUrl(url);
|
||||
if (!parsed) return undefined;
|
||||
const currentPath = parsed.pathname.toLowerCase();
|
||||
|
||||
for (const timeElement of root.querySelectorAll("a[href] time[datetime]")) {
|
||||
const href = timeElement.closest("a")?.getAttribute("href");
|
||||
const hrefUrl = href ? parseUrl(href.startsWith("http") ? href : `${parsed.origin}${href}`) : null;
|
||||
if (hrefUrl?.pathname.toLowerCase() === currentPath) {
|
||||
return timeElement.getAttribute("datetime") ?? undefined;
|
||||
}
|
||||
}
|
||||
|
||||
return root.querySelector("time[datetime]")?.getAttribute("datetime") ?? undefined;
|
||||
}
|
||||
|
||||
export function collectMediaMarkdown(root: ParentNode, seen: Set<string>): MediaResult {
|
||||
const lines: string[] = [];
|
||||
const urls: string[] = [];
|
||||
const rootElement = root as Element & {
|
||||
getAttribute?: (name: string) => string | null;
|
||||
};
|
||||
const photoNodes = [
|
||||
...(typeof rootElement.getAttribute === "function" &&
|
||||
rootElement.getAttribute("data-testid") === "tweetPhoto"
|
||||
? [rootElement]
|
||||
: []),
|
||||
...Array.from(root.querySelectorAll("[data-testid='tweetPhoto']")),
|
||||
];
|
||||
|
||||
for (const node of photoNodes) {
|
||||
const img = node.querySelector("img");
|
||||
const imageUrl = img?.getAttribute("src");
|
||||
if (imageUrl && !seen.has(imageUrl)) {
|
||||
seen.add(imageUrl);
|
||||
urls.push(imageUrl);
|
||||
lines.push(``);
|
||||
}
|
||||
|
||||
const video = node.querySelector("video");
|
||||
const posterUrl = video?.getAttribute("poster");
|
||||
if (posterUrl && !seen.has(posterUrl)) {
|
||||
seen.add(posterUrl);
|
||||
urls.push(posterUrl);
|
||||
lines.push(``);
|
||||
}
|
||||
|
||||
const videoUrl = video?.getAttribute("src") ?? video?.querySelector("source")?.getAttribute("src");
|
||||
if (videoUrl && !seen.has(videoUrl)) {
|
||||
seen.add(videoUrl);
|
||||
urls.push(videoUrl);
|
||||
lines.push(`[video](${videoUrl})`);
|
||||
}
|
||||
}
|
||||
|
||||
return { lines, urls };
|
||||
}
|
||||
|
||||
export function materializeTweetPhotoNodes(root: Element): void {
|
||||
for (const photo of Array.from(root.querySelectorAll("[data-testid='tweetPhoto']"))) {
|
||||
const document = photo.ownerDocument;
|
||||
const container = document.createElement("span");
|
||||
|
||||
const img = photo.querySelector("img");
|
||||
const imageUrl = img?.getAttribute("src");
|
||||
if (imageUrl) {
|
||||
const image = document.createElement("img");
|
||||
image.setAttribute("src", imageUrl);
|
||||
const alt = normalizeAlt(img?.getAttribute("alt"));
|
||||
if (alt) {
|
||||
image.setAttribute("alt", alt);
|
||||
}
|
||||
container.appendChild(image);
|
||||
}
|
||||
|
||||
const video = photo.querySelector("video");
|
||||
const posterUrl = video?.getAttribute("poster");
|
||||
if (posterUrl) {
|
||||
const poster = document.createElement("img");
|
||||
poster.setAttribute("src", posterUrl);
|
||||
poster.setAttribute("alt", "video");
|
||||
container.appendChild(poster);
|
||||
}
|
||||
|
||||
const videoUrl = video?.getAttribute("src") ?? video?.querySelector("source")?.getAttribute("src");
|
||||
if (videoUrl) {
|
||||
if (container.childNodes.length > 0) {
|
||||
container.appendChild(document.createTextNode(" "));
|
||||
}
|
||||
const link = document.createElement("a");
|
||||
link.setAttribute("href", videoUrl);
|
||||
link.textContent = "video";
|
||||
container.appendChild(link);
|
||||
}
|
||||
|
||||
if (container.childNodes.length === 0) {
|
||||
photo.remove();
|
||||
continue;
|
||||
}
|
||||
|
||||
photo.replaceWith(container);
|
||||
}
|
||||
}
|
||||
|
||||
function collapseLinkedMediaContainers(root: Element): void {
|
||||
for (const anchor of Array.from(root.querySelectorAll("a[href]"))) {
|
||||
const images = Array.from(anchor.querySelectorAll("img"));
|
||||
if (images.length !== 1) continue;
|
||||
if (cleanText(anchor.textContent)) continue;
|
||||
|
||||
const image = images[0].cloneNode(true);
|
||||
anchor.replaceChildren(image);
|
||||
}
|
||||
}
|
||||
|
||||
export function convertXRichTextElementToMarkdown(node: Element): string {
|
||||
const clone = node.cloneNode(true) as Element;
|
||||
materializeTweetPhotoNodes(clone);
|
||||
collapseLinkedMediaContainers(clone);
|
||||
return normalizeXMarkdown(convertHtmlFragmentToMarkdown(clone.innerHTML));
|
||||
}
|
||||
|
||||
export function sanitizeCoverImage(primary?: string, fallback?: string): string | undefined {
|
||||
if (primary) return primary;
|
||||
if (!fallback || fallback === DEFAULT_X_OG_IMAGE) return undefined;
|
||||
return fallback;
|
||||
}
|
||||
|
|
@ -0,0 +1,82 @@
|
|||
import type { ConversionResult } from "../../markdown-conversion-shared.js";
|
||||
import type { UrlRuleParser, UrlRuleParserContext } from "../types.js";
|
||||
import {
|
||||
buildQuoteMarkdown,
|
||||
buildTweetTitle,
|
||||
cleanText,
|
||||
collectMediaMarkdown,
|
||||
convertXRichTextElementToMarkdown,
|
||||
extractPublishedForCurrentUrl,
|
||||
inferLanguage,
|
||||
isXHost,
|
||||
isXStatusPath,
|
||||
normalizeXMarkdown,
|
||||
parseUrl,
|
||||
pickFirstValidLinkText,
|
||||
sanitizeCoverImage,
|
||||
summarizeText,
|
||||
} from "./x-shared.js";
|
||||
|
||||
function parseXStatus(context: UrlRuleParserContext): ConversionResult | null {
|
||||
const article = context.document.querySelector("article[data-testid='tweet'], article") as Element | null;
|
||||
if (!article) return null;
|
||||
|
||||
const tweetTextElements = Array.from(article.querySelectorAll("[data-testid='tweetText']")) as Element[];
|
||||
if (tweetTextElements.length === 0) return null;
|
||||
|
||||
const userNameElements = Array.from(article.querySelectorAll("[data-testid='User-Name']")) as Element[];
|
||||
const mainTextElement = tweetTextElements[0];
|
||||
const mainIdentity = pickFirstValidLinkText(userNameElements[0]);
|
||||
const published = extractPublishedForCurrentUrl(article, context.url);
|
||||
const mainMarkdown = normalizeXMarkdown(convertXRichTextElementToMarkdown(mainTextElement));
|
||||
if (!mainMarkdown) return null;
|
||||
|
||||
const parts = [mainMarkdown];
|
||||
const quotedTextElements = tweetTextElements.slice(1);
|
||||
const quotedUserNameElements = userNameElements.slice(1);
|
||||
|
||||
quotedTextElements.forEach((element, index) => {
|
||||
const quoteMarkdown = normalizeXMarkdown(convertXRichTextElementToMarkdown(element));
|
||||
if (!quoteMarkdown) return;
|
||||
const quoteIdentity = pickFirstValidLinkText(quotedUserNameElements[index]);
|
||||
parts.push(buildQuoteMarkdown(quoteMarkdown, quoteIdentity.author));
|
||||
});
|
||||
|
||||
const media = collectMediaMarkdown(article, new Set<string>());
|
||||
if (media.lines.length > 0) {
|
||||
parts.push(media.lines.join("\n\n"));
|
||||
}
|
||||
|
||||
const mainText = cleanText(mainTextElement.textContent);
|
||||
const markdown = normalizeXMarkdown(parts.join("\n\n"));
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
...context.baseMetadata,
|
||||
title: buildTweetTitle(mainText, context.baseMetadata.title),
|
||||
description: summarizeText(mainText, 220) ?? context.baseMetadata.description,
|
||||
author: mainIdentity.author ?? context.baseMetadata.author,
|
||||
published: published ?? context.baseMetadata.published,
|
||||
coverImage: sanitizeCoverImage(media.urls[0], context.baseMetadata.coverImage),
|
||||
language: inferLanguage(mainText, context.baseMetadata.language),
|
||||
},
|
||||
markdown,
|
||||
rawHtml: context.html,
|
||||
conversionMethod: "parser:x-status",
|
||||
};
|
||||
}
|
||||
|
||||
export const xStatusRuleParser: UrlRuleParser = {
|
||||
id: "x-status",
|
||||
supports(context) {
|
||||
const parsed = parseUrl(context.url);
|
||||
if (!parsed || !isXHost(parsed.hostname)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return isXStatusPath(parsed.pathname) && Boolean(context.document.querySelector("[data-testid='tweetText']"));
|
||||
},
|
||||
parse(context): ConversionResult | null {
|
||||
return parseXStatus(context);
|
||||
},
|
||||
};
|
||||
|
|
@ -0,0 +1,14 @@
|
|||
import type { ConversionResult, PageMetadata } from "../markdown-conversion-shared.js";
|
||||
|
||||
export interface UrlRuleParserContext {
|
||||
html: string;
|
||||
url: string;
|
||||
document: Document;
|
||||
baseMetadata: PageMetadata;
|
||||
}
|
||||
|
||||
export interface UrlRuleParser {
|
||||
id: string;
|
||||
supports(context: UrlRuleParserContext): boolean;
|
||||
parse(context: UrlRuleParserContext): ConversionResult | null;
|
||||
}
|
||||
Loading…
Reference in New Issue