202 lines
7.6 KiB
TypeScript
202 lines
7.6 KiB
TypeScript
import { describe, expect, test } from "bun:test";
|
||
|
||
import {
|
||
createMarkdownDocument,
|
||
extractMetadataFromHtml,
|
||
} from "../markdown-conversion-shared.js";
|
||
import { tryUrlRuleParsers } from "./index.js";
|
||
|
||
const CAPTURED_AT = "2026-03-22T06:00:00.000Z";
|
||
|
||
const ARTICLE_HTML = `<!doctype html>
|
||
<html lang="zh-CN">
|
||
<body>
|
||
<div data-testid="twitterArticleReadView">
|
||
<a href="/dotey/article/2035141635713941927/media/1">
|
||
<div data-testid="tweetPhoto">
|
||
<img src="https://pbs.twimg.com/media/article-cover.jpg" alt="Image">
|
||
</div>
|
||
</a>
|
||
<div data-testid="twitter-article-title">Karpathy:"写代码"已经不是对的动词了</div>
|
||
<div data-testid="User-Name">
|
||
<a href="/dotey">宝玉 Verified account</a>
|
||
<a href="/dotey">@dotey</a>
|
||
<time datetime="2026-03-20T23:49:11.000Z">Mar 20</time>
|
||
</div>
|
||
<div data-testid="twitterArticleRichTextView">
|
||
<p>Andrej Karpathy 说他从 2024 年 12 月起就基本没手写过一行代码。</p>
|
||
<a href="/dotey/article/2035141635713941927/media/2">
|
||
<div>
|
||
<div>
|
||
<div data-testid="tweetPhoto">
|
||
<img src="https://pbs.twimg.com/media/article-inline.jpg" alt="Image">
|
||
</div>
|
||
</div>
|
||
</div>
|
||
</a>
|
||
<h2>要点速览</h2>
|
||
<ul>
|
||
<li>核心焦虑从 GPU 利用率转向 Token 吞吐量</li>
|
||
</ul>
|
||
<blockquote>
|
||
<p>写代码已经不是对的动词了。</p>
|
||
</blockquote>
|
||
</div>
|
||
</div>
|
||
</body>
|
||
</html>`;
|
||
|
||
const STATUS_HTML = `<!doctype html>
|
||
<html lang="en">
|
||
<body>
|
||
<article data-testid="tweet">
|
||
<div data-testid="User-Name">
|
||
<a href="/dotey">宝玉 Verified account</a>
|
||
<a href="/dotey">@dotey</a>
|
||
<time datetime="2026-03-22T05:33:00.000Z">Mar 22</time>
|
||
</div>
|
||
<div data-testid="tweetText">
|
||
<span>转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:</span>
|
||
</div>
|
||
<div data-testid="tweetPhoto">
|
||
<img src="https://pbs.twimg.com/media/tweet-main.jpg" alt="Image">
|
||
</div>
|
||
<div data-testid="User-Name">
|
||
<a href="/mattshumer_">Matt Shumer Verified account</a>
|
||
<a href="/mattshumer_">@mattshumer_</a>
|
||
<time datetime="2026-03-17T00:00:00.000Z">Mar 17</time>
|
||
</div>
|
||
<div data-testid="tweetText">
|
||
<span>Add this to your Codex custom instructions for a way better experience.</span>
|
||
</div>
|
||
</article>
|
||
</body>
|
||
</html>`;
|
||
|
||
const ARCHIVE_HTML = `<!doctype html>
|
||
<html>
|
||
<head>
|
||
<title>archive.ph</title>
|
||
</head>
|
||
<body>
|
||
<form>
|
||
<input
|
||
type="text"
|
||
name="q"
|
||
value="https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
|
||
>
|
||
</form>
|
||
<div id="HEADER">
|
||
Archive shell text that should be ignored when CONTENT exists.
|
||
</div>
|
||
<div id="CONTENT">
|
||
<h1>Major leap towards reanimation after death as mammal brain preserved</h1>
|
||
<p>
|
||
Researchers say the preserved structure and activity markers suggest a significant step
|
||
forward in keeping delicate brain tissue viable after clinical death.
|
||
</p>
|
||
<p>
|
||
The archive wrapper should not take precedence over the actual article body when the
|
||
CONTENT container is available for parsing.
|
||
</p>
|
||
<img src="https://cdn.example.com/brain.jpg" alt="Brain tissue">
|
||
</div>
|
||
</body>
|
||
</html>`;
|
||
|
||
const ARCHIVE_FALLBACK_HTML = `<!doctype html>
|
||
<html>
|
||
<head>
|
||
<title>archive.ph</title>
|
||
</head>
|
||
<body>
|
||
<input type="text" name="q" value="https://example.com/fallback-story">
|
||
<main>
|
||
<h1>Fallback body parsing still works</h1>
|
||
<p>
|
||
When CONTENT is absent, the parser should fall back to the body content instead of
|
||
returning null or keeping the archive wrapper as the final URL.
|
||
</p>
|
||
<p>
|
||
This ensures archived pages with slightly different layouts still produce usable markdown.
|
||
</p>
|
||
</main>
|
||
</body>
|
||
</html>`;
|
||
|
||
function parse(html: string, url: string) {
|
||
const baseMetadata = extractMetadataFromHtml(html, url, CAPTURED_AT);
|
||
return tryUrlRuleParsers(html, url, baseMetadata);
|
||
}
|
||
|
||
describe("url rule parsers", () => {
|
||
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
|
||
const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5");
|
||
|
||
expect(result).not.toBeNull();
|
||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||
expect(result?.metadata.url).toBe(
|
||
"https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
|
||
);
|
||
expect(result?.metadata.title).toBe(
|
||
"Major leap towards reanimation after death as mammal brain preserved"
|
||
);
|
||
expect(result?.metadata.coverImage).toBe("https://cdn.example.com/brain.jpg");
|
||
expect(result?.markdown).toContain("Researchers say the preserved structure");
|
||
expect(result?.markdown).toContain("");
|
||
expect(result?.markdown).not.toContain("Archive shell text that should be ignored");
|
||
});
|
||
|
||
test("falls back to body when archive.ph CONTENT is missing", () => {
|
||
const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback");
|
||
|
||
expect(result).not.toBeNull();
|
||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||
expect(result?.metadata.url).toBe("https://example.com/fallback-story");
|
||
expect(result?.metadata.title).toBe("Fallback body parsing still works");
|
||
expect(result?.markdown).toContain("When CONTENT is absent");
|
||
});
|
||
|
||
test("parses X article pages from HTML", () => {
|
||
const result = parse(
|
||
ARTICLE_HTML,
|
||
"https://x.com/dotey/article/2035141635713941927"
|
||
);
|
||
|
||
expect(result).not.toBeNull();
|
||
expect(result?.conversionMethod).toBe("parser:x-article");
|
||
expect(result?.metadata.title).toBe("Karpathy:\"写代码\"已经不是对的动词了");
|
||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/article-cover.jpg");
|
||
expect(result?.metadata.published).toBe("2026-03-20T23:49:11.000Z");
|
||
expect(result?.metadata.language).toBe("zh");
|
||
expect(result?.markdown).toContain("## 要点速览");
|
||
expect(result?.markdown).toContain(
|
||
"[](/dotey/article/2035141635713941927/media/2)"
|
||
);
|
||
expect(result?.markdown).toContain("写代码已经不是对的动词了。");
|
||
|
||
const document = createMarkdownDocument(result!);
|
||
expect(document).toContain("# Karpathy:\"写代码\"已经不是对的动词了");
|
||
});
|
||
|
||
test("parses X status pages from HTML without duplicating the title heading", () => {
|
||
const result = parse(
|
||
STATUS_HTML,
|
||
"https://x.com/dotey/status/2035590649081196710"
|
||
);
|
||
|
||
expect(result).not.toBeNull();
|
||
expect(result?.conversionMethod).toBe("parser:x-status");
|
||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/tweet-main.jpg");
|
||
expect(result?.metadata.language).toBe("zh");
|
||
expect(result?.markdown).toContain("转译:把下面这段加到你的 Codex 自定义指令里");
|
||
expect(result?.markdown).toContain("> Quote from Matt Shumer (@mattshumer_)");
|
||
expect(result?.markdown).toContain("![");
|
||
|
||
const document = createMarkdownDocument(result!);
|
||
expect(document).not.toContain("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n");
|
||
});
|
||
});
|