Fix Node-compatible parser tests

This commit is contained in:
Jim Liu 宝玉 2026-03-23 12:08:51 -05:00
parent a5761dc71a
commit be601a6fd5
1 changed files with 74 additions and 69 deletions

View File

@ -1,4 +1,5 @@
import { describe, expect, test } from "bun:test";
import assert from "node:assert/strict";
import test from "node:test";
import {
createMarkdownDocument,
@ -129,73 +130,77 @@ function parse(html: string, url: string) {
return tryUrlRuleParsers(html, url, baseMetadata);
}
describe("url rule parsers", () => {
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5");
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5");
expect(result).not.toBeNull();
expect(result?.conversionMethod).toBe("parser:archive-ph");
expect(result?.metadata.url).toBe(
"https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
);
expect(result?.metadata.title).toBe(
"Major leap towards reanimation after death as mammal brain preserved"
);
expect(result?.metadata.coverImage).toBe("https://cdn.example.com/brain.jpg");
expect(result?.markdown).toContain("Researchers say the preserved structure");
expect(result?.markdown).toContain("![Brain tissue](https://cdn.example.com/brain.jpg)");
expect(result?.markdown).not.toContain("Archive shell text that should be ignored");
});
test("falls back to body when archive.ph CONTENT is missing", () => {
const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback");
expect(result).not.toBeNull();
expect(result?.conversionMethod).toBe("parser:archive-ph");
expect(result?.metadata.url).toBe("https://example.com/fallback-story");
expect(result?.metadata.title).toBe("Fallback body parsing still works");
expect(result?.markdown).toContain("When CONTENT is absent");
});
test("parses X article pages from HTML", () => {
const result = parse(
ARTICLE_HTML,
"https://x.com/dotey/article/2035141635713941927"
);
expect(result).not.toBeNull();
expect(result?.conversionMethod).toBe("parser:x-article");
expect(result?.metadata.title).toBe("Karpathy\"写代码\"已经不是对的动词了");
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/article-cover.jpg");
expect(result?.metadata.published).toBe("2026-03-20T23:49:11.000Z");
expect(result?.metadata.language).toBe("zh");
expect(result?.markdown).toContain("## 要点速览");
expect(result?.markdown).toContain(
"[![](https://pbs.twimg.com/media/article-inline.jpg)](/dotey/article/2035141635713941927/media/2)"
);
expect(result?.markdown).toContain("写代码已经不是对的动词了。");
const document = createMarkdownDocument(result!);
expect(document).toContain("# Karpathy\"写代码\"已经不是对的动词了");
});
test("parses X status pages from HTML without duplicating the title heading", () => {
const result = parse(
STATUS_HTML,
"https://x.com/dotey/status/2035590649081196710"
);
expect(result).not.toBeNull();
expect(result?.conversionMethod).toBe("parser:x-status");
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/tweet-main.jpg");
expect(result?.metadata.language).toBe("zh");
expect(result?.markdown).toContain("转译:把下面这段加到你的 Codex 自定义指令里");
expect(result?.markdown).toContain("> Quote from Matt Shumer (@mattshumer_)");
expect(result?.markdown).toContain("![");
const document = createMarkdownDocument(result!);
expect(document).not.toContain("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n");
});
assert.ok(result);
assert.equal(result.conversionMethod, "parser:archive-ph");
assert.equal(
result.metadata.url,
"https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
);
assert.equal(
result.metadata.title,
"Major leap towards reanimation after death as mammal brain preserved"
);
assert.equal(result.metadata.coverImage, "https://cdn.example.com/brain.jpg");
assert.ok(result.markdown.includes("Researchers say the preserved structure"));
assert.ok(result.markdown.includes("![Brain tissue](https://cdn.example.com/brain.jpg)"));
assert.ok(!result.markdown.includes("Archive shell text that should be ignored"));
});
test("falls back to body when archive.ph CONTENT is missing", () => {
const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback");
assert.ok(result);
assert.equal(result.conversionMethod, "parser:archive-ph");
assert.equal(result.metadata.url, "https://example.com/fallback-story");
assert.equal(result.metadata.title, "Fallback body parsing still works");
assert.ok(result.markdown.includes("When CONTENT is absent"));
});
test("parses X article pages from HTML", () => {
const result = parse(
ARTICLE_HTML,
"https://x.com/dotey/article/2035141635713941927"
);
assert.ok(result);
assert.equal(result.conversionMethod, "parser:x-article");
assert.equal(result.metadata.title, "Karpathy\"写代码\"已经不是对的动词了");
assert.equal(result.metadata.author, "宝玉 (@dotey)");
assert.equal(result.metadata.coverImage, "https://pbs.twimg.com/media/article-cover.jpg");
assert.equal(result.metadata.published, "2026-03-20T23:49:11.000Z");
assert.equal(result.metadata.language, "zh");
assert.ok(result.markdown.includes("## 要点速览"));
assert.ok(
result.markdown.includes(
"[![](https://pbs.twimg.com/media/article-inline.jpg)](/dotey/article/2035141635713941927/media/2)"
)
);
assert.ok(result.markdown.includes("写代码已经不是对的动词了。"));
const document = createMarkdownDocument(result);
assert.ok(document.includes("# Karpathy\"写代码\"已经不是对的动词了"));
});
test("parses X status pages from HTML without duplicating the title heading", () => {
const result = parse(
STATUS_HTML,
"https://x.com/dotey/status/2035590649081196710"
);
assert.ok(result);
assert.equal(result.conversionMethod, "parser:x-status");
assert.equal(result.metadata.author, "宝玉 (@dotey)");
assert.equal(result.metadata.coverImage, "https://pbs.twimg.com/media/tweet-main.jpg");
assert.equal(result.metadata.language, "zh");
assert.ok(result.markdown.includes("转译:把下面这段加到你的 Codex 自定义指令里"));
assert.ok(result.markdown.includes("> Quote from Matt Shumer (@mattshumer_)"));
assert.ok(result.markdown.includes("!["));
const document = createMarkdownDocument(result);
assert.ok(
!document.includes("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n")
);
});