Fix Node-compatible parser tests (#107)
* Fix Node-compatible parser tests * Add parser test dependencies to root test env
This commit is contained in:
parent
a5761dc71a
commit
d4e80b1bc3
|
|
@ -9,7 +9,11 @@
|
|||
"packages/*"
|
||||
],
|
||||
"devDependencies": {
|
||||
"tsx": "^4.20.5"
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"linkedom": "^0.18.12",
|
||||
"tsx": "^4.20.5",
|
||||
"turndown": "^7.2.2",
|
||||
"turndown-plugin-gfm": "^1.0.2"
|
||||
}
|
||||
},
|
||||
"node_modules/@esbuild/aix-ppc64": {
|
||||
|
|
@ -454,6 +458,23 @@
|
|||
"node": ">=18"
|
||||
}
|
||||
},
|
||||
"node_modules/@mixmark-io/domino": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@mixmark-io/domino/-/domino-2.2.0.tgz",
|
||||
"integrity": "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw==",
|
||||
"dev": true,
|
||||
"license": "BSD-2-Clause"
|
||||
},
|
||||
"node_modules/@mozilla/readability": {
|
||||
"version": "0.6.0",
|
||||
"resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz",
|
||||
"integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==",
|
||||
"dev": true,
|
||||
"license": "Apache-2.0",
|
||||
"engines": {
|
||||
"node": ">=14.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/@types/debug": {
|
||||
"version": "4.1.12",
|
||||
"resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz",
|
||||
|
|
@ -615,6 +636,13 @@
|
|||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
},
|
||||
"node_modules/cssom": {
|
||||
"version": "0.5.0",
|
||||
"resolved": "https://registry.npmjs.org/cssom/-/cssom-0.5.0.tgz",
|
||||
"integrity": "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/debug": {
|
||||
"version": "4.4.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.4.3.tgz",
|
||||
|
|
@ -896,6 +924,13 @@
|
|||
"node": ">=12.0.0"
|
||||
}
|
||||
},
|
||||
"node_modules/html-escaper": {
|
||||
"version": "3.0.3",
|
||||
"resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-3.0.3.tgz",
|
||||
"integrity": "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/htmlparser2": {
|
||||
"version": "9.1.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-9.1.0.tgz",
|
||||
|
|
@ -984,6 +1019,51 @@
|
|||
"node": ">=18.17"
|
||||
}
|
||||
},
|
||||
"node_modules/linkedom": {
|
||||
"version": "0.18.12",
|
||||
"resolved": "https://registry.npmjs.org/linkedom/-/linkedom-0.18.12.tgz",
|
||||
"integrity": "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q==",
|
||||
"dev": true,
|
||||
"license": "ISC",
|
||||
"dependencies": {
|
||||
"css-select": "^5.1.0",
|
||||
"cssom": "^0.5.0",
|
||||
"html-escaper": "^3.0.3",
|
||||
"htmlparser2": "^10.0.0",
|
||||
"uhyphen": "^0.2.0"
|
||||
},
|
||||
"engines": {
|
||||
"node": ">=16"
|
||||
},
|
||||
"peerDependencies": {
|
||||
"canvas": ">= 2"
|
||||
},
|
||||
"peerDependenciesMeta": {
|
||||
"canvas": {
|
||||
"optional": true
|
||||
}
|
||||
}
|
||||
},
|
||||
"node_modules/linkedom/node_modules/htmlparser2": {
|
||||
"version": "10.1.0",
|
||||
"resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-10.1.0.tgz",
|
||||
"integrity": "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ==",
|
||||
"dev": true,
|
||||
"funding": [
|
||||
"https://github.com/fb55/htmlparser2?sponsor=1",
|
||||
{
|
||||
"type": "github",
|
||||
"url": "https://github.com/sponsors/fb55"
|
||||
}
|
||||
],
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"domelementtype": "^2.3.0",
|
||||
"domhandler": "^5.0.3",
|
||||
"domutils": "^3.2.2",
|
||||
"entities": "^7.0.1"
|
||||
}
|
||||
},
|
||||
"node_modules/longest-streak": {
|
||||
"version": "3.1.0",
|
||||
"resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
|
||||
|
|
@ -1768,6 +1848,30 @@
|
|||
"fsevents": "~2.3.3"
|
||||
}
|
||||
},
|
||||
"node_modules/turndown": {
|
||||
"version": "7.2.2",
|
||||
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.2.2.tgz",
|
||||
"integrity": "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ==",
|
||||
"dev": true,
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@mixmark-io/domino": "^2.2.0"
|
||||
}
|
||||
},
|
||||
"node_modules/turndown-plugin-gfm": {
|
||||
"version": "1.0.2",
|
||||
"resolved": "https://registry.npmjs.org/turndown-plugin-gfm/-/turndown-plugin-gfm-1.0.2.tgz",
|
||||
"integrity": "sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg==",
|
||||
"dev": true,
|
||||
"license": "MIT"
|
||||
},
|
||||
"node_modules/uhyphen": {
|
||||
"version": "0.2.0",
|
||||
"resolved": "https://registry.npmjs.org/uhyphen/-/uhyphen-0.2.0.tgz",
|
||||
"integrity": "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA==",
|
||||
"dev": true,
|
||||
"license": "ISC"
|
||||
},
|
||||
"node_modules/undici": {
|
||||
"version": "6.24.0",
|
||||
"resolved": "https://registry.npmjs.org/undici/-/undici-6.24.0.tgz",
|
||||
|
|
|
|||
|
|
@ -10,6 +10,10 @@
|
|||
"test:coverage": "node --import tsx --experimental-test-coverage --test"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"linkedom": "^0.18.12",
|
||||
"turndown": "^7.2.2",
|
||||
"turndown-plugin-gfm": "^1.0.2",
|
||||
"tsx": "^4.20.5"
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,4 +1,5 @@
|
|||
import { describe, expect, test } from "bun:test";
|
||||
import assert from "node:assert/strict";
|
||||
import test from "node:test";
|
||||
|
||||
import {
|
||||
createMarkdownDocument,
|
||||
|
|
@ -129,73 +130,77 @@ function parse(html: string, url: string) {
|
|||
return tryUrlRuleParsers(html, url, baseMetadata);
|
||||
}
|
||||
|
||||
describe("url rule parsers", () => {
|
||||
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
|
||||
test("parses archive.ph pages from CONTENT and restores the original URL", () => {
|
||||
const result = parse(ARCHIVE_HTML, "https://archive.ph/SMcX5");
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||||
expect(result?.metadata.url).toBe(
|
||||
assert.ok(result);
|
||||
assert.equal(result.conversionMethod, "parser:archive-ph");
|
||||
assert.equal(
|
||||
result.metadata.url,
|
||||
"https://www.newscientist.com/article/2520204-major-leap-towards-reanimation-after-death-as-mammals-brain-preserved/"
|
||||
);
|
||||
expect(result?.metadata.title).toBe(
|
||||
assert.equal(
|
||||
result.metadata.title,
|
||||
"Major leap towards reanimation after death as mammal brain preserved"
|
||||
);
|
||||
expect(result?.metadata.coverImage).toBe("https://cdn.example.com/brain.jpg");
|
||||
expect(result?.markdown).toContain("Researchers say the preserved structure");
|
||||
expect(result?.markdown).toContain("");
|
||||
expect(result?.markdown).not.toContain("Archive shell text that should be ignored");
|
||||
});
|
||||
assert.equal(result.metadata.coverImage, "https://cdn.example.com/brain.jpg");
|
||||
assert.ok(result.markdown.includes("Researchers say the preserved structure"));
|
||||
assert.ok(result.markdown.includes(""));
|
||||
assert.ok(!result.markdown.includes("Archive shell text that should be ignored"));
|
||||
});
|
||||
|
||||
test("falls back to body when archive.ph CONTENT is missing", () => {
|
||||
test("falls back to body when archive.ph CONTENT is missing", () => {
|
||||
const result = parse(ARCHIVE_FALLBACK_HTML, "https://archive.ph/fallback");
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:archive-ph");
|
||||
expect(result?.metadata.url).toBe("https://example.com/fallback-story");
|
||||
expect(result?.metadata.title).toBe("Fallback body parsing still works");
|
||||
expect(result?.markdown).toContain("When CONTENT is absent");
|
||||
});
|
||||
assert.ok(result);
|
||||
assert.equal(result.conversionMethod, "parser:archive-ph");
|
||||
assert.equal(result.metadata.url, "https://example.com/fallback-story");
|
||||
assert.equal(result.metadata.title, "Fallback body parsing still works");
|
||||
assert.ok(result.markdown.includes("When CONTENT is absent"));
|
||||
});
|
||||
|
||||
test("parses X article pages from HTML", () => {
|
||||
test("parses X article pages from HTML", () => {
|
||||
const result = parse(
|
||||
ARTICLE_HTML,
|
||||
"https://x.com/dotey/article/2035141635713941927"
|
||||
);
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:x-article");
|
||||
expect(result?.metadata.title).toBe("Karpathy:\"写代码\"已经不是对的动词了");
|
||||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/article-cover.jpg");
|
||||
expect(result?.metadata.published).toBe("2026-03-20T23:49:11.000Z");
|
||||
expect(result?.metadata.language).toBe("zh");
|
||||
expect(result?.markdown).toContain("## 要点速览");
|
||||
expect(result?.markdown).toContain(
|
||||
assert.ok(result);
|
||||
assert.equal(result.conversionMethod, "parser:x-article");
|
||||
assert.equal(result.metadata.title, "Karpathy:\"写代码\"已经不是对的动词了");
|
||||
assert.equal(result.metadata.author, "宝玉 (@dotey)");
|
||||
assert.equal(result.metadata.coverImage, "https://pbs.twimg.com/media/article-cover.jpg");
|
||||
assert.equal(result.metadata.published, "2026-03-20T23:49:11.000Z");
|
||||
assert.equal(result.metadata.language, "zh");
|
||||
assert.ok(result.markdown.includes("## 要点速览"));
|
||||
assert.ok(
|
||||
result.markdown.includes(
|
||||
"[](/dotey/article/2035141635713941927/media/2)"
|
||||
)
|
||||
);
|
||||
expect(result?.markdown).toContain("写代码已经不是对的动词了。");
|
||||
assert.ok(result.markdown.includes("写代码已经不是对的动词了。"));
|
||||
|
||||
const document = createMarkdownDocument(result!);
|
||||
expect(document).toContain("# Karpathy:\"写代码\"已经不是对的动词了");
|
||||
});
|
||||
const document = createMarkdownDocument(result);
|
||||
assert.ok(document.includes("# Karpathy:\"写代码\"已经不是对的动词了"));
|
||||
});
|
||||
|
||||
test("parses X status pages from HTML without duplicating the title heading", () => {
|
||||
test("parses X status pages from HTML without duplicating the title heading", () => {
|
||||
const result = parse(
|
||||
STATUS_HTML,
|
||||
"https://x.com/dotey/status/2035590649081196710"
|
||||
);
|
||||
|
||||
expect(result).not.toBeNull();
|
||||
expect(result?.conversionMethod).toBe("parser:x-status");
|
||||
expect(result?.metadata.author).toBe("宝玉 (@dotey)");
|
||||
expect(result?.metadata.coverImage).toBe("https://pbs.twimg.com/media/tweet-main.jpg");
|
||||
expect(result?.metadata.language).toBe("zh");
|
||||
expect(result?.markdown).toContain("转译:把下面这段加到你的 Codex 自定义指令里");
|
||||
expect(result?.markdown).toContain("> Quote from Matt Shumer (@mattshumer_)");
|
||||
expect(result?.markdown).toContain("![");
|
||||
assert.ok(result);
|
||||
assert.equal(result.conversionMethod, "parser:x-status");
|
||||
assert.equal(result.metadata.author, "宝玉 (@dotey)");
|
||||
assert.equal(result.metadata.coverImage, "https://pbs.twimg.com/media/tweet-main.jpg");
|
||||
assert.equal(result.metadata.language, "zh");
|
||||
assert.ok(result.markdown.includes("转译:把下面这段加到你的 Codex 自定义指令里"));
|
||||
assert.ok(result.markdown.includes("> Quote from Matt Shumer (@mattshumer_)"));
|
||||
assert.ok(result.markdown.includes("!["));
|
||||
|
||||
const document = createMarkdownDocument(result!);
|
||||
expect(document).not.toContain("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n");
|
||||
});
|
||||
const document = createMarkdownDocument(result);
|
||||
assert.ok(
|
||||
!document.includes("\n\n# 转译:把下面这段加到你的 Codex 自定义指令里,体验会好太多:\n\n")
|
||||
);
|
||||
});
|
||||
|
|
|
|||
Loading…
Reference in New Issue