feat(baoyu-url-to-markdown): add defuddle.md API fallback, YouTube transcripts, and modular converter architecture
This commit is contained in:
parent
994e47d1be
commit
0279fa403d
|
|
@ -1,7 +1,7 @@
|
|||
---
|
||||
name: baoyu-url-to-markdown
|
||||
description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when Defuddle fails. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown.
|
||||
version: 1.56.1
|
||||
description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, uses an upgraded Defuddle pipeline with better web-component handling and YouTube transcript extraction, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when needed. If local browser capture fails entirely, it can fall back to the hosted defuddle.md API. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown.
|
||||
version: 1.58.1
|
||||
metadata:
|
||||
openclaw:
|
||||
homepage: https://github.com/JimLiu/baoyu-skills#baoyu-url-to-markdown
|
||||
|
|
@ -29,7 +29,10 @@ Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts i
|
|||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `scripts/main.ts` | CLI entry point for URL fetching |
|
||||
| `scripts/html-to-markdown.ts` | Defuddle-first conversion with automatic legacy fallback |
|
||||
| `scripts/html-to-markdown.ts` | Markdown conversion entry point and converter selection |
|
||||
| `scripts/defuddle-converter.ts` | Defuddle-based conversion |
|
||||
| `scripts/legacy-converter.ts` | Pre-Defuddle legacy extraction and markdown conversion |
|
||||
| `scripts/markdown-conversion-shared.ts` | Shared metadata parsing and markdown document helpers |
|
||||
|
||||
## Preferences (EXTEND.md)
|
||||
|
||||
|
|
@ -115,7 +118,10 @@ Full reference: [references/config/first-time-setup.md](references/config/first-
|
|||
- Two capture modes: auto or wait-for-user
|
||||
- Save rendered HTML as a sibling `-captured.html` file
|
||||
- Clean markdown output with metadata
|
||||
- Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history
|
||||
- Upgraded Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history
|
||||
- Materializes shadow DOM content before conversion so web-component pages survive serialization better
|
||||
- YouTube pages can include transcript/caption text in the markdown when YouTube exposes a caption track
|
||||
- If local browser capture fails completely, can fall back to `defuddle.md/<url>` and still save markdown
|
||||
- Handles login-required pages via wait mode
|
||||
- Download images and videos to local directories
|
||||
|
||||
|
|
@ -168,7 +174,10 @@ Each run saves two files side by side:
|
|||
- Markdown: YAML front matter with `url`, `title`, `description`, `author`, `published`, optional `coverImage`, and `captured_at`, followed by converted markdown content
|
||||
- HTML snapshot: `*-captured.html`, containing the rendered page HTML captured from Chrome
|
||||
|
||||
When Defuddle or page metadata provides a language hint, the markdown front matter also includes `language`.
|
||||
|
||||
The HTML snapshot is saved before any markdown media localization, so it stays a faithful capture of the page DOM used for conversion.
|
||||
If the hosted `defuddle.md` API fallback is used, markdown is still saved, but there is no local `-captured.html` snapshot for that run.
|
||||
|
||||
## Output Directory
|
||||
|
||||
|
|
@ -193,13 +202,16 @@ When `--download-media` is enabled:
|
|||
Conversion order:
|
||||
|
||||
1. Try Defuddle first
|
||||
2. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
|
||||
3. The fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
|
||||
2. For rich pages such as YouTube, prefer Defuddle's extractor-specific output (including transcripts when available) instead of replacing it with the legacy pipeline
|
||||
3. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor
|
||||
4. If the entire local browser capture flow fails before markdown can be produced, try the hosted `https://defuddle.md/<url>` API and save its markdown output directly
|
||||
5. The legacy fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history
|
||||
|
||||
CLI output will show:
|
||||
|
||||
- `Converter: defuddle` when Defuddle succeeds
|
||||
- `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed
|
||||
- `Converter: defuddle-api` when local browser capture failed and the hosted API was used instead
|
||||
|
||||
## Media Download Workflow
|
||||
|
||||
|
|
@ -232,6 +244,18 @@ Based on `download_media` setting in EXTEND.md:
|
|||
|
||||
**Troubleshooting**: Chrome not found → set `URL_CHROME_PATH`. Timeout → increase `--timeout`. Complex pages → try `--wait` mode. If markdown quality is poor, inspect the saved `-captured.html` and check whether the run logged a legacy fallback.
|
||||
|
||||
### YouTube Notes
|
||||
|
||||
- The upgraded Defuddle path uses async extractors, so YouTube pages can include transcript text directly in the markdown body.
|
||||
- Transcript availability depends on YouTube exposing a caption track. Videos with captions disabled, restricted playback, or blocked regional access may still produce description-only output.
|
||||
- If the page needs time to finish loading descriptions, chapters, or player metadata, prefer `--wait` and capture after the watch page is fully hydrated.
|
||||
|
||||
### Hosted API Fallback
|
||||
|
||||
- The hosted fallback endpoint is `https://defuddle.md/<url>`. In shell form: `curl https://defuddle.md/stephango.com`
|
||||
- Use it only when the local Chrome/CDP capture path fails outright. The local path still has higher fidelity because it can save the captured HTML and handle authenticated pages.
|
||||
- The hosted API already returns Markdown with YAML frontmatter, so save that response as-is and then apply the normal media-localization step if requested.
|
||||
|
||||
## Extension Support
|
||||
|
||||
Custom configurations via EXTEND.md. See **Preferences** section for paths and supported options.
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@
|
|||
"dependencies": {
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp",
|
||||
"defuddle": "^0.10.0",
|
||||
"defuddle": "^0.12.0",
|
||||
"jsdom": "^24.1.3",
|
||||
"linkedom": "^0.18.12",
|
||||
"turndown": "^7.2.2",
|
||||
|
|
@ -61,7 +61,7 @@
|
|||
|
||||
"decimal.js": ["decimal.js@10.6.0", "", {}, "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg=="],
|
||||
|
||||
"defuddle": ["defuddle@0.10.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-a43juTtHv6Vs4+sxvahVLM5NxoyDsarO1Ag3UxLORI4Fo/nsNFwzDxuQBvosKVGTIRxCwN/mfnWAzNXmQfieqw=="],
|
||||
"defuddle": ["defuddle@0.12.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-Y/WgyGKBxwxFir+hWNth4nmWDDDb8BzQi3qASS2NWYPXsKU42Ku49/3M5yFYefnRef9prynnmasfnXjk99EWgA=="],
|
||||
|
||||
"delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="],
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,58 @@
|
|||
import { JSDOM, VirtualConsole } from "jsdom";
|
||||
import { Defuddle } from "defuddle/node";
|
||||
|
||||
import {
|
||||
type ConversionResult,
|
||||
type PageMetadata,
|
||||
isMarkdownUsable,
|
||||
normalizeMarkdown,
|
||||
pickString,
|
||||
} from "./markdown-conversion-shared.js";
|
||||
|
||||
export async function tryDefuddleConversion(
|
||||
html: string,
|
||||
url: string,
|
||||
baseMetadata: PageMetadata
|
||||
): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> {
|
||||
try {
|
||||
const virtualConsole = new VirtualConsole();
|
||||
virtualConsole.on("jsdomError", (error: Error & { type?: string }) => {
|
||||
if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) {
|
||||
return;
|
||||
}
|
||||
console.warn(`[url-to-markdown] jsdom: ${error.message}`);
|
||||
});
|
||||
|
||||
const dom = new JSDOM(html, { url, virtualConsole });
|
||||
const result = await Defuddle(dom, url, { markdown: true });
|
||||
const markdown = normalizeMarkdown(result.content || "");
|
||||
|
||||
if (!isMarkdownUsable(markdown, html)) {
|
||||
return { ok: false, reason: "Defuddle returned empty or incomplete markdown" };
|
||||
}
|
||||
|
||||
return {
|
||||
ok: true,
|
||||
result: {
|
||||
metadata: {
|
||||
...baseMetadata,
|
||||
title: pickString(result.title, baseMetadata.title) ?? "",
|
||||
description: pickString(result.description, baseMetadata.description) ?? undefined,
|
||||
author: pickString(result.author, baseMetadata.author) ?? undefined,
|
||||
published: pickString(result.published, baseMetadata.published) ?? undefined,
|
||||
coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined,
|
||||
language: pickString(result.language, baseMetadata.language) ?? undefined,
|
||||
},
|
||||
markdown,
|
||||
rawHtml: html,
|
||||
conversionMethod: "defuddle",
|
||||
variables: result.variables,
|
||||
},
|
||||
};
|
||||
} catch (error) {
|
||||
return {
|
||||
ok: false,
|
||||
reason: error instanceof Error ? error.message : String(error),
|
||||
};
|
||||
}
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,629 @@
|
|||
import { Readability } from "@mozilla/readability";
|
||||
import TurndownService from "turndown";
|
||||
import { gfm } from "turndown-plugin-gfm";
|
||||
|
||||
import {
|
||||
type AnyRecord,
|
||||
type ConversionResult,
|
||||
type PageMetadata,
|
||||
GOOD_CONTENT_LENGTH,
|
||||
MIN_CONTENT_LENGTH,
|
||||
extractPublishedTime,
|
||||
extractTextFromHtml,
|
||||
extractTitle,
|
||||
normalizeMarkdown,
|
||||
parseDocument,
|
||||
pickString,
|
||||
sanitizeHtml,
|
||||
} from "./markdown-conversion-shared.js";
|
||||
|
||||
interface ExtractionCandidate {
|
||||
title: string | null;
|
||||
byline: string | null;
|
||||
excerpt: string | null;
|
||||
published: string | null;
|
||||
html: string | null;
|
||||
textContent: string;
|
||||
method: string;
|
||||
}
|
||||
|
||||
const CONTENT_SELECTORS = [
|
||||
"article",
|
||||
"main article",
|
||||
"[role='main'] article",
|
||||
"[itemprop='articleBody']",
|
||||
".article-content",
|
||||
".article-body",
|
||||
".post-content",
|
||||
".entry-content",
|
||||
".story-body",
|
||||
"main",
|
||||
"[role='main']",
|
||||
"#content",
|
||||
".content",
|
||||
];
|
||||
|
||||
const REMOVE_SELECTORS = [
|
||||
"script",
|
||||
"style",
|
||||
"noscript",
|
||||
"template",
|
||||
"iframe",
|
||||
"svg",
|
||||
"path",
|
||||
"nav",
|
||||
"aside",
|
||||
"footer",
|
||||
"header",
|
||||
"form",
|
||||
".advertisement",
|
||||
".ads",
|
||||
".social-share",
|
||||
".related-articles",
|
||||
".comments",
|
||||
".newsletter",
|
||||
".cookie-banner",
|
||||
".cookie-consent",
|
||||
"[role='navigation']",
|
||||
"[aria-label*='cookie' i]",
|
||||
];
|
||||
|
||||
const NEXT_DATA_CONTENT_PATHS = [
|
||||
"props.pageProps.content.body",
|
||||
"props.pageProps.article.body",
|
||||
"props.pageProps.article.content",
|
||||
"props.pageProps.post.body",
|
||||
"props.pageProps.post.content",
|
||||
"props.pageProps.data.body",
|
||||
"props.pageProps.story.body.content",
|
||||
];
|
||||
|
||||
const LOW_QUALITY_MARKERS = [
|
||||
/Join The Conversation/i,
|
||||
/One Community\. Many Voices/i,
|
||||
/Read our community guidelines/i,
|
||||
/Create a free account to share your thoughts/i,
|
||||
/Become a Forbes Member/i,
|
||||
/Subscribe to trusted journalism/i,
|
||||
/\bComments\b/i,
|
||||
];
|
||||
|
||||
function generateExcerpt(excerpt: string | null, textContent: string | null): string | null {
|
||||
if (excerpt) return excerpt;
|
||||
if (!textContent) return null;
|
||||
const trimmed = textContent.trim();
|
||||
if (!trimmed) return null;
|
||||
return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed;
|
||||
}
|
||||
|
||||
function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null {
|
||||
const type = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
|
||||
if (typeof type !== "string" || !["Article", "NewsArticle", "BlogPosting", "WebPage", "ReportageNewsArticle"].includes(type)) {
|
||||
return null;
|
||||
}
|
||||
|
||||
const rawContent =
|
||||
(typeof item.articleBody === "string" && item.articleBody) ||
|
||||
(typeof item.text === "string" && item.text) ||
|
||||
(typeof item.description === "string" && item.description) ||
|
||||
null;
|
||||
|
||||
if (!rawContent) return null;
|
||||
|
||||
const content = rawContent.trim();
|
||||
const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content);
|
||||
const textContent = htmlLike ? extractTextFromHtml(content) : content;
|
||||
|
||||
if (textContent.length < MIN_CONTENT_LENGTH) return null;
|
||||
|
||||
return {
|
||||
title: pickString(item.headline, item.name),
|
||||
byline: extractAuthorFromJsonLd(item.author),
|
||||
excerpt: pickString(item.description),
|
||||
published: pickString(item.datePublished, item.dateCreated),
|
||||
html: htmlLike ? content : null,
|
||||
textContent,
|
||||
method: "json-ld",
|
||||
};
|
||||
}
|
||||
|
||||
function extractAuthorFromJsonLd(authorData: unknown): string | null {
|
||||
if (typeof authorData === "string") return authorData;
|
||||
if (!authorData || typeof authorData !== "object") return null;
|
||||
|
||||
if (Array.isArray(authorData)) {
|
||||
const names = authorData
|
||||
.map((author) => extractAuthorFromJsonLd(author))
|
||||
.filter((name): name is string => Boolean(name));
|
||||
return names.length > 0 ? names.join(", ") : null;
|
||||
}
|
||||
|
||||
const author = authorData as AnyRecord;
|
||||
return typeof author.name === "string" ? author.name : null;
|
||||
}
|
||||
|
||||
function flattenJsonLdItems(data: unknown): AnyRecord[] {
|
||||
if (!data || typeof data !== "object") return [];
|
||||
if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
|
||||
|
||||
const item = data as AnyRecord;
|
||||
if (Array.isArray(item["@graph"])) {
|
||||
return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
|
||||
}
|
||||
|
||||
return [item];
|
||||
}
|
||||
|
||||
function tryJsonLdExtraction(document: Document): ExtractionCandidate | null {
|
||||
const scripts = document.querySelectorAll("script[type='application/ld+json']");
|
||||
|
||||
for (const script of scripts) {
|
||||
try {
|
||||
const data = JSON.parse(script.textContent ?? "");
|
||||
for (const item of flattenJsonLdItems(data)) {
|
||||
const extracted = parseJsonLdItem(item);
|
||||
if (extracted) return extracted;
|
||||
}
|
||||
} catch {
|
||||
// Ignore malformed blocks.
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function getByPath(value: unknown, path: string): unknown {
|
||||
let current = value;
|
||||
for (const part of path.split(".")) {
|
||||
if (!current || typeof current !== "object") return undefined;
|
||||
current = (current as AnyRecord)[part];
|
||||
}
|
||||
return current;
|
||||
}
|
||||
|
||||
function isContentBlockArray(value: unknown): value is AnyRecord[] {
|
||||
if (!Array.isArray(value) || value.length === 0) return false;
|
||||
return value.slice(0, 5).some((item) => {
|
||||
if (!item || typeof item !== "object") return false;
|
||||
const obj = item as AnyRecord;
|
||||
return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj;
|
||||
});
|
||||
}
|
||||
|
||||
function extractTextFromContentBlocks(blocks: AnyRecord[]): string {
|
||||
const parts: string[] = [];
|
||||
|
||||
function pushParagraph(text: string): void {
|
||||
const trimmed = text.trim();
|
||||
if (!trimmed) return;
|
||||
parts.push(trimmed, "\n\n");
|
||||
}
|
||||
|
||||
function walk(node: unknown): void {
|
||||
if (!node || typeof node !== "object") return;
|
||||
const block = node as AnyRecord;
|
||||
|
||||
if (typeof block.text === "string") {
|
||||
pushParagraph(block.text);
|
||||
return;
|
||||
}
|
||||
|
||||
if (typeof block.textHtml === "string") {
|
||||
pushParagraph(extractTextFromHtml(block.textHtml));
|
||||
return;
|
||||
}
|
||||
|
||||
if (Array.isArray(block.items)) {
|
||||
for (const item of block.items) {
|
||||
if (item && typeof item === "object") {
|
||||
const text = pickString((item as AnyRecord).text);
|
||||
if (text) parts.push(`- ${text}\n`);
|
||||
}
|
||||
}
|
||||
parts.push("\n");
|
||||
}
|
||||
|
||||
if (Array.isArray(block.components)) {
|
||||
for (const component of block.components) {
|
||||
walk(component);
|
||||
}
|
||||
}
|
||||
|
||||
if (Array.isArray(block.content)) {
|
||||
for (const child of block.content) {
|
||||
walk(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (const block of blocks) {
|
||||
walk(block);
|
||||
}
|
||||
|
||||
return parts.join("").replace(/\n{3,}/g, "\n\n").trim();
|
||||
}
|
||||
|
||||
function tryStringBodyExtraction(
|
||||
content: string,
|
||||
meta: AnyRecord,
|
||||
document: Document,
|
||||
method: string
|
||||
): ExtractionCandidate | null {
|
||||
if (!content || content.length < MIN_CONTENT_LENGTH) return null;
|
||||
|
||||
const isHtml = /<\/?[a-z][\s\S]*>/i.test(content);
|
||||
const html = isHtml ? sanitizeHtml(content) : null;
|
||||
const textContent = isHtml ? extractTextFromHtml(html) : content.trim();
|
||||
|
||||
if (textContent.length < MIN_CONTENT_LENGTH) return null;
|
||||
|
||||
return {
|
||||
title: pickString(meta.headline, meta.title, extractTitle(document)),
|
||||
byline: pickString(meta.byline, meta.author),
|
||||
excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)),
|
||||
published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)),
|
||||
html,
|
||||
textContent,
|
||||
method,
|
||||
};
|
||||
}
|
||||
|
||||
function tryNextDataExtraction(document: Document): ExtractionCandidate | null {
|
||||
try {
|
||||
const script = document.querySelector("script#__NEXT_DATA__");
|
||||
if (!script?.textContent) return null;
|
||||
|
||||
const data = JSON.parse(script.textContent) as AnyRecord;
|
||||
const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord;
|
||||
|
||||
for (const path of NEXT_DATA_CONTENT_PATHS) {
|
||||
const value = getByPath(data, path);
|
||||
|
||||
if (typeof value === "string") {
|
||||
const parentPath = path.split(".").slice(0, -1).join(".");
|
||||
const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord;
|
||||
const meta = {
|
||||
...pageProps,
|
||||
...parent,
|
||||
title: parent.title ?? (pageProps.title as string | undefined),
|
||||
};
|
||||
|
||||
const candidate = tryStringBodyExtraction(value, meta, document, "next-data");
|
||||
if (candidate) return candidate;
|
||||
}
|
||||
|
||||
if (isContentBlockArray(value)) {
|
||||
const textContent = extractTextFromContentBlocks(value);
|
||||
if (textContent.length < MIN_CONTENT_LENGTH) continue;
|
||||
|
||||
return {
|
||||
title: pickString(
|
||||
getByPath(data, "props.pageProps.content.headline"),
|
||||
getByPath(data, "props.pageProps.article.headline"),
|
||||
getByPath(data, "props.pageProps.article.title"),
|
||||
getByPath(data, "props.pageProps.post.title"),
|
||||
pageProps.title,
|
||||
extractTitle(document)
|
||||
),
|
||||
byline: pickString(
|
||||
getByPath(data, "props.pageProps.author.name"),
|
||||
getByPath(data, "props.pageProps.article.author.name")
|
||||
),
|
||||
excerpt: pickString(
|
||||
getByPath(data, "props.pageProps.content.description"),
|
||||
getByPath(data, "props.pageProps.article.description"),
|
||||
pageProps.description,
|
||||
generateExcerpt(null, textContent)
|
||||
),
|
||||
published: pickString(
|
||||
getByPath(data, "props.pageProps.content.datePublished"),
|
||||
getByPath(data, "props.pageProps.article.datePublished"),
|
||||
getByPath(data, "props.pageProps.publishedAt"),
|
||||
extractPublishedTime(document)
|
||||
),
|
||||
html: null,
|
||||
textContent,
|
||||
method: "next-data",
|
||||
};
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function buildReadabilityCandidate(
|
||||
article: ReturnType<Readability["parse"]>,
|
||||
document: Document,
|
||||
method: string
|
||||
): ExtractionCandidate | null {
|
||||
const textContent = article?.textContent?.trim() ?? "";
|
||||
if (textContent.length < MIN_CONTENT_LENGTH) return null;
|
||||
|
||||
return {
|
||||
title: pickString(article?.title, extractTitle(document)),
|
||||
byline: pickString((article as { byline?: string } | null)?.byline),
|
||||
excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)),
|
||||
published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)),
|
||||
html: article?.content ? sanitizeHtml(article.content) : null,
|
||||
textContent,
|
||||
method,
|
||||
};
|
||||
}
|
||||
|
||||
function tryReadability(document: Document): ExtractionCandidate | null {
|
||||
try {
|
||||
const strictClone = document.cloneNode(true) as Document;
|
||||
const strictResult = buildReadabilityCandidate(
|
||||
new Readability(strictClone).parse(),
|
||||
document,
|
||||
"readability"
|
||||
);
|
||||
if (strictResult) return strictResult;
|
||||
|
||||
const relaxedClone = document.cloneNode(true) as Document;
|
||||
return buildReadabilityCandidate(
|
||||
new Readability(relaxedClone, { charThreshold: 120 }).parse(),
|
||||
document,
|
||||
"readability-relaxed"
|
||||
);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function trySelectorExtraction(document: Document): ExtractionCandidate | null {
|
||||
for (const selector of CONTENT_SELECTORS) {
|
||||
const element = document.querySelector(selector);
|
||||
if (!element) continue;
|
||||
|
||||
const clone = element.cloneNode(true) as Element;
|
||||
for (const removeSelector of REMOVE_SELECTORS) {
|
||||
for (const node of clone.querySelectorAll(removeSelector)) {
|
||||
node.remove();
|
||||
}
|
||||
}
|
||||
|
||||
const html = sanitizeHtml(clone.innerHTML);
|
||||
const textContent = extractTextFromHtml(html);
|
||||
if (textContent.length < MIN_CONTENT_LENGTH) continue;
|
||||
|
||||
return {
|
||||
title: extractTitle(document),
|
||||
byline: null,
|
||||
excerpt: generateExcerpt(null, textContent),
|
||||
published: extractPublishedTime(document),
|
||||
html,
|
||||
textContent,
|
||||
method: `selector:${selector}`,
|
||||
};
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
function tryBodyExtraction(document: Document): ExtractionCandidate | null {
|
||||
const body = document.body;
|
||||
if (!body) return null;
|
||||
|
||||
const clone = body.cloneNode(true) as Element;
|
||||
for (const removeSelector of REMOVE_SELECTORS) {
|
||||
for (const node of clone.querySelectorAll(removeSelector)) {
|
||||
node.remove();
|
||||
}
|
||||
}
|
||||
|
||||
const html = sanitizeHtml(clone.innerHTML);
|
||||
const textContent = extractTextFromHtml(html);
|
||||
if (!textContent) return null;
|
||||
|
||||
return {
|
||||
title: extractTitle(document),
|
||||
byline: null,
|
||||
excerpt: generateExcerpt(null, textContent),
|
||||
published: extractPublishedTime(document),
|
||||
html,
|
||||
textContent,
|
||||
method: "body-fallback",
|
||||
};
|
||||
}
|
||||
|
||||
function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null {
|
||||
if (candidates.length === 0) return null;
|
||||
|
||||
const methodOrder = [
|
||||
"readability",
|
||||
"readability-relaxed",
|
||||
"next-data",
|
||||
"json-ld",
|
||||
"selector:",
|
||||
"body-fallback",
|
||||
];
|
||||
|
||||
function methodRank(method: string): number {
|
||||
const idx = methodOrder.findIndex((entry) =>
|
||||
entry.endsWith(":") ? method.startsWith(entry) : method === entry
|
||||
);
|
||||
return idx === -1 ? methodOrder.length : idx;
|
||||
}
|
||||
|
||||
const ranked = [...candidates].sort((a, b) => {
|
||||
const rankA = methodRank(a.method);
|
||||
const rankB = methodRank(b.method);
|
||||
if (rankA !== rankB) return rankA - rankB;
|
||||
return (b.textContent.length ?? 0) - (a.textContent.length ?? 0);
|
||||
});
|
||||
|
||||
for (const candidate of ranked) {
|
||||
if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
for (const candidate of ranked) {
|
||||
if (candidate.textContent.length >= MIN_CONTENT_LENGTH) {
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
return ranked[0];
|
||||
}
|
||||
|
||||
function extractFromHtml(html: string): ExtractionCandidate | null {
|
||||
const document = parseDocument(html);
|
||||
|
||||
const readabilityCandidate = tryReadability(document);
|
||||
const nextDataCandidate = tryNextDataExtraction(document);
|
||||
const jsonLdCandidate = tryJsonLdExtraction(document);
|
||||
const selectorCandidate = trySelectorExtraction(document);
|
||||
const bodyCandidate = tryBodyExtraction(document);
|
||||
|
||||
const candidates = [
|
||||
readabilityCandidate,
|
||||
nextDataCandidate,
|
||||
jsonLdCandidate,
|
||||
selectorCandidate,
|
||||
bodyCandidate,
|
||||
].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate));
|
||||
|
||||
const winner = pickBestCandidate(candidates);
|
||||
if (!winner) return null;
|
||||
|
||||
return {
|
||||
...winner,
|
||||
title: winner.title ?? extractTitle(document),
|
||||
published: winner.published ?? extractPublishedTime(document),
|
||||
excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent),
|
||||
};
|
||||
}
|
||||
|
||||
const turndown = new TurndownService({
|
||||
headingStyle: "atx",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
codeBlockStyle: "fenced",
|
||||
emDelimiter: "*",
|
||||
strongDelimiter: "**",
|
||||
linkStyle: "inlined",
|
||||
});
|
||||
|
||||
turndown.use(gfm);
|
||||
turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]);
|
||||
|
||||
turndown.addRule("collapseFigure", {
|
||||
filter: "figure",
|
||||
replacement(content) {
|
||||
return `\n\n${content.trim()}\n\n`;
|
||||
},
|
||||
});
|
||||
|
||||
turndown.addRule("dropInvisibleAnchors", {
|
||||
filter(node) {
|
||||
return node.nodeName === "A" && !(node as Element).textContent?.trim();
|
||||
},
|
||||
replacement() {
|
||||
return "";
|
||||
},
|
||||
});
|
||||
|
||||
function convertHtmlToMarkdown(html: string): string {
|
||||
if (!html || !html.trim()) return "";
|
||||
|
||||
try {
|
||||
const sanitized = sanitizeHtml(html);
|
||||
return turndown.turndown(sanitized);
|
||||
} catch {
|
||||
return "";
|
||||
}
|
||||
}
|
||||
|
||||
function fallbackPlainText(html: string): string {
|
||||
const document = parseDocument(html);
|
||||
for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
|
||||
for (const el of document.querySelectorAll(selector)) {
|
||||
el.remove();
|
||||
}
|
||||
}
|
||||
const text = document.body?.textContent ?? document.documentElement?.textContent ?? "";
|
||||
return normalizeMarkdown(text.replace(/\s+/g, " "));
|
||||
}
|
||||
|
||||
function countBylines(markdown: string): number {
|
||||
return (markdown.match(/(^|\n)By\s+/g) || []).length;
|
||||
}
|
||||
|
||||
function countUsefulParagraphs(markdown: string): number {
|
||||
const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/);
|
||||
let count = 0;
|
||||
|
||||
for (const paragraph of paragraphs) {
|
||||
const trimmed = paragraph.trim();
|
||||
if (!trimmed) continue;
|
||||
if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue;
|
||||
if (/^#{1,6}\s+/.test(trimmed)) continue;
|
||||
if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue;
|
||||
count++;
|
||||
}
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
function countMarkerHits(markdown: string, markers: RegExp[]): number {
|
||||
let hits = 0;
|
||||
for (const marker of markers) {
|
||||
if (marker.test(markdown)) hits++;
|
||||
}
|
||||
return hits;
|
||||
}
|
||||
|
||||
export function scoreMarkdownQuality(markdown: string): number {
|
||||
const normalized = normalizeMarkdown(markdown);
|
||||
const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length;
|
||||
const usefulParagraphs = countUsefulParagraphs(normalized);
|
||||
const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length;
|
||||
const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS);
|
||||
const bylineCount = countBylines(normalized);
|
||||
const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length;
|
||||
|
||||
return (
|
||||
Math.min(wordCount, 4000) +
|
||||
usefulParagraphs * 40 +
|
||||
headingCount * 10 -
|
||||
markerHits * 180 -
|
||||
Math.max(0, bylineCount - 1) * 120 -
|
||||
Math.max(0, staffCount - 1) * 80
|
||||
);
|
||||
}
|
||||
|
||||
export function shouldCompareWithLegacy(markdown: string): boolean {
|
||||
const normalized = normalizeMarkdown(markdown);
|
||||
return (
|
||||
countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 ||
|
||||
countBylines(normalized) > 1 ||
|
||||
countUsefulParagraphs(normalized) < 6
|
||||
);
|
||||
}
|
||||
|
||||
export function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult {
|
||||
const extracted = extractFromHtml(html);
|
||||
|
||||
let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : "";
|
||||
if (!markdown.trim()) {
|
||||
markdown = extracted?.textContent?.trim() || fallbackPlainText(html);
|
||||
}
|
||||
|
||||
return {
|
||||
metadata: {
|
||||
...baseMetadata,
|
||||
title: pickString(extracted?.title, baseMetadata.title) ?? "",
|
||||
description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined,
|
||||
author: pickString(extracted?.byline, baseMetadata.author) ?? undefined,
|
||||
published: pickString(extracted?.published, baseMetadata.published) ?? undefined,
|
||||
},
|
||||
markdown: normalizeMarkdown(markdown),
|
||||
rawHtml: html,
|
||||
conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text",
|
||||
};
|
||||
}
|
||||
|
|
@ -75,6 +75,55 @@ function deriveHtmlSnapshotPath(markdownPath: string): string {
|
|||
return path.join(parsed.dir, `${basename}-captured.html`);
|
||||
}
|
||||
|
||||
function extractTitleFromMarkdownDocument(document: string): string {
|
||||
const normalized = document.replace(/\r\n/g, "\n");
|
||||
const frontmatterMatch = normalized.match(/^---\n([\s\S]*?)\n---\n?/);
|
||||
if (frontmatterMatch) {
|
||||
const titleLine = frontmatterMatch[1]
|
||||
.split("\n")
|
||||
.find((line) => /^title:\s*/i.test(line));
|
||||
|
||||
if (titleLine) {
|
||||
const rawValue = titleLine.replace(/^title:\s*/i, "").trim();
|
||||
const unquoted = rawValue
|
||||
.replace(/^"(.*)"$/, "$1")
|
||||
.replace(/^'(.*)'$/, "$1")
|
||||
.replace(/\\"/g, '"');
|
||||
if (unquoted) return unquoted;
|
||||
}
|
||||
}
|
||||
|
||||
const headingMatch = normalized.match(/^#\s+(.+)$/m);
|
||||
return headingMatch?.[1]?.trim() ?? "";
|
||||
}
|
||||
|
||||
function buildDefuddleApiUrl(targetUrl: string): string {
|
||||
return `https://defuddle.md/${encodeURIComponent(targetUrl)}`;
|
||||
}
|
||||
|
||||
async function fetchDefuddleApiMarkdown(targetUrl: string): Promise<{ markdown: string; title: string }> {
|
||||
const apiUrl = buildDefuddleApiUrl(targetUrl);
|
||||
const response = await fetch(apiUrl, {
|
||||
headers: {
|
||||
accept: "text/markdown,text/plain;q=0.9,*/*;q=0.1",
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
throw new Error(`defuddle.md returned ${response.status} ${response.statusText}`);
|
||||
}
|
||||
|
||||
const markdown = (await response.text()).replace(/\r\n/g, "\n").trim();
|
||||
if (!markdown) {
|
||||
throw new Error("defuddle.md returned empty markdown");
|
||||
}
|
||||
|
||||
return {
|
||||
markdown,
|
||||
title: extractTitleFromMarkdownDocument(markdown),
|
||||
};
|
||||
}
|
||||
|
||||
async function generateOutputPath(url: string, title: string, outputDir?: string): Promise<string> {
|
||||
const domain = new URL(url).hostname.replace(/^www\./, "");
|
||||
const slug = generateSlug(title, url);
|
||||
|
|
@ -192,14 +241,41 @@ async function main(): Promise<void> {
|
|||
console.log(`Fetching: ${args.url}`);
|
||||
console.log(`Mode: ${args.wait ? "wait" : "auto"}`);
|
||||
|
||||
const result = await captureUrl(args);
|
||||
const outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir);
|
||||
const outputDir = path.dirname(outputPath);
|
||||
const htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath);
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8");
|
||||
let outputPath: string;
|
||||
let htmlSnapshotPath: string | null = null;
|
||||
let document: string;
|
||||
let conversionMethod: string;
|
||||
let fallbackReason: string | undefined;
|
||||
|
||||
let document = createMarkdownDocument(result);
|
||||
try {
|
||||
const result = await captureUrl(args);
|
||||
outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir);
|
||||
const outputDir = path.dirname(outputPath);
|
||||
htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath);
|
||||
await mkdir(outputDir, { recursive: true });
|
||||
await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8");
|
||||
|
||||
document = createMarkdownDocument(result);
|
||||
conversionMethod = result.conversionMethod;
|
||||
fallbackReason = result.fallbackReason;
|
||||
} catch (error) {
|
||||
const primaryError = error instanceof Error ? error.message : String(error);
|
||||
console.warn(`Primary capture failed: ${primaryError}`);
|
||||
console.warn("Trying defuddle.md API fallback...");
|
||||
|
||||
try {
|
||||
const remoteResult = await fetchDefuddleApiMarkdown(args.url);
|
||||
outputPath = args.output || await generateOutputPath(args.url, remoteResult.title, args.outputDir);
|
||||
await mkdir(path.dirname(outputPath), { recursive: true });
|
||||
|
||||
document = remoteResult.markdown;
|
||||
conversionMethod = "defuddle-api";
|
||||
fallbackReason = `Local browser capture failed: ${primaryError}`;
|
||||
} catch (remoteError) {
|
||||
const remoteMessage = remoteError instanceof Error ? remoteError.message : String(remoteError);
|
||||
throw new Error(`Local browser capture failed (${primaryError}); defuddle.md fallback failed (${remoteMessage})`);
|
||||
}
|
||||
}
|
||||
|
||||
if (args.downloadMedia) {
|
||||
const mediaResult = await localizeMarkdownMedia(document, {
|
||||
|
|
@ -220,11 +296,15 @@ async function main(): Promise<void> {
|
|||
await writeFile(outputPath, document, "utf-8");
|
||||
|
||||
console.log(`Saved: ${outputPath}`);
|
||||
console.log(`Saved HTML: ${htmlSnapshotPath}`);
|
||||
console.log(`Title: ${result.metadata.title || "(no title)"}`);
|
||||
console.log(`Converter: ${result.conversionMethod}`);
|
||||
if (result.fallbackReason) {
|
||||
console.warn(`Fallback used: ${result.fallbackReason}`);
|
||||
if (htmlSnapshotPath) {
|
||||
console.log(`Saved HTML: ${htmlSnapshotPath}`);
|
||||
} else {
|
||||
console.log("Saved HTML: unavailable (defuddle.md fallback)");
|
||||
}
|
||||
console.log(`Title: ${extractTitleFromMarkdownDocument(document) || "(no title)"}`);
|
||||
console.log(`Converter: ${conversionMethod}`);
|
||||
if (fallbackReason) {
|
||||
console.warn(`Fallback used: ${fallbackReason}`);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,305 @@
|
|||
import { parseHTML } from "linkedom";
|
||||
|
||||
export interface PageMetadata {
|
||||
url: string;
|
||||
title: string;
|
||||
description?: string;
|
||||
author?: string;
|
||||
published?: string;
|
||||
coverImage?: string;
|
||||
language?: string;
|
||||
captured_at: string;
|
||||
}
|
||||
|
||||
export interface ConversionResult {
|
||||
metadata: PageMetadata;
|
||||
markdown: string;
|
||||
rawHtml: string;
|
||||
conversionMethod: string;
|
||||
fallbackReason?: string;
|
||||
variables?: Record<string, string>;
|
||||
}
|
||||
|
||||
export type AnyRecord = Record<string, unknown>;
|
||||
|
||||
export const MIN_CONTENT_LENGTH = 120;
|
||||
export const GOOD_CONTENT_LENGTH = 900;
|
||||
|
||||
const PUBLISHED_TIME_SELECTORS = [
|
||||
"meta[property='article:published_time']",
|
||||
"meta[name='pubdate']",
|
||||
"meta[name='publishdate']",
|
||||
"meta[name='date']",
|
||||
"time[datetime]",
|
||||
];
|
||||
|
||||
const ARTICLE_TYPES = new Set([
|
||||
"Article",
|
||||
"NewsArticle",
|
||||
"BlogPosting",
|
||||
"WebPage",
|
||||
"ReportageNewsArticle",
|
||||
]);
|
||||
|
||||
export function pickString(...values: unknown[]): string | null {
|
||||
for (const value of values) {
|
||||
if (typeof value === "string") {
|
||||
const trimmed = value.trim();
|
||||
if (trimmed) return trimmed;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export function normalizeMarkdown(markdown: string): string {
|
||||
return markdown
|
||||
.replace(/\r\n/g, "\n")
|
||||
.replace(/[ \t]+\n/g, "\n")
|
||||
.replace(/\n{3,}/g, "\n\n")
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function parseDocument(html: string): Document {
|
||||
const normalized = /<\s*html[\s>]/i.test(html)
|
||||
? html
|
||||
: `<!doctype html><html><body>${html}</body></html>`;
|
||||
return parseHTML(normalized).document as unknown as Document;
|
||||
}
|
||||
|
||||
export function sanitizeHtml(html: string): string {
|
||||
const { document } = parseHTML(`<div id="__root">${html}</div>`);
|
||||
const root = document.querySelector("#__root");
|
||||
if (!root) return html;
|
||||
|
||||
for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
|
||||
for (const el of root.querySelectorAll(selector)) {
|
||||
el.remove();
|
||||
}
|
||||
}
|
||||
|
||||
return root.innerHTML;
|
||||
}
|
||||
|
||||
export function extractTextFromHtml(html: string): string {
|
||||
const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
|
||||
for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
|
||||
for (const el of document.querySelectorAll(selector)) {
|
||||
el.remove();
|
||||
}
|
||||
}
|
||||
return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
|
||||
}
|
||||
|
||||
export function getMetaContent(document: Document, names: string[]): string | null {
|
||||
for (const name of names) {
|
||||
const element =
|
||||
document.querySelector(`meta[name="${name}"]`) ??
|
||||
document.querySelector(`meta[property="${name}"]`);
|
||||
const content = element?.getAttribute("content");
|
||||
if (content && content.trim()) return content.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function normalizeLanguageTag(value: string | null): string | null {
|
||||
if (!value) return null;
|
||||
|
||||
const trimmed = value.trim();
|
||||
if (!trimmed) return null;
|
||||
|
||||
const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();
|
||||
if (!primary) return null;
|
||||
|
||||
return primary.replace(/_/g, "-");
|
||||
}
|
||||
|
||||
function flattenJsonLdItems(data: unknown): AnyRecord[] {
|
||||
if (!data || typeof data !== "object") return [];
|
||||
if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
|
||||
|
||||
const item = data as AnyRecord;
|
||||
if (Array.isArray(item["@graph"])) {
|
||||
return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
|
||||
}
|
||||
|
||||
return [item];
|
||||
}
|
||||
|
||||
function parseJsonLdScripts(document: Document): AnyRecord[] {
|
||||
const results: AnyRecord[] = [];
|
||||
const scripts = document.querySelectorAll("script[type='application/ld+json']");
|
||||
|
||||
for (const script of scripts) {
|
||||
try {
|
||||
const data = JSON.parse(script.textContent ?? "");
|
||||
results.push(...flattenJsonLdItems(data));
|
||||
} catch {
|
||||
// Ignore malformed blocks.
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
function isArticleType(item: AnyRecord): boolean {
|
||||
const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
|
||||
return typeof value === "string" && ARTICLE_TYPES.has(value);
|
||||
}
|
||||
|
||||
function extractAuthorFromJsonLd(authorData: unknown): string | null {
|
||||
if (typeof authorData === "string") return authorData;
|
||||
if (!authorData || typeof authorData !== "object") return null;
|
||||
|
||||
if (Array.isArray(authorData)) {
|
||||
const names = authorData
|
||||
.map((author) => extractAuthorFromJsonLd(author))
|
||||
.filter((name): name is string => Boolean(name));
|
||||
return names.length > 0 ? names.join(", ") : null;
|
||||
}
|
||||
|
||||
const author = authorData as AnyRecord;
|
||||
return typeof author.name === "string" ? author.name : null;
|
||||
}
|
||||
|
||||
function extractPrimaryJsonLdMeta(document: Document): Partial<PageMetadata> {
|
||||
for (const item of parseJsonLdScripts(document)) {
|
||||
if (!isArticleType(item)) continue;
|
||||
|
||||
return {
|
||||
title: pickString(item.headline, item.name) ?? undefined,
|
||||
description: pickString(item.description) ?? undefined,
|
||||
author: extractAuthorFromJsonLd(item.author) ?? undefined,
|
||||
published: pickString(item.datePublished, item.dateCreated) ?? undefined,
|
||||
coverImage:
|
||||
pickString(
|
||||
item.image,
|
||||
(item.image as AnyRecord | undefined)?.url,
|
||||
(Array.isArray(item.image) ? item.image[0] : undefined) as unknown
|
||||
) ?? undefined,
|
||||
};
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
export function extractPublishedTime(document: Document): string | null {
|
||||
for (const selector of PUBLISHED_TIME_SELECTORS) {
|
||||
const el = document.querySelector(selector);
|
||||
if (!el) continue;
|
||||
const value = el.getAttribute("content") ?? el.getAttribute("datetime");
|
||||
if (value && value.trim()) return value.trim();
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
export function extractTitle(document: Document): string | null {
|
||||
const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content");
|
||||
if (ogTitle && ogTitle.trim()) return ogTitle.trim();
|
||||
|
||||
const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content");
|
||||
if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim();
|
||||
|
||||
const title = document.querySelector("title")?.textContent?.trim();
|
||||
if (title) {
|
||||
const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim();
|
||||
if (cleaned) return cleaned;
|
||||
}
|
||||
|
||||
const h1 = document.querySelector("h1")?.textContent?.trim();
|
||||
return h1 || null;
|
||||
}
|
||||
|
||||
export function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata {
|
||||
const document = parseDocument(html);
|
||||
const jsonLd = extractPrimaryJsonLdMeta(document);
|
||||
const timeEl = document.querySelector("time[datetime]");
|
||||
const htmlLang = normalizeLanguageTag(document.documentElement?.getAttribute("lang"));
|
||||
const metaLanguage = normalizeLanguageTag(
|
||||
pickString(
|
||||
getMetaContent(document, ["language", "content-language", "og:locale"]),
|
||||
document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content")
|
||||
)
|
||||
);
|
||||
|
||||
return {
|
||||
url,
|
||||
title:
|
||||
pickString(
|
||||
getMetaContent(document, ["og:title", "twitter:title"]),
|
||||
jsonLd.title,
|
||||
document.querySelector("h1")?.textContent,
|
||||
document.title
|
||||
) ?? "",
|
||||
description:
|
||||
pickString(
|
||||
getMetaContent(document, ["description", "og:description", "twitter:description"]),
|
||||
jsonLd.description
|
||||
) ?? undefined,
|
||||
author:
|
||||
pickString(
|
||||
getMetaContent(document, ["author", "article:author", "twitter:creator"]),
|
||||
jsonLd.author
|
||||
) ?? undefined,
|
||||
published:
|
||||
pickString(
|
||||
timeEl?.getAttribute("datetime"),
|
||||
getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
|
||||
jsonLd.published,
|
||||
extractPublishedTime(document)
|
||||
) ?? undefined,
|
||||
coverImage:
|
||||
pickString(
|
||||
getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
|
||||
jsonLd.coverImage
|
||||
) ?? undefined,
|
||||
language: pickString(htmlLang, metaLanguage) ?? undefined,
|
||||
captured_at: capturedAt,
|
||||
};
|
||||
}
|
||||
|
||||
export function isMarkdownUsable(markdown: string, html: string): boolean {
|
||||
const normalized = normalizeMarkdown(markdown);
|
||||
if (!normalized) return false;
|
||||
|
||||
const htmlTextLength = extractTextFromHtml(html).length;
|
||||
if (htmlTextLength < MIN_CONTENT_LENGTH) return true;
|
||||
|
||||
if (normalized.length >= 80) return true;
|
||||
return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
|
||||
}
|
||||
|
||||
export function isYouTubeUrl(url: string): boolean {
|
||||
try {
|
||||
const hostname = new URL(url).hostname.toLowerCase();
|
||||
return hostname === "youtu.be" || hostname.endsWith(".youtube.com") || hostname === "youtube.com";
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
function escapeYamlValue(value: string): string {
|
||||
return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n");
|
||||
}
|
||||
|
||||
export function formatMetadataYaml(meta: PageMetadata): string {
|
||||
const lines = ["---"];
|
||||
lines.push(`url: ${meta.url}`);
|
||||
lines.push(`title: "${escapeYamlValue(meta.title)}"`);
|
||||
if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
|
||||
if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
|
||||
if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
|
||||
if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
|
||||
if (meta.language) lines.push(`language: "${escapeYamlValue(meta.language)}"`);
|
||||
lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
|
||||
lines.push("---");
|
||||
return lines.join("\n");
|
||||
}
|
||||
|
||||
export function createMarkdownDocument(result: ConversionResult): string {
|
||||
const yaml = formatMetadataYaml(result.metadata);
|
||||
const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
|
||||
const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
|
||||
const hasTitle = titleRegex.test(result.markdown.trimStart());
|
||||
const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
|
||||
return yaml + title + result.markdown;
|
||||
}
|
||||
|
|
@ -5,7 +5,7 @@
|
|||
"dependencies": {
|
||||
"@mozilla/readability": "^0.6.0",
|
||||
"baoyu-chrome-cdp": "file:./vendor/baoyu-chrome-cdp",
|
||||
"defuddle": "^0.10.0",
|
||||
"defuddle": "^0.12.0",
|
||||
"jsdom": "^24.1.3",
|
||||
"linkedom": "^0.18.12",
|
||||
"turndown": "^7.2.2",
|
||||
|
|
|
|||
Loading…
Reference in New Issue