JimLiu-baoyu-skills/skills/baoyu-url-to-markdown/scripts/markdown-conversion-shared.ts

306 lines
9.7 KiB
TypeScript

import { parseHTML } from "linkedom";
export interface PageMetadata {
url: string;
title: string;
description?: string;
author?: string;
published?: string;
coverImage?: string;
language?: string;
captured_at: string;
}
export interface ConversionResult {
metadata: PageMetadata;
markdown: string;
rawHtml: string;
conversionMethod: string;
fallbackReason?: string;
variables?: Record<string, string>;
}
export type AnyRecord = Record<string, unknown>;
export const MIN_CONTENT_LENGTH = 120;
export const GOOD_CONTENT_LENGTH = 900;
const PUBLISHED_TIME_SELECTORS = [
"meta[property='article:published_time']",
"meta[name='pubdate']",
"meta[name='publishdate']",
"meta[name='date']",
"time[datetime]",
];
const ARTICLE_TYPES = new Set([
"Article",
"NewsArticle",
"BlogPosting",
"WebPage",
"ReportageNewsArticle",
]);
export function pickString(...values: unknown[]): string | null {
for (const value of values) {
if (typeof value === "string") {
const trimmed = value.trim();
if (trimmed) return trimmed;
}
}
return null;
}
export function normalizeMarkdown(markdown: string): string {
return markdown
.replace(/\r\n/g, "\n")
.replace(/[ \t]+\n/g, "\n")
.replace(/\n{3,}/g, "\n\n")
.trim();
}
export function parseDocument(html: string): Document {
const normalized = /<\s*html[\s>]/i.test(html)
? html
: `<!doctype html><html><body>${html}</body></html>`;
return parseHTML(normalized).document as unknown as Document;
}
export function sanitizeHtml(html: string): string {
const { document } = parseHTML(`<div id="__root">${html}</div>`);
const root = document.querySelector("#__root");
if (!root) return html;
for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) {
for (const el of root.querySelectorAll(selector)) {
el.remove();
}
}
return root.innerHTML;
}
export function extractTextFromHtml(html: string): string {
const { document } = parseHTML(`<!doctype html><html><body>${html}</body></html>`);
for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) {
for (const el of document.querySelectorAll(selector)) {
el.remove();
}
}
return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? "";
}
export function getMetaContent(document: Document, names: string[]): string | null {
for (const name of names) {
const element =
document.querySelector(`meta[name="${name}"]`) ??
document.querySelector(`meta[property="${name}"]`);
const content = element?.getAttribute("content");
if (content && content.trim()) return content.trim();
}
return null;
}
function normalizeLanguageTag(value: string | null): string | null {
if (!value) return null;
const trimmed = value.trim();
if (!trimmed) return null;
const primary = trimmed.split(/[,\s;]/, 1)[0]?.trim();
if (!primary) return null;
return primary.replace(/_/g, "-");
}
function flattenJsonLdItems(data: unknown): AnyRecord[] {
if (!data || typeof data !== "object") return [];
if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems);
const item = data as AnyRecord;
if (Array.isArray(item["@graph"])) {
return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems);
}
return [item];
}
function parseJsonLdScripts(document: Document): AnyRecord[] {
const results: AnyRecord[] = [];
const scripts = document.querySelectorAll("script[type='application/ld+json']");
for (const script of scripts) {
try {
const data = JSON.parse(script.textContent ?? "");
results.push(...flattenJsonLdItems(data));
} catch {
// Ignore malformed blocks.
}
}
return results;
}
function isArticleType(item: AnyRecord): boolean {
const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"];
return typeof value === "string" && ARTICLE_TYPES.has(value);
}
function extractAuthorFromJsonLd(authorData: unknown): string | null {
if (typeof authorData === "string") return authorData;
if (!authorData || typeof authorData !== "object") return null;
if (Array.isArray(authorData)) {
const names = authorData
.map((author) => extractAuthorFromJsonLd(author))
.filter((name): name is string => Boolean(name));
return names.length > 0 ? names.join(", ") : null;
}
const author = authorData as AnyRecord;
return typeof author.name === "string" ? author.name : null;
}
function extractPrimaryJsonLdMeta(document: Document): Partial<PageMetadata> {
for (const item of parseJsonLdScripts(document)) {
if (!isArticleType(item)) continue;
return {
title: pickString(item.headline, item.name) ?? undefined,
description: pickString(item.description) ?? undefined,
author: extractAuthorFromJsonLd(item.author) ?? undefined,
published: pickString(item.datePublished, item.dateCreated) ?? undefined,
coverImage:
pickString(
item.image,
(item.image as AnyRecord | undefined)?.url,
(Array.isArray(item.image) ? item.image[0] : undefined) as unknown
) ?? undefined,
};
}
return {};
}
export function extractPublishedTime(document: Document): string | null {
for (const selector of PUBLISHED_TIME_SELECTORS) {
const el = document.querySelector(selector);
if (!el) continue;
const value = el.getAttribute("content") ?? el.getAttribute("datetime");
if (value && value.trim()) return value.trim();
}
return null;
}
export function extractTitle(document: Document): string | null {
const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content");
if (ogTitle && ogTitle.trim()) return ogTitle.trim();
const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content");
if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim();
const title = document.querySelector("title")?.textContent?.trim();
if (title) {
const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim();
if (cleaned) return cleaned;
}
const h1 = document.querySelector("h1")?.textContent?.trim();
return h1 || null;
}
export function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata {
const document = parseDocument(html);
const jsonLd = extractPrimaryJsonLdMeta(document);
const timeEl = document.querySelector("time[datetime]");
const htmlLang = normalizeLanguageTag(document.documentElement?.getAttribute("lang"));
const metaLanguage = normalizeLanguageTag(
pickString(
getMetaContent(document, ["language", "content-language", "og:locale"]),
document.querySelector("meta[http-equiv='content-language']")?.getAttribute("content")
)
);
return {
url,
title:
pickString(
getMetaContent(document, ["og:title", "twitter:title"]),
jsonLd.title,
document.querySelector("h1")?.textContent,
document.title
) ?? "",
description:
pickString(
getMetaContent(document, ["description", "og:description", "twitter:description"]),
jsonLd.description
) ?? undefined,
author:
pickString(
getMetaContent(document, ["author", "article:author", "twitter:creator"]),
jsonLd.author
) ?? undefined,
published:
pickString(
timeEl?.getAttribute("datetime"),
getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]),
jsonLd.published,
extractPublishedTime(document)
) ?? undefined,
coverImage:
pickString(
getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]),
jsonLd.coverImage
) ?? undefined,
language: pickString(htmlLang, metaLanguage) ?? undefined,
captured_at: capturedAt,
};
}
export function isMarkdownUsable(markdown: string, html: string): boolean {
const normalized = normalizeMarkdown(markdown);
if (!normalized) return false;
const htmlTextLength = extractTextFromHtml(html).length;
if (htmlTextLength < MIN_CONTENT_LENGTH) return true;
if (normalized.length >= 80) return true;
return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2));
}
export function isYouTubeUrl(url: string): boolean {
try {
const hostname = new URL(url).hostname.toLowerCase();
return hostname === "youtu.be" || hostname.endsWith(".youtube.com") || hostname === "youtube.com";
} catch {
return false;
}
}
function escapeYamlValue(value: string): string {
return value.replace(/\\/g, "\\\\").replace(/"/g, '\\"').replace(/\r?\n/g, "\\n");
}
export function formatMetadataYaml(meta: PageMetadata): string {
const lines = ["---"];
lines.push(`url: ${meta.url}`);
lines.push(`title: "${escapeYamlValue(meta.title)}"`);
if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
if (meta.language) lines.push(`language: "${escapeYamlValue(meta.language)}"`);
lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
lines.push("---");
return lines.join("\n");
}
export function createMarkdownDocument(result: ConversionResult): string {
const yaml = formatMetadataYaml(result.metadata);
const escapedTitle = result.metadata.title.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
const titleRegex = new RegExp(`^#\\s+${escapedTitle}\\s*(\\n|$)`, "i");
const hasTitle = titleRegex.test(result.markdown.trimStart());
const title = result.metadata.title && !hasTitle ? `\n\n# ${result.metadata.title}\n\n` : "\n\n";
return yaml + title + result.markdown;
}