JimLiu-baoyu-skills/skills/baoyu-url-to-markdown/scripts/content-cleaner.ts

433 lines
11 KiB
TypeScript

import { parseHTML } from "linkedom";
export interface CleaningOptions {
removeAds?: boolean;
removeBase64Images?: boolean;
onlyMainContent?: boolean;
includeTags?: string[];
excludeTags?: string[];
}
const ALWAYS_REMOVE_SELECTORS = [
"script",
"style",
"noscript",
"link[rel='stylesheet']",
"[hidden]",
"[aria-hidden='true']",
"[style*='display: none']",
"[style*='display:none']",
"[style*='visibility: hidden']",
"[style*='visibility:hidden']",
"svg[aria-hidden='true']",
"svg.icon",
"svg[class*='icon']",
"template",
"meta",
"iframe",
"canvas",
"object",
"embed",
"form",
"input",
"select",
"textarea",
"button",
];
const OVERLAY_SELECTORS = [
"[class*='modal']",
"[class*='popup']",
"[class*='overlay']",
"[class*='dialog']",
"[role='dialog']",
"[role='alertdialog']",
"[class*='cookie']",
"[class*='consent']",
"[class*='gdpr']",
"[class*='privacy-banner']",
"[class*='notification-bar']",
"[id*='cookie']",
"[id*='consent']",
"[id*='gdpr']",
"[style*='position: fixed']",
"[style*='position:fixed']",
"[style*='position: sticky']",
"[style*='position:sticky']",
];
const NAVIGATION_SELECTORS = [
"header",
"footer",
"nav",
"aside",
".header",
".top",
".navbar",
"#header",
".footer",
".bottom",
"#footer",
".sidebar",
".side",
".aside",
"#sidebar",
".modal",
".popup",
"#modal",
".overlay",
".ad",
".ads",
".advert",
"#ad",
".lang-selector",
".language",
"#language-selector",
".social",
".social-media",
".social-links",
"#social",
".menu",
".navigation",
"#nav",
".breadcrumbs",
"#breadcrumbs",
".share",
"#share",
".widget",
"#widget",
".cookie",
"#cookie",
];
const FORCE_INCLUDE_SELECTORS = [
"#main",
"#content",
"#main-content",
"#article",
"#post",
"#page-content",
"main",
"article",
"[role='main']",
".main-content",
".content",
".post-content",
".article-content",
".entry-content",
".page-content",
".article-body",
".post-body",
".story-content",
".blog-content",
];
const AD_SELECTORS = [
"ins.adsbygoogle",
".google-ad",
".adsense",
"[data-ad]",
"[data-ads]",
"[data-ad-slot]",
"[data-ad-client]",
".ad-container",
".ad-wrapper",
".advertisement",
".sponsored-content",
"img[width='1'][height='1']",
"img[src*='pixel']",
"img[src*='tracking']",
"img[src*='analytics']",
];
function getLinkDensity(element: Element): number {
const text = element.textContent || "";
const textLength = text.trim().length;
if (textLength === 0) return 1;
let linkLength = 0;
element.querySelectorAll("a").forEach((link: Element) => {
linkLength += (link.textContent || "").trim().length;
});
return linkLength / textLength;
}
function getContentScore(element: Element): number {
let score = 0;
const text = element.textContent || "";
const textLength = text.trim().length;
score += Math.min(textLength / 100, 50);
score += element.querySelectorAll("p").length * 3;
score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2;
score += element.querySelectorAll("img").length;
score -= element.querySelectorAll("a").length * 0.5;
score -= element.querySelectorAll("li").length * 0.2;
const linkDensity = getLinkDensity(element);
if (linkDensity > 0.5) score -= 30;
else if (linkDensity > 0.3) score -= 15;
const className = typeof element.className === "string" ? element.className : "";
const classAndId = `${className} ${element.id || ""}`;
if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25;
if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25;
return score;
}
function looksLikeNavigation(element: Element): boolean {
const linkDensity = getLinkDensity(element);
if (linkDensity > 0.5) return true;
const listItems = element.querySelectorAll("li");
const links = element.querySelectorAll("a");
return listItems.length > 5 && links.length > listItems.length * 0.8;
}
function removeElements(document: Document, selectors: string[]): void {
for (const selector of selectors) {
try {
document.querySelectorAll(selector).forEach((element: Element) => element.remove());
} catch {
// Ignore unsupported selectors from linkedom/jsdom differences.
}
}
}
function removeWithProtection(
document: Document,
selectorsToRemove: string[],
protectedSelectors: string[]
): void {
for (const selector of selectorsToRemove) {
try {
document.querySelectorAll(selector).forEach((element: Element) => {
const isProtected = protectedSelectors.some((protectedSelector) => {
try {
return element.matches(protectedSelector);
} catch {
return false;
}
});
if (isProtected) return;
const containsProtected = protectedSelectors.some((protectedSelector) => {
try {
return element.querySelector(protectedSelector) !== null;
} catch {
return false;
}
});
if (containsProtected) return;
element.remove();
});
} catch {
// Ignore unsupported selectors from linkedom/jsdom differences.
}
}
}
function findMainContent(document: Document): Element | null {
const isValidContent = (element: Element | null): element is Element => {
if (!element) return false;
const text = element.textContent || "";
if (text.trim().length < 100) return false;
return !looksLikeNavigation(element);
};
const main = document.querySelector("main");
if (isValidContent(main) && getLinkDensity(main) < 0.4) return main;
const roleMain = document.querySelector('[role="main"]');
if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) return roleMain;
const articles = document.querySelectorAll("article");
if (articles.length === 1 && isValidContent(articles[0] ?? null)) {
return articles[0] ?? null;
}
const contentSelectors = [
"#content",
"#main-content",
"#main",
".content",
".main-content",
".post-content",
".article-content",
".entry-content",
".page-content",
".article-body",
".post-body",
".story-content",
".blog-content",
];
for (const selector of contentSelectors) {
try {
const element = document.querySelector(selector);
if (isValidContent(element) && getLinkDensity(element) < 0.4) {
return element;
}
} catch {
// Ignore invalid selectors.
}
}
const candidates: Array<{ element: Element; score: number }> = [];
const containers = document.querySelectorAll("div, section, article");
containers.forEach((element: Element) => {
const text = element.textContent || "";
if (text.trim().length < 200) return;
const score = getContentScore(element);
if (score > 0) {
candidates.push({ element, score });
}
});
candidates.sort((left, right) => right.score - left.score);
if ((candidates[0]?.score ?? 0) > 20) {
return candidates[0]?.element ?? null;
}
return null;
}
function removeBase64ImagesFromDocument(document: Document): void {
document.querySelectorAll("img[src^='data:']").forEach((element: Element) => {
element.remove();
});
document.querySelectorAll("[style*='data:image']").forEach((element: Element) => {
const style = element.getAttribute("style");
if (!style) return;
const cleanedStyle = style.replace(
/background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi,
""
);
if (cleanedStyle.trim()) {
element.setAttribute("style", cleanedStyle);
} else {
element.removeAttribute("style");
}
});
document.querySelectorAll("source[src^='data:'], source[srcset*='data:']").forEach((element: Element) => {
element.remove();
});
}
function makeAbsoluteUrl(value: string, baseUrl: string): string | null {
try {
return new URL(value, baseUrl).toString();
} catch {
return null;
}
}
function convertRelativeUrls(document: Document, baseUrl: string): void {
document.querySelectorAll("[src]").forEach((element: Element) => {
const src = element.getAttribute("src");
if (!src || src.startsWith("http") || src.startsWith("//") || src.startsWith("data:")) return;
const absolute = makeAbsoluteUrl(src, baseUrl);
if (absolute) element.setAttribute("src", absolute);
});
document.querySelectorAll("[href]").forEach((element: Element) => {
const href = element.getAttribute("href");
if (
!href ||
href.startsWith("http") ||
href.startsWith("//") ||
href.startsWith("#") ||
href.startsWith("mailto:") ||
href.startsWith("tel:") ||
href.startsWith("javascript:")
) {
return;
}
const absolute = makeAbsoluteUrl(href, baseUrl);
if (absolute) element.setAttribute("href", absolute);
});
}
export function cleanHtml(html: string, baseUrl: string, options: CleaningOptions = {}): string {
const {
removeAds = true,
removeBase64Images = true,
onlyMainContent = true,
includeTags,
excludeTags,
} = options;
const { document } = parseHTML(html);
removeElements(document, ALWAYS_REMOVE_SELECTORS);
removeElements(document, OVERLAY_SELECTORS);
if (removeAds) {
removeElements(document, AD_SELECTORS);
}
if (excludeTags?.length) {
removeElements(document, excludeTags);
}
if (onlyMainContent) {
removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS);
const mainContent = findMainContent(document);
if (mainContent && document.body) {
const clone = mainContent.cloneNode(true) as Element;
document.body.innerHTML = "";
document.body.appendChild(clone);
}
}
if (includeTags?.length && document.body) {
const matchedElements: Element[] = [];
for (const selector of includeTags) {
try {
document.querySelectorAll(selector).forEach((element: Element) => {
matchedElements.push(element.cloneNode(true) as Element);
});
} catch {
// Ignore invalid selectors.
}
}
if (matchedElements.length > 0) {
document.body.innerHTML = "";
matchedElements.forEach((element) => document.body?.appendChild(element));
}
}
if (removeBase64Images) {
removeBase64ImagesFromDocument(document);
}
const walker = document.createTreeWalker(document, 128);
const comments: Node[] = [];
while (walker.nextNode()) {
comments.push(walker.currentNode);
}
comments.forEach((comment) => comment.parentNode?.removeChild(comment));
convertRelativeUrls(document, baseUrl);
return document.documentElement?.outerHTML || html;
}
export function cleanContent(html: string, baseUrl: string, options: CleaningOptions = {}): string {
return cleanHtml(html, baseUrl, options);
}