import { parseHTML } from "linkedom"; export interface CleaningOptions { removeAds?: boolean; removeBase64Images?: boolean; onlyMainContent?: boolean; includeTags?: string[]; excludeTags?: string[]; } const ALWAYS_REMOVE_SELECTORS = [ "script", "style", "noscript", "link[rel='stylesheet']", "[hidden]", "[aria-hidden='true']", "[style*='display: none']", "[style*='display:none']", "[style*='visibility: hidden']", "[style*='visibility:hidden']", "svg[aria-hidden='true']", "svg.icon", "svg[class*='icon']", "template", "meta", "iframe", "canvas", "object", "embed", "form", "input", "select", "textarea", "button", ]; const OVERLAY_SELECTORS = [ "[class*='modal']", "[class*='popup']", "[class*='overlay']", "[class*='dialog']", "[role='dialog']", "[role='alertdialog']", "[class*='cookie']", "[class*='consent']", "[class*='gdpr']", "[class*='privacy-banner']", "[class*='notification-bar']", "[id*='cookie']", "[id*='consent']", "[id*='gdpr']", "[style*='position: fixed']", "[style*='position:fixed']", "[style*='position: sticky']", "[style*='position:sticky']", ]; const NAVIGATION_SELECTORS = [ "header", "footer", "nav", "aside", ".header", ".top", ".navbar", "#header", ".footer", ".bottom", "#footer", ".sidebar", ".side", ".aside", "#sidebar", ".modal", ".popup", "#modal", ".overlay", ".ad", ".ads", ".advert", "#ad", ".lang-selector", ".language", "#language-selector", ".social", ".social-media", ".social-links", "#social", ".menu", ".navigation", "#nav", ".breadcrumbs", "#breadcrumbs", ".share", "#share", ".widget", "#widget", ".cookie", "#cookie", ]; const FORCE_INCLUDE_SELECTORS = [ "#main", "#content", "#main-content", "#article", "#post", "#page-content", "main", "article", "[role='main']", ".main-content", ".content", ".post-content", ".article-content", ".entry-content", ".page-content", ".article-body", ".post-body", ".story-content", ".blog-content", ]; const AD_SELECTORS = [ "ins.adsbygoogle", ".google-ad", ".adsense", "[data-ad]", "[data-ads]", "[data-ad-slot]", "[data-ad-client]", ".ad-container", ".ad-wrapper", ".advertisement", ".sponsored-content", "img[width='1'][height='1']", "img[src*='pixel']", "img[src*='tracking']", "img[src*='analytics']", ]; function getLinkDensity(element: Element): number { const text = element.textContent || ""; const textLength = text.trim().length; if (textLength === 0) return 1; let linkLength = 0; element.querySelectorAll("a").forEach((link: Element) => { linkLength += (link.textContent || "").trim().length; }); return linkLength / textLength; } function getContentScore(element: Element): number { let score = 0; const text = element.textContent || ""; const textLength = text.trim().length; score += Math.min(textLength / 100, 50); score += element.querySelectorAll("p").length * 3; score += element.querySelectorAll("h1, h2, h3, h4, h5, h6").length * 2; score += element.querySelectorAll("img").length; score -= element.querySelectorAll("a").length * 0.5; score -= element.querySelectorAll("li").length * 0.2; const linkDensity = getLinkDensity(element); if (linkDensity > 0.5) score -= 30; else if (linkDensity > 0.3) score -= 15; const className = typeof element.className === "string" ? element.className : ""; const classAndId = `${className} ${element.id || ""}`; if (/article|content|post|body|main|entry/i.test(classAndId)) score += 25; if (/comment|sidebar|footer|nav|menu|header|widget|ad/i.test(classAndId)) score -= 25; return score; } function looksLikeNavigation(element: Element): boolean { const linkDensity = getLinkDensity(element); if (linkDensity > 0.5) return true; const listItems = element.querySelectorAll("li"); const links = element.querySelectorAll("a"); return listItems.length > 5 && links.length > listItems.length * 0.8; } function removeElements(document: Document, selectors: string[]): void { for (const selector of selectors) { try { document.querySelectorAll(selector).forEach((element: Element) => element.remove()); } catch { // Ignore unsupported selectors from linkedom/jsdom differences. } } } function removeWithProtection( document: Document, selectorsToRemove: string[], protectedSelectors: string[] ): void { for (const selector of selectorsToRemove) { try { document.querySelectorAll(selector).forEach((element: Element) => { const isProtected = protectedSelectors.some((protectedSelector) => { try { return element.matches(protectedSelector); } catch { return false; } }); if (isProtected) return; const containsProtected = protectedSelectors.some((protectedSelector) => { try { return element.querySelector(protectedSelector) !== null; } catch { return false; } }); if (containsProtected) return; element.remove(); }); } catch { // Ignore unsupported selectors from linkedom/jsdom differences. } } } function findMainContent(document: Document): Element | null { const isValidContent = (element: Element | null): element is Element => { if (!element) return false; const text = element.textContent || ""; if (text.trim().length < 100) return false; return !looksLikeNavigation(element); }; const main = document.querySelector("main"); if (isValidContent(main) && getLinkDensity(main) < 0.4) return main; const roleMain = document.querySelector('[role="main"]'); if (isValidContent(roleMain) && getLinkDensity(roleMain) < 0.4) return roleMain; const articles = document.querySelectorAll("article"); if (articles.length === 1 && isValidContent(articles[0] ?? null)) { return articles[0] ?? null; } const contentSelectors = [ "#content", "#main-content", "#main", ".content", ".main-content", ".post-content", ".article-content", ".entry-content", ".page-content", ".article-body", ".post-body", ".story-content", ".blog-content", ]; for (const selector of contentSelectors) { try { const element = document.querySelector(selector); if (isValidContent(element) && getLinkDensity(element) < 0.4) { return element; } } catch { // Ignore invalid selectors. } } const candidates: Array<{ element: Element; score: number }> = []; const containers = document.querySelectorAll("div, section, article"); containers.forEach((element: Element) => { const text = element.textContent || ""; if (text.trim().length < 200) return; const score = getContentScore(element); if (score > 0) { candidates.push({ element, score }); } }); candidates.sort((left, right) => right.score - left.score); if ((candidates[0]?.score ?? 0) > 20) { return candidates[0]?.element ?? null; } return null; } function removeBase64ImagesFromDocument(document: Document): void { document.querySelectorAll("img[src^='data:']").forEach((element: Element) => { element.remove(); }); document.querySelectorAll("[style*='data:image']").forEach((element: Element) => { const style = element.getAttribute("style"); if (!style) return; const cleanedStyle = style.replace( /background(-image)?:\s*url\([^)]*data:image[^)]*\)[^;]*;?/gi, "" ); if (cleanedStyle.trim()) { element.setAttribute("style", cleanedStyle); } else { element.removeAttribute("style"); } }); document.querySelectorAll("source[src^='data:'], source[srcset*='data:']").forEach((element: Element) => { element.remove(); }); } function makeAbsoluteUrl(value: string, baseUrl: string): string | null { try { return new URL(value, baseUrl).toString(); } catch { return null; } } function convertRelativeUrls(document: Document, baseUrl: string): void { document.querySelectorAll("[src]").forEach((element: Element) => { const src = element.getAttribute("src"); if (!src || src.startsWith("http") || src.startsWith("//") || src.startsWith("data:")) return; const absolute = makeAbsoluteUrl(src, baseUrl); if (absolute) element.setAttribute("src", absolute); }); document.querySelectorAll("[href]").forEach((element: Element) => { const href = element.getAttribute("href"); if ( !href || href.startsWith("http") || href.startsWith("//") || href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:") ) { return; } const absolute = makeAbsoluteUrl(href, baseUrl); if (absolute) element.setAttribute("href", absolute); }); } export function cleanHtml(html: string, baseUrl: string, options: CleaningOptions = {}): string { const { removeAds = true, removeBase64Images = true, onlyMainContent = true, includeTags, excludeTags, } = options; const { document } = parseHTML(html); removeElements(document, ALWAYS_REMOVE_SELECTORS); removeElements(document, OVERLAY_SELECTORS); if (removeAds) { removeElements(document, AD_SELECTORS); } if (excludeTags?.length) { removeElements(document, excludeTags); } if (onlyMainContent) { removeWithProtection(document, NAVIGATION_SELECTORS, FORCE_INCLUDE_SELECTORS); const mainContent = findMainContent(document); if (mainContent && document.body) { const clone = mainContent.cloneNode(true) as Element; document.body.innerHTML = ""; document.body.appendChild(clone); } } if (includeTags?.length && document.body) { const matchedElements: Element[] = []; for (const selector of includeTags) { try { document.querySelectorAll(selector).forEach((element: Element) => { matchedElements.push(element.cloneNode(true) as Element); }); } catch { // Ignore invalid selectors. } } if (matchedElements.length > 0) { document.body.innerHTML = ""; matchedElements.forEach((element) => document.body?.appendChild(element)); } } if (removeBase64Images) { removeBase64ImagesFromDocument(document); } const walker = document.createTreeWalker(document, 128); const comments: Node[] = []; while (walker.nextNode()) { comments.push(walker.currentNode); } comments.forEach((comment) => comment.parentNode?.removeChild(comment)); convertRelativeUrls(document, baseUrl); return document.documentElement?.outerHTML || html; } export function cleanContent(html: string, baseUrl: string, options: CleaningOptions = {}): string { return cleanHtml(html, baseUrl, options); }