fix(baoyu-post-to-wechat): decode HTML entities and strip tags from article summary
Add cleanSummaryText() to baoyu-md package: decodes HTML entities (&, <, &#x..., etc.) and strips HTML tags before using frontmatter description/summary as WeChat article digest
This commit is contained in:
parent
517ff566a1
commit
990fea4f7b
|
|
@ -2,6 +2,7 @@ import assert from "node:assert/strict";
|
|||
import test from "node:test";
|
||||
|
||||
import {
|
||||
cleanSummaryText,
|
||||
extractSummaryFromBody,
|
||||
extractTitleFromMarkdown,
|
||||
parseFrontmatter,
|
||||
|
|
@ -91,3 +92,19 @@ This is **the first paragraph** with [a link](https://example.com) and \`inline
|
|||
"This is the first paragraph with a link and inline code that should...",
|
||||
);
|
||||
});
|
||||
|
||||
test("summary extraction normalizes raw HTML paragraphs to plain text", () => {
|
||||
const summary = extractSummaryFromBody(
|
||||
`
|
||||
# Heading
|
||||
<p style="font-size: 16px; color: #666; margin-bottom: 20px;">2026年初,一只“龙虾”搅动了整个科技圈。腾讯楼下排起近千人长队,只为让工程师领取一份福利。</p>
|
||||
`,
|
||||
120,
|
||||
);
|
||||
|
||||
assert.equal(
|
||||
summary,
|
||||
"2026年初,一只“龙虾”搅动了整个科技圈。腾讯楼下排起近千人长队,只为让工程师领取一份福利。",
|
||||
);
|
||||
assert.equal(cleanSummaryText("<strong>Good text!'</strong>"), "Good text!'");
|
||||
});
|
||||
|
|
|
|||
|
|
@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
|
|||
return value.trim();
|
||||
}
|
||||
|
||||
const HTML_ENTITIES: Record<string, string> = {
|
||||
amp: "&",
|
||||
apos: "'",
|
||||
gt: ">",
|
||||
lt: "<",
|
||||
nbsp: " ",
|
||||
quot: '"',
|
||||
};
|
||||
|
||||
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
|
||||
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
|
||||
return fallback;
|
||||
}
|
||||
return String.fromCodePoint(codePoint);
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
|
||||
const normalized = body.toLowerCase();
|
||||
if (normalized.startsWith("#x")) {
|
||||
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
|
||||
}
|
||||
if (normalized.startsWith("#")) {
|
||||
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
|
||||
}
|
||||
return HTML_ENTITIES[normalized] ?? entity;
|
||||
});
|
||||
}
|
||||
|
||||
export function cleanSummaryText(value: string): string {
|
||||
return decodeHtmlEntities(stripWrappingQuotes(value))
|
||||
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
|
||||
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/<br\s*\/?>/gi, " ")
|
||||
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function toFrontmatterString(value: unknown): string | undefined {
|
||||
if (typeof value === "string") {
|
||||
return stripWrappingQuotes(value);
|
||||
|
|
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
|
|||
.replace(/\*(.+?)\*/g, "$1")
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||
.replace(/`([^`]+)`/g, "$1");
|
||||
const summaryText = cleanSummaryText(cleanText);
|
||||
|
||||
if (cleanText.length > 20) {
|
||||
if (cleanText.length <= maxLen) return cleanText;
|
||||
return `${cleanText.slice(0, maxLen - 3)}...`;
|
||||
if (summaryText.length > 20) {
|
||||
if (summaryText.length <= maxLen) return summaryText;
|
||||
return `${summaryText.slice(0, maxLen - 3)}...`;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -39,6 +39,22 @@ test("buildHtmlDocument includes optional meta tags and code theme CSS", () => {
|
|||
assert.match(html, /<article>Hello<\/article>/);
|
||||
});
|
||||
|
||||
test("buildHtmlDocument escapes head metadata attributes", () => {
|
||||
const html = buildHtmlDocument(
|
||||
{
|
||||
title: `Doc <draft>`,
|
||||
author: `Bao"yu`,
|
||||
description: `<p style="color: red">Summary & notes</p>`,
|
||||
},
|
||||
"",
|
||||
"",
|
||||
);
|
||||
|
||||
assert.match(html, /<title>Doc <draft><\/title>/);
|
||||
assert.match(html, /meta name="author" content="Bao"yu"/);
|
||||
assert.match(html, /meta name="description" content="<p style="color: red">Summary & notes<\/p>"/);
|
||||
});
|
||||
|
||||
test("normalizeCssText and normalizeInlineCss replace variables and strip declarations", () => {
|
||||
const rawCss = `
|
||||
:root { --md-primary-color: #000; --md-font-size: 12px; --foreground: 0 0% 5%; }
|
||||
|
|
|
|||
|
|
@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
|
|||
}
|
||||
|
||||
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
||||
const escapeHtmlAttribute = (value: string) => value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, """)
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">");
|
||||
const lines = [
|
||||
"<!doctype html>",
|
||||
"<html>",
|
||||
"<head>",
|
||||
' <meta charset="utf-8" />',
|
||||
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
||||
` <title>${meta.title}</title>`,
|
||||
` <title>${escapeHtmlAttribute(meta.title)}</title>`,
|
||||
];
|
||||
if (meta.author) {
|
||||
lines.push(` <meta name="author" content="${meta.author}" />`);
|
||||
lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
|
||||
}
|
||||
if (meta.description) {
|
||||
lines.push(` <meta name="description" content="${meta.description}" />`);
|
||||
lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
|
||||
}
|
||||
lines.push(` <style>${css}</style>`);
|
||||
if (codeThemeCss) {
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ import path from "node:path";
|
|||
import process from "node:process";
|
||||
|
||||
import {
|
||||
cleanSummaryText,
|
||||
extractSummaryFromBody,
|
||||
extractTitleFromMarkdown,
|
||||
parseFrontmatter,
|
||||
|
|
@ -47,8 +48,9 @@ export async function convertMarkdown(
|
|||
}
|
||||
|
||||
const author = stripWrappingQuotes(frontmatter.author ?? "");
|
||||
let summary = stripWrappingQuotes(frontmatter.description ?? "")
|
||||
const frontmatterSummary = stripWrappingQuotes(frontmatter.description ?? "")
|
||||
|| stripWrappingQuotes(frontmatter.summary ?? "");
|
||||
let summary = cleanSummaryText(frontmatterSummary);
|
||||
if (!summary) {
|
||||
summary = extractSummaryFromBody(body, 120);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
|
|||
return value.trim();
|
||||
}
|
||||
|
||||
const HTML_ENTITIES: Record<string, string> = {
|
||||
amp: "&",
|
||||
apos: "'",
|
||||
gt: ">",
|
||||
lt: "<",
|
||||
nbsp: " ",
|
||||
quot: '"',
|
||||
};
|
||||
|
||||
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
|
||||
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
|
||||
return fallback;
|
||||
}
|
||||
return String.fromCodePoint(codePoint);
|
||||
}
|
||||
|
||||
function decodeHtmlEntities(value: string): string {
|
||||
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
|
||||
const normalized = body.toLowerCase();
|
||||
if (normalized.startsWith("#x")) {
|
||||
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
|
||||
}
|
||||
if (normalized.startsWith("#")) {
|
||||
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
|
||||
}
|
||||
return HTML_ENTITIES[normalized] ?? entity;
|
||||
});
|
||||
}
|
||||
|
||||
export function cleanSummaryText(value: string): string {
|
||||
return decodeHtmlEntities(stripWrappingQuotes(value))
|
||||
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
|
||||
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
|
||||
.replace(/<br\s*\/?>/gi, " ")
|
||||
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
|
||||
.replace(/\s+/g, " ")
|
||||
.trim();
|
||||
}
|
||||
|
||||
export function toFrontmatterString(value: unknown): string | undefined {
|
||||
if (typeof value === "string") {
|
||||
return stripWrappingQuotes(value);
|
||||
|
|
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
|
|||
.replace(/\*(.+?)\*/g, "$1")
|
||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||
.replace(/`([^`]+)`/g, "$1");
|
||||
const summaryText = cleanSummaryText(cleanText);
|
||||
|
||||
if (cleanText.length > 20) {
|
||||
if (cleanText.length <= maxLen) return cleanText;
|
||||
return `${cleanText.slice(0, maxLen - 3)}...`;
|
||||
if (summaryText.length > 20) {
|
||||
if (summaryText.length <= maxLen) return summaryText;
|
||||
return `${summaryText.slice(0, maxLen - 3)}...`;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
|
|||
}
|
||||
|
||||
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
||||
const escapeHtmlAttribute = (value: string) => value
|
||||
.replace(/&/g, "&")
|
||||
.replace(/"/g, """)
|
||||
.replace(/</g, "<")
|
||||
.replace(/>/g, ">");
|
||||
const lines = [
|
||||
"<!doctype html>",
|
||||
"<html>",
|
||||
"<head>",
|
||||
' <meta charset="utf-8" />',
|
||||
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
||||
` <title>${meta.title}</title>`,
|
||||
` <title>${escapeHtmlAttribute(meta.title)}</title>`,
|
||||
];
|
||||
if (meta.author) {
|
||||
lines.push(` <meta name="author" content="${meta.author}" />`);
|
||||
lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
|
||||
}
|
||||
if (meta.description) {
|
||||
lines.push(` <meta name="description" content="${meta.description}" />`);
|
||||
lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
|
||||
}
|
||||
lines.push(` <style>${css}</style>`);
|
||||
if (codeThemeCss) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue