fix(baoyu-post-to-wechat): decode HTML entities and strip tags from article summary

Add cleanSummaryText() to baoyu-md package: decodes HTML entities (&, <, &#x..., etc.)
and strips HTML tags before using frontmatter description/summary as WeChat article digest
This commit is contained in:
Jim Liu 宝玉 2026-04-12 20:16:53 -05:00
parent 517ff566a1
commit 990fea4f7b
7 changed files with 138 additions and 13 deletions

View File

@ -2,6 +2,7 @@ import assert from "node:assert/strict";
import test from "node:test"; import test from "node:test";
import { import {
cleanSummaryText,
extractSummaryFromBody, extractSummaryFromBody,
extractTitleFromMarkdown, extractTitleFromMarkdown,
parseFrontmatter, parseFrontmatter,
@ -91,3 +92,19 @@ This is **the first paragraph** with [a link](https://example.com) and \`inline
"This is the first paragraph with a link and inline code that should...", "This is the first paragraph with a link and inline code that should...",
); );
}); });
test("summary extraction normalizes raw HTML paragraphs to plain text", () => {
const summary = extractSummaryFromBody(
`
# Heading
<p style="font-size: 16px; color: #666; margin-bottom: 20px;">2026</p>
`,
120,
);
assert.equal(
summary,
"2026年初一只“龙虾”搅动了整个科技圈。腾讯楼下排起近千人长队只为让工程师领取一份福利。",
);
assert.equal(cleanSummaryText("<strong>Good&nbsp;text&#33;&apos;</strong>"), "Good text!'");
});

View File

@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
return value.trim(); return value.trim();
} }
const HTML_ENTITIES: Record<string, string> = {
amp: "&",
apos: "'",
gt: ">",
lt: "<",
nbsp: " ",
quot: '"',
};
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
return fallback;
}
return String.fromCodePoint(codePoint);
}
function decodeHtmlEntities(value: string): string {
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
const normalized = body.toLowerCase();
if (normalized.startsWith("#x")) {
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
}
if (normalized.startsWith("#")) {
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
}
return HTML_ENTITIES[normalized] ?? entity;
});
}
export function cleanSummaryText(value: string): string {
return decodeHtmlEntities(stripWrappingQuotes(value))
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
.replace(/<br\s*\/?>/gi, " ")
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
.replace(/\s+/g, " ")
.trim();
}
export function toFrontmatterString(value: unknown): string | undefined { export function toFrontmatterString(value: unknown): string | undefined {
if (typeof value === "string") { if (typeof value === "string") {
return stripWrappingQuotes(value); return stripWrappingQuotes(value);
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
.replace(/\*(.+?)\*/g, "$1") .replace(/\*(.+?)\*/g, "$1")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.replace(/`([^`]+)`/g, "$1"); .replace(/`([^`]+)`/g, "$1");
const summaryText = cleanSummaryText(cleanText);
if (cleanText.length > 20) { if (summaryText.length > 20) {
if (cleanText.length <= maxLen) return cleanText; if (summaryText.length <= maxLen) return summaryText;
return `${cleanText.slice(0, maxLen - 3)}...`; return `${summaryText.slice(0, maxLen - 3)}...`;
} }
} }

View File

@ -39,6 +39,22 @@ test("buildHtmlDocument includes optional meta tags and code theme CSS", () => {
assert.match(html, /<article>Hello<\/article>/); assert.match(html, /<article>Hello<\/article>/);
}); });
test("buildHtmlDocument escapes head metadata attributes", () => {
const html = buildHtmlDocument(
{
title: `Doc <draft>`,
author: `Bao"yu`,
description: `<p style="color: red">Summary & notes</p>`,
},
"",
"",
);
assert.match(html, /<title>Doc &lt;draft&gt;<\/title>/);
assert.match(html, /meta name="author" content="Bao&quot;yu"/);
assert.match(html, /meta name="description" content="&lt;p style=&quot;color: red&quot;&gt;Summary &amp; notes&lt;\/p&gt;"/);
});
test("normalizeCssText and normalizeInlineCss replace variables and strip declarations", () => { test("normalizeCssText and normalizeInlineCss replace variables and strip declarations", () => {
const rawCss = ` const rawCss = `
:root { --md-primary-color: #000; --md-font-size: 12px; --foreground: 0 0% 5%; } :root { --md-primary-color: #000; --md-font-size: 12px; --foreground: 0 0% 5%; }

View File

@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
} }
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string { export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
const escapeHtmlAttribute = (value: string) => value
.replace(/&/g, "&amp;")
.replace(/"/g, "&quot;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;");
const lines = [ const lines = [
"<!doctype html>", "<!doctype html>",
"<html>", "<html>",
"<head>", "<head>",
' <meta charset="utf-8" />', ' <meta charset="utf-8" />',
' <meta name="viewport" content="width=device-width, initial-scale=1" />', ' <meta name="viewport" content="width=device-width, initial-scale=1" />',
` <title>${meta.title}</title>`, ` <title>${escapeHtmlAttribute(meta.title)}</title>`,
]; ];
if (meta.author) { if (meta.author) {
lines.push(` <meta name="author" content="${meta.author}" />`); lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
} }
if (meta.description) { if (meta.description) {
lines.push(` <meta name="description" content="${meta.description}" />`); lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
} }
lines.push(` <style>${css}</style>`); lines.push(` <style>${css}</style>`);
if (codeThemeCss) { if (codeThemeCss) {

View File

@ -4,6 +4,7 @@ import path from "node:path";
import process from "node:process"; import process from "node:process";
import { import {
cleanSummaryText,
extractSummaryFromBody, extractSummaryFromBody,
extractTitleFromMarkdown, extractTitleFromMarkdown,
parseFrontmatter, parseFrontmatter,
@ -47,8 +48,9 @@ export async function convertMarkdown(
} }
const author = stripWrappingQuotes(frontmatter.author ?? ""); const author = stripWrappingQuotes(frontmatter.author ?? "");
let summary = stripWrappingQuotes(frontmatter.description ?? "") const frontmatterSummary = stripWrappingQuotes(frontmatter.description ?? "")
|| stripWrappingQuotes(frontmatter.summary ?? ""); || stripWrappingQuotes(frontmatter.summary ?? "");
let summary = cleanSummaryText(frontmatterSummary);
if (!summary) { if (!summary) {
summary = extractSummaryFromBody(body, 120); summary = extractSummaryFromBody(body, 120);
} }

View File

@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
return value.trim(); return value.trim();
} }
const HTML_ENTITIES: Record<string, string> = {
amp: "&",
apos: "'",
gt: ">",
lt: "<",
nbsp: " ",
quot: '"',
};
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
return fallback;
}
return String.fromCodePoint(codePoint);
}
function decodeHtmlEntities(value: string): string {
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
const normalized = body.toLowerCase();
if (normalized.startsWith("#x")) {
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
}
if (normalized.startsWith("#")) {
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
}
return HTML_ENTITIES[normalized] ?? entity;
});
}
export function cleanSummaryText(value: string): string {
return decodeHtmlEntities(stripWrappingQuotes(value))
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
.replace(/<br\s*\/?>/gi, " ")
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
.replace(/\s+/g, " ")
.trim();
}
export function toFrontmatterString(value: unknown): string | undefined { export function toFrontmatterString(value: unknown): string | undefined {
if (typeof value === "string") { if (typeof value === "string") {
return stripWrappingQuotes(value); return stripWrappingQuotes(value);
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
.replace(/\*(.+?)\*/g, "$1") .replace(/\*(.+?)\*/g, "$1")
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1") .replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
.replace(/`([^`]+)`/g, "$1"); .replace(/`([^`]+)`/g, "$1");
const summaryText = cleanSummaryText(cleanText);
if (cleanText.length > 20) { if (summaryText.length > 20) {
if (cleanText.length <= maxLen) return cleanText; if (summaryText.length <= maxLen) return summaryText;
return `${cleanText.slice(0, maxLen - 3)}...`; return `${summaryText.slice(0, maxLen - 3)}...`;
} }
} }

View File

@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
} }
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string { export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
const escapeHtmlAttribute = (value: string) => value
.replace(/&/g, "&amp;")
.replace(/"/g, "&quot;")
.replace(/</g, "&lt;")
.replace(/>/g, "&gt;");
const lines = [ const lines = [
"<!doctype html>", "<!doctype html>",
"<html>", "<html>",
"<head>", "<head>",
' <meta charset="utf-8" />', ' <meta charset="utf-8" />',
' <meta name="viewport" content="width=device-width, initial-scale=1" />', ' <meta name="viewport" content="width=device-width, initial-scale=1" />',
` <title>${meta.title}</title>`, ` <title>${escapeHtmlAttribute(meta.title)}</title>`,
]; ];
if (meta.author) { if (meta.author) {
lines.push(` <meta name="author" content="${meta.author}" />`); lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
} }
if (meta.description) { if (meta.description) {
lines.push(` <meta name="description" content="${meta.description}" />`); lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
} }
lines.push(` <style>${css}</style>`); lines.push(` <style>${css}</style>`);
if (codeThemeCss) { if (codeThemeCss) {