fix(baoyu-post-to-wechat): decode HTML entities and strip tags from article summary
Add cleanSummaryText() to baoyu-md package: decodes HTML entities (&, <, &#x..., etc.) and strips HTML tags before using frontmatter description/summary as WeChat article digest
This commit is contained in:
parent
517ff566a1
commit
990fea4f7b
|
|
@ -2,6 +2,7 @@ import assert from "node:assert/strict";
|
||||||
import test from "node:test";
|
import test from "node:test";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
|
cleanSummaryText,
|
||||||
extractSummaryFromBody,
|
extractSummaryFromBody,
|
||||||
extractTitleFromMarkdown,
|
extractTitleFromMarkdown,
|
||||||
parseFrontmatter,
|
parseFrontmatter,
|
||||||
|
|
@ -91,3 +92,19 @@ This is **the first paragraph** with [a link](https://example.com) and \`inline
|
||||||
"This is the first paragraph with a link and inline code that should...",
|
"This is the first paragraph with a link and inline code that should...",
|
||||||
);
|
);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("summary extraction normalizes raw HTML paragraphs to plain text", () => {
|
||||||
|
const summary = extractSummaryFromBody(
|
||||||
|
`
|
||||||
|
# Heading
|
||||||
|
<p style="font-size: 16px; color: #666; margin-bottom: 20px;">2026年初,一只“龙虾”搅动了整个科技圈。腾讯楼下排起近千人长队,只为让工程师领取一份福利。</p>
|
||||||
|
`,
|
||||||
|
120,
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.equal(
|
||||||
|
summary,
|
||||||
|
"2026年初,一只“龙虾”搅动了整个科技圈。腾讯楼下排起近千人长队,只为让工程师领取一份福利。",
|
||||||
|
);
|
||||||
|
assert.equal(cleanSummaryText("<strong>Good text!'</strong>"), "Good text!'");
|
||||||
|
});
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
|
||||||
return value.trim();
|
return value.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const HTML_ENTITIES: Record<string, string> = {
|
||||||
|
amp: "&",
|
||||||
|
apos: "'",
|
||||||
|
gt: ">",
|
||||||
|
lt: "<",
|
||||||
|
nbsp: " ",
|
||||||
|
quot: '"',
|
||||||
|
};
|
||||||
|
|
||||||
|
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
|
||||||
|
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
return String.fromCodePoint(codePoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeHtmlEntities(value: string): string {
|
||||||
|
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
|
||||||
|
const normalized = body.toLowerCase();
|
||||||
|
if (normalized.startsWith("#x")) {
|
||||||
|
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
|
||||||
|
}
|
||||||
|
if (normalized.startsWith("#")) {
|
||||||
|
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
|
||||||
|
}
|
||||||
|
return HTML_ENTITIES[normalized] ?? entity;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function cleanSummaryText(value: string): string {
|
||||||
|
return decodeHtmlEntities(stripWrappingQuotes(value))
|
||||||
|
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
|
||||||
|
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
|
||||||
|
.replace(/<br\s*\/?>/gi, " ")
|
||||||
|
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
export function toFrontmatterString(value: unknown): string | undefined {
|
export function toFrontmatterString(value: unknown): string | undefined {
|
||||||
if (typeof value === "string") {
|
if (typeof value === "string") {
|
||||||
return stripWrappingQuotes(value);
|
return stripWrappingQuotes(value);
|
||||||
|
|
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
|
||||||
.replace(/\*(.+?)\*/g, "$1")
|
.replace(/\*(.+?)\*/g, "$1")
|
||||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||||
.replace(/`([^`]+)`/g, "$1");
|
.replace(/`([^`]+)`/g, "$1");
|
||||||
|
const summaryText = cleanSummaryText(cleanText);
|
||||||
|
|
||||||
if (cleanText.length > 20) {
|
if (summaryText.length > 20) {
|
||||||
if (cleanText.length <= maxLen) return cleanText;
|
if (summaryText.length <= maxLen) return summaryText;
|
||||||
return `${cleanText.slice(0, maxLen - 3)}...`;
|
return `${summaryText.slice(0, maxLen - 3)}...`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -39,6 +39,22 @@ test("buildHtmlDocument includes optional meta tags and code theme CSS", () => {
|
||||||
assert.match(html, /<article>Hello<\/article>/);
|
assert.match(html, /<article>Hello<\/article>/);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("buildHtmlDocument escapes head metadata attributes", () => {
|
||||||
|
const html = buildHtmlDocument(
|
||||||
|
{
|
||||||
|
title: `Doc <draft>`,
|
||||||
|
author: `Bao"yu`,
|
||||||
|
description: `<p style="color: red">Summary & notes</p>`,
|
||||||
|
},
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
);
|
||||||
|
|
||||||
|
assert.match(html, /<title>Doc <draft><\/title>/);
|
||||||
|
assert.match(html, /meta name="author" content="Bao"yu"/);
|
||||||
|
assert.match(html, /meta name="description" content="<p style="color: red">Summary & notes<\/p>"/);
|
||||||
|
});
|
||||||
|
|
||||||
test("normalizeCssText and normalizeInlineCss replace variables and strip declarations", () => {
|
test("normalizeCssText and normalizeInlineCss replace variables and strip declarations", () => {
|
||||||
const rawCss = `
|
const rawCss = `
|
||||||
:root { --md-primary-color: #000; --md-font-size: 12px; --foreground: 0 0% 5%; }
|
:root { --md-primary-color: #000; --md-font-size: 12px; --foreground: 0 0% 5%; }
|
||||||
|
|
|
||||||
|
|
@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
||||||
|
const escapeHtmlAttribute = (value: string) => value
|
||||||
|
.replace(/&/g, "&")
|
||||||
|
.replace(/"/g, """)
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">");
|
||||||
const lines = [
|
const lines = [
|
||||||
"<!doctype html>",
|
"<!doctype html>",
|
||||||
"<html>",
|
"<html>",
|
||||||
"<head>",
|
"<head>",
|
||||||
' <meta charset="utf-8" />',
|
' <meta charset="utf-8" />',
|
||||||
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
||||||
` <title>${meta.title}</title>`,
|
` <title>${escapeHtmlAttribute(meta.title)}</title>`,
|
||||||
];
|
];
|
||||||
if (meta.author) {
|
if (meta.author) {
|
||||||
lines.push(` <meta name="author" content="${meta.author}" />`);
|
lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
|
||||||
}
|
}
|
||||||
if (meta.description) {
|
if (meta.description) {
|
||||||
lines.push(` <meta name="description" content="${meta.description}" />`);
|
lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
|
||||||
}
|
}
|
||||||
lines.push(` <style>${css}</style>`);
|
lines.push(` <style>${css}</style>`);
|
||||||
if (codeThemeCss) {
|
if (codeThemeCss) {
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import path from "node:path";
|
||||||
import process from "node:process";
|
import process from "node:process";
|
||||||
|
|
||||||
import {
|
import {
|
||||||
|
cleanSummaryText,
|
||||||
extractSummaryFromBody,
|
extractSummaryFromBody,
|
||||||
extractTitleFromMarkdown,
|
extractTitleFromMarkdown,
|
||||||
parseFrontmatter,
|
parseFrontmatter,
|
||||||
|
|
@ -47,8 +48,9 @@ export async function convertMarkdown(
|
||||||
}
|
}
|
||||||
|
|
||||||
const author = stripWrappingQuotes(frontmatter.author ?? "");
|
const author = stripWrappingQuotes(frontmatter.author ?? "");
|
||||||
let summary = stripWrappingQuotes(frontmatter.description ?? "")
|
const frontmatterSummary = stripWrappingQuotes(frontmatter.description ?? "")
|
||||||
|| stripWrappingQuotes(frontmatter.summary ?? "");
|
|| stripWrappingQuotes(frontmatter.summary ?? "");
|
||||||
|
let summary = cleanSummaryText(frontmatterSummary);
|
||||||
if (!summary) {
|
if (!summary) {
|
||||||
summary = extractSummaryFromBody(body, 120);
|
summary = extractSummaryFromBody(body, 120);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,45 @@ export function stripWrappingQuotes(value: string): string {
|
||||||
return value.trim();
|
return value.trim();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const HTML_ENTITIES: Record<string, string> = {
|
||||||
|
amp: "&",
|
||||||
|
apos: "'",
|
||||||
|
gt: ">",
|
||||||
|
lt: "<",
|
||||||
|
nbsp: " ",
|
||||||
|
quot: '"',
|
||||||
|
};
|
||||||
|
|
||||||
|
function decodeHtmlCodePoint(codePoint: number, fallback: string): string {
|
||||||
|
if (!Number.isFinite(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
|
||||||
|
return fallback;
|
||||||
|
}
|
||||||
|
return String.fromCodePoint(codePoint);
|
||||||
|
}
|
||||||
|
|
||||||
|
function decodeHtmlEntities(value: string): string {
|
||||||
|
return value.replace(/&(#x?[0-9a-f]+|[a-z]+);/gi, (entity, body: string) => {
|
||||||
|
const normalized = body.toLowerCase();
|
||||||
|
if (normalized.startsWith("#x")) {
|
||||||
|
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(2), 16), entity);
|
||||||
|
}
|
||||||
|
if (normalized.startsWith("#")) {
|
||||||
|
return decodeHtmlCodePoint(Number.parseInt(normalized.slice(1), 10), entity);
|
||||||
|
}
|
||||||
|
return HTML_ENTITIES[normalized] ?? entity;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function cleanSummaryText(value: string): string {
|
||||||
|
return decodeHtmlEntities(stripWrappingQuotes(value))
|
||||||
|
.replace(/<script\b[\s\S]*?<\/script>/gi, " ")
|
||||||
|
.replace(/<style\b[\s\S]*?<\/style>/gi, " ")
|
||||||
|
.replace(/<br\s*\/?>/gi, " ")
|
||||||
|
.replace(/<\/?[a-z][a-z0-9:-]*(?:\s+[^>]*)?>/gi, " ")
|
||||||
|
.replace(/\s+/g, " ")
|
||||||
|
.trim();
|
||||||
|
}
|
||||||
|
|
||||||
export function toFrontmatterString(value: unknown): string | undefined {
|
export function toFrontmatterString(value: unknown): string | undefined {
|
||||||
if (typeof value === "string") {
|
if (typeof value === "string") {
|
||||||
return stripWrappingQuotes(value);
|
return stripWrappingQuotes(value);
|
||||||
|
|
@ -94,10 +133,11 @@ export function extractSummaryFromBody(body: string, maxLen: number): string {
|
||||||
.replace(/\*(.+?)\*/g, "$1")
|
.replace(/\*(.+?)\*/g, "$1")
|
||||||
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
.replace(/\[([^\]]+)\]\([^)]+\)/g, "$1")
|
||||||
.replace(/`([^`]+)`/g, "$1");
|
.replace(/`([^`]+)`/g, "$1");
|
||||||
|
const summaryText = cleanSummaryText(cleanText);
|
||||||
|
|
||||||
if (cleanText.length > 20) {
|
if (summaryText.length > 20) {
|
||||||
if (cleanText.length <= maxLen) return cleanText;
|
if (summaryText.length <= maxLen) return summaryText;
|
||||||
return `${cleanText.slice(0, maxLen - 3)}...`;
|
return `${summaryText.slice(0, maxLen - 3)}...`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,19 +45,24 @@ export function loadCodeThemeCss(themeName: string): string {
|
||||||
}
|
}
|
||||||
|
|
||||||
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
export function buildHtmlDocument(meta: HtmlDocumentMeta, css: string, html: string, codeThemeCss?: string): string {
|
||||||
|
const escapeHtmlAttribute = (value: string) => value
|
||||||
|
.replace(/&/g, "&")
|
||||||
|
.replace(/"/g, """)
|
||||||
|
.replace(/</g, "<")
|
||||||
|
.replace(/>/g, ">");
|
||||||
const lines = [
|
const lines = [
|
||||||
"<!doctype html>",
|
"<!doctype html>",
|
||||||
"<html>",
|
"<html>",
|
||||||
"<head>",
|
"<head>",
|
||||||
' <meta charset="utf-8" />',
|
' <meta charset="utf-8" />',
|
||||||
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
' <meta name="viewport" content="width=device-width, initial-scale=1" />',
|
||||||
` <title>${meta.title}</title>`,
|
` <title>${escapeHtmlAttribute(meta.title)}</title>`,
|
||||||
];
|
];
|
||||||
if (meta.author) {
|
if (meta.author) {
|
||||||
lines.push(` <meta name="author" content="${meta.author}" />`);
|
lines.push(` <meta name="author" content="${escapeHtmlAttribute(meta.author)}" />`);
|
||||||
}
|
}
|
||||||
if (meta.description) {
|
if (meta.description) {
|
||||||
lines.push(` <meta name="description" content="${meta.description}" />`);
|
lines.push(` <meta name="description" content="${escapeHtmlAttribute(meta.description)}" />`);
|
||||||
}
|
}
|
||||||
lines.push(` <style>${css}</style>`);
|
lines.push(` <style>${css}</style>`);
|
||||||
if (codeThemeCss) {
|
if (codeThemeCss) {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue