refactor(baoyu-format-markdown): use remark-cjk-friendly for CJK emphasis

Replace custom CJK emphasis handling with remark-cjk-friendly library,
significantly simplifying the codebase.
This commit is contained in:
Jim Liu 宝玉 2026-02-01 02:13:58 -06:00
parent c0162bb3af
commit 3a5866eb4b
5 changed files with 1137 additions and 371 deletions

View File

@ -13,8 +13,7 @@ Scripts in `scripts/` subdirectory. Replace `${SKILL_DIR}` with this SKILL.md's
| Script | Purpose |
|--------|---------|
| `scripts/main.ts` | Main entry point with CLI options |
| `scripts/cjk-emphasis.ts` | Fix CJK emphasis/bold punctuation issues |
| `scripts/main.ts` | Main entry point with CLI options (uses remark-cjk-friendly for CJK emphasis) |
| `scripts/quotes.ts` | Replace ASCII quotes with fullwidth quotes |
| `scripts/autocorrect.ts` | Add CJK/English spacing via autocorrect |

View File

@ -1,314 +0,0 @@
const CJK_PUNCT_COMMON = "。.,、?!:;";
const CJK_OPENING_PUNCT = "(〔〖〘〚「『〈《【\u201C\u2018";
const CJK_CLOSING_PUNCT = ")〕〗〙〛」』〉》】\u201D\u2019" + CJK_PUNCT_COMMON;
const CJK_SCRIPTS =
"\\p{Script=Han}\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Hangul}";
export const CJK_CLOSING_PUNCT_RE = new RegExp(`[${CJK_CLOSING_PUNCT}]`);
export const CJK_OPENING_PUNCT_RE = new RegExp(`^[${CJK_OPENING_PUNCT}]`);
export const CJK_CHAR_RE = new RegExp(`[${CJK_SCRIPTS}]`, "u");
const PUNCT_OR_SYMBOL_RE = /[\p{P}\p{S}]/u;
const WORD_CHAR_RE = /[\p{L}\p{N}]/u;
const CJK_PUNCT_PAIRS: Record<string, string> = {
"“": "”",
"": "",
"": "",
"": "",
"〖": "〗",
"〘": "〙",
"〚": "〛",
"「": "」",
"『": "』",
"〈": "〉",
"《": "》",
"【": "】",
};
function findInlineCodeRanges(text: string): Array<[number, number]> {
const ranges: Array<[number, number]> = [];
let i = 0;
while (i < text.length) {
if (text[i] !== "`") {
i += 1;
continue;
}
let run = 1;
while (i + run < text.length && text[i + run] === "`") {
run += 1;
}
const start = i;
let j = i + run;
let found = false;
while (j < text.length) {
if (text[j] !== "`") {
j += 1;
continue;
}
let closeRun = 1;
while (j + closeRun < text.length && text[j + closeRun] === "`") {
closeRun += 1;
}
if (closeRun === run) {
ranges.push([start, j + closeRun - 1]);
i = j + closeRun;
found = true;
break;
}
j += closeRun;
}
if (!found) {
i = start + run;
}
}
return ranges;
}
function isEscaped(text: string, pos: number): boolean {
let count = 0;
for (let i = pos - 1; i >= 0 && text[i] === "\\"; i -= 1) {
count += 1;
}
return count % 2 === 1;
}
function isWhitespaceChar(ch: string | undefined): boolean {
return !ch || /\s/u.test(ch);
}
function isPunctuationOrSymbol(ch: string | undefined): boolean {
return !!ch && PUNCT_OR_SYMBOL_RE.test(ch);
}
function isMatchingCjkPunct(open: string, close: string): boolean {
return CJK_PUNCT_PAIRS[open] === close;
}
function mapNonCodeSegments(
block: string,
mapper: (segment: string) => string
): string {
const codeRanges = findInlineCodeRanges(block);
if (codeRanges.length === 0) {
return mapper(block);
}
let result = "";
let lastIndex = 0;
for (const [start, end] of codeRanges) {
if (start > lastIndex) {
result += mapper(block.slice(lastIndex, start));
}
result += block.slice(start, end + 1);
lastIndex = end + 1;
}
if (lastIndex < block.length) {
result += mapper(block.slice(lastIndex));
}
return result;
}
function moveCjkPunctuationOutsideEmphasis(block: string): string {
return mapNonCodeSegments(block, (segment) => {
const delimiterPositions: number[] = [];
let cursor = 0;
while (cursor < segment.length - 1) {
if (
segment[cursor] === "*" &&
segment[cursor + 1] === "*" &&
segment[cursor - 1] !== "*" &&
segment[cursor + 2] !== "*" &&
!isEscaped(segment, cursor)
) {
delimiterPositions.push(cursor);
cursor += 2;
continue;
}
cursor += 1;
}
if (delimiterPositions.length < 2) return segment;
const stack: number[] = [];
const pairs: Array<{ open: number; close: number }> = [];
for (const pos of delimiterPositions) {
if (stack.length === 0) {
stack.push(pos);
} else {
const open = stack.pop() as number;
pairs.push({ open, close: pos });
}
}
const skip = new Set<number>();
const insertBefore = new Map<number, string>();
for (const pair of pairs) {
const openPunctPos = pair.open + 2;
const closePunctPos = pair.close - 1;
if (openPunctPos >= closePunctPos) continue;
const openPunct = segment[openPunctPos];
const closePunct = segment[closePunctPos];
if (!openPunct || !closePunct) continue;
if (!CJK_OPENING_PUNCT_RE.test(openPunct)) continue;
if (!CJK_CLOSING_PUNCT_RE.test(closePunct)) continue;
if (!isMatchingCjkPunct(openPunct, closePunct)) continue;
if (openPunctPos + 1 >= closePunctPos) continue;
const inner = segment.slice(openPunctPos + 1, closePunctPos);
if (inner.length === 0) continue;
skip.add(openPunctPos);
skip.add(closePunctPos);
insertBefore.set(pair.open, (insertBefore.get(pair.open) ?? "") + openPunct);
const afterClose = pair.close + 2;
insertBefore.set(
afterClose,
(insertBefore.get(afterClose) ?? "") + closePunct
);
}
if (skip.size === 0) return segment;
let result = "";
for (let idx = 0; idx < segment.length; idx += 1) {
const insert = insertBefore.get(idx);
if (insert) {
result += insert;
}
if (skip.has(idx)) {
continue;
}
result += segment[idx];
}
const tailInsert = insertBefore.get(segment.length);
if (tailInsert) {
result += tailInsert;
}
return result;
});
}
function fixCjkEmphasisSpacingInBlock(block: string): string {
const normalized = moveCjkPunctuationOutsideEmphasis(block);
const codeRanges = findInlineCodeRanges(normalized);
let rangeIndex = 0;
const delimiters: Array<{
pos: number;
canOpen: boolean;
canClose: boolean;
}> = [];
let cursor = 0;
while (cursor < normalized.length - 1) {
if (rangeIndex < codeRanges.length && cursor >= codeRanges[rangeIndex][0]) {
if (cursor <= codeRanges[rangeIndex][1]) {
cursor = codeRanges[rangeIndex][1] + 1;
continue;
}
rangeIndex += 1;
continue;
}
if (
normalized[cursor] === "*" &&
normalized[cursor + 1] === "*" &&
normalized[cursor - 1] !== "*" &&
normalized[cursor + 2] !== "*" &&
!isEscaped(normalized, cursor)
) {
const before = normalized[cursor - 1];
const after = normalized[cursor + 2];
const beforeIsSpace = isWhitespaceChar(before);
const afterIsSpace = isWhitespaceChar(after);
const beforeIsPunct = isPunctuationOrSymbol(before);
const afterIsPunct = isPunctuationOrSymbol(after);
const leftFlanking =
!afterIsSpace && (!afterIsPunct || beforeIsSpace || beforeIsPunct);
const rightFlanking =
!beforeIsSpace && (!beforeIsPunct || afterIsSpace || afterIsPunct);
const cjkPunctBefore = !!before && CJK_CLOSING_PUNCT_RE.test(before);
const wordAfter = !!after && WORD_CHAR_RE.test(after);
delimiters.push({
pos: cursor,
canOpen: leftFlanking,
canClose: rightFlanking || (cjkPunctBefore && wordAfter),
});
cursor += 2;
continue;
}
cursor += 1;
}
const stack: Array<{ pos: number }> = [];
const pairs: Array<{ open: number; close: number }> = [];
for (const delimiter of delimiters) {
if (delimiter.canClose) {
let openerIndex = -1;
for (let j = stack.length - 1; j >= 0; j -= 1) {
openerIndex = j;
break;
}
if (openerIndex !== -1) {
const opener = stack.splice(openerIndex, 1)[0];
pairs.push({ open: opener.pos, close: delimiter.pos });
}
}
if (delimiter.canOpen) {
stack.push({ pos: delimiter.pos });
}
}
if (pairs.length === 0) return normalized;
const insertPositions = new Set<number>();
for (const pair of pairs) {
const insideLast = normalized[pair.close - 1];
const afterClose = normalized[pair.close + 2];
if (!afterClose) continue;
if (
CJK_CLOSING_PUNCT_RE.test(insideLast) &&
WORD_CHAR_RE.test(afterClose)
) {
insertPositions.add(pair.close + 2);
}
}
if (insertPositions.size === 0) return normalized;
let result = "";
for (let idx = 0; idx < normalized.length; idx += 1) {
if (insertPositions.has(idx)) {
result += " ";
}
result += normalized[idx];
}
if (insertPositions.has(normalized.length)) {
result += " ";
}
return result;
}
export function fixCjkEmphasisSpacing(content: string): string {
const parts = content.split(/(^```[\s\S]*?^```|^~~~[\s\S]*?^~~~)/m);
return parts
.map((part, i) => {
if (i % 2 === 1) return part;
const blocks = part.split(/(\n\s*\n+)/);
return blocks
.map((block, index) => {
if (index % 2 === 1) return block;
return fixCjkEmphasisSpacingInBlock(block);
})
.join("");
})
.join("");
}

View File

@ -1,17 +1,12 @@
import { readFileSync, writeFileSync } from "fs";
import { unified } from "unified";
import remarkParse from "remark-parse";
import remarkCjkFriendly from "remark-cjk-friendly";
import remarkGfm from "remark-gfm";
import remarkFrontmatter from "remark-frontmatter";
import remarkStringify from "remark-stringify";
import { visit } from "unist-util-visit";
import YAML from "yaml";
import {
fixCjkEmphasisSpacing,
CJK_CLOSING_PUNCT_RE,
CJK_OPENING_PUNCT_RE,
CJK_CHAR_RE,
} from "./cjk-emphasis";
import { replaceQuotes } from "./quotes";
import { applyAutocorrect } from "./autocorrect";
@ -36,6 +31,12 @@ const DEFAULT_OPTIONS: Required<FormatOptions> = {
emphasis: true,
};
function decodeHtmlEntities(text: string): string {
return text.replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) =>
String.fromCodePoint(parseInt(hex, 16))
);
}
function formatFrontmatter(value: string): string | null {
try {
const doc = YAML.parseDocument(value);
@ -49,12 +50,9 @@ function formatMarkdownContent(
content: string,
options: Required<FormatOptions>
): string {
if (options.emphasis) {
content = fixCjkEmphasisSpacing(content);
}
const processor = unified()
.use(remarkParse)
.use(options.emphasis ? remarkCjkFriendly : [])
.use(remarkGfm)
.use(remarkFrontmatter, ["yaml"])
.use(remarkStringify, {
@ -63,7 +61,7 @@ function formatMarkdownContent(
const tree = processor.parse(content);
visit(tree, (node, _index, parent) => {
visit(tree, (node) => {
if (node.type === "text" && options.quotes) {
const textNode = node as { value: string };
textNode.value = replaceQuotes(textNode.value);
@ -77,54 +75,11 @@ function formatMarkdownContent(
}
return;
}
if (
options.emphasis &&
(node.type === "strong" ||
node.type === "emphasis" ||
node.type === "delete") &&
parent
) {
const siblings = (parent as { children: typeof node[] }).children;
const idx = siblings.indexOf(node);
const children = (node as { children: typeof node[] }).children;
if (!children || children.length === 0) return;
const lastChild = children[children.length - 1];
if (lastChild.type === "text") {
const lastText = (lastChild as { value: string }).value;
if (
CJK_CLOSING_PUNCT_RE.test(lastText.slice(-1)) &&
idx + 1 < siblings.length
) {
const nextSib = siblings[idx + 1];
if (nextSib.type === "text") {
const nextText = (nextSib as { value: string }).value;
if (CJK_CHAR_RE.test(nextText.charAt(0))) {
(nextSib as { value: string }).value = " " + nextText;
}
}
}
}
const firstChild = children[0];
if (firstChild.type === "text") {
const firstText = (firstChild as { value: string }).value;
if (CJK_OPENING_PUNCT_RE.test(firstText) && idx > 0) {
const prevSib = siblings[idx - 1];
if (prevSib.type === "text") {
const prevText = (prevSib as { value: string }).value;
if (CJK_CHAR_RE.test(prevText.charAt(prevText.length - 1))) {
(prevSib as { value: string }).value = prevText + " ";
}
}
}
}
}
});
let result = processor.stringify(tree);
if (options.emphasis) {
result = fixCjkEmphasisSpacing(result);
result = decodeHtmlEntities(result);
}
return result;
}

File diff suppressed because it is too large Load Diff

View File

@ -1,5 +1,6 @@
{
"dependencies": {
"remark-cjk-friendly": "^1.1.0",
"remark-frontmatter": "^5.0.0",
"remark-gfm": "^4.0.1",
"remark-parse": "^11.0.0",