refactor(baoyu-format-markdown): use remark-cjk-friendly for CJK emphasis

Replace custom CJK emphasis handling with remark-cjk-friendly library, significantly simplifying the codebase.
2026-02-01 02:13:58 -06:00 · 2026-02-01 02:13:58 -06:00 · 3a5866eb4b
parent c0162bb3af
commit 3a5866eb4b
5 changed files with 1137 additions and 371 deletions
--- a/skills/baoyu-format-markdown/SKILL.md
+++ b/skills/baoyu-format-markdown/SKILL.md
@ -13,8 +13,7 @@ Scripts in `scripts/` subdirectory. Replace `${SKILL_DIR}` with this SKILL.md's

 | Script | Purpose |
 |--------|---------|
-| `scripts/main.ts` | Main entry point with CLI options |
-| `scripts/cjk-emphasis.ts` | Fix CJK emphasis/bold punctuation issues |
+| `scripts/main.ts` | Main entry point with CLI options (uses remark-cjk-friendly for CJK emphasis) |
 | `scripts/quotes.ts` | Replace ASCII quotes with fullwidth quotes |
 | `scripts/autocorrect.ts` | Add CJK/English spacing via autocorrect |

--- a/skills/baoyu-format-markdown/scripts/cjk-emphasis.ts
+++ b/skills/baoyu-format-markdown/scripts/cjk-emphasis.ts
@ -1,314 +0,0 @@
-const CJK_PUNCT_COMMON = "。．，、？！：；";
-const CJK_OPENING_PUNCT = "（〔〖〘〚「『〈《【\u201C\u2018";
-const CJK_CLOSING_PUNCT = "）〕〗〙〛」』〉》】\u201D\u2019" + CJK_PUNCT_COMMON;
-const CJK_SCRIPTS =
-  "\\p{Script=Han}\\p{Script=Hiragana}\\p{Script=Katakana}\\p{Script=Hangul}";
-
-export const CJK_CLOSING_PUNCT_RE = new RegExp(`[${CJK_CLOSING_PUNCT}]`);
-export const CJK_OPENING_PUNCT_RE = new RegExp(`^[${CJK_OPENING_PUNCT}]`);
-export const CJK_CHAR_RE = new RegExp(`[${CJK_SCRIPTS}]`, "u");
-
-const PUNCT_OR_SYMBOL_RE = /[\p{P}\p{S}]/u;
-const WORD_CHAR_RE = /[\p{L}\p{N}]/u;
-
-const CJK_PUNCT_PAIRS: Record<string, string> = {
-  "“": "”",
-  "‘": "’",
-  "（": "）",
-  "〔": "〕",
-  "〖": "〗",
-  "〘": "〙",
-  "〚": "〛",
-  "「": "」",
-  "『": "』",
-  "〈": "〉",
-  "《": "》",
-  "【": "】",
-};
-
-function findInlineCodeRanges(text: string): Array<[number, number]> {
-  const ranges: Array<[number, number]> = [];
-  let i = 0;
-  while (i < text.length) {
-    if (text[i] !== "`") {
-      i += 1;
-      continue;
-    }
-
-    let run = 1;
-    while (i + run < text.length && text[i + run] === "`") {
-      run += 1;
-    }
-
-    const start = i;
-    let j = i + run;
-    let found = false;
-    while (j < text.length) {
-      if (text[j] !== "`") {
-        j += 1;
-        continue;
-      }
-      let closeRun = 1;
-      while (j + closeRun < text.length && text[j + closeRun] === "`") {
-        closeRun += 1;
-      }
-      if (closeRun === run) {
-        ranges.push([start, j + closeRun - 1]);
-        i = j + closeRun;
-        found = true;
-        break;
-      }
-      j += closeRun;
-    }
-
-    if (!found) {
-      i = start + run;
-    }
-  }
-  return ranges;
-}
-
-function isEscaped(text: string, pos: number): boolean {
-  let count = 0;
-  for (let i = pos - 1; i >= 0 && text[i] === "\\"; i -= 1) {
-    count += 1;
-  }
-  return count % 2 === 1;
-}
-
-function isWhitespaceChar(ch: string | undefined): boolean {
-  return !ch || /\s/u.test(ch);
-}
-
-function isPunctuationOrSymbol(ch: string | undefined): boolean {
-  return !!ch && PUNCT_OR_SYMBOL_RE.test(ch);
-}
-
-function isMatchingCjkPunct(open: string, close: string): boolean {
-  return CJK_PUNCT_PAIRS[open] === close;
-}
-
-function mapNonCodeSegments(
-  block: string,
-  mapper: (segment: string) => string
-): string {
-  const codeRanges = findInlineCodeRanges(block);
-  if (codeRanges.length === 0) {
-    return mapper(block);
-  }
-
-  let result = "";
-  let lastIndex = 0;
-  for (const [start, end] of codeRanges) {
-    if (start > lastIndex) {
-      result += mapper(block.slice(lastIndex, start));
-    }
-    result += block.slice(start, end + 1);
-    lastIndex = end + 1;
-  }
-  if (lastIndex < block.length) {
-    result += mapper(block.slice(lastIndex));
-  }
-  return result;
-}
-
-function moveCjkPunctuationOutsideEmphasis(block: string): string {
-  return mapNonCodeSegments(block, (segment) => {
-    const delimiterPositions: number[] = [];
-    let cursor = 0;
-    while (cursor < segment.length - 1) {
-      if (
-        segment[cursor] === "*" &&
-        segment[cursor + 1] === "*" &&
-        segment[cursor - 1] !== "*" &&
-        segment[cursor + 2] !== "*" &&
-        !isEscaped(segment, cursor)
-      ) {
-        delimiterPositions.push(cursor);
-        cursor += 2;
-        continue;
-      }
-      cursor += 1;
-    }
-
-    if (delimiterPositions.length < 2) return segment;
-
-    const stack: number[] = [];
-    const pairs: Array<{ open: number; close: number }> = [];
-    for (const pos of delimiterPositions) {
-      if (stack.length === 0) {
-        stack.push(pos);
-      } else {
-        const open = stack.pop() as number;
-        pairs.push({ open, close: pos });
-      }
-    }
-
-    const skip = new Set<number>();
-    const insertBefore = new Map<number, string>();
-
-    for (const pair of pairs) {
-      const openPunctPos = pair.open + 2;
-      const closePunctPos = pair.close - 1;
-      if (openPunctPos >= closePunctPos) continue;
-
-      const openPunct = segment[openPunctPos];
-      const closePunct = segment[closePunctPos];
-      if (!openPunct || !closePunct) continue;
-      if (!CJK_OPENING_PUNCT_RE.test(openPunct)) continue;
-      if (!CJK_CLOSING_PUNCT_RE.test(closePunct)) continue;
-      if (!isMatchingCjkPunct(openPunct, closePunct)) continue;
-      if (openPunctPos + 1 >= closePunctPos) continue;
-
-      const inner = segment.slice(openPunctPos + 1, closePunctPos);
-      if (inner.length === 0) continue;
-
-      skip.add(openPunctPos);
-      skip.add(closePunctPos);
-
-      insertBefore.set(pair.open, (insertBefore.get(pair.open) ?? "") + openPunct);
-      const afterClose = pair.close + 2;
-      insertBefore.set(
-        afterClose,
-        (insertBefore.get(afterClose) ?? "") + closePunct
-      );
-    }
-
-    if (skip.size === 0) return segment;
-
-    let result = "";
-    for (let idx = 0; idx < segment.length; idx += 1) {
-      const insert = insertBefore.get(idx);
-      if (insert) {
-        result += insert;
-      }
-      if (skip.has(idx)) {
-        continue;
-      }
-      result += segment[idx];
-    }
-    const tailInsert = insertBefore.get(segment.length);
-    if (tailInsert) {
-      result += tailInsert;
-    }
-    return result;
-  });
-}
-
-function fixCjkEmphasisSpacingInBlock(block: string): string {
-  const normalized = moveCjkPunctuationOutsideEmphasis(block);
-  const codeRanges = findInlineCodeRanges(normalized);
-  let rangeIndex = 0;
-
-  const delimiters: Array<{
-    pos: number;
-    canOpen: boolean;
-    canClose: boolean;
-  }> = [];
-  let cursor = 0;
-  while (cursor < normalized.length - 1) {
-    if (rangeIndex < codeRanges.length && cursor >= codeRanges[rangeIndex][0]) {
-      if (cursor <= codeRanges[rangeIndex][1]) {
-        cursor = codeRanges[rangeIndex][1] + 1;
-        continue;
-      }
-      rangeIndex += 1;
-      continue;
-    }
-
-    if (
-      normalized[cursor] === "*" &&
-      normalized[cursor + 1] === "*" &&
-      normalized[cursor - 1] !== "*" &&
-      normalized[cursor + 2] !== "*" &&
-      !isEscaped(normalized, cursor)
-    ) {
-      const before = normalized[cursor - 1];
-      const after = normalized[cursor + 2];
-      const beforeIsSpace = isWhitespaceChar(before);
-      const afterIsSpace = isWhitespaceChar(after);
-      const beforeIsPunct = isPunctuationOrSymbol(before);
-      const afterIsPunct = isPunctuationOrSymbol(after);
-      const leftFlanking =
-        !afterIsSpace && (!afterIsPunct || beforeIsSpace || beforeIsPunct);
-      const rightFlanking =
-        !beforeIsSpace && (!beforeIsPunct || afterIsSpace || afterIsPunct);
-      const cjkPunctBefore = !!before && CJK_CLOSING_PUNCT_RE.test(before);
-      const wordAfter = !!after && WORD_CHAR_RE.test(after);
-
-      delimiters.push({
-        pos: cursor,
-        canOpen: leftFlanking,
-        canClose: rightFlanking || (cjkPunctBefore && wordAfter),
-      });
-      cursor += 2;
-      continue;
-    }
-
-    cursor += 1;
-  }
-
-  const stack: Array<{ pos: number }> = [];
-  const pairs: Array<{ open: number; close: number }> = [];
-  for (const delimiter of delimiters) {
-    if (delimiter.canClose) {
-      let openerIndex = -1;
-      for (let j = stack.length - 1; j >= 0; j -= 1) {
-        openerIndex = j;
-        break;
-      }
-      if (openerIndex !== -1) {
-        const opener = stack.splice(openerIndex, 1)[0];
-        pairs.push({ open: opener.pos, close: delimiter.pos });
-      }
-    }
-    if (delimiter.canOpen) {
-      stack.push({ pos: delimiter.pos });
-    }
-  }
-
-  if (pairs.length === 0) return normalized;
-
-  const insertPositions = new Set<number>();
-  for (const pair of pairs) {
-    const insideLast = normalized[pair.close - 1];
-    const afterClose = normalized[pair.close + 2];
-    if (!afterClose) continue;
-    if (
-      CJK_CLOSING_PUNCT_RE.test(insideLast) &&
-      WORD_CHAR_RE.test(afterClose)
-    ) {
-      insertPositions.add(pair.close + 2);
-    }
-  }
-
-  if (insertPositions.size === 0) return normalized;
-
-  let result = "";
-  for (let idx = 0; idx < normalized.length; idx += 1) {
-    if (insertPositions.has(idx)) {
-      result += " ";
-    }
-    result += normalized[idx];
-  }
-  if (insertPositions.has(normalized.length)) {
-    result += " ";
-  }
-  return result;
-}
-
-export function fixCjkEmphasisSpacing(content: string): string {
-  const parts = content.split(/(^```[\s\S]*?^```|^~~~[\s\S]*?^~~~)/m);
-  return parts
-    .map((part, i) => {
-      if (i % 2 === 1) return part;
-      const blocks = part.split(/(\n\s*\n+)/);
-      return blocks
-        .map((block, index) => {
-          if (index % 2 === 1) return block;
-          return fixCjkEmphasisSpacingInBlock(block);
-        })
-        .join("");
-    })
-    .join("");
-}
--- a/skills/baoyu-format-markdown/scripts/main.ts
+++ b/skills/baoyu-format-markdown/scripts/main.ts
@ -1,17 +1,12 @@
 import { readFileSync, writeFileSync } from "fs";
 import { unified } from "unified";
 import remarkParse from "remark-parse";
+import remarkCjkFriendly from "remark-cjk-friendly";
 import remarkGfm from "remark-gfm";
 import remarkFrontmatter from "remark-frontmatter";
 import remarkStringify from "remark-stringify";
 import { visit } from "unist-util-visit";
 import YAML from "yaml";
-import {
-  fixCjkEmphasisSpacing,
-  CJK_CLOSING_PUNCT_RE,
-  CJK_OPENING_PUNCT_RE,
-  CJK_CHAR_RE,
-} from "./cjk-emphasis";
 import { replaceQuotes } from "./quotes";
 import { applyAutocorrect } from "./autocorrect";

@ -36,6 +31,12 @@ const DEFAULT_OPTIONS: Required<FormatOptions> = {
  emphasis: true,
 };

+function decodeHtmlEntities(text: string): string {
+  return text.replace(/&#x([0-9A-Fa-f]+);/g, (_, hex) =>
+    String.fromCodePoint(parseInt(hex, 16))
+  );
+}
+
 function formatFrontmatter(value: string): string | null {
  try {
    const doc = YAML.parseDocument(value);
@ -49,12 +50,9 @@ function formatMarkdownContent(
  content: string,
  options: Required<FormatOptions>
 ): string {
-  if (options.emphasis) {
-    content = fixCjkEmphasisSpacing(content);
-  }
-
  const processor = unified()
    .use(remarkParse)
+    .use(options.emphasis ? remarkCjkFriendly : [])
    .use(remarkGfm)
    .use(remarkFrontmatter, ["yaml"])
    .use(remarkStringify, {
@ -63,7 +61,7 @@ function formatMarkdownContent(

  const tree = processor.parse(content);

-  visit(tree, (node, _index, parent) => {
+  visit(tree, (node) => {
    if (node.type === "text" && options.quotes) {
      const textNode = node as { value: string };
      textNode.value = replaceQuotes(textNode.value);
@ -77,54 +75,11 @@ function formatMarkdownContent(
      }
      return;
    }
-    if (
-      options.emphasis &&
-      (node.type === "strong" ||
-        node.type === "emphasis" ||
-        node.type === "delete") &&
-      parent
-    ) {
-      const siblings = (parent as { children: typeof node[] }).children;
-      const idx = siblings.indexOf(node);
-      const children = (node as { children: typeof node[] }).children;
-      if (!children || children.length === 0) return;
-
-      const lastChild = children[children.length - 1];
-      if (lastChild.type === "text") {
-        const lastText = (lastChild as { value: string }).value;
-        if (
-          CJK_CLOSING_PUNCT_RE.test(lastText.slice(-1)) &&
-          idx + 1 < siblings.length
-        ) {
-          const nextSib = siblings[idx + 1];
-          if (nextSib.type === "text") {
-            const nextText = (nextSib as { value: string }).value;
-            if (CJK_CHAR_RE.test(nextText.charAt(0))) {
-              (nextSib as { value: string }).value = " " + nextText;
-            }
-          }
-        }
-      }
-
-      const firstChild = children[0];
-      if (firstChild.type === "text") {
-        const firstText = (firstChild as { value: string }).value;
-        if (CJK_OPENING_PUNCT_RE.test(firstText) && idx > 0) {
-          const prevSib = siblings[idx - 1];
-          if (prevSib.type === "text") {
-            const prevText = (prevSib as { value: string }).value;
-            if (CJK_CHAR_RE.test(prevText.charAt(prevText.length - 1))) {
-              (prevSib as { value: string }).value = prevText + " ";
-            }
-          }
-        }
-      }
-    }
  });

  let result = processor.stringify(tree);
  if (options.emphasis) {
-    result = fixCjkEmphasisSpacing(result);
+    result = decodeHtmlEntities(result);
  }
  return result;
 }
--- a/skills/baoyu-format-markdown/scripts/package-lock.json
+++ b/skills/baoyu-format-markdown/scripts/package-lock.json
--- a/skills/baoyu-format-markdown/scripts/package.json
+++ b/skills/baoyu-format-markdown/scripts/package.json
@ -1,5 +1,6 @@
 {
  "dependencies": {
+    "remark-cjk-friendly": "^1.1.0",
    "remark-frontmatter": "^5.0.0",
    "remark-gfm": "^4.0.1",
    "remark-parse": "^11.0.0",