From 7e07c1bb8425fcc710ac1769abd922f281c064fd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= Date: Fri, 13 Mar 2026 15:27:25 -0500 Subject: [PATCH] refactor(baoyu-translate): replace remark/unified with markdown-it for chunk parsing Simplifies dependencies and adds main.ts CLI entry point with exported functions for programmatic reuse. --- skills/baoyu-translate/SKILL.md | 7 +- skills/baoyu-translate/scripts/bun.lock | 153 +------ skills/baoyu-translate/scripts/chunk.ts | 440 ++++++++++++++------ skills/baoyu-translate/scripts/main.ts | 55 +++ skills/baoyu-translate/scripts/package.json | 10 +- 5 files changed, 391 insertions(+), 274 deletions(-) create mode 100644 skills/baoyu-translate/scripts/main.ts diff --git a/skills/baoyu-translate/SKILL.md b/skills/baoyu-translate/SKILL.md index 1941744..feaa4f6 100644 --- a/skills/baoyu-translate/SKILL.md +++ b/skills/baoyu-translate/SKILL.md @@ -21,7 +21,8 @@ Scripts in `scripts/` subdirectory. `{baseDir}` = this SKILL.md's directory path | Script | Purpose | |--------|---------| -| `scripts/chunk.ts` | Split markdown into chunks by AST blocks (sections, headings, paragraphs), with line/word fallback for oversized blocks. Use `--output-dir ` to write chunks into `/chunks/` instead of `/chunks/` | +| `scripts/main.ts` | CLI entry point. Default action splits markdown into chunks; also supports explicit `chunk` subcommand | +| `scripts/chunk.ts` | Markdown chunking implementation used by `main.ts` and kept compatible for direct invocation | ## Preferences (EXTEND.md) @@ -183,8 +184,8 @@ Before translating chunks: 1. **Extract terminology**: Scan entire document for proper nouns, technical terms, recurring phrases 2. **Build session glossary**: Merge extracted terms with loaded glossaries, establish consistent translations -3. **Split into chunks**: Use `${BUN_X} {baseDir}/scripts/chunk.ts [--max-words ] [--output-dir ]` - - Parses markdown AST (headings, paragraphs, lists, code blocks, tables, etc.) +3. **Split into chunks**: Use `${BUN_X} {baseDir}/scripts/main.ts [--max-words ] [--output-dir ]` + - Parses markdown blocks (headings, paragraphs, lists, code blocks, tables, etc.) - Splits at markdown block boundaries to preserve structure - If a single block exceeds the threshold, falls back to line splitting, then word splitting 4. **Assemble translation prompt**: diff --git a/skills/baoyu-translate/scripts/bun.lock b/skills/baoyu-translate/scripts/bun.lock index 824cd43..0dbc2e7 100644 --- a/skills/baoyu-translate/scripts/bun.lock +++ b/skills/baoyu-translate/scripts/bun.lock @@ -2,160 +2,25 @@ "lockfileVersion": 1, "workspaces": { "": { + "name": "baoyu-translate-chunk", "dependencies": { - "remark-frontmatter": "^5.0.0", - "remark-gfm": "^4.0.1", - "remark-parse": "^11.0.0", - "remark-stringify": "^11.0.0", - "unified": "^11.0.5", + "markdown-it": "14.1.1", }, }, }, "packages": { - "@types/debug": ["@types/debug@4.1.12", "", { "dependencies": { "@types/ms": "*" } }, "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ=="], + "argparse": ["argparse@2.0.1", "", {}, "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q=="], - "@types/mdast": ["@types/mdast@4.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA=="], + "entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], - "@types/ms": ["@types/ms@2.1.0", "", {}, "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA=="], + "linkify-it": ["linkify-it@5.0.0", "", { "dependencies": { "uc.micro": "^2.0.0" } }, "sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ=="], - "@types/unist": ["@types/unist@3.0.3", "", {}, "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q=="], + "markdown-it": ["markdown-it@14.1.1", "", { "dependencies": { "argparse": "^2.0.1", "entities": "^4.4.0", "linkify-it": "^5.0.0", "mdurl": "^2.0.0", "punycode.js": "^2.3.1", "uc.micro": "^2.1.0" }, "bin": { "markdown-it": "bin/markdown-it.mjs" } }, "sha512-BuU2qnTti9YKgK5N+IeMubp14ZUKUUw7yeJbkjtosvHiP0AZ5c8IAgEMk79D0eC8F23r4Ac/q8cAIFdm2FtyoA=="], - "bail": ["bail@2.0.2", "", {}, "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw=="], + "mdurl": ["mdurl@2.0.0", "", {}, "sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w=="], - "ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="], + "punycode.js": ["punycode.js@2.3.1", "", {}, "sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA=="], - "character-entities": ["character-entities@2.0.2", "", {}, "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ=="], - - "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], - - "decode-named-character-reference": ["decode-named-character-reference@1.3.0", "", { "dependencies": { "character-entities": "^2.0.0" } }, "sha512-GtpQYB283KrPp6nRw50q3U9/VfOutZOe103qlN7BPP6Ad27xYnOIWv4lPzo8HCAL+mMZofJ9KEy30fq6MfaK6Q=="], - - "dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="], - - "devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="], - - "escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="], - - "extend": ["extend@3.0.2", "", {}, "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="], - - "fault": ["fault@2.0.1", "", { "dependencies": { "format": "^0.2.0" } }, "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ=="], - - "format": ["format@0.2.2", "", {}, "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww=="], - - "is-plain-obj": ["is-plain-obj@4.1.0", "", {}, "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg=="], - - "longest-streak": ["longest-streak@3.1.0", "", {}, "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g=="], - - "markdown-table": ["markdown-table@3.0.4", "", {}, "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw=="], - - "mdast-util-find-and-replace": ["mdast-util-find-and-replace@3.0.2", "", { "dependencies": { "@types/mdast": "^4.0.0", "escape-string-regexp": "^5.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg=="], - - "mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.3", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-W4mAWTvSlKvf8L6J+VN9yLSqQ9AOAAvHuoDAmPkz4dHf553m5gVj2ejadHJhoJmcmxEnOv6Pa8XJhpxE93kb8Q=="], - - "mdast-util-frontmatter": ["mdast-util-frontmatter@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "escape-string-regexp": "^5.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0", "micromark-extension-frontmatter": "^2.0.0" } }, "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA=="], - - "mdast-util-gfm": ["mdast-util-gfm@3.1.0", "", { "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", "mdast-util-gfm-footnote": "^2.0.0", "mdast-util-gfm-strikethrough": "^2.0.0", "mdast-util-gfm-table": "^2.0.0", "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ=="], - - "mdast-util-gfm-autolink-literal": ["mdast-util-gfm-autolink-literal@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "ccount": "^2.0.0", "devlop": "^1.0.0", "mdast-util-find-and-replace": "^3.0.0", "micromark-util-character": "^2.0.0" } }, "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ=="], - - "mdast-util-gfm-footnote": ["mdast-util-gfm-footnote@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.1.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0" } }, "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ=="], - - "mdast-util-gfm-strikethrough": ["mdast-util-gfm-strikethrough@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg=="], - - "mdast-util-gfm-table": ["mdast-util-gfm-table@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "markdown-table": "^3.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg=="], - - "mdast-util-gfm-task-list-item": ["mdast-util-gfm-task-list-item@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ=="], - - "mdast-util-phrasing": ["mdast-util-phrasing@4.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "unist-util-is": "^6.0.0" } }, "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w=="], - - "mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.2", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA=="], - - "mdast-util-to-string": ["mdast-util-to-string@4.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0" } }, "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg=="], - - "micromark": ["micromark@4.0.2", "", { "dependencies": { "@types/debug": "^4.0.0", "debug": "^4.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "micromark-core-commonmark": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-combine-extensions": "^2.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-encode": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-subtokenize": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA=="], - - "micromark-core-commonmark": ["micromark-core-commonmark@2.0.3", "", { "dependencies": { "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "micromark-factory-destination": "^2.0.0", "micromark-factory-label": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-factory-title": "^2.0.0", "micromark-factory-whitespace": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-html-tag-name": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-subtokenize": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg=="], - - "micromark-extension-frontmatter": ["micromark-extension-frontmatter@2.0.0", "", { "dependencies": { "fault": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg=="], - - "micromark-extension-gfm": ["micromark-extension-gfm@3.0.0", "", { "dependencies": { "micromark-extension-gfm-autolink-literal": "^2.0.0", "micromark-extension-gfm-footnote": "^2.0.0", "micromark-extension-gfm-strikethrough": "^2.0.0", "micromark-extension-gfm-table": "^2.0.0", "micromark-extension-gfm-tagfilter": "^2.0.0", "micromark-extension-gfm-task-list-item": "^2.0.0", "micromark-util-combine-extensions": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w=="], - - "micromark-extension-gfm-autolink-literal": ["micromark-extension-gfm-autolink-literal@2.1.0", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw=="], - - "micromark-extension-gfm-footnote": ["micromark-extension-gfm-footnote@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-core-commonmark": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw=="], - - "micromark-extension-gfm-strikethrough": ["micromark-extension-gfm-strikethrough@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw=="], - - "micromark-extension-gfm-table": ["micromark-extension-gfm-table@2.1.1", "", { "dependencies": { "devlop": "^1.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg=="], - - "micromark-extension-gfm-tagfilter": ["micromark-extension-gfm-tagfilter@2.0.0", "", { "dependencies": { "micromark-util-types": "^2.0.0" } }, "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg=="], - - "micromark-extension-gfm-task-list-item": ["micromark-extension-gfm-task-list-item@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw=="], - - "micromark-factory-destination": ["micromark-factory-destination@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA=="], - - "micromark-factory-label": ["micromark-factory-label@2.0.1", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg=="], - - "micromark-factory-space": ["micromark-factory-space@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg=="], - - "micromark-factory-title": ["micromark-factory-title@2.0.1", "", { "dependencies": { "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw=="], - - "micromark-factory-whitespace": ["micromark-factory-whitespace@2.0.1", "", { "dependencies": { "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ=="], - - "micromark-util-character": ["micromark-util-character@2.1.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q=="], - - "micromark-util-chunked": ["micromark-util-chunked@2.0.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA=="], - - "micromark-util-classify-character": ["micromark-util-classify-character@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q=="], - - "micromark-util-combine-extensions": ["micromark-util-combine-extensions@2.0.1", "", { "dependencies": { "micromark-util-chunked": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg=="], - - "micromark-util-decode-numeric-character-reference": ["micromark-util-decode-numeric-character-reference@2.0.2", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw=="], - - "micromark-util-decode-string": ["micromark-util-decode-string@2.0.1", "", { "dependencies": { "decode-named-character-reference": "^1.0.0", "micromark-util-character": "^2.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-symbol": "^2.0.0" } }, "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ=="], - - "micromark-util-encode": ["micromark-util-encode@2.0.1", "", {}, "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw=="], - - "micromark-util-html-tag-name": ["micromark-util-html-tag-name@2.0.1", "", {}, "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA=="], - - "micromark-util-normalize-identifier": ["micromark-util-normalize-identifier@2.0.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q=="], - - "micromark-util-resolve-all": ["micromark-util-resolve-all@2.0.1", "", { "dependencies": { "micromark-util-types": "^2.0.0" } }, "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg=="], - - "micromark-util-sanitize-uri": ["micromark-util-sanitize-uri@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-encode": "^2.0.0", "micromark-util-symbol": "^2.0.0" } }, "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ=="], - - "micromark-util-subtokenize": ["micromark-util-subtokenize@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA=="], - - "micromark-util-symbol": ["micromark-util-symbol@2.0.1", "", {}, "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q=="], - - "micromark-util-types": ["micromark-util-types@2.0.2", "", {}, "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA=="], - - "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], - - "remark-frontmatter": ["remark-frontmatter@5.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-frontmatter": "^2.0.0", "micromark-extension-frontmatter": "^2.0.0", "unified": "^11.0.0" } }, "sha512-XTFYvNASMe5iPN0719nPrdItC9aU0ssC4v14mH1BCi1u0n1gAocqcujWUrByftZTbLhRtiKRyjYTSIOcr69UVQ=="], - - "remark-gfm": ["remark-gfm@4.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-gfm": "^3.0.0", "micromark-extension-gfm": "^3.0.0", "remark-parse": "^11.0.0", "remark-stringify": "^11.0.0", "unified": "^11.0.0" } }, "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg=="], - - "remark-parse": ["remark-parse@11.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-from-markdown": "^2.0.0", "micromark-util-types": "^2.0.0", "unified": "^11.0.0" } }, "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA=="], - - "remark-stringify": ["remark-stringify@11.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-to-markdown": "^2.0.0", "unified": "^11.0.0" } }, "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw=="], - - "trough": ["trough@2.2.0", "", {}, "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw=="], - - "unified": ["unified@11.0.5", "", { "dependencies": { "@types/unist": "^3.0.0", "bail": "^2.0.0", "devlop": "^1.0.0", "extend": "^3.0.0", "is-plain-obj": "^4.0.0", "trough": "^2.0.0", "vfile": "^6.0.0" } }, "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA=="], - - "unist-util-is": ["unist-util-is@6.0.1", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g=="], - - "unist-util-stringify-position": ["unist-util-stringify-position@4.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ=="], - - "unist-util-visit": ["unist-util-visit@5.1.0", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg=="], - - "unist-util-visit-parents": ["unist-util-visit-parents@6.0.2", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0" } }, "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ=="], - - "vfile": ["vfile@6.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile-message": "^4.0.0" } }, "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q=="], - - "vfile-message": ["vfile-message@4.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw=="], - - "zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="], + "uc.micro": ["uc.micro@2.1.0", "", {}, "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A=="], } } diff --git a/skills/baoyu-translate/scripts/chunk.ts b/skills/baoyu-translate/scripts/chunk.ts index e49d982..0291330 100644 --- a/skills/baoyu-translate/scripts/chunk.ts +++ b/skills/baoyu-translate/scripts/chunk.ts @@ -1,137 +1,335 @@ -import { readFileSync, writeFileSync, mkdirSync } from "fs" -import { basename, dirname, join } from "path" -import { unified } from "unified" -import remarkParse from "remark-parse" -import remarkGfm from "remark-gfm" -import remarkFrontmatter from "remark-frontmatter" -import remarkStringify from "remark-stringify" -import type { Root, Content } from "mdast" +import { mkdirSync, readFileSync, writeFileSync } from "fs" +import { dirname, join } from "path" +import MarkdownIt from "markdown-it" -const args = process.argv.slice(2) -const file = args.find(a => !a.startsWith("--")) -const maxWords = parseInt(args[args.indexOf("--max-words") + 1] || "5000") -const outputDir = args.indexOf("--output-dir") !== -1 ? args[args.indexOf("--output-dir") + 1] : "" +type BlockKind = + | "heading" + | "thematicBreak" + | "html" + | "code" + | "flow" -if (!file) { - console.error("Usage: chunk.ts [--max-words 5000]") - process.exit(1) +interface Block { + kind: BlockKind + md: string + words: number } -const content = readFileSync(file, "utf-8") +interface Chunk { + blocks: Block[] + words: number +} -const tree = unified() - .use(remarkParse) - .use(remarkGfm) - .use(remarkFrontmatter, ["yaml"]) - .parse(content) +export interface ChunkCliOptions { + file: string + maxWords: number + outputDir: string +} -const stringify = unified() - .use(remarkStringify, { bullet: "-", emphasis: "*", strong: "*" }) - .use(remarkGfm) - .use(remarkFrontmatter, ["yaml"]) +export interface ChunkResult { + source: string + chunks: number + output_dir: string + frontmatter: boolean + words_per_chunk: number[] +} -function nodeToMd(node: Content): string { - const root: Root = { type: "root", children: [node] } - return stringify.stringify(root).trim() +const parser = new MarkdownIt({ html: true }) + +export function formatChunkUsage(command: string): string { + return `Usage: ${command} [--max-words 5000] [--output-dir ]` +} + +export function runChunkCli(args: string[], command = "chunk.ts"): number { + const parsed = parseChunkCliArgs(args) + + if ("help" in parsed) { + console.log(formatChunkUsage(command)) + return 0 + } + + if ("error" in parsed) { + console.error(parsed.error) + console.error(formatChunkUsage(command)) + return 1 + } + + const result = chunkMarkdownFile(parsed.file, { + maxWords: parsed.maxWords, + outputDir: parsed.outputDir, + }) + + console.log(JSON.stringify(result)) + return 0 +} + +export function chunkMarkdownFile( + file: string, + options: { maxWords?: number; outputDir?: string } = {} +): ChunkResult { + const maxWords = options.maxWords ?? 5000 + const outputDir = options.outputDir ?? "" + + const rawContent = normalizeNewlines(readFileSync(file, "utf-8")) + const { frontmatter, body } = extractFrontmatter(rawContent) + const chunks = buildChunks(parseMarkdown(body), maxWords) + + const dir = outputDir ? join(outputDir, "chunks") : join(dirname(file), "chunks") + mkdirSync(dir, { recursive: true }) + + if (frontmatter) { + writeFileSync(join(dir, "frontmatter.md"), frontmatter) + } + + chunks.forEach((chunk, index) => { + const num = String(index + 1).padStart(2, "0") + writeFileSync(join(dir, `chunk-${num}.md`), chunk.blocks.map(block => block.md).join("\n\n")) + }) + + return { + source: file, + chunks: chunks.length, + output_dir: dir, + frontmatter: Boolean(frontmatter), + words_per_chunk: chunks.map(chunk => chunk.words), + } +} + +function parseChunkCliArgs(args: string[]): + | ChunkCliOptions + | { help: true } + | { error: string } { + let file = "" + let maxWords = 5000 + let outputDir = "" + + for (let index = 0; index < args.length; index += 1) { + const arg = args[index] + + if (arg === "-h" || arg === "--help") { + return { help: true } + } + + if (arg === "--max-words") { + const value = args[index + 1] + if (!value) return { error: "Missing value for --max-words" } + maxWords = parsePositiveInt(value, 0) + if (maxWords <= 0) return { error: `Invalid --max-words value: ${value}` } + index += 1 + continue + } + + if (arg === "--output-dir") { + const value = args[index + 1] + if (!value) return { error: "Missing value for --output-dir" } + outputDir = value + index += 1 + continue + } + + if (arg.startsWith("-")) { + return { error: `Unknown option: ${arg}` } + } + + if (!file) { + file = arg + continue + } + + return { error: `Unexpected positional argument: ${arg}` } + } + + if (!file) { + return { error: "Missing input file" } + } + + return { file, maxWords, outputDir } +} + +function parsePositiveInt(value: string | undefined, fallback: number): number { + if (!value) return fallback + const parsed = Number.parseInt(value, 10) + return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback +} + +function normalizeNewlines(text: string): string { + return text.replace(/^\uFEFF/, "").replace(/\r\n?/g, "\n") +} + +function trimBoundaryBlankLines(text: string): string { + return text.replace(/^\n+/, "").replace(/\n+$/, "") +} + +function extractFrontmatter(content: string): { frontmatter: string; body: string } { + const lines = content.split("\n") + if (lines[0] !== "---") { + return { frontmatter: "", body: content } + } + + for (let index = 1; index < lines.length; index += 1) { + if (lines[index] === "---" || lines[index] === "...") { + return { + frontmatter: lines.slice(0, index + 1).join("\n"), + body: lines.slice(index + 1).join("\n").replace(/^\n+/, ""), + } + } + } + + return { frontmatter: "", body: content } +} + +function parseMarkdown(content: string): Block[] { + if (!content.trim()) return [] + + const lines = content.split("\n") + const tokens = parser.parse(content, {}) + const blocks: Block[] = [] + + for (const token of tokens) { + if (!token.map || token.level !== 0) continue + if (token.nesting !== 1 && token.nesting !== 0) continue + + const [startLine, endLine] = token.map + const md = trimBoundaryBlankLines(lines.slice(startLine, endLine).join("\n")) + if (!md) continue + + blocks.push(makeBlock(tokenTypeToBlockKind(token.type), md)) + } + + if (blocks.length === 0) { + const body = trimBoundaryBlankLines(content) + if (body) { + blocks.push(makeBlock("flow", body)) + } + } + + return blocks +} + +function tokenTypeToBlockKind(tokenType: string): BlockKind { + if (tokenType === "heading_open") return "heading" + if (tokenType === "hr") return "thematicBreak" + if (tokenType === "html_block") return "html" + if (tokenType === "fence" || tokenType === "code_block") return "code" + return "flow" +} + +function makeBlock(kind: BlockKind, md: string): Block { + return { + kind, + md: trimBoundaryBlankLines(md), + words: countWords(md), + } +} + +function buildChunks(blocks: Block[], maxWordsPerChunk: number): Chunk[] { + const sections = splitIntoSections(blocks) + const normalizedBlocks: Block[] = [] + + for (const section of sections) { + const sectionWords = section.reduce((sum, block) => sum + block.words, 0) + if (sectionWords <= maxWordsPerChunk) { + normalizedBlocks.push(makeBlock("flow", section.map(block => block.md).join("\n\n"))) + continue + } + + for (const block of section) { + normalizedBlocks.push(...splitOversizedBlock(block, maxWordsPerChunk)) + } + } + + const chunks: Chunk[] = [] + let currentBlocks: Block[] = [] + let currentWords = 0 + + for (const block of normalizedBlocks) { + if (currentWords + block.words > maxWordsPerChunk && currentBlocks.length > 0) { + chunks.push({ blocks: currentBlocks, words: currentWords }) + currentBlocks = [block] + currentWords = block.words + continue + } + + currentBlocks.push(block) + currentWords += block.words + } + + if (currentBlocks.length > 0) { + chunks.push({ blocks: currentBlocks, words: currentWords }) + } + + return chunks +} + +function splitIntoSections(blocks: Block[]): Block[][] { + const sections: Block[][] = [] + let current: Block[] = [] + + for (const block of blocks) { + if (block.kind === "heading" && current.length > 0) { + sections.push(current) + current = [block] + continue + } + + current.push(block) + } + + if (current.length > 0) { + sections.push(current) + } + + return sections +} + +function splitOversizedBlock(block: Block, maxWordsPerChunk: number): Block[] { + if (block.words <= maxWordsPerChunk) return [block] + + if ( + block.kind === "heading" + || block.kind === "thematicBreak" + || block.kind === "html" + || block.kind === "code" + ) { + return [block] + } + + const lines = block.md.split("\n") + if (lines.length <= 1) { + return [block] + } + + const splitBlocks: Block[] = [] + let buffer: string[] = [] + let bufferWords = 0 + + for (const line of lines) { + const lineWords = countWords(line) + if (bufferWords + lineWords > maxWordsPerChunk && buffer.length > 0) { + splitBlocks.push(makeBlock(block.kind, buffer.join("\n"))) + buffer = [line] + bufferWords = lineWords + continue + } + + buffer.push(line) + bufferWords += lineWords + } + + if (buffer.length > 0) { + splitBlocks.push(makeBlock(block.kind, buffer.join("\n"))) + } + + return splitBlocks } function countWords(text: string): number { const cleaned = text.replace(/[#*`\[\]()>|_~-]/g, " ") const cjk = cleaned.match(/[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]/g) const latin = cleaned.match(/[a-zA-Z0-9]+/g) - return (cjk?.length || 0) + (latin?.length || 0) + return (cjk?.length ?? 0) + (latin?.length ?? 0) } -interface Block { - md: string - words: number +if (import.meta.main) { + process.exit(runChunkCli(process.argv.slice(2), process.argv[1] ?? "chunk.ts")) } - -function splitNodeToBlocks(node: Content): Block[] { - const md = nodeToMd(node) - const words = countWords(md) - - if (words <= maxWords) return [{ md, words }] - - if (node.type === "heading" || node.type === "thematicBreak" || node.type === "html") { - return [{ md, words }] - } - - if ("children" in node && Array.isArray(node.children)) { - const blocks: Block[] = [] - for (const child of node.children as Content[]) { - blocks.push(...splitNodeToBlocks(child)) - } - return blocks - } - - const lines = md.split("\n") - if (lines.length > 1) { - const blocks: Block[] = [] - let buf: string[] = [] - let bufWords = 0 - for (const line of lines) { - const lw = countWords(line) - if (bufWords + lw > maxWords && buf.length > 0) { - blocks.push({ md: buf.join("\n"), words: bufWords }) - buf = [line] - bufWords = lw - } else { - buf.push(line) - bufWords += lw - } - } - if (buf.length > 0) blocks.push({ md: buf.join("\n"), words: bufWords }) - return blocks - } - - return [{ md, words }] -} - -let frontmatter = "" -const blocks: Block[] = [] - -for (const node of tree.children) { - if (node.type === "yaml") { - frontmatter = `---\n${node.value}\n---` - continue - } - blocks.push(...splitNodeToBlocks(node as Content)) -} - -const chunks: { blocks: Block[]; words: number }[] = [] -let cur: Block[] = [] -let curWords = 0 - -for (const b of blocks) { - if (curWords + b.words > maxWords && cur.length > 0) { - chunks.push({ blocks: cur, words: curWords }) - cur = [b] - curWords = b.words - } else { - cur.push(b) - curWords += b.words - } -} -if (cur.length > 0) chunks.push({ blocks: cur, words: curWords }) - -const dir = outputDir ? join(outputDir, "chunks") : join(dirname(file), "chunks") -mkdirSync(dir, { recursive: true }) - -if (frontmatter) { - writeFileSync(join(dir, "frontmatter.md"), frontmatter) -} - -chunks.forEach((chunk, i) => { - const num = String(i + 1).padStart(2, "0") - const out = join(dir, `chunk-${num}.md`) - writeFileSync(out, chunk.blocks.map(b => b.md).join("\n\n")) -}) - -console.log(JSON.stringify({ - source: file, - chunks: chunks.length, - output_dir: dir, - frontmatter: !!frontmatter, - words_per_chunk: chunks.map(c => c.words) -})) diff --git a/skills/baoyu-translate/scripts/main.ts b/skills/baoyu-translate/scripts/main.ts new file mode 100644 index 0000000..24f8e46 --- /dev/null +++ b/skills/baoyu-translate/scripts/main.ts @@ -0,0 +1,55 @@ +#!/usr/bin/env bun +import path from "node:path" +import process from "node:process" +import { runChunkCli } from "./chunk.js" + +function formatScriptCommand(fallback: string): string { + const raw = process.argv[1] + const displayPath = raw + ? (() => { + const relative = path.relative(process.cwd(), raw) + return relative && !relative.startsWith("..") ? relative : raw + })() + : fallback + + const quotedPath = displayPath.includes(" ") + ? `"${displayPath.replace(/"/g, '\\"')}"` + : displayPath + + return `npx -y bun ${quotedPath}` +} + +function printUsage(exitCode: number): never { + const cmd = formatScriptCommand("scripts/main.ts") + console.log(`Baoyu Translate CLI + +Usage: + ${cmd} [--max-words 5000] [--output-dir ] + ${cmd} chunk [--max-words 5000] [--output-dir ] + +Commands: + chunk Split markdown into chunks + +Options: + --max-words Maximum words per chunk (default: 5000) + --output-dir Write chunks into /chunks/ + -h, --help Show help +`) + process.exit(exitCode) +} + +const args = process.argv.slice(2) + +if (args.length === 0) { + printUsage(1) +} + +if (args[0] === "-h" || args[0] === "--help") { + printUsage(0) +} + +if (args[0] === "chunk") { + process.exit(runChunkCli(args.slice(1), `${formatScriptCommand("scripts/main.ts")} chunk`)) +} + +process.exit(runChunkCli(args, formatScriptCommand("scripts/main.ts"))) diff --git a/skills/baoyu-translate/scripts/package.json b/skills/baoyu-translate/scripts/package.json index 290b644..e19c9bb 100644 --- a/skills/baoyu-translate/scripts/package.json +++ b/skills/baoyu-translate/scripts/package.json @@ -1,9 +1,7 @@ { + "name": "baoyu-translate-chunk", + "private": true, "dependencies": { - "remark-frontmatter": "^5.0.0", - "remark-gfm": "^4.0.1", - "remark-parse": "^11.0.0", - "remark-stringify": "^11.0.0", - "unified": "^11.0.5" + "markdown-it": "14.1.1" } -} \ No newline at end of file +}