refactor(baoyu-translate): replace remark/unified with markdown-it for chunk parsing
Simplifies dependencies and adds main.ts CLI entry point with exported functions for programmatic reuse.
This commit is contained in:
parent
6f38724163
commit
7e07c1bb84
|
|
@ -21,7 +21,8 @@ Scripts in `scripts/` subdirectory. `{baseDir}` = this SKILL.md's directory path
|
|||
|
||||
| Script | Purpose |
|
||||
|--------|---------|
|
||||
| `scripts/chunk.ts` | Split markdown into chunks by AST blocks (sections, headings, paragraphs), with line/word fallback for oversized blocks. Use `--output-dir <dir>` to write chunks into `<dir>/chunks/` instead of `<source-dir>/chunks/` |
|
||||
| `scripts/main.ts` | CLI entry point. Default action splits markdown into chunks; also supports explicit `chunk` subcommand |
|
||||
| `scripts/chunk.ts` | Markdown chunking implementation used by `main.ts` and kept compatible for direct invocation |
|
||||
|
||||
## Preferences (EXTEND.md)
|
||||
|
||||
|
|
@ -183,8 +184,8 @@ Before translating chunks:
|
|||
|
||||
1. **Extract terminology**: Scan entire document for proper nouns, technical terms, recurring phrases
|
||||
2. **Build session glossary**: Merge extracted terms with loaded glossaries, establish consistent translations
|
||||
3. **Split into chunks**: Use `${BUN_X} {baseDir}/scripts/chunk.ts <file> [--max-words <chunk_max_words>] [--output-dir <output-dir>]`
|
||||
- Parses markdown AST (headings, paragraphs, lists, code blocks, tables, etc.)
|
||||
3. **Split into chunks**: Use `${BUN_X} {baseDir}/scripts/main.ts <file> [--max-words <chunk_max_words>] [--output-dir <output-dir>]`
|
||||
- Parses markdown blocks (headings, paragraphs, lists, code blocks, tables, etc.)
|
||||
- Splits at markdown block boundaries to preserve structure
|
||||
- If a single block exceeds the threshold, falls back to line splitting, then word splitting
|
||||
4. **Assemble translation prompt**:
|
||||
|
|
|
|||
|
|
@ -2,160 +2,25 @@
|
|||
"lockfileVersion": 1,
|
||||
"workspaces": {
|
||||
"": {
|
||||
"name": "baoyu-translate-chunk",
|
||||
"dependencies": {
|
||||
"remark-frontmatter": "^5.0.0",
|
||||
"remark-gfm": "^4.0.1",
|
||||
"remark-parse": "^11.0.0",
|
||||
"remark-stringify": "^11.0.0",
|
||||
"unified": "^11.0.5",
|
||||
"markdown-it": "14.1.1",
|
||||
},
|
||||
},
|
||||
},
|
||||
"packages": {
|
||||
"@types/debug": ["@types/debug@4.1.12", "", { "dependencies": { "@types/ms": "*" } }, "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ=="],
|
||||
"argparse": ["argparse@2.0.1", "", {}, "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q=="],
|
||||
|
||||
"@types/mdast": ["@types/mdast@4.0.4", "", { "dependencies": { "@types/unist": "*" } }, "sha512-kGaNbPh1k7AFzgpud/gMdvIm5xuECykRR+JnWKQno9TAXVa6WIVCGTPvYGekIDL4uwCZQSYbUxNBSb1aUo79oA=="],
|
||||
"entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="],
|
||||
|
||||
"@types/ms": ["@types/ms@2.1.0", "", {}, "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA=="],
|
||||
"linkify-it": ["linkify-it@5.0.0", "", { "dependencies": { "uc.micro": "^2.0.0" } }, "sha512-5aHCbzQRADcdP+ATqnDuhhJ/MRIqDkZX5pyjFHRRysS8vZ5AbqGEoFIb6pYHPZ+L/OC2Lc+xT8uHVVR5CAK/wQ=="],
|
||||
|
||||
"@types/unist": ["@types/unist@3.0.3", "", {}, "sha512-ko/gIFJRv177XgZsZcBwnqJN5x/Gien8qNOn0D5bQU/zAzVf9Zt3BlcUiLqhV9y4ARk0GbT3tnUiPNgnTXzc/Q=="],
|
||||
"markdown-it": ["markdown-it@14.1.1", "", { "dependencies": { "argparse": "^2.0.1", "entities": "^4.4.0", "linkify-it": "^5.0.0", "mdurl": "^2.0.0", "punycode.js": "^2.3.1", "uc.micro": "^2.1.0" }, "bin": { "markdown-it": "bin/markdown-it.mjs" } }, "sha512-BuU2qnTti9YKgK5N+IeMubp14ZUKUUw7yeJbkjtosvHiP0AZ5c8IAgEMk79D0eC8F23r4Ac/q8cAIFdm2FtyoA=="],
|
||||
|
||||
"bail": ["bail@2.0.2", "", {}, "sha512-0xO6mYd7JB2YesxDKplafRpsiOzPt9V02ddPCLbY1xYGPOX24NTyN50qnUxgCPcSoYMhKpAuBTjQoRZCAkUDRw=="],
|
||||
"mdurl": ["mdurl@2.0.0", "", {}, "sha512-Lf+9+2r+Tdp5wXDXC4PcIBjTDtq4UKjCPMQhKIuzpJNW0b96kVqSwW0bT7FhRSfmAiFYgP+SCRvdrDozfh0U5w=="],
|
||||
|
||||
"ccount": ["ccount@2.0.1", "", {}, "sha512-eyrF0jiFpY+3drT6383f1qhkbGsLSifNAjA61IUjZjmLCWjItY6LB9ft9YhoDgwfmclB2zhu51Lc7+95b8NRAg=="],
|
||||
"punycode.js": ["punycode.js@2.3.1", "", {}, "sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA=="],
|
||||
|
||||
"character-entities": ["character-entities@2.0.2", "", {}, "sha512-shx7oQ0Awen/BRIdkjkvz54PnEEI/EjwXDSIZp86/KKdbafHh1Df/RYGBhn4hbe2+uKC9FnT5UCEdyPz3ai9hQ=="],
|
||||
|
||||
"debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="],
|
||||
|
||||
"decode-named-character-reference": ["decode-named-character-reference@1.3.0", "", { "dependencies": { "character-entities": "^2.0.0" } }, "sha512-GtpQYB283KrPp6nRw50q3U9/VfOutZOe103qlN7BPP6Ad27xYnOIWv4lPzo8HCAL+mMZofJ9KEy30fq6MfaK6Q=="],
|
||||
|
||||
"dequal": ["dequal@2.0.3", "", {}, "sha512-0je+qPKHEMohvfRTCEo3CrPG6cAzAYgmzKyxRiYSSDkS6eGJdyVJm7WaYA5ECaAD9wLB2T4EEeymA5aFVcYXCA=="],
|
||||
|
||||
"devlop": ["devlop@1.1.0", "", { "dependencies": { "dequal": "^2.0.0" } }, "sha512-RWmIqhcFf1lRYBvNmr7qTNuyCt/7/ns2jbpp1+PalgE/rDQcBT0fioSMUpJ93irlUhC5hrg4cYqe6U+0ImW0rA=="],
|
||||
|
||||
"escape-string-regexp": ["escape-string-regexp@5.0.0", "", {}, "sha512-/veY75JbMK4j1yjvuUxuVsiS/hr/4iHs9FTT6cgTexxdE0Ly/glccBAkloH/DofkjRbZU3bnoj38mOmhkZ0lHw=="],
|
||||
|
||||
"extend": ["extend@3.0.2", "", {}, "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g=="],
|
||||
|
||||
"fault": ["fault@2.0.1", "", { "dependencies": { "format": "^0.2.0" } }, "sha512-WtySTkS4OKev5JtpHXnib4Gxiurzh5NCGvWrFaZ34m6JehfTUhKZvn9njTfw48t6JumVQOmrKqpmGcdwxnhqBQ=="],
|
||||
|
||||
"format": ["format@0.2.2", "", {}, "sha512-wzsgA6WOq+09wrU1tsJ09udeR/YZRaeArL9e1wPbFg3GG2yDnC2ldKpxs4xunpFF9DgqCqOIra3bc1HWrJ37Ww=="],
|
||||
|
||||
"is-plain-obj": ["is-plain-obj@4.1.0", "", {}, "sha512-+Pgi+vMuUNkJyExiMBt5IlFoMyKnr5zhJ4Uspz58WOhBF5QoIZkFyNHIbBAtHwzVAgk5RtndVNsDRN61/mmDqg=="],
|
||||
|
||||
"longest-streak": ["longest-streak@3.1.0", "", {}, "sha512-9Ri+o0JYgehTaVBBDoMqIl8GXtbWg711O3srftcHhZ0dqnETqLaoIK0x17fUw9rFSlK/0NlsKe0Ahhyl5pXE2g=="],
|
||||
|
||||
"markdown-table": ["markdown-table@3.0.4", "", {}, "sha512-wiYz4+JrLyb/DqW2hkFJxP7Vd7JuTDm77fvbM8VfEQdmSMqcImWeeRbHwZjBjIFki/VaMK2BhFi7oUUZeM5bqw=="],
|
||||
|
||||
"mdast-util-find-and-replace": ["mdast-util-find-and-replace@3.0.2", "", { "dependencies": { "@types/mdast": "^4.0.0", "escape-string-regexp": "^5.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-Tmd1Vg/m3Xz43afeNxDIhWRtFZgM2VLyaf4vSTYwudTyeuTneoL3qtWMA5jeLyz/O1vDJmmV4QuScFCA2tBPwg=="],
|
||||
|
||||
"mdast-util-from-markdown": ["mdast-util-from-markdown@2.0.3", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "mdast-util-to-string": "^4.0.0", "micromark": "^4.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-W4mAWTvSlKvf8L6J+VN9yLSqQ9AOAAvHuoDAmPkz4dHf553m5gVj2ejadHJhoJmcmxEnOv6Pa8XJhpxE93kb8Q=="],
|
||||
|
||||
"mdast-util-frontmatter": ["mdast-util-frontmatter@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "escape-string-regexp": "^5.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0", "micromark-extension-frontmatter": "^2.0.0" } }, "sha512-LRqI9+wdgC25P0URIJY9vwocIzCcksduHQ9OF2joxQoyTNVduwLAFUzjoopuRJbJAReaKrNQKAZKL3uCMugWJA=="],
|
||||
|
||||
"mdast-util-gfm": ["mdast-util-gfm@3.1.0", "", { "dependencies": { "mdast-util-from-markdown": "^2.0.0", "mdast-util-gfm-autolink-literal": "^2.0.0", "mdast-util-gfm-footnote": "^2.0.0", "mdast-util-gfm-strikethrough": "^2.0.0", "mdast-util-gfm-table": "^2.0.0", "mdast-util-gfm-task-list-item": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-0ulfdQOM3ysHhCJ1p06l0b0VKlhU0wuQs3thxZQagjcjPrlFRqY215uZGHHJan9GEAXd9MbfPjFJz+qMkVR6zQ=="],
|
||||
|
||||
"mdast-util-gfm-autolink-literal": ["mdast-util-gfm-autolink-literal@2.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "ccount": "^2.0.0", "devlop": "^1.0.0", "mdast-util-find-and-replace": "^3.0.0", "micromark-util-character": "^2.0.0" } }, "sha512-5HVP2MKaP6L+G6YaxPNjuL0BPrq9orG3TsrZ9YXbA3vDw/ACI4MEsnoDpn6ZNm7GnZgtAcONJyPhOP8tNJQavQ=="],
|
||||
|
||||
"mdast-util-gfm-footnote": ["mdast-util-gfm-footnote@2.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.1.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0" } }, "sha512-sqpDWlsHn7Ac9GNZQMeUzPQSMzR6Wv0WKRNvQRg0KqHh02fpTz69Qc1QSseNX29bhz1ROIyNyxExfawVKTm1GQ=="],
|
||||
|
||||
"mdast-util-gfm-strikethrough": ["mdast-util-gfm-strikethrough@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-mKKb915TF+OC5ptj5bJ7WFRPdYtuHv0yTRxK2tJvi+BDqbkiG7h7u/9SI89nRAYcmap2xHQL9D+QG/6wSrTtXg=="],
|
||||
|
||||
"mdast-util-gfm-table": ["mdast-util-gfm-table@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "markdown-table": "^3.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-78UEvebzz/rJIxLvE7ZtDd/vIQ0RHv+3Mh5DR96p7cS7HsBhYIICDBCu8csTNWNO6tBWfqXPWekRuj2FNOGOZg=="],
|
||||
|
||||
"mdast-util-gfm-task-list-item": ["mdast-util-gfm-task-list-item@2.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "devlop": "^1.0.0", "mdast-util-from-markdown": "^2.0.0", "mdast-util-to-markdown": "^2.0.0" } }, "sha512-IrtvNvjxC1o06taBAVJznEnkiHxLFTzgonUdy8hzFVeDun0uTjxxrRGVaNFqkU1wJR3RBPEfsxmU6jDWPofrTQ=="],
|
||||
|
||||
"mdast-util-phrasing": ["mdast-util-phrasing@4.1.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "unist-util-is": "^6.0.0" } }, "sha512-TqICwyvJJpBwvGAMZjj4J2n0X8QWp21b9l0o7eXyVJ25YNWYbJDVIyD1bZXE6WtV6RmKJVYmQAKWa0zWOABz2w=="],
|
||||
|
||||
"mdast-util-to-markdown": ["mdast-util-to-markdown@2.1.2", "", { "dependencies": { "@types/mdast": "^4.0.0", "@types/unist": "^3.0.0", "longest-streak": "^3.0.0", "mdast-util-phrasing": "^4.0.0", "mdast-util-to-string": "^4.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-decode-string": "^2.0.0", "unist-util-visit": "^5.0.0", "zwitch": "^2.0.0" } }, "sha512-xj68wMTvGXVOKonmog6LwyJKrYXZPvlwabaryTjLh9LuvovB/KAH+kvi8Gjj+7rJjsFi23nkUxRQv1KqSroMqA=="],
|
||||
|
||||
"mdast-util-to-string": ["mdast-util-to-string@4.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0" } }, "sha512-0H44vDimn51F0YwvxSJSm0eCDOJTRlmN0R1yBh4HLj9wiV1Dn0QoXGbvFAWj2hSItVTlCmBF1hqKlIyUBVFLPg=="],
|
||||
|
||||
"micromark": ["micromark@4.0.2", "", { "dependencies": { "@types/debug": "^4.0.0", "debug": "^4.0.0", "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "micromark-core-commonmark": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-combine-extensions": "^2.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-encode": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-subtokenize": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-zpe98Q6kvavpCr1NPVSCMebCKfD7CA2NqZ+rykeNhONIJBpc1tFKt9hucLGwha3jNTNI8lHpctWJWoimVF4PfA=="],
|
||||
|
||||
"micromark-core-commonmark": ["micromark-core-commonmark@2.0.3", "", { "dependencies": { "decode-named-character-reference": "^1.0.0", "devlop": "^1.0.0", "micromark-factory-destination": "^2.0.0", "micromark-factory-label": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-factory-title": "^2.0.0", "micromark-factory-whitespace": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-html-tag-name": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-subtokenize": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-RDBrHEMSxVFLg6xvnXmb1Ayr2WzLAWjeSATAoxwKYJV94TeNavgoIdA0a9ytzDSVzBy2YKFK+emCPOEibLeCrg=="],
|
||||
|
||||
"micromark-extension-frontmatter": ["micromark-extension-frontmatter@2.0.0", "", { "dependencies": { "fault": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-C4AkuM3dA58cgZha7zVnuVxBhDsbttIMiytjgsM2XbHAB2faRVaHRle40558FBN+DJcrLNCoqG5mlrpdU4cRtg=="],
|
||||
|
||||
"micromark-extension-gfm": ["micromark-extension-gfm@3.0.0", "", { "dependencies": { "micromark-extension-gfm-autolink-literal": "^2.0.0", "micromark-extension-gfm-footnote": "^2.0.0", "micromark-extension-gfm-strikethrough": "^2.0.0", "micromark-extension-gfm-table": "^2.0.0", "micromark-extension-gfm-tagfilter": "^2.0.0", "micromark-extension-gfm-task-list-item": "^2.0.0", "micromark-util-combine-extensions": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-vsKArQsicm7t0z2GugkCKtZehqUm31oeGBV/KVSorWSy8ZlNAv7ytjFhvaryUiCUJYqs+NoE6AFhpQvBTM6Q4w=="],
|
||||
|
||||
"micromark-extension-gfm-autolink-literal": ["micromark-extension-gfm-autolink-literal@2.1.0", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-oOg7knzhicgQ3t4QCjCWgTmfNhvQbDDnJeVu9v81r7NltNCVmhPy1fJRX27pISafdjL+SVc4d3l48Gb6pbRypw=="],
|
||||
|
||||
"micromark-extension-gfm-footnote": ["micromark-extension-gfm-footnote@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-core-commonmark": "^2.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-normalize-identifier": "^2.0.0", "micromark-util-sanitize-uri": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-/yPhxI1ntnDNsiHtzLKYnE3vf9JZ6cAisqVDauhp4CEHxlb4uoOTxOCJ+9s51bIB8U1N1FJ1RXOKTIlD5B/gqw=="],
|
||||
|
||||
"micromark-extension-gfm-strikethrough": ["micromark-extension-gfm-strikethrough@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-classify-character": "^2.0.0", "micromark-util-resolve-all": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-ADVjpOOkjz1hhkZLlBiYA9cR2Anf8F4HqZUO6e5eDcPQd0Txw5fxLzzxnEkSkfnD0wziSGiv7sYhk/ktvbf1uw=="],
|
||||
|
||||
"micromark-extension-gfm-table": ["micromark-extension-gfm-table@2.1.1", "", { "dependencies": { "devlop": "^1.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-t2OU/dXXioARrC6yWfJ4hqB7rct14e8f7m0cbI5hUmDyyIlwv5vEtooptH8INkbLzOatzKuVbQmAYcbWoyz6Dg=="],
|
||||
|
||||
"micromark-extension-gfm-tagfilter": ["micromark-extension-gfm-tagfilter@2.0.0", "", { "dependencies": { "micromark-util-types": "^2.0.0" } }, "sha512-xHlTOmuCSotIA8TW1mDIM6X2O1SiX5P9IuDtqGonFhEK0qgRI4yeC6vMxEV2dgyr2TiD+2PQ10o+cOhdVAcwfg=="],
|
||||
|
||||
"micromark-extension-gfm-task-list-item": ["micromark-extension-gfm-task-list-item@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-qIBZhqxqI6fjLDYFTBIa4eivDMnP+OZqsNwmQ3xNLE4Cxwc+zfQEfbs6tzAo2Hjq+bh6q5F+Z8/cksrLFYWQQw=="],
|
||||
|
||||
"micromark-factory-destination": ["micromark-factory-destination@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-Xe6rDdJlkmbFRExpTOmRj9N3MaWmbAgdpSrBQvCFqhezUn4AHqJHbaEnfbVYYiexVSs//tqOdY/DxhjdCiJnIA=="],
|
||||
|
||||
"micromark-factory-label": ["micromark-factory-label@2.0.1", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-VFMekyQExqIW7xIChcXn4ok29YE3rnuyveW3wZQWWqF4Nv9Wk5rgJ99KzPvHjkmPXF93FXIbBp6YdW3t71/7Vg=="],
|
||||
|
||||
"micromark-factory-space": ["micromark-factory-space@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-zRkxjtBxxLd2Sc0d+fbnEunsTj46SWXgXciZmHq0kDYGnck/ZSGj9/wULTV95uoeYiK5hRXP2mJ98Uo4cq/LQg=="],
|
||||
|
||||
"micromark-factory-title": ["micromark-factory-title@2.0.1", "", { "dependencies": { "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-5bZ+3CjhAd9eChYTHsjy6TGxpOFSKgKKJPJxr293jTbfry2KDoWkhBb6TcPVB4NmzaPhMs1Frm9AZH7OD4Cjzw=="],
|
||||
|
||||
"micromark-factory-whitespace": ["micromark-factory-whitespace@2.0.1", "", { "dependencies": { "micromark-factory-space": "^2.0.0", "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-Ob0nuZ3PKt/n0hORHyvoD9uZhr+Za8sFoP+OnMcnWK5lngSzALgQYKMr9RJVOWLqQYuyn6ulqGWSXdwf6F80lQ=="],
|
||||
|
||||
"micromark-util-character": ["micromark-util-character@2.1.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-wv8tdUTJ3thSFFFJKtpYKOYiGP2+v96Hvk4Tu8KpCAsTMs6yi+nVmGh1syvSCsaxz45J6Jbw+9DD6g97+NV67Q=="],
|
||||
|
||||
"micromark-util-chunked": ["micromark-util-chunked@2.0.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-QUNFEOPELfmvv+4xiNg2sRYeS/P84pTW0TCgP5zc9FpXetHY0ab7SxKyAQCNCc1eK0459uoLI1y5oO5Vc1dbhA=="],
|
||||
|
||||
"micromark-util-classify-character": ["micromark-util-classify-character@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-K0kHzM6afW/MbeWYWLjoHQv1sgg2Q9EccHEDzSkxiP/EaagNzCm7T/WMKZ3rjMbvIpvBiZgwR3dKMygtA4mG1Q=="],
|
||||
|
||||
"micromark-util-combine-extensions": ["micromark-util-combine-extensions@2.0.1", "", { "dependencies": { "micromark-util-chunked": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-OnAnH8Ujmy59JcyZw8JSbK9cGpdVY44NKgSM7E9Eh7DiLS2E9RNQf0dONaGDzEG9yjEl5hcqeIsj4hfRkLH/Bg=="],
|
||||
|
||||
"micromark-util-decode-numeric-character-reference": ["micromark-util-decode-numeric-character-reference@2.0.2", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-ccUbYk6CwVdkmCQMyr64dXz42EfHGkPQlBj5p7YVGzq8I7CtjXZJrubAYezf7Rp+bjPseiROqe7G6foFd+lEuw=="],
|
||||
|
||||
"micromark-util-decode-string": ["micromark-util-decode-string@2.0.1", "", { "dependencies": { "decode-named-character-reference": "^1.0.0", "micromark-util-character": "^2.0.0", "micromark-util-decode-numeric-character-reference": "^2.0.0", "micromark-util-symbol": "^2.0.0" } }, "sha512-nDV/77Fj6eH1ynwscYTOsbK7rR//Uj0bZXBwJZRfaLEJ1iGBR6kIfNmlNqaqJf649EP0F3NWNdeJi03elllNUQ=="],
|
||||
|
||||
"micromark-util-encode": ["micromark-util-encode@2.0.1", "", {}, "sha512-c3cVx2y4KqUnwopcO9b/SCdo2O67LwJJ/UyqGfbigahfegL9myoEFoDYZgkT7f36T0bLrM9hZTAaAyH+PCAXjw=="],
|
||||
|
||||
"micromark-util-html-tag-name": ["micromark-util-html-tag-name@2.0.1", "", {}, "sha512-2cNEiYDhCWKI+Gs9T0Tiysk136SnR13hhO8yW6BGNyhOC4qYFnwF1nKfD3HFAIXA5c45RrIG1ub11GiXeYd1xA=="],
|
||||
|
||||
"micromark-util-normalize-identifier": ["micromark-util-normalize-identifier@2.0.1", "", { "dependencies": { "micromark-util-symbol": "^2.0.0" } }, "sha512-sxPqmo70LyARJs0w2UclACPUUEqltCkJ6PhKdMIDuJ3gSf/Q+/GIe3WKl0Ijb/GyH9lOpUkRAO2wp0GVkLvS9Q=="],
|
||||
|
||||
"micromark-util-resolve-all": ["micromark-util-resolve-all@2.0.1", "", { "dependencies": { "micromark-util-types": "^2.0.0" } }, "sha512-VdQyxFWFT2/FGJgwQnJYbe1jjQoNTS4RjglmSjTUlpUMa95Htx9NHeYW4rGDJzbjvCsl9eLjMQwGeElsqmzcHg=="],
|
||||
|
||||
"micromark-util-sanitize-uri": ["micromark-util-sanitize-uri@2.0.1", "", { "dependencies": { "micromark-util-character": "^2.0.0", "micromark-util-encode": "^2.0.0", "micromark-util-symbol": "^2.0.0" } }, "sha512-9N9IomZ/YuGGZZmQec1MbgxtlgougxTodVwDzzEouPKo3qFWvymFHWcnDi2vzV1ff6kas9ucW+o3yzJK9YB1AQ=="],
|
||||
|
||||
"micromark-util-subtokenize": ["micromark-util-subtokenize@2.1.0", "", { "dependencies": { "devlop": "^1.0.0", "micromark-util-chunked": "^2.0.0", "micromark-util-symbol": "^2.0.0", "micromark-util-types": "^2.0.0" } }, "sha512-XQLu552iSctvnEcgXw6+Sx75GflAPNED1qx7eBJ+wydBb2KCbRZe+NwvIEEMM83uml1+2WSXpBAcp9IUCgCYWA=="],
|
||||
|
||||
"micromark-util-symbol": ["micromark-util-symbol@2.0.1", "", {}, "sha512-vs5t8Apaud9N28kgCrRUdEed4UJ+wWNvicHLPxCa9ENlYuAY31M0ETy5y1vA33YoNPDFTghEbnh6efaE8h4x0Q=="],
|
||||
|
||||
"micromark-util-types": ["micromark-util-types@2.0.2", "", {}, "sha512-Yw0ECSpJoViF1qTU4DC6NwtC4aWGt1EkzaQB8KPPyCRR8z9TWeV0HbEFGTO+ZY1wB22zmxnJqhPyTpOVCpeHTA=="],
|
||||
|
||||
"ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="],
|
||||
|
||||
"remark-frontmatter": ["remark-frontmatter@5.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-frontmatter": "^2.0.0", "micromark-extension-frontmatter": "^2.0.0", "unified": "^11.0.0" } }, "sha512-XTFYvNASMe5iPN0719nPrdItC9aU0ssC4v14mH1BCi1u0n1gAocqcujWUrByftZTbLhRtiKRyjYTSIOcr69UVQ=="],
|
||||
|
||||
"remark-gfm": ["remark-gfm@4.0.1", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-gfm": "^3.0.0", "micromark-extension-gfm": "^3.0.0", "remark-parse": "^11.0.0", "remark-stringify": "^11.0.0", "unified": "^11.0.0" } }, "sha512-1quofZ2RQ9EWdeN34S79+KExV1764+wCUGop5CPL1WGdD0ocPpu91lzPGbwWMECpEpd42kJGQwzRfyov9j4yNg=="],
|
||||
|
||||
"remark-parse": ["remark-parse@11.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-from-markdown": "^2.0.0", "micromark-util-types": "^2.0.0", "unified": "^11.0.0" } }, "sha512-FCxlKLNGknS5ba/1lmpYijMUzX2esxW5xQqjWxw2eHFfS2MSdaHVINFmhjo+qN1WhZhNimq0dZATN9pH0IDrpA=="],
|
||||
|
||||
"remark-stringify": ["remark-stringify@11.0.0", "", { "dependencies": { "@types/mdast": "^4.0.0", "mdast-util-to-markdown": "^2.0.0", "unified": "^11.0.0" } }, "sha512-1OSmLd3awB/t8qdoEOMazZkNsfVTeY4fTsgzcQFdXNq8ToTN4ZGwrMnlda4K6smTFKD+GRV6O48i6Z4iKgPPpw=="],
|
||||
|
||||
"trough": ["trough@2.2.0", "", {}, "sha512-tmMpK00BjZiUyVyvrBK7knerNgmgvcV/KLVyuma/SC+TQN167GrMRciANTz09+k3zW8L8t60jWO1GpfkZdjTaw=="],
|
||||
|
||||
"unified": ["unified@11.0.5", "", { "dependencies": { "@types/unist": "^3.0.0", "bail": "^2.0.0", "devlop": "^1.0.0", "extend": "^3.0.0", "is-plain-obj": "^4.0.0", "trough": "^2.0.0", "vfile": "^6.0.0" } }, "sha512-xKvGhPWw3k84Qjh8bI3ZeJjqnyadK+GEFtazSfZv/rKeTkTjOJho6mFqh2SM96iIcZokxiOpg78GazTSg8+KHA=="],
|
||||
|
||||
"unist-util-is": ["unist-util-is@6.0.1", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-LsiILbtBETkDz8I9p1dQ0uyRUWuaQzd/cuEeS1hoRSyW5E5XGmTzlwY1OrNzzakGowI9Dr/I8HVaw4hTtnxy8g=="],
|
||||
|
||||
"unist-util-stringify-position": ["unist-util-stringify-position@4.0.0", "", { "dependencies": { "@types/unist": "^3.0.0" } }, "sha512-0ASV06AAoKCDkS2+xw5RXJywruurpbC4JZSm7nr7MOt1ojAzvyyaO+UxZf18j8FCF6kmzCZKcAgN/yu2gm2XgQ=="],
|
||||
|
||||
"unist-util-visit": ["unist-util-visit@5.1.0", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0", "unist-util-visit-parents": "^6.0.0" } }, "sha512-m+vIdyeCOpdr/QeQCu2EzxX/ohgS8KbnPDgFni4dQsfSCtpz8UqDyY5GjRru8PDKuYn7Fq19j1CQ+nJSsGKOzg=="],
|
||||
|
||||
"unist-util-visit-parents": ["unist-util-visit-parents@6.0.2", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-is": "^6.0.0" } }, "sha512-goh1s1TBrqSqukSc8wrjwWhL0hiJxgA8m4kFxGlQ+8FYQ3C/m11FcTs4YYem7V664AhHVvgoQLk890Ssdsr2IQ=="],
|
||||
|
||||
"vfile": ["vfile@6.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "vfile-message": "^4.0.0" } }, "sha512-KzIbH/9tXat2u30jf+smMwFCsno4wHVdNmzFyL+T/L3UGqqk6JKfVqOFOZEpZSHADH1k40ab6NUIXZq422ov3Q=="],
|
||||
|
||||
"vfile-message": ["vfile-message@4.0.3", "", { "dependencies": { "@types/unist": "^3.0.0", "unist-util-stringify-position": "^4.0.0" } }, "sha512-QTHzsGd1EhbZs4AsQ20JX1rC3cOlt/IWJruk893DfLRr57lcnOeMaWG4K0JrRta4mIJZKth2Au3mM3u03/JWKw=="],
|
||||
|
||||
"zwitch": ["zwitch@2.0.4", "", {}, "sha512-bXE4cR/kVZhKZX/RjPEflHaKVhUVl85noU3v6b8apfQEc1x4A+zBxjZ4lN8LqGd6WZ3dl98pY4o717VFmoPp+A=="],
|
||||
"uc.micro": ["uc.micro@2.1.0", "", {}, "sha512-ARDJmphmdvUk6Glw7y9DQ2bFkKBHwQHLi2lsaH6PPmz/Ka9sFOBsBluozhDltWmnv9u/cF6Rt87znRTPV+yp/A=="],
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,137 +1,335 @@
|
|||
import { readFileSync, writeFileSync, mkdirSync } from "fs"
|
||||
import { basename, dirname, join } from "path"
|
||||
import { unified } from "unified"
|
||||
import remarkParse from "remark-parse"
|
||||
import remarkGfm from "remark-gfm"
|
||||
import remarkFrontmatter from "remark-frontmatter"
|
||||
import remarkStringify from "remark-stringify"
|
||||
import type { Root, Content } from "mdast"
|
||||
import { mkdirSync, readFileSync, writeFileSync } from "fs"
|
||||
import { dirname, join } from "path"
|
||||
import MarkdownIt from "markdown-it"
|
||||
|
||||
const args = process.argv.slice(2)
|
||||
const file = args.find(a => !a.startsWith("--"))
|
||||
const maxWords = parseInt(args[args.indexOf("--max-words") + 1] || "5000")
|
||||
const outputDir = args.indexOf("--output-dir") !== -1 ? args[args.indexOf("--output-dir") + 1] : ""
|
||||
type BlockKind =
|
||||
| "heading"
|
||||
| "thematicBreak"
|
||||
| "html"
|
||||
| "code"
|
||||
| "flow"
|
||||
|
||||
if (!file) {
|
||||
console.error("Usage: chunk.ts <file> [--max-words 5000]")
|
||||
process.exit(1)
|
||||
interface Block {
|
||||
kind: BlockKind
|
||||
md: string
|
||||
words: number
|
||||
}
|
||||
|
||||
const content = readFileSync(file, "utf-8")
|
||||
interface Chunk {
|
||||
blocks: Block[]
|
||||
words: number
|
||||
}
|
||||
|
||||
const tree = unified()
|
||||
.use(remarkParse)
|
||||
.use(remarkGfm)
|
||||
.use(remarkFrontmatter, ["yaml"])
|
||||
.parse(content)
|
||||
export interface ChunkCliOptions {
|
||||
file: string
|
||||
maxWords: number
|
||||
outputDir: string
|
||||
}
|
||||
|
||||
const stringify = unified()
|
||||
.use(remarkStringify, { bullet: "-", emphasis: "*", strong: "*" })
|
||||
.use(remarkGfm)
|
||||
.use(remarkFrontmatter, ["yaml"])
|
||||
export interface ChunkResult {
|
||||
source: string
|
||||
chunks: number
|
||||
output_dir: string
|
||||
frontmatter: boolean
|
||||
words_per_chunk: number[]
|
||||
}
|
||||
|
||||
function nodeToMd(node: Content): string {
|
||||
const root: Root = { type: "root", children: [node] }
|
||||
return stringify.stringify(root).trim()
|
||||
const parser = new MarkdownIt({ html: true })
|
||||
|
||||
export function formatChunkUsage(command: string): string {
|
||||
return `Usage: ${command} <file> [--max-words 5000] [--output-dir <dir>]`
|
||||
}
|
||||
|
||||
export function runChunkCli(args: string[], command = "chunk.ts"): number {
|
||||
const parsed = parseChunkCliArgs(args)
|
||||
|
||||
if ("help" in parsed) {
|
||||
console.log(formatChunkUsage(command))
|
||||
return 0
|
||||
}
|
||||
|
||||
if ("error" in parsed) {
|
||||
console.error(parsed.error)
|
||||
console.error(formatChunkUsage(command))
|
||||
return 1
|
||||
}
|
||||
|
||||
const result = chunkMarkdownFile(parsed.file, {
|
||||
maxWords: parsed.maxWords,
|
||||
outputDir: parsed.outputDir,
|
||||
})
|
||||
|
||||
console.log(JSON.stringify(result))
|
||||
return 0
|
||||
}
|
||||
|
||||
export function chunkMarkdownFile(
|
||||
file: string,
|
||||
options: { maxWords?: number; outputDir?: string } = {}
|
||||
): ChunkResult {
|
||||
const maxWords = options.maxWords ?? 5000
|
||||
const outputDir = options.outputDir ?? ""
|
||||
|
||||
const rawContent = normalizeNewlines(readFileSync(file, "utf-8"))
|
||||
const { frontmatter, body } = extractFrontmatter(rawContent)
|
||||
const chunks = buildChunks(parseMarkdown(body), maxWords)
|
||||
|
||||
const dir = outputDir ? join(outputDir, "chunks") : join(dirname(file), "chunks")
|
||||
mkdirSync(dir, { recursive: true })
|
||||
|
||||
if (frontmatter) {
|
||||
writeFileSync(join(dir, "frontmatter.md"), frontmatter)
|
||||
}
|
||||
|
||||
chunks.forEach((chunk, index) => {
|
||||
const num = String(index + 1).padStart(2, "0")
|
||||
writeFileSync(join(dir, `chunk-${num}.md`), chunk.blocks.map(block => block.md).join("\n\n"))
|
||||
})
|
||||
|
||||
return {
|
||||
source: file,
|
||||
chunks: chunks.length,
|
||||
output_dir: dir,
|
||||
frontmatter: Boolean(frontmatter),
|
||||
words_per_chunk: chunks.map(chunk => chunk.words),
|
||||
}
|
||||
}
|
||||
|
||||
function parseChunkCliArgs(args: string[]):
|
||||
| ChunkCliOptions
|
||||
| { help: true }
|
||||
| { error: string } {
|
||||
let file = ""
|
||||
let maxWords = 5000
|
||||
let outputDir = ""
|
||||
|
||||
for (let index = 0; index < args.length; index += 1) {
|
||||
const arg = args[index]
|
||||
|
||||
if (arg === "-h" || arg === "--help") {
|
||||
return { help: true }
|
||||
}
|
||||
|
||||
if (arg === "--max-words") {
|
||||
const value = args[index + 1]
|
||||
if (!value) return { error: "Missing value for --max-words" }
|
||||
maxWords = parsePositiveInt(value, 0)
|
||||
if (maxWords <= 0) return { error: `Invalid --max-words value: ${value}` }
|
||||
index += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (arg === "--output-dir") {
|
||||
const value = args[index + 1]
|
||||
if (!value) return { error: "Missing value for --output-dir" }
|
||||
outputDir = value
|
||||
index += 1
|
||||
continue
|
||||
}
|
||||
|
||||
if (arg.startsWith("-")) {
|
||||
return { error: `Unknown option: ${arg}` }
|
||||
}
|
||||
|
||||
if (!file) {
|
||||
file = arg
|
||||
continue
|
||||
}
|
||||
|
||||
return { error: `Unexpected positional argument: ${arg}` }
|
||||
}
|
||||
|
||||
if (!file) {
|
||||
return { error: "Missing input file" }
|
||||
}
|
||||
|
||||
return { file, maxWords, outputDir }
|
||||
}
|
||||
|
||||
function parsePositiveInt(value: string | undefined, fallback: number): number {
|
||||
if (!value) return fallback
|
||||
const parsed = Number.parseInt(value, 10)
|
||||
return Number.isFinite(parsed) && parsed > 0 ? parsed : fallback
|
||||
}
|
||||
|
||||
function normalizeNewlines(text: string): string {
|
||||
return text.replace(/^\uFEFF/, "").replace(/\r\n?/g, "\n")
|
||||
}
|
||||
|
||||
function trimBoundaryBlankLines(text: string): string {
|
||||
return text.replace(/^\n+/, "").replace(/\n+$/, "")
|
||||
}
|
||||
|
||||
function extractFrontmatter(content: string): { frontmatter: string; body: string } {
|
||||
const lines = content.split("\n")
|
||||
if (lines[0] !== "---") {
|
||||
return { frontmatter: "", body: content }
|
||||
}
|
||||
|
||||
for (let index = 1; index < lines.length; index += 1) {
|
||||
if (lines[index] === "---" || lines[index] === "...") {
|
||||
return {
|
||||
frontmatter: lines.slice(0, index + 1).join("\n"),
|
||||
body: lines.slice(index + 1).join("\n").replace(/^\n+/, ""),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { frontmatter: "", body: content }
|
||||
}
|
||||
|
||||
function parseMarkdown(content: string): Block[] {
|
||||
if (!content.trim()) return []
|
||||
|
||||
const lines = content.split("\n")
|
||||
const tokens = parser.parse(content, {})
|
||||
const blocks: Block[] = []
|
||||
|
||||
for (const token of tokens) {
|
||||
if (!token.map || token.level !== 0) continue
|
||||
if (token.nesting !== 1 && token.nesting !== 0) continue
|
||||
|
||||
const [startLine, endLine] = token.map
|
||||
const md = trimBoundaryBlankLines(lines.slice(startLine, endLine).join("\n"))
|
||||
if (!md) continue
|
||||
|
||||
blocks.push(makeBlock(tokenTypeToBlockKind(token.type), md))
|
||||
}
|
||||
|
||||
if (blocks.length === 0) {
|
||||
const body = trimBoundaryBlankLines(content)
|
||||
if (body) {
|
||||
blocks.push(makeBlock("flow", body))
|
||||
}
|
||||
}
|
||||
|
||||
return blocks
|
||||
}
|
||||
|
||||
function tokenTypeToBlockKind(tokenType: string): BlockKind {
|
||||
if (tokenType === "heading_open") return "heading"
|
||||
if (tokenType === "hr") return "thematicBreak"
|
||||
if (tokenType === "html_block") return "html"
|
||||
if (tokenType === "fence" || tokenType === "code_block") return "code"
|
||||
return "flow"
|
||||
}
|
||||
|
||||
function makeBlock(kind: BlockKind, md: string): Block {
|
||||
return {
|
||||
kind,
|
||||
md: trimBoundaryBlankLines(md),
|
||||
words: countWords(md),
|
||||
}
|
||||
}
|
||||
|
||||
function buildChunks(blocks: Block[], maxWordsPerChunk: number): Chunk[] {
|
||||
const sections = splitIntoSections(blocks)
|
||||
const normalizedBlocks: Block[] = []
|
||||
|
||||
for (const section of sections) {
|
||||
const sectionWords = section.reduce((sum, block) => sum + block.words, 0)
|
||||
if (sectionWords <= maxWordsPerChunk) {
|
||||
normalizedBlocks.push(makeBlock("flow", section.map(block => block.md).join("\n\n")))
|
||||
continue
|
||||
}
|
||||
|
||||
for (const block of section) {
|
||||
normalizedBlocks.push(...splitOversizedBlock(block, maxWordsPerChunk))
|
||||
}
|
||||
}
|
||||
|
||||
const chunks: Chunk[] = []
|
||||
let currentBlocks: Block[] = []
|
||||
let currentWords = 0
|
||||
|
||||
for (const block of normalizedBlocks) {
|
||||
if (currentWords + block.words > maxWordsPerChunk && currentBlocks.length > 0) {
|
||||
chunks.push({ blocks: currentBlocks, words: currentWords })
|
||||
currentBlocks = [block]
|
||||
currentWords = block.words
|
||||
continue
|
||||
}
|
||||
|
||||
currentBlocks.push(block)
|
||||
currentWords += block.words
|
||||
}
|
||||
|
||||
if (currentBlocks.length > 0) {
|
||||
chunks.push({ blocks: currentBlocks, words: currentWords })
|
||||
}
|
||||
|
||||
return chunks
|
||||
}
|
||||
|
||||
function splitIntoSections(blocks: Block[]): Block[][] {
|
||||
const sections: Block[][] = []
|
||||
let current: Block[] = []
|
||||
|
||||
for (const block of blocks) {
|
||||
if (block.kind === "heading" && current.length > 0) {
|
||||
sections.push(current)
|
||||
current = [block]
|
||||
continue
|
||||
}
|
||||
|
||||
current.push(block)
|
||||
}
|
||||
|
||||
if (current.length > 0) {
|
||||
sections.push(current)
|
||||
}
|
||||
|
||||
return sections
|
||||
}
|
||||
|
||||
function splitOversizedBlock(block: Block, maxWordsPerChunk: number): Block[] {
|
||||
if (block.words <= maxWordsPerChunk) return [block]
|
||||
|
||||
if (
|
||||
block.kind === "heading"
|
||||
|| block.kind === "thematicBreak"
|
||||
|| block.kind === "html"
|
||||
|| block.kind === "code"
|
||||
) {
|
||||
return [block]
|
||||
}
|
||||
|
||||
const lines = block.md.split("\n")
|
||||
if (lines.length <= 1) {
|
||||
return [block]
|
||||
}
|
||||
|
||||
const splitBlocks: Block[] = []
|
||||
let buffer: string[] = []
|
||||
let bufferWords = 0
|
||||
|
||||
for (const line of lines) {
|
||||
const lineWords = countWords(line)
|
||||
if (bufferWords + lineWords > maxWordsPerChunk && buffer.length > 0) {
|
||||
splitBlocks.push(makeBlock(block.kind, buffer.join("\n")))
|
||||
buffer = [line]
|
||||
bufferWords = lineWords
|
||||
continue
|
||||
}
|
||||
|
||||
buffer.push(line)
|
||||
bufferWords += lineWords
|
||||
}
|
||||
|
||||
if (buffer.length > 0) {
|
||||
splitBlocks.push(makeBlock(block.kind, buffer.join("\n")))
|
||||
}
|
||||
|
||||
return splitBlocks
|
||||
}
|
||||
|
||||
function countWords(text: string): number {
|
||||
const cleaned = text.replace(/[#*`\[\]()>|_~-]/g, " ")
|
||||
const cjk = cleaned.match(/[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff]/g)
|
||||
const latin = cleaned.match(/[a-zA-Z0-9]+/g)
|
||||
return (cjk?.length || 0) + (latin?.length || 0)
|
||||
return (cjk?.length ?? 0) + (latin?.length ?? 0)
|
||||
}
|
||||
|
||||
interface Block {
|
||||
md: string
|
||||
words: number
|
||||
if (import.meta.main) {
|
||||
process.exit(runChunkCli(process.argv.slice(2), process.argv[1] ?? "chunk.ts"))
|
||||
}
|
||||
|
||||
function splitNodeToBlocks(node: Content): Block[] {
|
||||
const md = nodeToMd(node)
|
||||
const words = countWords(md)
|
||||
|
||||
if (words <= maxWords) return [{ md, words }]
|
||||
|
||||
if (node.type === "heading" || node.type === "thematicBreak" || node.type === "html") {
|
||||
return [{ md, words }]
|
||||
}
|
||||
|
||||
if ("children" in node && Array.isArray(node.children)) {
|
||||
const blocks: Block[] = []
|
||||
for (const child of node.children as Content[]) {
|
||||
blocks.push(...splitNodeToBlocks(child))
|
||||
}
|
||||
return blocks
|
||||
}
|
||||
|
||||
const lines = md.split("\n")
|
||||
if (lines.length > 1) {
|
||||
const blocks: Block[] = []
|
||||
let buf: string[] = []
|
||||
let bufWords = 0
|
||||
for (const line of lines) {
|
||||
const lw = countWords(line)
|
||||
if (bufWords + lw > maxWords && buf.length > 0) {
|
||||
blocks.push({ md: buf.join("\n"), words: bufWords })
|
||||
buf = [line]
|
||||
bufWords = lw
|
||||
} else {
|
||||
buf.push(line)
|
||||
bufWords += lw
|
||||
}
|
||||
}
|
||||
if (buf.length > 0) blocks.push({ md: buf.join("\n"), words: bufWords })
|
||||
return blocks
|
||||
}
|
||||
|
||||
return [{ md, words }]
|
||||
}
|
||||
|
||||
let frontmatter = ""
|
||||
const blocks: Block[] = []
|
||||
|
||||
for (const node of tree.children) {
|
||||
if (node.type === "yaml") {
|
||||
frontmatter = `---\n${node.value}\n---`
|
||||
continue
|
||||
}
|
||||
blocks.push(...splitNodeToBlocks(node as Content))
|
||||
}
|
||||
|
||||
const chunks: { blocks: Block[]; words: number }[] = []
|
||||
let cur: Block[] = []
|
||||
let curWords = 0
|
||||
|
||||
for (const b of blocks) {
|
||||
if (curWords + b.words > maxWords && cur.length > 0) {
|
||||
chunks.push({ blocks: cur, words: curWords })
|
||||
cur = [b]
|
||||
curWords = b.words
|
||||
} else {
|
||||
cur.push(b)
|
||||
curWords += b.words
|
||||
}
|
||||
}
|
||||
if (cur.length > 0) chunks.push({ blocks: cur, words: curWords })
|
||||
|
||||
const dir = outputDir ? join(outputDir, "chunks") : join(dirname(file), "chunks")
|
||||
mkdirSync(dir, { recursive: true })
|
||||
|
||||
if (frontmatter) {
|
||||
writeFileSync(join(dir, "frontmatter.md"), frontmatter)
|
||||
}
|
||||
|
||||
chunks.forEach((chunk, i) => {
|
||||
const num = String(i + 1).padStart(2, "0")
|
||||
const out = join(dir, `chunk-${num}.md`)
|
||||
writeFileSync(out, chunk.blocks.map(b => b.md).join("\n\n"))
|
||||
})
|
||||
|
||||
console.log(JSON.stringify({
|
||||
source: file,
|
||||
chunks: chunks.length,
|
||||
output_dir: dir,
|
||||
frontmatter: !!frontmatter,
|
||||
words_per_chunk: chunks.map(c => c.words)
|
||||
}))
|
||||
|
|
|
|||
|
|
@ -0,0 +1,55 @@
|
|||
#!/usr/bin/env bun
|
||||
import path from "node:path"
|
||||
import process from "node:process"
|
||||
import { runChunkCli } from "./chunk.js"
|
||||
|
||||
function formatScriptCommand(fallback: string): string {
|
||||
const raw = process.argv[1]
|
||||
const displayPath = raw
|
||||
? (() => {
|
||||
const relative = path.relative(process.cwd(), raw)
|
||||
return relative && !relative.startsWith("..") ? relative : raw
|
||||
})()
|
||||
: fallback
|
||||
|
||||
const quotedPath = displayPath.includes(" ")
|
||||
? `"${displayPath.replace(/"/g, '\\"')}"`
|
||||
: displayPath
|
||||
|
||||
return `npx -y bun ${quotedPath}`
|
||||
}
|
||||
|
||||
function printUsage(exitCode: number): never {
|
||||
const cmd = formatScriptCommand("scripts/main.ts")
|
||||
console.log(`Baoyu Translate CLI
|
||||
|
||||
Usage:
|
||||
${cmd} <file> [--max-words 5000] [--output-dir <dir>]
|
||||
${cmd} chunk <file> [--max-words 5000] [--output-dir <dir>]
|
||||
|
||||
Commands:
|
||||
chunk Split markdown into chunks
|
||||
|
||||
Options:
|
||||
--max-words <n> Maximum words per chunk (default: 5000)
|
||||
--output-dir <dir> Write chunks into <dir>/chunks/
|
||||
-h, --help Show help
|
||||
`)
|
||||
process.exit(exitCode)
|
||||
}
|
||||
|
||||
const args = process.argv.slice(2)
|
||||
|
||||
if (args.length === 0) {
|
||||
printUsage(1)
|
||||
}
|
||||
|
||||
if (args[0] === "-h" || args[0] === "--help") {
|
||||
printUsage(0)
|
||||
}
|
||||
|
||||
if (args[0] === "chunk") {
|
||||
process.exit(runChunkCli(args.slice(1), `${formatScriptCommand("scripts/main.ts")} chunk`))
|
||||
}
|
||||
|
||||
process.exit(runChunkCli(args, formatScriptCommand("scripts/main.ts")))
|
||||
|
|
@ -1,9 +1,7 @@
|
|||
{
|
||||
"name": "baoyu-translate-chunk",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"remark-frontmatter": "^5.0.0",
|
||||
"remark-gfm": "^4.0.1",
|
||||
"remark-parse": "^11.0.0",
|
||||
"remark-stringify": "^11.0.0",
|
||||
"unified": "^11.0.5"
|
||||
"markdown-it": "14.1.1"
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue