From 5560db595ac0286176d08a9c0cd420568be4450e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= Date: Fri, 6 Mar 2026 21:18:21 -0600 Subject: [PATCH] feat(baoyu-url-to-markdown): add HTML snapshot saving and Defuddle fallback pipeline - Save rendered HTML as sibling -captured.html file alongside markdown - Defuddle-first conversion with automatic fallback to legacy Readability/selector extractor - Add rawHtml, conversionMethod, fallbackReason to ConversionResult - Log converter method and fallback reason in CLI output --- README.md | 2 +- README.zh.md | 2 +- skills/baoyu-url-to-markdown/SKILL.md | 34 +- skills/baoyu-url-to-markdown/scripts/bun.lock | 191 ++++ .../scripts/html-to-markdown.ts | 893 +++++++++++++++++- skills/baoyu-url-to-markdown/scripts/main.ts | 15 +- .../scripts/package.json | 13 + 7 files changed, 1130 insertions(+), 20 deletions(-) create mode 100644 skills/baoyu-url-to-markdown/scripts/bun.lock create mode 100644 skills/baoyu-url-to-markdown/scripts/package.json diff --git a/README.md b/README.md index ccfe2d3..9ad3043 100644 --- a/README.md +++ b/README.md @@ -677,7 +677,7 @@ Utility tools for content processing. #### baoyu-url-to-markdown -Fetch any URL via Chrome CDP and convert to clean markdown. Supports two capture modes for different scenarios. +Fetch any URL via Chrome CDP and convert to clean markdown. Saves rendered HTML snapshot alongside the markdown, and automatically falls back to a legacy extractor when Defuddle fails. ```bash # Auto mode (default) - capture when page loads diff --git a/README.zh.md b/README.zh.md index c593550..5861d51 100644 --- a/README.zh.md +++ b/README.zh.md @@ -677,7 +677,7 @@ AI 驱动的生成后端。 #### baoyu-url-to-markdown -通过 Chrome CDP 抓取任意 URL 并转换为干净的 Markdown。支持两种抓取模式,适应不同场景。 +通过 Chrome CDP 抓取任意 URL 并转换为 Markdown。同时保存渲染后的 HTML 快照,Defuddle 失败时自动回退到旧版提取器。 ```bash # 自动模式(默认)- 页面加载后立即抓取 diff --git a/skills/baoyu-url-to-markdown/SKILL.md b/skills/baoyu-url-to-markdown/SKILL.md index 2f5bf05..7b3d72d 100644 --- a/skills/baoyu-url-to-markdown/SKILL.md +++ b/skills/baoyu-url-to-markdown/SKILL.md @@ -1,11 +1,11 @@ --- name: baoyu-url-to-markdown -description: Fetch any URL and convert to markdown using Chrome CDP. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown. +description: Fetch any URL and convert to markdown using Chrome CDP. Saves the rendered HTML snapshot alongside the markdown, and automatically falls back to the pre-Defuddle HTML-to-Markdown pipeline when Defuddle fails. Supports two modes - auto-capture on page load, or wait for user signal (for pages requiring login). Use when user wants to save a webpage as markdown. --- # URL to Markdown -Fetches any URL via Chrome CDP and converts HTML to clean markdown. +Fetches any URL via Chrome CDP, saves the rendered HTML snapshot, and converts it to clean markdown. ## Script Directory @@ -21,6 +21,7 @@ Fetches any URL via Chrome CDP and converts HTML to clean markdown. | Script | Purpose | |--------|---------| | `scripts/main.ts` | CLI entry point for URL fetching | +| `scripts/html-to-markdown.ts` | Defuddle-first conversion with automatic legacy fallback | ## Preferences (EXTEND.md) @@ -101,7 +102,9 @@ Full reference: [references/config/first-time-setup.md](references/config/first- - Chrome CDP for full JavaScript rendering - Two capture modes: auto or wait-for-user +- Save rendered HTML as a sibling `-captured.html` file - Clean markdown output with metadata +- Defuddle-first markdown conversion with automatic fallback to the pre-Defuddle extractor from git history - Handles login-required pages via wait mode - Download images and videos to local directories @@ -149,13 +152,23 @@ ${BUN_X} ${SKILL_DIR}/scripts/main.ts --download-media ## Output Format -YAML front matter with `url`, `title`, `description`, `author`, `published`, `captured_at` fields, followed by converted markdown content. +Each run saves two files side by side: + +- Markdown: YAML front matter with `url`, `title`, `description`, `author`, `published`, optional `coverImage`, and `captured_at`, followed by converted markdown content +- HTML snapshot: `*-captured.html`, containing the rendered page HTML captured from Chrome + +The HTML snapshot is saved before any markdown media localization, so it stays a faithful capture of the page DOM used for conversion. ## Output Directory Default: `url-to-markdown//.md` With `--output-dir ./posts/`: `./posts//.md` +HTML snapshot path uses the same basename: + +- `url-to-markdown//-captured.html` +- `./posts//-captured.html` + - ``: From page title or URL path (kebab-case, 2-6 words) - Conflict resolution: Append timestamp `-YYYYMMDD-HHMMSS.md` @@ -164,6 +177,19 @@ When `--download-media` is enabled: - Videos are saved to `videos/` next to the markdown file - Markdown media links are rewritten to local relative paths +## Conversion Fallback + +Conversion order: + +1. Try Defuddle first +2. If Defuddle throws, cannot load, returns obviously incomplete markdown, or captures lower-quality content than the legacy pipeline, automatically fall back to the pre-Defuddle extractor +3. The fallback path uses the older Readability/selector/Next.js-data based HTML-to-Markdown implementation recovered from git history + +CLI output will show: + +- `Converter: defuddle` when Defuddle succeeds +- `Converter: legacy:...` plus `Fallback used: ...` when fallback was needed + ## Media Download Workflow Based on `download_media` setting in EXTEND.md: @@ -193,7 +219,7 @@ Based on `download_media` setting in EXTEND.md: | `URL_DATA_DIR` | Custom data directory | | `URL_CHROME_PROFILE_DIR` | Custom Chrome profile directory | -**Troubleshooting**: Chrome not found → set `URL_CHROME_PATH`. Timeout → increase `--timeout`. Complex pages → try `--wait` mode. +**Troubleshooting**: Chrome not found → set `URL_CHROME_PATH`. Timeout → increase `--timeout`. Complex pages → try `--wait` mode. If markdown quality is poor, inspect the saved `-captured.html` and check whether the run logged a legacy fallback. ## Extension Support diff --git a/skills/baoyu-url-to-markdown/scripts/bun.lock b/skills/baoyu-url-to-markdown/scripts/bun.lock new file mode 100644 index 0000000..c167bc3 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/bun.lock @@ -0,0 +1,191 @@ +{ + "lockfileVersion": 1, + "workspaces": { + "": { + "name": "baoyu-url-to-markdown-scripts", + "dependencies": { + "@mozilla/readability": "^0.6.0", + "defuddle": "^0.10.0", + "jsdom": "^24.1.3", + "linkedom": "^0.18.12", + "turndown": "^7.2.2", + "turndown-plugin-gfm": "^1.0.2", + }, + }, + }, + "packages": { + "@asamuzakjp/css-color": ["@asamuzakjp/css-color@3.2.0", "", { "dependencies": { "@csstools/css-calc": "^2.1.3", "@csstools/css-color-parser": "^3.0.9", "@csstools/css-parser-algorithms": "^3.0.4", "@csstools/css-tokenizer": "^3.0.3", "lru-cache": "^10.4.3" } }, "sha512-K1A6z8tS3XsmCMM86xoWdn7Fkdn9m6RSVtocUrJYIwZnFVkng/PvkEoWtOWmP+Scc6saYWHWZYbndEEXxl24jw=="], + + "@csstools/color-helpers": ["@csstools/color-helpers@5.1.0", "", {}, "sha512-S11EXWJyy0Mz5SYvRmY8nJYTFFd1LCNV+7cXyAgQtOOuzb4EsgfqDufL+9esx72/eLhsRdGZwaldu/h+E4t4BA=="], + + "@csstools/css-calc": ["@csstools/css-calc@2.1.4", "", { "peerDependencies": { "@csstools/css-parser-algorithms": "^3.0.5", "@csstools/css-tokenizer": "^3.0.4" } }, "sha512-3N8oaj+0juUw/1H3YwmDDJXCgTB1gKU6Hc/bB502u9zR0q2vd786XJH9QfrKIEgFlZmhZiq6epXl4rHqhzsIgQ=="], + + "@csstools/css-color-parser": ["@csstools/css-color-parser@3.1.0", "", { "dependencies": { "@csstools/color-helpers": "^5.1.0", "@csstools/css-calc": "^2.1.4" }, "peerDependencies": { "@csstools/css-parser-algorithms": "^3.0.5", "@csstools/css-tokenizer": "^3.0.4" } }, "sha512-nbtKwh3a6xNVIp/VRuXV64yTKnb1IjTAEEh3irzS+HkKjAOYLTGNb9pmVNntZ8iVBHcWDA2Dof0QtPgFI1BaTA=="], + + "@csstools/css-parser-algorithms": ["@csstools/css-parser-algorithms@3.0.5", "", { "peerDependencies": { "@csstools/css-tokenizer": "^3.0.4" } }, "sha512-DaDeUkXZKjdGhgYaHNJTV9pV7Y9B3b644jCLs9Upc3VeNGg6LWARAT6O+Q+/COo+2gg/bM5rhpMAtf70WqfBdQ=="], + + "@csstools/css-tokenizer": ["@csstools/css-tokenizer@3.0.4", "", {}, "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw=="], + + "@mixmark-io/domino": ["@mixmark-io/domino@2.2.0", "", {}, "sha512-Y28PR25bHXUg88kCV7nivXrP2Nj2RueZ3/l/jdx6J9f8J4nsEGcgX0Qe6lt7Pa+J79+kPiJU3LguR6O/6zrLOw=="], + + "@mozilla/readability": ["@mozilla/readability@0.6.0", "", {}, "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ=="], + + "@xmldom/xmldom": ["@xmldom/xmldom@0.8.11", "", {}, "sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw=="], + + "agent-base": ["agent-base@7.1.4", "", {}, "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ=="], + + "asynckit": ["asynckit@0.4.0", "", {}, "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q=="], + + "boolbase": ["boolbase@1.0.0", "", {}, "sha512-JZOSA7Mo9sNGB8+UjSgzdLtokWAky1zbztM3WRLCbZ70/3cTANmQmOdR7y2g+J0e2WXywy1yS468tY+IruqEww=="], + + "call-bind-apply-helpers": ["call-bind-apply-helpers@1.0.2", "", { "dependencies": { "es-errors": "^1.3.0", "function-bind": "^1.1.2" } }, "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ=="], + + "combined-stream": ["combined-stream@1.0.8", "", { "dependencies": { "delayed-stream": "~1.0.0" } }, "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg=="], + + "commander": ["commander@12.1.0", "", {}, "sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA=="], + + "css-select": ["css-select@5.2.2", "", { "dependencies": { "boolbase": "^1.0.0", "css-what": "^6.1.0", "domhandler": "^5.0.2", "domutils": "^3.0.1", "nth-check": "^2.0.1" } }, "sha512-TizTzUddG/xYLA3NXodFM0fSbNizXjOKhqiQQwvhlspadZokn1KDy0NZFS0wuEubIYAV5/c1/lAr0TaaFXEXzw=="], + + "css-what": ["css-what@6.2.2", "", {}, "sha512-u/O3vwbptzhMs3L1fQE82ZSLHQQfto5gyZzwteVIEyeaY5Fc7R4dapF/BvRoSYFeqfBk4m0V1Vafq5Pjv25wvA=="], + + "cssom": ["cssom@0.5.0", "", {}, "sha512-iKuQcq+NdHqlAcwUY0o/HL69XQrUaQdMjmStJ8JFmUaiiQErlhrmuigkg/CU4E2J0IyUKUrMAgl36TvN67MqTw=="], + + "cssstyle": ["cssstyle@4.6.0", "", { "dependencies": { "@asamuzakjp/css-color": "^3.2.0", "rrweb-cssom": "^0.8.0" } }, "sha512-2z+rWdzbbSZv6/rhtvzvqeZQHrBaqgogqt85sqFNbabZOuFbCVFb8kPeEtZjiKkbrm395irpNKiYeFeLiQnFPg=="], + + "data-urls": ["data-urls@5.0.0", "", { "dependencies": { "whatwg-mimetype": "^4.0.0", "whatwg-url": "^14.0.0" } }, "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg=="], + + "debug": ["debug@4.4.3", "", { "dependencies": { "ms": "^2.1.3" } }, "sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA=="], + + "decimal.js": ["decimal.js@10.6.0", "", {}, "sha512-YpgQiITW3JXGntzdUmyUR1V812Hn8T1YVXhCu+wO3OpS4eU9l4YdD3qjyiKdV6mvV29zapkMeD390UVEf2lkUg=="], + + "defuddle": ["defuddle@0.10.0", "", { "dependencies": { "commander": "^12.1.0" }, "optionalDependencies": { "mathml-to-latex": "^1.5.0", "temml": "^0.13.1", "turndown": "^7.2.0" }, "peerDependencies": { "jsdom": "^24.0.0" }, "bin": { "defuddle": "dist/cli.js" } }, "sha512-a43juTtHv6Vs4+sxvahVLM5NxoyDsarO1Ag3UxLORI4Fo/nsNFwzDxuQBvosKVGTIRxCwN/mfnWAzNXmQfieqw=="], + + "delayed-stream": ["delayed-stream@1.0.0", "", {}, "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ=="], + + "dom-serializer": ["dom-serializer@2.0.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.2", "entities": "^4.2.0" } }, "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg=="], + + "domelementtype": ["domelementtype@2.3.0", "", {}, "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw=="], + + "domhandler": ["domhandler@5.0.3", "", { "dependencies": { "domelementtype": "^2.3.0" } }, "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w=="], + + "domutils": ["domutils@3.2.2", "", { "dependencies": { "dom-serializer": "^2.0.0", "domelementtype": "^2.3.0", "domhandler": "^5.0.3" } }, "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw=="], + + "dunder-proto": ["dunder-proto@1.0.1", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.1", "es-errors": "^1.3.0", "gopd": "^1.2.0" } }, "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A=="], + + "entities": ["entities@6.0.1", "", {}, "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g=="], + + "es-define-property": ["es-define-property@1.0.1", "", {}, "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g=="], + + "es-errors": ["es-errors@1.3.0", "", {}, "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw=="], + + "es-object-atoms": ["es-object-atoms@1.1.1", "", { "dependencies": { "es-errors": "^1.3.0" } }, "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA=="], + + "es-set-tostringtag": ["es-set-tostringtag@2.1.0", "", { "dependencies": { "es-errors": "^1.3.0", "get-intrinsic": "^1.2.6", "has-tostringtag": "^1.0.2", "hasown": "^2.0.2" } }, "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA=="], + + "form-data": ["form-data@4.0.5", "", { "dependencies": { "asynckit": "^0.4.0", "combined-stream": "^1.0.8", "es-set-tostringtag": "^2.1.0", "hasown": "^2.0.2", "mime-types": "^2.1.12" } }, "sha512-8RipRLol37bNs2bhoV67fiTEvdTrbMUYcFTiy3+wuuOnUog2QBHCZWXDRijWQfAkhBj2Uf5UnVaiWwA5vdd82w=="], + + "function-bind": ["function-bind@1.1.2", "", {}, "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA=="], + + "get-intrinsic": ["get-intrinsic@1.3.0", "", { "dependencies": { "call-bind-apply-helpers": "^1.0.2", "es-define-property": "^1.0.1", "es-errors": "^1.3.0", "es-object-atoms": "^1.1.1", "function-bind": "^1.1.2", "get-proto": "^1.0.1", "gopd": "^1.2.0", "has-symbols": "^1.1.0", "hasown": "^2.0.2", "math-intrinsics": "^1.1.0" } }, "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ=="], + + "get-proto": ["get-proto@1.0.1", "", { "dependencies": { "dunder-proto": "^1.0.1", "es-object-atoms": "^1.0.0" } }, "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g=="], + + "gopd": ["gopd@1.2.0", "", {}, "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg=="], + + "has-symbols": ["has-symbols@1.1.0", "", {}, "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ=="], + + "has-tostringtag": ["has-tostringtag@1.0.2", "", { "dependencies": { "has-symbols": "^1.0.3" } }, "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw=="], + + "hasown": ["hasown@2.0.2", "", { "dependencies": { "function-bind": "^1.1.2" } }, "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ=="], + + "html-encoding-sniffer": ["html-encoding-sniffer@4.0.0", "", { "dependencies": { "whatwg-encoding": "^3.1.1" } }, "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ=="], + + "html-escaper": ["html-escaper@3.0.3", "", {}, "sha512-RuMffC89BOWQoY0WKGpIhn5gX3iI54O6nRA0yC124NYVtzjmFWBIiFd8M0x+ZdX0P9R4lADg1mgP8C7PxGOWuQ=="], + + "htmlparser2": ["htmlparser2@10.1.0", "", { "dependencies": { "domelementtype": "^2.3.0", "domhandler": "^5.0.3", "domutils": "^3.2.2", "entities": "^7.0.1" } }, "sha512-VTZkM9GWRAtEpveh7MSF6SjjrpNVNNVJfFup7xTY3UpFtm67foy9HDVXneLtFVt4pMz5kZtgNcvCniNFb1hlEQ=="], + + "http-proxy-agent": ["http-proxy-agent@7.0.2", "", { "dependencies": { "agent-base": "^7.1.0", "debug": "^4.3.4" } }, "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig=="], + + "https-proxy-agent": ["https-proxy-agent@7.0.6", "", { "dependencies": { "agent-base": "^7.1.2", "debug": "4" } }, "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw=="], + + "iconv-lite": ["iconv-lite@0.6.3", "", { "dependencies": { "safer-buffer": ">= 2.1.2 < 3.0.0" } }, "sha512-4fCk79wshMdzMp2rH06qWrJE4iolqLhCUH+OiuIgU++RB0+94NlDL81atO7GX55uUKueo0txHNtvEyI6D7WdMw=="], + + "is-potential-custom-element-name": ["is-potential-custom-element-name@1.0.1", "", {}, "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ=="], + + "jsdom": ["jsdom@24.1.3", "", { "dependencies": { "cssstyle": "^4.0.1", "data-urls": "^5.0.0", "decimal.js": "^10.4.3", "form-data": "^4.0.0", "html-encoding-sniffer": "^4.0.0", "http-proxy-agent": "^7.0.2", "https-proxy-agent": "^7.0.5", "is-potential-custom-element-name": "^1.0.1", "nwsapi": "^2.2.12", "parse5": "^7.1.2", "rrweb-cssom": "^0.7.1", "saxes": "^6.0.0", "symbol-tree": "^3.2.4", "tough-cookie": "^4.1.4", "w3c-xmlserializer": "^5.0.0", "webidl-conversions": "^7.0.0", "whatwg-encoding": "^3.1.1", "whatwg-mimetype": "^4.0.0", "whatwg-url": "^14.0.0", "ws": "^8.18.0", "xml-name-validator": "^5.0.0" }, "peerDependencies": { "canvas": "^2.11.2" }, "optionalPeers": ["canvas"] }, "sha512-MyL55p3Ut3cXbeBEG7Hcv0mVM8pp8PBNWxRqchZnSfAiES1v1mRnMeFfaHWIPULpwsYfvO+ZmMZz5tGCnjzDUQ=="], + + "linkedom": ["linkedom@0.18.12", "", { "dependencies": { "css-select": "^5.1.0", "cssom": "^0.5.0", "html-escaper": "^3.0.3", "htmlparser2": "^10.0.0", "uhyphen": "^0.2.0" }, "peerDependencies": { "canvas": ">= 2" }, "optionalPeers": ["canvas"] }, "sha512-jalJsOwIKuQJSeTvsgzPe9iJzyfVaEJiEXl+25EkKevsULHvMJzpNqwvj1jOESWdmgKDiXObyjOYwlUqG7wo1Q=="], + + "lru-cache": ["lru-cache@10.4.3", "", {}, "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ=="], + + "math-intrinsics": ["math-intrinsics@1.1.0", "", {}, "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g=="], + + "mathml-to-latex": ["mathml-to-latex@1.5.0", "", { "dependencies": { "@xmldom/xmldom": "^0.8.10" } }, "sha512-rrWn0eEvcEcdMM4xfHcSGIy+i01DX9byOdXTLWg+w1iJ6O6ohP5UXY1dVzNUZLhzfl3EGcRekWLhY7JT5Omaew=="], + + "mime-db": ["mime-db@1.52.0", "", {}, "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg=="], + + "mime-types": ["mime-types@2.1.35", "", { "dependencies": { "mime-db": "1.52.0" } }, "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw=="], + + "ms": ["ms@2.1.3", "", {}, "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA=="], + + "nth-check": ["nth-check@2.1.1", "", { "dependencies": { "boolbase": "^1.0.0" } }, "sha512-lqjrjmaOoAnWfMmBPL+XNnynZh2+swxiX3WUE0s4yEHI6m+AwrK2UZOimIRl3X/4QctVqS8AiZjFqyOGrMXb/w=="], + + "nwsapi": ["nwsapi@2.2.23", "", {}, "sha512-7wfH4sLbt4M0gCDzGE6vzQBo0bfTKjU7Sfpqy/7gs1qBfYz2vEJH6vXcBKpO3+6Yu1telwd0t9HpyOoLEQQbIQ=="], + + "parse5": ["parse5@7.3.0", "", { "dependencies": { "entities": "^6.0.0" } }, "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw=="], + + "psl": ["psl@1.15.0", "", { "dependencies": { "punycode": "^2.3.1" } }, "sha512-JZd3gMVBAVQkSs6HdNZo9Sdo0LNcQeMNP3CozBJb3JYC/QUYZTnKxP+f8oWRX4rHP5EurWxqAHTSwUCjlNKa1w=="], + + "punycode": ["punycode@2.3.1", "", {}, "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg=="], + + "querystringify": ["querystringify@2.2.0", "", {}, "sha512-FIqgj2EUvTa7R50u0rGsyTftzjYmv/a3hO345bZNrqabNqjtgiDMgmo4mkUjd+nzU5oF3dClKqFIPUKybUyqoQ=="], + + "requires-port": ["requires-port@1.0.0", "", {}, "sha512-KigOCHcocU3XODJxsu8i/j8T9tzT4adHiecwORRQ0ZZFcp7ahwXuRU1m+yuO90C5ZUyGeGfocHDI14M3L3yDAQ=="], + + "rrweb-cssom": ["rrweb-cssom@0.7.1", "", {}, "sha512-TrEMa7JGdVm0UThDJSx7ddw5nVm3UJS9o9CCIZ72B1vSyEZoziDqBYP3XIoi/12lKrJR8rE3jeFHMok2F/Mnsg=="], + + "safer-buffer": ["safer-buffer@2.1.2", "", {}, "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg=="], + + "saxes": ["saxes@6.0.0", "", { "dependencies": { "xmlchars": "^2.2.0" } }, "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA=="], + + "symbol-tree": ["symbol-tree@3.2.4", "", {}, "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw=="], + + "temml": ["temml@0.13.1", "", {}, "sha512-/fL1utq8QUD9YpcLeZHPRnp9Cbzbexq5hZl5uSBhf8mNYiKkcS4eYbLidDB+/nF8C+RHAcBQbKw2bKoS83mz1Q=="], + + "tough-cookie": ["tough-cookie@4.1.4", "", { "dependencies": { "psl": "^1.1.33", "punycode": "^2.1.1", "universalify": "^0.2.0", "url-parse": "^1.5.3" } }, "sha512-Loo5UUvLD9ScZ6jh8beX1T6sO1w2/MpCRpEP7V280GKMVUQ0Jzar2U3UJPsrdbziLEMMhu3Ujnq//rhiFuIeag=="], + + "tr46": ["tr46@5.1.1", "", { "dependencies": { "punycode": "^2.3.1" } }, "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw=="], + + "turndown": ["turndown@7.2.2", "", { "dependencies": { "@mixmark-io/domino": "^2.2.0" } }, "sha512-1F7db8BiExOKxjSMU2b7if62D/XOyQyZbPKq/nUwopfgnHlqXHqQ0lvfUTeUIr1lZJzOPFn43dODyMSIfvWRKQ=="], + + "turndown-plugin-gfm": ["turndown-plugin-gfm@1.0.2", "", {}, "sha512-vwz9tfvF7XN/jE0dGoBei3FXWuvll78ohzCZQuOb+ZjWrs3a0XhQVomJEb2Qh4VHTPNRO4GPZh0V7VRbiWwkRg=="], + + "uhyphen": ["uhyphen@0.2.0", "", {}, "sha512-qz3o9CHXmJJPGBdqzab7qAYuW8kQGKNEuoHFYrBwV6hWIMcpAmxDLXojcHfFr9US1Pe6zUswEIJIbLI610fuqA=="], + + "universalify": ["universalify@0.2.0", "", {}, "sha512-CJ1QgKmNg3CwvAv/kOFmtnEN05f0D/cn9QntgNOQlQF9dgvVTHj3t+8JPdjqawCHk7V/KA+fbUqzZ9XWhcqPUg=="], + + "url-parse": ["url-parse@1.5.10", "", { "dependencies": { "querystringify": "^2.1.1", "requires-port": "^1.0.0" } }, "sha512-WypcfiRhfeUP9vvF0j6rw0J3hrWrw6iZv3+22h6iRMJ/8z1Tj6XfLP4DsUix5MhMPnXpiHDoKyoZ/bdCkwBCiQ=="], + + "w3c-xmlserializer": ["w3c-xmlserializer@5.0.0", "", { "dependencies": { "xml-name-validator": "^5.0.0" } }, "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA=="], + + "webidl-conversions": ["webidl-conversions@7.0.0", "", {}, "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g=="], + + "whatwg-encoding": ["whatwg-encoding@3.1.1", "", { "dependencies": { "iconv-lite": "0.6.3" } }, "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ=="], + + "whatwg-mimetype": ["whatwg-mimetype@4.0.0", "", {}, "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg=="], + + "whatwg-url": ["whatwg-url@14.2.0", "", { "dependencies": { "tr46": "^5.1.0", "webidl-conversions": "^7.0.0" } }, "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw=="], + + "ws": ["ws@8.19.0", "", { "peerDependencies": { "bufferutil": "^4.0.1", "utf-8-validate": ">=5.0.2" }, "optionalPeers": ["bufferutil", "utf-8-validate"] }, "sha512-blAT2mjOEIi0ZzruJfIhb3nps74PRWTCz1IjglWEEpQl5XS/UNama6u2/rjFkDDouqr4L67ry+1aGIALViWjDg=="], + + "xml-name-validator": ["xml-name-validator@5.0.0", "", {}, "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg=="], + + "xmlchars": ["xmlchars@2.2.0", "", {}, "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw=="], + + "cssstyle/rrweb-cssom": ["rrweb-cssom@0.8.0", "", {}, "sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw=="], + + "dom-serializer/entities": ["entities@4.5.0", "", {}, "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw=="], + + "htmlparser2/entities": ["entities@7.0.1", "", {}, "sha512-TWrgLOFUQTH994YUyl1yT4uyavY5nNB5muff+RtWaqNVCAK408b5ZnnbNAUEWLTCpum9w6arT70i1XdQ4UeOPA=="], + } +} diff --git a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts index ff2f6ed..f93af43 100644 --- a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts +++ b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts @@ -1,5 +1,7 @@ -import { JSDOM } from "jsdom"; -import { Defuddle } from "defuddle/node"; +import { parseHTML } from "linkedom"; +import { Readability } from "@mozilla/readability"; +import TurndownService from "turndown"; +import { gfm } from "turndown-plugin-gfm"; export interface PageMetadata { url: string; @@ -14,8 +16,103 @@ export interface PageMetadata { export interface ConversionResult { metadata: PageMetadata; markdown: string; + rawHtml: string; + conversionMethod: string; + fallbackReason?: string; } +interface ExtractionCandidate { + title: string | null; + byline: string | null; + excerpt: string | null; + published: string | null; + html: string | null; + textContent: string; + method: string; +} + +type AnyRecord = Record; + +const MIN_CONTENT_LENGTH = 120; +const GOOD_CONTENT_LENGTH = 900; + +const CONTENT_SELECTORS = [ + "article", + "main article", + "[role='main'] article", + "[itemprop='articleBody']", + ".article-content", + ".article-body", + ".post-content", + ".entry-content", + ".story-body", + "main", + "[role='main']", + "#content", + ".content", +]; + +const REMOVE_SELECTORS = [ + "script", + "style", + "noscript", + "template", + "iframe", + "svg", + "path", + "nav", + "aside", + "footer", + "header", + "form", + ".advertisement", + ".ads", + ".social-share", + ".related-articles", + ".comments", + ".newsletter", + ".cookie-banner", + ".cookie-consent", + "[role='navigation']", + "[aria-label*='cookie' i]", +]; + +const PUBLISHED_TIME_SELECTORS = [ + "meta[property='article:published_time']", + "meta[name='pubdate']", + "meta[name='publishdate']", + "meta[name='date']", + "time[datetime]", +]; + +const ARTICLE_TYPES = new Set([ + "Article", + "NewsArticle", + "BlogPosting", + "WebPage", + "ReportageNewsArticle", +]); + +const NEXT_DATA_CONTENT_PATHS = [ + "props.pageProps.content.body", + "props.pageProps.article.body", + "props.pageProps.article.content", + "props.pageProps.post.body", + "props.pageProps.post.content", + "props.pageProps.data.body", + "props.pageProps.story.body.content", +]; + +const LOW_QUALITY_MARKERS = [ + /Join The Conversation/i, + /One Community\. Many Voices/i, + /Read our community guidelines/i, + /Create a free account to share your thoughts/i, + /Become a Forbes Member/i, + /Subscribe to trusted journalism/i, + /\bComments\b/i, +]; + export const absolutizeUrlsScript = String.raw` (function() { const baseUrl = document.baseURI || location.href; @@ -53,21 +150,791 @@ export const absolutizeUrlsScript = String.raw` })() `; -export async function extractContent(html: string, url: string): Promise { - const dom = new JSDOM(html, { url }); - const result = await Defuddle(dom, url, { markdown: true }); +function pickString(...values: unknown[]): string | null { + for (const value of values) { + if (typeof value === "string") { + const trimmed = value.trim(); + if (trimmed) return trimmed; + } + } + return null; +} - const metadata: PageMetadata = { +function normalizeMarkdown(markdown: string): string { + return markdown + .replace(/\r\n/g, "\n") + .replace(/[ \t]+\n/g, "\n") + .replace(/\n{3,}/g, "\n\n") + .trim(); +} + +function parseDocument(html: string): Document { + const normalized = /<\s*html[\s>]/i.test(html) + ? html + : `${html}`; + return parseHTML(normalized).document as unknown as Document; +} + +function sanitizeHtml(html: string): string { + const { document } = parseHTML(`
${html}
`); + const root = document.querySelector("#__root"); + if (!root) return html; + + for (const selector of ["script", "style", "iframe", "noscript", "template", "svg", "path"]) { + for (const el of root.querySelectorAll(selector)) { + el.remove(); + } + } + + return root.innerHTML; +} + +function extractTextFromHtml(html: string): string { + const { document } = parseHTML(`${html}`); + for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { + for (const el of document.querySelectorAll(selector)) { + el.remove(); + } + } + return document.body?.textContent?.replace(/\s+/g, " ").trim() ?? ""; +} + +function getMetaContent(document: Document, names: string[]): string | null { + for (const name of names) { + const element = + document.querySelector(`meta[name="${name}"]`) ?? + document.querySelector(`meta[property="${name}"]`); + const content = element?.getAttribute("content"); + if (content && content.trim()) return content.trim(); + } + return null; +} + +function flattenJsonLdItems(data: unknown): AnyRecord[] { + if (!data || typeof data !== "object") return []; + if (Array.isArray(data)) return data.flatMap(flattenJsonLdItems); + + const item = data as AnyRecord; + if (Array.isArray(item["@graph"])) { + return (item["@graph"] as unknown[]).flatMap(flattenJsonLdItems); + } + + return [item]; +} + +function parseJsonLdScripts(document: Document): AnyRecord[] { + const results: AnyRecord[] = []; + const scripts = document.querySelectorAll("script[type='application/ld+json']"); + + for (const script of scripts) { + try { + const data = JSON.parse(script.textContent ?? ""); + results.push(...flattenJsonLdItems(data)); + } catch { + // Ignore malformed blocks. + } + } + + return results; +} + +function isArticleType(item: AnyRecord): boolean { + const value = Array.isArray(item["@type"]) ? item["@type"][0] : item["@type"]; + return typeof value === "string" && ARTICLE_TYPES.has(value); +} + +function extractAuthorFromJsonLd(authorData: unknown): string | null { + if (typeof authorData === "string") return authorData; + if (!authorData || typeof authorData !== "object") return null; + + if (Array.isArray(authorData)) { + const names = authorData + .map((author) => extractAuthorFromJsonLd(author)) + .filter((name): name is string => Boolean(name)); + return names.length > 0 ? names.join(", ") : null; + } + + const author = authorData as AnyRecord; + return typeof author.name === "string" ? author.name : null; +} + +function extractPrimaryJsonLdMeta(document: Document): Partial { + for (const item of parseJsonLdScripts(document)) { + if (!isArticleType(item)) continue; + + return { + title: pickString(item.headline, item.name) ?? undefined, + description: pickString(item.description) ?? undefined, + author: extractAuthorFromJsonLd(item.author) ?? undefined, + published: pickString(item.datePublished, item.dateCreated) ?? undefined, + coverImage: + pickString( + item.image, + (item.image as AnyRecord | undefined)?.url, + (Array.isArray(item.image) ? item.image[0] : undefined) as unknown + ) ?? undefined, + }; + } + + return {}; +} + +function extractPublishedTime(document: Document): string | null { + for (const selector of PUBLISHED_TIME_SELECTORS) { + const el = document.querySelector(selector); + if (!el) continue; + const value = el.getAttribute("content") ?? el.getAttribute("datetime"); + if (value && value.trim()) return value.trim(); + } + return null; +} + +function extractTitle(document: Document): string | null { + const ogTitle = document.querySelector("meta[property='og:title']")?.getAttribute("content"); + if (ogTitle && ogTitle.trim()) return ogTitle.trim(); + + const twitterTitle = document.querySelector("meta[name='twitter:title']")?.getAttribute("content"); + if (twitterTitle && twitterTitle.trim()) return twitterTitle.trim(); + + const title = document.querySelector("title")?.textContent?.trim(); + if (title) { + const cleaned = title.split(/\s*[-|–—]\s*/)[0]?.trim(); + if (cleaned) return cleaned; + } + + const h1 = document.querySelector("h1")?.textContent?.trim(); + return h1 || null; +} + +function extractMetadataFromHtml(html: string, url: string, capturedAt: string): PageMetadata { + const document = parseDocument(html); + const jsonLd = extractPrimaryJsonLdMeta(document); + const timeEl = document.querySelector("time[datetime]"); + + return { url, - title: result.title || "", - description: result.description || undefined, - author: result.author || undefined, - published: result.published || undefined, - coverImage: result.image || undefined, - captured_at: new Date().toISOString(), + title: + pickString( + getMetaContent(document, ["og:title", "twitter:title"]), + jsonLd.title, + document.querySelector("h1")?.textContent, + document.title + ) ?? "", + description: + pickString( + getMetaContent(document, ["description", "og:description", "twitter:description"]), + jsonLd.description + ) ?? undefined, + author: + pickString( + getMetaContent(document, ["author", "article:author", "twitter:creator"]), + jsonLd.author + ) ?? undefined, + published: + pickString( + timeEl?.getAttribute("datetime"), + getMetaContent(document, ["article:published_time", "datePublished", "publishdate", "date"]), + jsonLd.published, + extractPublishedTime(document) + ) ?? undefined, + coverImage: + pickString( + getMetaContent(document, ["og:image", "twitter:image", "twitter:image:src"]), + jsonLd.coverImage + ) ?? undefined, + captured_at: capturedAt, }; +} - return { metadata, markdown: result.content || "" }; +function generateExcerpt(excerpt: string | null, textContent: string | null): string | null { + if (excerpt) return excerpt; + if (!textContent) return null; + const trimmed = textContent.trim(); + if (!trimmed) return null; + return trimmed.length > 200 ? `${trimmed.slice(0, 200)}...` : trimmed; +} + +function parseJsonLdItem(item: AnyRecord): ExtractionCandidate | null { + if (!isArticleType(item)) return null; + + const rawContent = + (typeof item.articleBody === "string" && item.articleBody) || + (typeof item.text === "string" && item.text) || + (typeof item.description === "string" && item.description) || + null; + + if (!rawContent) return null; + + const content = rawContent.trim(); + const htmlLike = /<\/?[a-z][\s\S]*>/i.test(content); + const textContent = htmlLike ? extractTextFromHtml(content) : content; + + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(item.headline, item.name), + byline: extractAuthorFromJsonLd(item.author), + excerpt: pickString(item.description), + published: pickString(item.datePublished, item.dateCreated), + html: htmlLike ? content : null, + textContent, + method: "json-ld", + }; +} + +function tryJsonLdExtraction(document: Document): ExtractionCandidate | null { + for (const item of parseJsonLdScripts(document)) { + const extracted = parseJsonLdItem(item); + if (extracted) return extracted; + } + return null; +} + +function getByPath(value: unknown, path: string): unknown { + let current = value; + for (const part of path.split(".")) { + if (!current || typeof current !== "object") return undefined; + current = (current as AnyRecord)[part]; + } + return current; +} + +function isContentBlockArray(value: unknown): value is AnyRecord[] { + if (!Array.isArray(value) || value.length === 0) return false; + return value.slice(0, 5).some((item) => { + if (!item || typeof item !== "object") return false; + const obj = item as AnyRecord; + return "type" in obj || "text" in obj || "textHtml" in obj || "content" in obj; + }); +} + +function extractTextFromContentBlocks(blocks: AnyRecord[]): string { + const parts: string[] = []; + + function pushParagraph(text: string): void { + const trimmed = text.trim(); + if (!trimmed) return; + parts.push(trimmed, "\n\n"); + } + + function walk(node: unknown): void { + if (!node || typeof node !== "object") return; + const block = node as AnyRecord; + + if (typeof block.text === "string") { + pushParagraph(block.text); + return; + } + + if (typeof block.textHtml === "string") { + pushParagraph(extractTextFromHtml(block.textHtml)); + return; + } + + if (Array.isArray(block.items)) { + for (const item of block.items) { + if (item && typeof item === "object") { + const text = pickString((item as AnyRecord).text); + if (text) parts.push(`- ${text}\n`); + } + } + parts.push("\n"); + } + + if (Array.isArray(block.components)) { + for (const component of block.components) { + walk(component); + } + } + + if (Array.isArray(block.content)) { + for (const child of block.content) { + walk(child); + } + } + } + + for (const block of blocks) { + walk(block); + } + + return parts.join("").replace(/\n{3,}/g, "\n\n").trim(); +} + +function tryStringBodyExtraction( + content: string, + meta: AnyRecord, + document: Document, + method: string +): ExtractionCandidate | null { + if (!content || content.length < MIN_CONTENT_LENGTH) return null; + + const isHtml = /<\/?[a-z][\s\S]*>/i.test(content); + const html = isHtml ? sanitizeHtml(content) : null; + const textContent = isHtml ? extractTextFromHtml(html) : content.trim(); + + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(meta.headline, meta.title, extractTitle(document)), + byline: pickString(meta.byline, meta.author), + excerpt: pickString(meta.description, meta.excerpt, generateExcerpt(null, textContent)), + published: pickString(meta.datePublished, meta.publishedAt, extractPublishedTime(document)), + html, + textContent, + method, + }; +} + +function tryNextDataExtraction(document: Document): ExtractionCandidate | null { + try { + const script = document.querySelector("script#__NEXT_DATA__"); + if (!script?.textContent) return null; + + const data = JSON.parse(script.textContent) as AnyRecord; + const pageProps = (getByPath(data, "props.pageProps") ?? {}) as AnyRecord; + + for (const path of NEXT_DATA_CONTENT_PATHS) { + const value = getByPath(data, path); + + if (typeof value === "string") { + const parentPath = path.split(".").slice(0, -1).join("."); + const parent = (getByPath(data, parentPath) ?? {}) as AnyRecord; + const meta = { + ...pageProps, + ...parent, + title: parent.title ?? (pageProps.title as string | undefined), + }; + + const candidate = tryStringBodyExtraction(value, meta, document, "next-data"); + if (candidate) return candidate; + } + + if (isContentBlockArray(value)) { + const textContent = extractTextFromContentBlocks(value); + if (textContent.length < MIN_CONTENT_LENGTH) continue; + + return { + title: pickString( + getByPath(data, "props.pageProps.content.headline"), + getByPath(data, "props.pageProps.article.headline"), + getByPath(data, "props.pageProps.article.title"), + getByPath(data, "props.pageProps.post.title"), + pageProps.title, + extractTitle(document) + ), + byline: pickString( + getByPath(data, "props.pageProps.author.name"), + getByPath(data, "props.pageProps.article.author.name") + ), + excerpt: pickString( + getByPath(data, "props.pageProps.content.description"), + getByPath(data, "props.pageProps.article.description"), + pageProps.description, + generateExcerpt(null, textContent) + ), + published: pickString( + getByPath(data, "props.pageProps.content.datePublished"), + getByPath(data, "props.pageProps.article.datePublished"), + getByPath(data, "props.pageProps.publishedAt"), + extractPublishedTime(document) + ), + html: null, + textContent, + method: "next-data", + }; + } + } + } catch { + return null; + } + + return null; +} + +function buildReadabilityCandidate( + article: ReturnType, + document: Document, + method: string +): ExtractionCandidate | null { + const textContent = article?.textContent?.trim() ?? ""; + if (textContent.length < MIN_CONTENT_LENGTH) return null; + + return { + title: pickString(article?.title, extractTitle(document)), + byline: pickString((article as { byline?: string } | null)?.byline), + excerpt: pickString(article?.excerpt, generateExcerpt(null, textContent)), + published: pickString((article as { publishedTime?: string } | null)?.publishedTime, extractPublishedTime(document)), + html: article?.content ? sanitizeHtml(article.content) : null, + textContent, + method, + }; +} + +function tryReadability(document: Document): ExtractionCandidate | null { + try { + const strictClone = document.cloneNode(true) as Document; + const strictResult = buildReadabilityCandidate( + new Readability(strictClone).parse(), + document, + "readability" + ); + if (strictResult) return strictResult; + + const relaxedClone = document.cloneNode(true) as Document; + return buildReadabilityCandidate( + new Readability(relaxedClone, { charThreshold: 120 }).parse(), + document, + "readability-relaxed" + ); + } catch { + return null; + } +} + +function trySelectorExtraction(document: Document): ExtractionCandidate | null { + for (const selector of CONTENT_SELECTORS) { + const element = document.querySelector(selector); + if (!element) continue; + + const clone = element.cloneNode(true) as Element; + for (const removeSelector of REMOVE_SELECTORS) { + for (const node of clone.querySelectorAll(removeSelector)) { + node.remove(); + } + } + + const html = sanitizeHtml(clone.innerHTML); + const textContent = extractTextFromHtml(html); + if (textContent.length < MIN_CONTENT_LENGTH) continue; + + return { + title: extractTitle(document), + byline: null, + excerpt: generateExcerpt(null, textContent), + published: extractPublishedTime(document), + html, + textContent, + method: `selector:${selector}`, + }; + } + + return null; +} + +function tryBodyExtraction(document: Document): ExtractionCandidate | null { + const body = document.body; + if (!body) return null; + + const clone = body.cloneNode(true) as Element; + for (const removeSelector of REMOVE_SELECTORS) { + for (const node of clone.querySelectorAll(removeSelector)) { + node.remove(); + } + } + + const html = sanitizeHtml(clone.innerHTML); + const textContent = extractTextFromHtml(html); + if (!textContent) return null; + + return { + title: extractTitle(document), + byline: null, + excerpt: generateExcerpt(null, textContent), + published: extractPublishedTime(document), + html, + textContent, + method: "body-fallback", + }; +} + +function pickBestCandidate(candidates: ExtractionCandidate[]): ExtractionCandidate | null { + if (candidates.length === 0) return null; + + const methodOrder = [ + "readability", + "readability-relaxed", + "next-data", + "json-ld", + "selector:", + "body-fallback", + ]; + + function methodRank(method: string): number { + const idx = methodOrder.findIndex((entry) => + entry.endsWith(":") ? method.startsWith(entry) : method === entry + ); + return idx === -1 ? methodOrder.length : idx; + } + + const ranked = [...candidates].sort((a, b) => { + const rankA = methodRank(a.method); + const rankB = methodRank(b.method); + if (rankA !== rankB) return rankA - rankB; + return (b.textContent.length ?? 0) - (a.textContent.length ?? 0); + }); + + for (const candidate of ranked) { + if (candidate.textContent.length >= GOOD_CONTENT_LENGTH) { + return candidate; + } + } + + for (const candidate of ranked) { + if (candidate.textContent.length >= MIN_CONTENT_LENGTH) { + return candidate; + } + } + + return ranked[0]; +} + +function extractFromHtml(html: string): ExtractionCandidate | null { + const document = parseDocument(html); + + const readabilityCandidate = tryReadability(document); + const nextDataCandidate = tryNextDataExtraction(document); + const jsonLdCandidate = tryJsonLdExtraction(document); + const selectorCandidate = trySelectorExtraction(document); + const bodyCandidate = tryBodyExtraction(document); + + const candidates = [ + readabilityCandidate, + nextDataCandidate, + jsonLdCandidate, + selectorCandidate, + bodyCandidate, + ].filter((candidate): candidate is ExtractionCandidate => Boolean(candidate)); + + const winner = pickBestCandidate(candidates); + if (!winner) return null; + + return { + ...winner, + title: winner.title ?? extractTitle(document), + published: winner.published ?? extractPublishedTime(document), + excerpt: winner.excerpt ?? generateExcerpt(null, winner.textContent), + }; +} + +const turndown = new TurndownService({ + headingStyle: "atx", + hr: "---", + bulletListMarker: "-", + codeBlockStyle: "fenced", + emDelimiter: "*", + strongDelimiter: "**", + linkStyle: "inlined", +}); + +turndown.use(gfm); +turndown.remove(["script", "style", "iframe", "noscript", "template", "svg", "path"]); + +turndown.addRule("collapseFigure", { + filter: "figure", + replacement(content) { + return `\n\n${content.trim()}\n\n`; + }, +}); + +turndown.addRule("dropInvisibleAnchors", { + filter(node) { + return node.nodeName === "A" && !(node as Element).textContent?.trim(); + }, + replacement() { + return ""; + }, +}); + +function convertHtmlToMarkdown(html: string): string { + if (!html || !html.trim()) return ""; + + try { + const sanitized = sanitizeHtml(html); + return turndown.turndown(sanitized); + } catch { + return ""; + } +} + +function fallbackPlainText(html: string): string { + const document = parseDocument(html); + for (const selector of ["script", "style", "noscript", "template", "iframe", "svg", "path"]) { + for (const el of document.querySelectorAll(selector)) { + el.remove(); + } + } + const text = document.body?.textContent ?? document.documentElement?.textContent ?? ""; + return normalizeMarkdown(text.replace(/\s+/g, " ")); +} + +function countBylines(markdown: string): number { + return (markdown.match(/(^|\n)By\s+/g) || []).length; +} + +function countUsefulParagraphs(markdown: string): number { + const paragraphs = normalizeMarkdown(markdown).split(/\n{2,}/); + let count = 0; + + for (const paragraph of paragraphs) { + const trimmed = paragraph.trim(); + if (!trimmed) continue; + if (/^!?\[[^\]]*\]\([^)]+\)$/.test(trimmed)) continue; + if (/^#{1,6}\s+/.test(trimmed)) continue; + if ((trimmed.match(/\b[\p{L}\p{N}']+\b/gu) || []).length < 8) continue; + count++; + } + + return count; +} + +function countMarkerHits(markdown: string, markers: RegExp[]): number { + let hits = 0; + for (const marker of markers) { + if (marker.test(markdown)) hits++; + } + return hits; +} + +function scoreMarkdownQuality(markdown: string): number { + const normalized = normalizeMarkdown(markdown); + const wordCount = (normalized.match(/\b[\p{L}\p{N}']+\b/gu) || []).length; + const usefulParagraphs = countUsefulParagraphs(normalized); + const headingCount = (normalized.match(/^#{1,6}\s+/gm) || []).length; + const markerHits = countMarkerHits(normalized, LOW_QUALITY_MARKERS); + const bylineCount = countBylines(normalized); + const staffCount = (normalized.match(/\bForbes Staff\b/gi) || []).length; + + return ( + Math.min(wordCount, 4000) + + usefulParagraphs * 40 + + headingCount * 10 - + markerHits * 180 - + Math.max(0, bylineCount - 1) * 120 - + Math.max(0, staffCount - 1) * 80 + ); +} + +function shouldCompareWithLegacy(markdown: string): boolean { + const normalized = normalizeMarkdown(markdown); + return ( + countMarkerHits(normalized, LOW_QUALITY_MARKERS) > 0 || + countBylines(normalized) > 1 || + countUsefulParagraphs(normalized) < 6 + ); +} + +function isMarkdownUsable(markdown: string, html: string): boolean { + const normalized = normalizeMarkdown(markdown); + if (!normalized) return false; + + const htmlTextLength = extractTextFromHtml(html).length; + if (htmlTextLength < MIN_CONTENT_LENGTH) return true; + + if (normalized.length >= 80) return true; + return normalized.length >= Math.min(200, Math.floor(htmlTextLength * 0.2)); +} + +async function tryDefuddleConversion( + html: string, + url: string, + baseMetadata: PageMetadata +): Promise<{ ok: true; result: ConversionResult } | { ok: false; reason: string }> { + try { + const [{ JSDOM, VirtualConsole }, { Defuddle }] = await Promise.all([ + import("jsdom"), + import("defuddle/node"), + ]); + + const virtualConsole = new VirtualConsole(); + virtualConsole.on("jsdomError", (error: Error & { type?: string }) => { + if (error.type === "css parsing" || /Could not parse CSS stylesheet/i.test(error.message)) { + return; + } + console.warn(`[url-to-markdown] jsdom: ${error.message}`); + }); + + const dom = new JSDOM(html, { url, virtualConsole }); + const result = await Defuddle(dom, url, { markdown: true }); + const markdown = normalizeMarkdown(result.content || ""); + + if (!isMarkdownUsable(markdown, html)) { + return { ok: false, reason: "Defuddle returned empty or incomplete markdown" }; + } + + return { + ok: true, + result: { + metadata: { + ...baseMetadata, + title: pickString(result.title, baseMetadata.title) ?? "", + description: pickString(result.description, baseMetadata.description) ?? undefined, + author: pickString(result.author, baseMetadata.author) ?? undefined, + published: pickString(result.published, baseMetadata.published) ?? undefined, + coverImage: pickString(result.image, baseMetadata.coverImage) ?? undefined, + }, + markdown, + rawHtml: html, + conversionMethod: "defuddle", + }, + }; + } catch (error) { + return { + ok: false, + reason: error instanceof Error ? error.message : String(error), + }; + } +} + +function convertWithLegacyExtractor(html: string, baseMetadata: PageMetadata): ConversionResult { + const extracted = extractFromHtml(html); + + let markdown = extracted?.html ? convertHtmlToMarkdown(extracted.html) : ""; + if (!markdown.trim()) { + markdown = extracted?.textContent?.trim() || fallbackPlainText(html); + } + + return { + metadata: { + ...baseMetadata, + title: pickString(extracted?.title, baseMetadata.title) ?? "", + description: pickString(extracted?.excerpt, baseMetadata.description) ?? undefined, + author: pickString(extracted?.byline, baseMetadata.author) ?? undefined, + published: pickString(extracted?.published, baseMetadata.published) ?? undefined, + }, + markdown: normalizeMarkdown(markdown), + rawHtml: html, + conversionMethod: extracted ? `legacy:${extracted.method}` : "legacy:plain-text", + }; +} + +export async function extractContent(html: string, url: string): Promise { + const capturedAt = new Date().toISOString(); + const baseMetadata = extractMetadataFromHtml(html, url, capturedAt); + + const defuddleResult = await tryDefuddleConversion(html, url, baseMetadata); + if (defuddleResult.ok) { + if (shouldCompareWithLegacy(defuddleResult.result.markdown)) { + const legacyResult = convertWithLegacyExtractor(html, baseMetadata); + const legacyScore = scoreMarkdownQuality(legacyResult.markdown); + const defuddleScore = scoreMarkdownQuality(defuddleResult.result.markdown); + + if (legacyScore > defuddleScore + 120) { + return { + ...legacyResult, + fallbackReason: "Legacy extractor produced higher-quality markdown than Defuddle", + }; + } + } + + return defuddleResult.result; + } + + const fallbackResult = convertWithLegacyExtractor(html, baseMetadata); + return { + ...fallbackResult, + fallbackReason: defuddleResult.reason, + }; } function escapeYamlValue(value: string): string { diff --git a/skills/baoyu-url-to-markdown/scripts/main.ts b/skills/baoyu-url-to-markdown/scripts/main.ts index c48a6b9..e7b23b2 100644 --- a/skills/baoyu-url-to-markdown/scripts/main.ts +++ b/skills/baoyu-url-to-markdown/scripts/main.ts @@ -69,6 +69,12 @@ function formatTimestamp(): string { return `${now.getFullYear()}${pad(now.getMonth() + 1)}${pad(now.getDate())}-${pad(now.getHours())}${pad(now.getMinutes())}${pad(now.getSeconds())}`; } +function deriveHtmlSnapshotPath(markdownPath: string): string { + const parsed = path.parse(markdownPath); + const basename = parsed.ext ? parsed.name : parsed.base; + return path.join(parsed.dir, `${basename}-captured.html`); +} + async function generateOutputPath(url: string, title: string, outputDir?: string): Promise { const domain = new URL(url).hostname.replace(/^www\./, ""); const slug = generateSlug(title, url); @@ -141,7 +147,7 @@ async function captureUrl(args: Args): Promise { async function main(): Promise { const args = parseArgs(process.argv); if (!args.url) { - console.error("Usage: bun main.ts [-o output.md] [--wait] [--timeout ms]"); + console.error("Usage: bun main.ts [-o output.md] [--output-dir dir] [--wait] [--timeout ms] [--download-media]"); process.exit(1); } @@ -166,7 +172,9 @@ async function main(): Promise { const result = await captureUrl(args); const outputPath = args.output || await generateOutputPath(args.url, result.metadata.title, args.outputDir); const outputDir = path.dirname(outputPath); + const htmlSnapshotPath = deriveHtmlSnapshotPath(outputPath); await mkdir(outputDir, { recursive: true }); + await writeFile(htmlSnapshotPath, result.rawHtml, "utf-8"); let document = createMarkdownDocument(result); @@ -189,7 +197,12 @@ async function main(): Promise { await writeFile(outputPath, document, "utf-8"); console.log(`Saved: ${outputPath}`); + console.log(`Saved HTML: ${htmlSnapshotPath}`); console.log(`Title: ${result.metadata.title || "(no title)"}`); + console.log(`Converter: ${result.conversionMethod}`); + if (result.fallbackReason) { + console.warn(`Fallback used: ${result.fallbackReason}`); + } } main().catch((err) => { diff --git a/skills/baoyu-url-to-markdown/scripts/package.json b/skills/baoyu-url-to-markdown/scripts/package.json new file mode 100644 index 0000000..90fc925 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/package.json @@ -0,0 +1,13 @@ +{ + "name": "baoyu-url-to-markdown-scripts", + "private": true, + "type": "module", + "dependencies": { + "@mozilla/readability": "^0.6.0", + "defuddle": "^0.10.0", + "jsdom": "^24.1.3", + "linkedom": "^0.18.12", + "turndown": "^7.2.2", + "turndown-plugin-gfm": "^1.0.2" + } +}