diff --git a/skills/baoyu-url-to-markdown/SKILL.md b/skills/baoyu-url-to-markdown/SKILL.md index 1664d33..0ede520 100644 --- a/skills/baoyu-url-to-markdown/SKILL.md +++ b/skills/baoyu-url-to-markdown/SKILL.md @@ -46,10 +46,45 @@ test -f "$HOME/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md" && echo "user" ├───────────┼───────────────────────────────────────────────────────────────────────────┤ │ Found │ Read, parse, apply settings │ ├───────────┼───────────────────────────────────────────────────────────────────────────┤ -│ Not found │ Use defaults │ +│ Not found │ **MUST** run first-time setup (see below) — do NOT silently create defaults │ └───────────┴───────────────────────────────────────────────────────────────────────────┘ -**EXTEND.md Supports**: Default output directory | Default capture mode | Timeout settings +**EXTEND.md Supports**: Download media by default | Default output directory | Default capture mode | Timeout settings + +### First-Time Setup (BLOCKING) + +**CRITICAL**: When EXTEND.md is not found, you **MUST use `AskUserQuestion`** to ask the user for their preferences before creating EXTEND.md. **NEVER** create EXTEND.md with defaults without asking. This is a **BLOCKING** operation — do NOT proceed with any conversion until setup is complete. + +Use `AskUserQuestion` with ALL questions in ONE call: + +**Question 1** — header: "Media", question: "How to handle images and videos in pages?" +- "Ask each time (Recommended)" — After saving markdown, ask whether to download media +- "Always download" — Always download media to local imgs/ and videos/ directories +- "Never download" — Keep original remote URLs in markdown + +**Question 2** — header: "Output", question: "Default output directory?" +- "url-to-markdown (Recommended)" — Save to ./url-to-markdown/{domain}/{slug}.md +- (User may choose "Other" to type a custom path) + +**Question 3** — header: "Save", question: "Where to save preferences?" +- "User (Recommended)" — ~/.baoyu-skills/ (all projects) +- "Project" — .baoyu-skills/ (this project only) + +After user answers, create EXTEND.md at the chosen location, confirm "Preferences saved to [path]", then continue. + +Full reference: [references/config/first-time-setup.md](references/config/first-time-setup.md) + +### Supported Keys + +| Key | Default | Values | Description | +|-----|---------|--------|-------------| +| `download_media` | `ask` | `ask` / `1` / `0` | `ask` = prompt each time, `1` = always download, `0` = never | +| `default_output_dir` | empty | path or empty | Default output directory (empty = `./url-to-markdown/`) | + +**Value priority**: +1. CLI arguments (`--download-media`, `-o`) +2. EXTEND.md +3. Skill defaults ## Features @@ -57,6 +92,7 @@ test -f "$HOME/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md" && echo "user" - Two capture modes: auto or wait-for-user - Clean markdown output with metadata - Handles login-required pages via wait mode +- Download images and videos to local directories ## Usage @@ -69,6 +105,9 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --wait # Save to specific file npx -y bun ${SKILL_DIR}/scripts/main.ts -o output.md + +# Download images and videos to local directories +npx -y bun ${SKILL_DIR}/scripts/main.ts --download-media ``` ## Options @@ -79,6 +118,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts -o output.md | `-o ` | Output file path (default: auto-generated) | | `--wait` | Wait for user signal before capturing | | `--timeout ` | Page load timeout (default: 30000) | +| `--download-media` | Download image/video assets to local `imgs/` and `videos/`, and rewrite markdown links to local relative paths | ## Capture Modes @@ -105,6 +145,32 @@ url-to-markdown//.md - ``: From page title or URL path (kebab-case, 2-6 words) - Conflict resolution: Append timestamp `-YYYYMMDD-HHMMSS.md` +When `--download-media` is enabled: +- Images are saved to `imgs/` next to the markdown file +- Videos are saved to `videos/` next to the markdown file +- Markdown media links are rewritten to local relative paths + +## Media Download Workflow + +Based on `download_media` setting in EXTEND.md: + +| Setting | Behavior | +|---------|----------| +| `1` (always) | Run script with `--download-media` flag | +| `0` (never) | Run script without `--download-media` flag | +| `ask` (default) | Follow the ask-each-time flow below | + +### Ask-Each-Time Flow + +1. Run script **without** `--download-media` → markdown saved +2. Check saved markdown for remote media URLs (`https://` in image/video links) +3. **If no remote media found** → done, no prompt needed +4. **If remote media found** → use `AskUserQuestion`: + - header: "Media", question: "Download N images/videos to local files?" + - "Yes" — Download to local directories + - "No" — Keep remote URLs +5. If user confirms → run script **again** with `--download-media` (overwrites markdown with localized links) + ## Environment Variables | Variable | Description | diff --git a/skills/baoyu-url-to-markdown/references/config/first-time-setup.md b/skills/baoyu-url-to-markdown/references/config/first-time-setup.md new file mode 100644 index 0000000..646bf8c --- /dev/null +++ b/skills/baoyu-url-to-markdown/references/config/first-time-setup.md @@ -0,0 +1,106 @@ +--- +name: first-time-setup +description: First-time setup flow for baoyu-url-to-markdown preferences +--- + +# First-Time Setup + +## Overview + +When no EXTEND.md is found, guide user through preference setup. + +**BLOCKING OPERATION**: This setup MUST complete before ANY other workflow steps. Do NOT: +- Start converting URLs +- Ask about URLs or output paths +- Proceed to any conversion + +ONLY ask the questions in this setup flow, save EXTEND.md, then continue. + +## Setup Flow + +``` +No EXTEND.md found + | + v ++---------------------+ +| AskUserQuestion | +| (all questions) | ++---------------------+ + | + v ++---------------------+ +| Create EXTEND.md | ++---------------------+ + | + v + Continue conversion +``` + +## Questions + +**Language**: Use user's input language or saved language preference. + +Use AskUserQuestion with ALL questions in ONE call: + +### Question 1: Download Media + +```yaml +header: "Media" +question: "How to handle images and videos in pages?" +options: + - label: "Ask each time (Recommended)" + description: "After saving markdown, ask whether to download media" + - label: "Always download" + description: "Always download media to local imgs/ and videos/ directories" + - label: "Never download" + description: "Keep original remote URLs in markdown" +``` + +### Question 2: Default Output Directory + +```yaml +header: "Output" +question: "Default output directory?" +options: + - label: "url-to-markdown (Recommended)" + description: "Save to ./url-to-markdown/{domain}/{slug}.md" +``` + +Note: User will likely choose "Other" to type a custom path. + +### Question 3: Save Location + +```yaml +header: "Save" +question: "Where to save preferences?" +options: + - label: "User (Recommended)" + description: "~/.baoyu-skills/ (all projects)" + - label: "Project" + description: ".baoyu-skills/ (this project only)" +``` + +## Save Locations + +| Choice | Path | Scope | +|--------|------|-------| +| User | `~/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md` | All projects | +| Project | `.baoyu-skills/baoyu-url-to-markdown/EXTEND.md` | Current project | + +## After Setup + +1. Create directory if needed +2. Write EXTEND.md +3. Confirm: "Preferences saved to [path]" +4. Continue with conversion using saved preferences + +## EXTEND.md Template + +```md +download_media: [ask/1/0] +default_output_dir: [path or empty] +``` + +## Modifying Preferences Later + +Users can edit EXTEND.md directly or delete it to trigger setup again. diff --git a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts index a684b23..ff2f6ed 100644 --- a/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts +++ b/skills/baoyu-url-to-markdown/scripts/html-to-markdown.ts @@ -7,6 +7,7 @@ export interface PageMetadata { description?: string; author?: string; published?: string; + coverImage?: string; captured_at: string; } @@ -39,6 +40,12 @@ export const absolutizeUrlsScript = String.raw` }).filter(Boolean).join(", ")); }); } + document.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => { + const ds = el.getAttribute("data-src"); + if (ds && (!el.getAttribute("src") || el.getAttribute("src") === "" || el.getAttribute("src")?.startsWith("data:"))) { + el.setAttribute("src", ds); + } + }); absAttr("a[href]", "href"); absAttr("img[src], video[src], audio[src], source[src]", "src"); absSrcset("img[srcset], source[srcset]"); @@ -56,6 +63,7 @@ export async function extractContent(html: string, url: string): Promise { const outputDir = path.dirname(outputPath); await mkdir(outputDir, { recursive: true }); - const document = createMarkdownDocument(result); + let document = createMarkdownDocument(result); + + if (args.downloadMedia) { + const mediaResult = await localizeMarkdownMedia(document, { + markdownPath: outputPath, + log: console.log, + }); + document = mediaResult.markdown; + if (mediaResult.downloadedImages > 0 || mediaResult.downloadedVideos > 0) { + console.log(`Downloaded: ${mediaResult.downloadedImages} images, ${mediaResult.downloadedVideos} videos`); + } + } else { + const { images, videos } = countRemoteMedia(document); + if (images > 0 || videos > 0) { + console.log(`Remote media found: ${images} images, ${videos} videos`); + } + } + await writeFile(outputPath, document, "utf-8"); console.log(`Saved: ${outputPath}`); diff --git a/skills/baoyu-url-to-markdown/scripts/media-localizer.ts b/skills/baoyu-url-to-markdown/scripts/media-localizer.ts new file mode 100644 index 0000000..0363a44 --- /dev/null +++ b/skills/baoyu-url-to-markdown/scripts/media-localizer.ts @@ -0,0 +1,317 @@ +import path from "node:path"; +import { mkdir, writeFile } from "node:fs/promises"; + +type MediaKind = "image" | "video"; +type MediaHint = "image" | "unknown"; + +type MarkdownLinkCandidate = { + url: string; + hint: MediaHint; +}; + +export type LocalizeMarkdownMediaOptions = { + markdownPath: string; + log?: (message: string) => void; +}; + +export type LocalizeMarkdownMediaResult = { + markdown: string; + downloadedImages: number; + downloadedVideos: number; + imageDir: string | null; + videoDir: string | null; +}; + +const MARKDOWN_LINK_RE = /(!?\[[^\]\n]*\])\((<)?(https?:\/\/[^)\s>]+)(>)?\)/g; +const FRONTMATTER_COVER_RE = /^(coverImage:\s*")(https?:\/\/[^"]+)(")/m; + +const IMAGE_EXTENSIONS = new Set([ + "jpg", + "jpeg", + "png", + "webp", + "gif", + "bmp", + "avif", + "heic", + "heif", + "svg", +]); + +const VIDEO_EXTENSIONS = new Set(["mp4", "m4v", "mov", "webm", "mkv"]); + +const MIME_EXTENSION_MAP: Record = { + "image/jpeg": "jpg", + "image/jpg": "jpg", + "image/png": "png", + "image/webp": "webp", + "image/gif": "gif", + "image/bmp": "bmp", + "image/avif": "avif", + "image/heic": "heic", + "image/heif": "heif", + "image/svg+xml": "svg", + "video/mp4": "mp4", + "video/webm": "webm", + "video/quicktime": "mov", + "video/x-m4v": "m4v", +}; + +const DOWNLOAD_USER_AGENT = + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"; + +function normalizeContentType(raw: string | null): string { + return raw?.split(";")[0]?.trim().toLowerCase() ?? ""; +} + +function normalizeExtension(raw: string | undefined | null): string | undefined { + if (!raw) return undefined; + const trimmed = raw.replace(/^\./, "").trim().toLowerCase(); + if (!trimmed) return undefined; + if (trimmed === "jpeg") return "jpg"; + if (trimmed === "jpg") return "jpg"; + return trimmed; +} + +function resolveExtensionFromUrl(rawUrl: string): string | undefined { + try { + const parsed = new URL(rawUrl); + const extFromPath = normalizeExtension(path.posix.extname(parsed.pathname)); + if (extFromPath) return extFromPath; + const extFromFormat = normalizeExtension(parsed.searchParams.get("format")); + if (extFromFormat) return extFromFormat; + } catch { + return undefined; + } + return undefined; +} + +function resolveKindFromContentType(contentType: string): MediaKind | undefined { + if (!contentType) return undefined; + if (contentType.startsWith("image/")) return "image"; + if (contentType.startsWith("video/")) return "video"; + return undefined; +} + +function resolveKindFromExtension(ext: string | undefined): MediaKind | undefined { + if (!ext) return undefined; + if (IMAGE_EXTENSIONS.has(ext)) return "image"; + if (VIDEO_EXTENSIONS.has(ext)) return "video"; + return undefined; +} + +function resolveMediaKind( + rawUrl: string, + contentType: string, + extension: string | undefined, + hint: MediaHint +): MediaKind | undefined { + const kindFromType = resolveKindFromContentType(contentType); + if (kindFromType) return kindFromType; + + const kindFromExtension = resolveKindFromExtension(extension); + if (kindFromExtension) return kindFromExtension; + + if (contentType && contentType !== "application/octet-stream") { + return undefined; + } + + return hint === "image" ? "image" : undefined; +} + +function resolveOutputExtension( + contentType: string, + extension: string | undefined, + kind: MediaKind +): string { + const extFromMime = normalizeExtension(MIME_EXTENSION_MAP[contentType]); + if (extFromMime) return extFromMime; + + const normalizedExt = normalizeExtension(extension); + if (normalizedExt) return normalizedExt; + + return kind === "video" ? "mp4" : "jpg"; +} + +function safeDecodeURIComponent(value: string): string { + try { + return decodeURIComponent(value); + } catch { + return value; + } +} + +function sanitizeFileSegment(input: string): string { + return input + .replace(/[^a-zA-Z0-9_-]+/g, "-") + .replace(/-+/g, "-") + .replace(/^[-_]+|[-_]+$/g, "") + .slice(0, 48); +} + +function resolveFileStem(rawUrl: string, extension: string): string { + try { + const parsed = new URL(rawUrl); + const base = path.posix.basename(parsed.pathname); + if (!base) return ""; + const decodedBase = safeDecodeURIComponent(base); + const normalizedExt = normalizeExtension(extension); + const stripExt = normalizedExt ? new RegExp(`\\.${normalizedExt}$`, "i") : null; + const rawStem = stripExt ? decodedBase.replace(stripExt, "") : decodedBase; + return sanitizeFileSegment(rawStem); + } catch { + return ""; + } +} + +function buildFileName(kind: MediaKind, index: number, sourceUrl: string, extension: string): string { + const stem = resolveFileStem(sourceUrl, extension); + const prefix = kind === "image" ? "img" : "video"; + const serial = String(index).padStart(3, "0"); + const suffix = stem ? `-${stem}` : ""; + return `${prefix}-${serial}${suffix}.${extension}`; +} + +function collectMarkdownLinkCandidates(markdown: string): MarkdownLinkCandidate[] { + const candidates: MarkdownLinkCandidate[] = []; + const seen = new Set(); + + const fmMatch = markdown.match(/^---\n([\s\S]*?)\n---/); + if (fmMatch) { + const coverMatch = fmMatch[1]?.match(FRONTMATTER_COVER_RE); + if (coverMatch?.[2] && !seen.has(coverMatch[2])) { + seen.add(coverMatch[2]); + candidates.push({ url: coverMatch[2], hint: "image" }); + } + } + + MARKDOWN_LINK_RE.lastIndex = 0; + let match: RegExpExecArray | null; + while ((match = MARKDOWN_LINK_RE.exec(markdown))) { + const label = match[1] ?? ""; + const rawUrl = match[3] ?? ""; + if (!rawUrl || seen.has(rawUrl)) continue; + seen.add(rawUrl); + candidates.push({ + url: rawUrl, + hint: label.startsWith("![") ? "image" : "unknown", + }); + } + + return candidates; +} + +function rewriteMarkdownMediaLinks(markdown: string, replacements: Map): string { + if (replacements.size === 0) return markdown; + MARKDOWN_LINK_RE.lastIndex = 0; + + let result = markdown.replace(MARKDOWN_LINK_RE, (full, label, _openAngle, rawUrl) => { + const localPath = replacements.get(rawUrl); + if (!localPath) return full; + return `${label}(${localPath})`; + }); + + result = result.replace(FRONTMATTER_COVER_RE, (full, prefix, rawUrl, suffix) => { + const localPath = replacements.get(rawUrl); + if (!localPath) return full; + return `${prefix}${localPath}${suffix}`; + }); + + return result; +} + +export async function localizeMarkdownMedia( + markdown: string, + options: LocalizeMarkdownMediaOptions +): Promise { + const log = options.log ?? (() => {}); + const markdownDir = path.dirname(options.markdownPath); + const candidates = collectMarkdownLinkCandidates(markdown); + + if (candidates.length === 0) { + return { + markdown, + downloadedImages: 0, + downloadedVideos: 0, + imageDir: null, + videoDir: null, + }; + } + + const replacements = new Map(); + let downloadedImages = 0; + let downloadedVideos = 0; + + for (const candidate of candidates) { + try { + const response = await fetch(candidate.url, { + method: "GET", + redirect: "follow", + headers: { + "user-agent": DOWNLOAD_USER_AGENT, + }, + }); + + if (!response.ok) { + log(`[url-to-markdown] Skip media (${response.status}): ${candidate.url}`); + continue; + } + + const sourceUrl = response.url || candidate.url; + const contentType = normalizeContentType(response.headers.get("content-type")); + const extension = resolveExtensionFromUrl(sourceUrl) ?? resolveExtensionFromUrl(candidate.url); + const kind = resolveMediaKind(sourceUrl, contentType, extension, candidate.hint); + if (!kind) { + continue; + } + + const outputExtension = resolveOutputExtension(contentType, extension, kind); + const nextIndex = kind === "image" ? downloadedImages + 1 : downloadedVideos + 1; + const dirName = kind === "image" ? "imgs" : "videos"; + const targetDir = path.join(markdownDir, dirName); + await mkdir(targetDir, { recursive: true }); + + const fileName = buildFileName(kind, nextIndex, sourceUrl, outputExtension); + const absolutePath = path.join(targetDir, fileName); + const relativePath = path.posix.join(dirName, fileName); + const bytes = Buffer.from(await response.arrayBuffer()); + await writeFile(absolutePath, bytes); + replacements.set(candidate.url, relativePath); + + if (kind === "image") { + downloadedImages = nextIndex; + } else { + downloadedVideos = nextIndex; + } + } catch (error) { + const message = error instanceof Error ? error.message : String(error ?? ""); + log(`[url-to-markdown] Failed to download media ${candidate.url}: ${message}`); + } + } + + return { + markdown: rewriteMarkdownMediaLinks(markdown, replacements), + downloadedImages, + downloadedVideos, + imageDir: downloadedImages > 0 ? path.join(markdownDir, "imgs") : null, + videoDir: downloadedVideos > 0 ? path.join(markdownDir, "videos") : null, + }; +} + +export function countRemoteMedia(markdown: string): { images: number; videos: number; hasCoverImage: boolean } { + const fmMatch = markdown.match(/^---\n([\s\S]*?)\n---/); + const hasCoverImage = !!(fmMatch?.[1]?.match(FRONTMATTER_COVER_RE)?.[2]); + const candidates = collectMarkdownLinkCandidates(markdown); + let images = 0; + let videos = 0; + for (const c of candidates) { + const ext = resolveExtensionFromUrl(c.url); + const kind = resolveKindFromExtension(ext); + if (kind === "video") { + videos++; + } else if (kind === "image" || c.hint === "image") { + images++; + } + } + return { images, videos, hasCoverImage }; +}