feat(baoyu-url-to-markdown): add media download and cover image extraction
- Add --download-media flag to download images/videos to local dirs and rewrite markdown links - Extract coverImage from page meta (og:image) into YAML front matter - Handle data-src lazy loading for WeChat and similar sites - Add EXTEND.md preferences with first-time setup for download_media setting - Add media-localizer module adapted from x-to-markdown
This commit is contained in:
parent
bd4de7b995
commit
210905ef66
|
|
@ -46,10 +46,45 @@ test -f "$HOME/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md" && echo "user"
|
||||||
├───────────┼───────────────────────────────────────────────────────────────────────────┤
|
├───────────┼───────────────────────────────────────────────────────────────────────────┤
|
||||||
│ Found │ Read, parse, apply settings │
|
│ Found │ Read, parse, apply settings │
|
||||||
├───────────┼───────────────────────────────────────────────────────────────────────────┤
|
├───────────┼───────────────────────────────────────────────────────────────────────────┤
|
||||||
│ Not found │ Use defaults │
|
│ Not found │ **MUST** run first-time setup (see below) — do NOT silently create defaults │
|
||||||
└───────────┴───────────────────────────────────────────────────────────────────────────┘
|
└───────────┴───────────────────────────────────────────────────────────────────────────┘
|
||||||
|
|
||||||
**EXTEND.md Supports**: Default output directory | Default capture mode | Timeout settings
|
**EXTEND.md Supports**: Download media by default | Default output directory | Default capture mode | Timeout settings
|
||||||
|
|
||||||
|
### First-Time Setup (BLOCKING)
|
||||||
|
|
||||||
|
**CRITICAL**: When EXTEND.md is not found, you **MUST use `AskUserQuestion`** to ask the user for their preferences before creating EXTEND.md. **NEVER** create EXTEND.md with defaults without asking. This is a **BLOCKING** operation — do NOT proceed with any conversion until setup is complete.
|
||||||
|
|
||||||
|
Use `AskUserQuestion` with ALL questions in ONE call:
|
||||||
|
|
||||||
|
**Question 1** — header: "Media", question: "How to handle images and videos in pages?"
|
||||||
|
- "Ask each time (Recommended)" — After saving markdown, ask whether to download media
|
||||||
|
- "Always download" — Always download media to local imgs/ and videos/ directories
|
||||||
|
- "Never download" — Keep original remote URLs in markdown
|
||||||
|
|
||||||
|
**Question 2** — header: "Output", question: "Default output directory?"
|
||||||
|
- "url-to-markdown (Recommended)" — Save to ./url-to-markdown/{domain}/{slug}.md
|
||||||
|
- (User may choose "Other" to type a custom path)
|
||||||
|
|
||||||
|
**Question 3** — header: "Save", question: "Where to save preferences?"
|
||||||
|
- "User (Recommended)" — ~/.baoyu-skills/ (all projects)
|
||||||
|
- "Project" — .baoyu-skills/ (this project only)
|
||||||
|
|
||||||
|
After user answers, create EXTEND.md at the chosen location, confirm "Preferences saved to [path]", then continue.
|
||||||
|
|
||||||
|
Full reference: [references/config/first-time-setup.md](references/config/first-time-setup.md)
|
||||||
|
|
||||||
|
### Supported Keys
|
||||||
|
|
||||||
|
| Key | Default | Values | Description |
|
||||||
|
|-----|---------|--------|-------------|
|
||||||
|
| `download_media` | `ask` | `ask` / `1` / `0` | `ask` = prompt each time, `1` = always download, `0` = never |
|
||||||
|
| `default_output_dir` | empty | path or empty | Default output directory (empty = `./url-to-markdown/`) |
|
||||||
|
|
||||||
|
**Value priority**:
|
||||||
|
1. CLI arguments (`--download-media`, `-o`)
|
||||||
|
2. EXTEND.md
|
||||||
|
3. Skill defaults
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
|
|
@ -57,6 +92,7 @@ test -f "$HOME/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md" && echo "user"
|
||||||
- Two capture modes: auto or wait-for-user
|
- Two capture modes: auto or wait-for-user
|
||||||
- Clean markdown output with metadata
|
- Clean markdown output with metadata
|
||||||
- Handles login-required pages via wait mode
|
- Handles login-required pages via wait mode
|
||||||
|
- Download images and videos to local directories
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
|
|
@ -69,6 +105,9 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts <url> --wait
|
||||||
|
|
||||||
# Save to specific file
|
# Save to specific file
|
||||||
npx -y bun ${SKILL_DIR}/scripts/main.ts <url> -o output.md
|
npx -y bun ${SKILL_DIR}/scripts/main.ts <url> -o output.md
|
||||||
|
|
||||||
|
# Download images and videos to local directories
|
||||||
|
npx -y bun ${SKILL_DIR}/scripts/main.ts <url> --download-media
|
||||||
```
|
```
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
|
|
@ -79,6 +118,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts <url> -o output.md
|
||||||
| `-o <path>` | Output file path (default: auto-generated) |
|
| `-o <path>` | Output file path (default: auto-generated) |
|
||||||
| `--wait` | Wait for user signal before capturing |
|
| `--wait` | Wait for user signal before capturing |
|
||||||
| `--timeout <ms>` | Page load timeout (default: 30000) |
|
| `--timeout <ms>` | Page load timeout (default: 30000) |
|
||||||
|
| `--download-media` | Download image/video assets to local `imgs/` and `videos/`, and rewrite markdown links to local relative paths |
|
||||||
|
|
||||||
## Capture Modes
|
## Capture Modes
|
||||||
|
|
||||||
|
|
@ -105,6 +145,32 @@ url-to-markdown/<domain>/<slug>.md
|
||||||
- `<slug>`: From page title or URL path (kebab-case, 2-6 words)
|
- `<slug>`: From page title or URL path (kebab-case, 2-6 words)
|
||||||
- Conflict resolution: Append timestamp `<slug>-YYYYMMDD-HHMMSS.md`
|
- Conflict resolution: Append timestamp `<slug>-YYYYMMDD-HHMMSS.md`
|
||||||
|
|
||||||
|
When `--download-media` is enabled:
|
||||||
|
- Images are saved to `imgs/` next to the markdown file
|
||||||
|
- Videos are saved to `videos/` next to the markdown file
|
||||||
|
- Markdown media links are rewritten to local relative paths
|
||||||
|
|
||||||
|
## Media Download Workflow
|
||||||
|
|
||||||
|
Based on `download_media` setting in EXTEND.md:
|
||||||
|
|
||||||
|
| Setting | Behavior |
|
||||||
|
|---------|----------|
|
||||||
|
| `1` (always) | Run script with `--download-media` flag |
|
||||||
|
| `0` (never) | Run script without `--download-media` flag |
|
||||||
|
| `ask` (default) | Follow the ask-each-time flow below |
|
||||||
|
|
||||||
|
### Ask-Each-Time Flow
|
||||||
|
|
||||||
|
1. Run script **without** `--download-media` → markdown saved
|
||||||
|
2. Check saved markdown for remote media URLs (`https://` in image/video links)
|
||||||
|
3. **If no remote media found** → done, no prompt needed
|
||||||
|
4. **If remote media found** → use `AskUserQuestion`:
|
||||||
|
- header: "Media", question: "Download N images/videos to local files?"
|
||||||
|
- "Yes" — Download to local directories
|
||||||
|
- "No" — Keep remote URLs
|
||||||
|
5. If user confirms → run script **again** with `--download-media` (overwrites markdown with localized links)
|
||||||
|
|
||||||
## Environment Variables
|
## Environment Variables
|
||||||
|
|
||||||
| Variable | Description |
|
| Variable | Description |
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,106 @@
|
||||||
|
---
|
||||||
|
name: first-time-setup
|
||||||
|
description: First-time setup flow for baoyu-url-to-markdown preferences
|
||||||
|
---
|
||||||
|
|
||||||
|
# First-Time Setup
|
||||||
|
|
||||||
|
## Overview
|
||||||
|
|
||||||
|
When no EXTEND.md is found, guide user through preference setup.
|
||||||
|
|
||||||
|
**BLOCKING OPERATION**: This setup MUST complete before ANY other workflow steps. Do NOT:
|
||||||
|
- Start converting URLs
|
||||||
|
- Ask about URLs or output paths
|
||||||
|
- Proceed to any conversion
|
||||||
|
|
||||||
|
ONLY ask the questions in this setup flow, save EXTEND.md, then continue.
|
||||||
|
|
||||||
|
## Setup Flow
|
||||||
|
|
||||||
|
```
|
||||||
|
No EXTEND.md found
|
||||||
|
|
|
||||||
|
v
|
||||||
|
+---------------------+
|
||||||
|
| AskUserQuestion |
|
||||||
|
| (all questions) |
|
||||||
|
+---------------------+
|
||||||
|
|
|
||||||
|
v
|
||||||
|
+---------------------+
|
||||||
|
| Create EXTEND.md |
|
||||||
|
+---------------------+
|
||||||
|
|
|
||||||
|
v
|
||||||
|
Continue conversion
|
||||||
|
```
|
||||||
|
|
||||||
|
## Questions
|
||||||
|
|
||||||
|
**Language**: Use user's input language or saved language preference.
|
||||||
|
|
||||||
|
Use AskUserQuestion with ALL questions in ONE call:
|
||||||
|
|
||||||
|
### Question 1: Download Media
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
header: "Media"
|
||||||
|
question: "How to handle images and videos in pages?"
|
||||||
|
options:
|
||||||
|
- label: "Ask each time (Recommended)"
|
||||||
|
description: "After saving markdown, ask whether to download media"
|
||||||
|
- label: "Always download"
|
||||||
|
description: "Always download media to local imgs/ and videos/ directories"
|
||||||
|
- label: "Never download"
|
||||||
|
description: "Keep original remote URLs in markdown"
|
||||||
|
```
|
||||||
|
|
||||||
|
### Question 2: Default Output Directory
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
header: "Output"
|
||||||
|
question: "Default output directory?"
|
||||||
|
options:
|
||||||
|
- label: "url-to-markdown (Recommended)"
|
||||||
|
description: "Save to ./url-to-markdown/{domain}/{slug}.md"
|
||||||
|
```
|
||||||
|
|
||||||
|
Note: User will likely choose "Other" to type a custom path.
|
||||||
|
|
||||||
|
### Question 3: Save Location
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
header: "Save"
|
||||||
|
question: "Where to save preferences?"
|
||||||
|
options:
|
||||||
|
- label: "User (Recommended)"
|
||||||
|
description: "~/.baoyu-skills/ (all projects)"
|
||||||
|
- label: "Project"
|
||||||
|
description: ".baoyu-skills/ (this project only)"
|
||||||
|
```
|
||||||
|
|
||||||
|
## Save Locations
|
||||||
|
|
||||||
|
| Choice | Path | Scope |
|
||||||
|
|--------|------|-------|
|
||||||
|
| User | `~/.baoyu-skills/baoyu-url-to-markdown/EXTEND.md` | All projects |
|
||||||
|
| Project | `.baoyu-skills/baoyu-url-to-markdown/EXTEND.md` | Current project |
|
||||||
|
|
||||||
|
## After Setup
|
||||||
|
|
||||||
|
1. Create directory if needed
|
||||||
|
2. Write EXTEND.md
|
||||||
|
3. Confirm: "Preferences saved to [path]"
|
||||||
|
4. Continue with conversion using saved preferences
|
||||||
|
|
||||||
|
## EXTEND.md Template
|
||||||
|
|
||||||
|
```md
|
||||||
|
download_media: [ask/1/0]
|
||||||
|
default_output_dir: [path or empty]
|
||||||
|
```
|
||||||
|
|
||||||
|
## Modifying Preferences Later
|
||||||
|
|
||||||
|
Users can edit EXTEND.md directly or delete it to trigger setup again.
|
||||||
|
|
@ -7,6 +7,7 @@ export interface PageMetadata {
|
||||||
description?: string;
|
description?: string;
|
||||||
author?: string;
|
author?: string;
|
||||||
published?: string;
|
published?: string;
|
||||||
|
coverImage?: string;
|
||||||
captured_at: string;
|
captured_at: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -39,6 +40,12 @@ export const absolutizeUrlsScript = String.raw`
|
||||||
}).filter(Boolean).join(", "));
|
}).filter(Boolean).join(", "));
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
document.querySelectorAll("img[data-src], video[data-src], audio[data-src], source[data-src]").forEach(el => {
|
||||||
|
const ds = el.getAttribute("data-src");
|
||||||
|
if (ds && (!el.getAttribute("src") || el.getAttribute("src") === "" || el.getAttribute("src")?.startsWith("data:"))) {
|
||||||
|
el.setAttribute("src", ds);
|
||||||
|
}
|
||||||
|
});
|
||||||
absAttr("a[href]", "href");
|
absAttr("a[href]", "href");
|
||||||
absAttr("img[src], video[src], audio[src], source[src]", "src");
|
absAttr("img[src], video[src], audio[src], source[src]", "src");
|
||||||
absSrcset("img[srcset], source[srcset]");
|
absSrcset("img[srcset], source[srcset]");
|
||||||
|
|
@ -56,6 +63,7 @@ export async function extractContent(html: string, url: string): Promise<Convers
|
||||||
description: result.description || undefined,
|
description: result.description || undefined,
|
||||||
author: result.author || undefined,
|
author: result.author || undefined,
|
||||||
published: result.published || undefined,
|
published: result.published || undefined,
|
||||||
|
coverImage: result.image || undefined,
|
||||||
captured_at: new Date().toISOString(),
|
captured_at: new Date().toISOString(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
@ -73,6 +81,7 @@ export function formatMetadataYaml(meta: PageMetadata): string {
|
||||||
if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
|
if (meta.description) lines.push(`description: "${escapeYamlValue(meta.description)}"`);
|
||||||
if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
|
if (meta.author) lines.push(`author: "${escapeYamlValue(meta.author)}"`);
|
||||||
if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
|
if (meta.published) lines.push(`published: "${escapeYamlValue(meta.published)}"`);
|
||||||
|
if (meta.coverImage) lines.push(`coverImage: "${escapeYamlValue(meta.coverImage)}"`);
|
||||||
lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
|
lines.push(`captured_at: "${escapeYamlValue(meta.captured_at)}"`);
|
||||||
lines.push("---");
|
lines.push("---");
|
||||||
return lines.join("\n");
|
return lines.join("\n");
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ import process from "node:process";
|
||||||
|
|
||||||
import { CdpConnection, getFreePort, launchChrome, waitForChromeDebugPort, waitForNetworkIdle, waitForPageLoad, autoScroll, evaluateScript, killChrome } from "./cdp.js";
|
import { CdpConnection, getFreePort, launchChrome, waitForChromeDebugPort, waitForNetworkIdle, waitForPageLoad, autoScroll, evaluateScript, killChrome } from "./cdp.js";
|
||||||
import { absolutizeUrlsScript, extractContent, createMarkdownDocument, type ConversionResult } from "./html-to-markdown.js";
|
import { absolutizeUrlsScript, extractContent, createMarkdownDocument, type ConversionResult } from "./html-to-markdown.js";
|
||||||
|
import { localizeMarkdownMedia, countRemoteMedia } from "./media-localizer.js";
|
||||||
import { resolveUrlToMarkdownDataDir } from "./paths.js";
|
import { resolveUrlToMarkdownDataDir } from "./paths.js";
|
||||||
import { DEFAULT_TIMEOUT_MS, CDP_CONNECT_TIMEOUT_MS, NETWORK_IDLE_TIMEOUT_MS, POST_LOAD_DELAY_MS, SCROLL_STEP_WAIT_MS, SCROLL_MAX_STEPS } from "./constants.js";
|
import { DEFAULT_TIMEOUT_MS, CDP_CONNECT_TIMEOUT_MS, NETWORK_IDLE_TIMEOUT_MS, POST_LOAD_DELAY_MS, SCROLL_STEP_WAIT_MS, SCROLL_MAX_STEPS } from "./constants.js";
|
||||||
|
|
||||||
|
|
@ -26,10 +27,11 @@ interface Args {
|
||||||
output?: string;
|
output?: string;
|
||||||
wait: boolean;
|
wait: boolean;
|
||||||
timeout: number;
|
timeout: number;
|
||||||
|
downloadMedia: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
function parseArgs(argv: string[]): Args {
|
function parseArgs(argv: string[]): Args {
|
||||||
const args: Args = { url: "", wait: false, timeout: DEFAULT_TIMEOUT_MS };
|
const args: Args = { url: "", wait: false, timeout: DEFAULT_TIMEOUT_MS, downloadMedia: false };
|
||||||
for (let i = 2; i < argv.length; i++) {
|
for (let i = 2; i < argv.length; i++) {
|
||||||
const arg = argv[i];
|
const arg = argv[i];
|
||||||
if (arg === "--wait" || arg === "-w") {
|
if (arg === "--wait" || arg === "-w") {
|
||||||
|
|
@ -38,6 +40,8 @@ function parseArgs(argv: string[]): Args {
|
||||||
args.output = argv[++i];
|
args.output = argv[++i];
|
||||||
} else if (arg === "--timeout" || arg === "-t") {
|
} else if (arg === "--timeout" || arg === "-t") {
|
||||||
args.timeout = parseInt(argv[++i], 10) || DEFAULT_TIMEOUT_MS;
|
args.timeout = parseInt(argv[++i], 10) || DEFAULT_TIMEOUT_MS;
|
||||||
|
} else if (arg === "--download-media") {
|
||||||
|
args.downloadMedia = true;
|
||||||
} else if (!arg.startsWith("-") && !args.url) {
|
} else if (!arg.startsWith("-") && !args.url) {
|
||||||
args.url = arg;
|
args.url = arg;
|
||||||
}
|
}
|
||||||
|
|
@ -153,7 +157,24 @@ async function main(): Promise<void> {
|
||||||
const outputDir = path.dirname(outputPath);
|
const outputDir = path.dirname(outputPath);
|
||||||
await mkdir(outputDir, { recursive: true });
|
await mkdir(outputDir, { recursive: true });
|
||||||
|
|
||||||
const document = createMarkdownDocument(result);
|
let document = createMarkdownDocument(result);
|
||||||
|
|
||||||
|
if (args.downloadMedia) {
|
||||||
|
const mediaResult = await localizeMarkdownMedia(document, {
|
||||||
|
markdownPath: outputPath,
|
||||||
|
log: console.log,
|
||||||
|
});
|
||||||
|
document = mediaResult.markdown;
|
||||||
|
if (mediaResult.downloadedImages > 0 || mediaResult.downloadedVideos > 0) {
|
||||||
|
console.log(`Downloaded: ${mediaResult.downloadedImages} images, ${mediaResult.downloadedVideos} videos`);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
const { images, videos } = countRemoteMedia(document);
|
||||||
|
if (images > 0 || videos > 0) {
|
||||||
|
console.log(`Remote media found: ${images} images, ${videos} videos`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
await writeFile(outputPath, document, "utf-8");
|
await writeFile(outputPath, document, "utf-8");
|
||||||
|
|
||||||
console.log(`Saved: ${outputPath}`);
|
console.log(`Saved: ${outputPath}`);
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,317 @@
|
||||||
|
import path from "node:path";
|
||||||
|
import { mkdir, writeFile } from "node:fs/promises";
|
||||||
|
|
||||||
|
type MediaKind = "image" | "video";
|
||||||
|
type MediaHint = "image" | "unknown";
|
||||||
|
|
||||||
|
type MarkdownLinkCandidate = {
|
||||||
|
url: string;
|
||||||
|
hint: MediaHint;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type LocalizeMarkdownMediaOptions = {
|
||||||
|
markdownPath: string;
|
||||||
|
log?: (message: string) => void;
|
||||||
|
};
|
||||||
|
|
||||||
|
export type LocalizeMarkdownMediaResult = {
|
||||||
|
markdown: string;
|
||||||
|
downloadedImages: number;
|
||||||
|
downloadedVideos: number;
|
||||||
|
imageDir: string | null;
|
||||||
|
videoDir: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
const MARKDOWN_LINK_RE = /(!?\[[^\]\n]*\])\((<)?(https?:\/\/[^)\s>]+)(>)?\)/g;
|
||||||
|
const FRONTMATTER_COVER_RE = /^(coverImage:\s*")(https?:\/\/[^"]+)(")/m;
|
||||||
|
|
||||||
|
const IMAGE_EXTENSIONS = new Set([
|
||||||
|
"jpg",
|
||||||
|
"jpeg",
|
||||||
|
"png",
|
||||||
|
"webp",
|
||||||
|
"gif",
|
||||||
|
"bmp",
|
||||||
|
"avif",
|
||||||
|
"heic",
|
||||||
|
"heif",
|
||||||
|
"svg",
|
||||||
|
]);
|
||||||
|
|
||||||
|
const VIDEO_EXTENSIONS = new Set(["mp4", "m4v", "mov", "webm", "mkv"]);
|
||||||
|
|
||||||
|
const MIME_EXTENSION_MAP: Record<string, string> = {
|
||||||
|
"image/jpeg": "jpg",
|
||||||
|
"image/jpg": "jpg",
|
||||||
|
"image/png": "png",
|
||||||
|
"image/webp": "webp",
|
||||||
|
"image/gif": "gif",
|
||||||
|
"image/bmp": "bmp",
|
||||||
|
"image/avif": "avif",
|
||||||
|
"image/heic": "heic",
|
||||||
|
"image/heif": "heif",
|
||||||
|
"image/svg+xml": "svg",
|
||||||
|
"video/mp4": "mp4",
|
||||||
|
"video/webm": "webm",
|
||||||
|
"video/quicktime": "mov",
|
||||||
|
"video/x-m4v": "m4v",
|
||||||
|
};
|
||||||
|
|
||||||
|
const DOWNLOAD_USER_AGENT =
|
||||||
|
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36";
|
||||||
|
|
||||||
|
function normalizeContentType(raw: string | null): string {
|
||||||
|
return raw?.split(";")[0]?.trim().toLowerCase() ?? "";
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeExtension(raw: string | undefined | null): string | undefined {
|
||||||
|
if (!raw) return undefined;
|
||||||
|
const trimmed = raw.replace(/^\./, "").trim().toLowerCase();
|
||||||
|
if (!trimmed) return undefined;
|
||||||
|
if (trimmed === "jpeg") return "jpg";
|
||||||
|
if (trimmed === "jpg") return "jpg";
|
||||||
|
return trimmed;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveExtensionFromUrl(rawUrl: string): string | undefined {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(rawUrl);
|
||||||
|
const extFromPath = normalizeExtension(path.posix.extname(parsed.pathname));
|
||||||
|
if (extFromPath) return extFromPath;
|
||||||
|
const extFromFormat = normalizeExtension(parsed.searchParams.get("format"));
|
||||||
|
if (extFromFormat) return extFromFormat;
|
||||||
|
} catch {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveKindFromContentType(contentType: string): MediaKind | undefined {
|
||||||
|
if (!contentType) return undefined;
|
||||||
|
if (contentType.startsWith("image/")) return "image";
|
||||||
|
if (contentType.startsWith("video/")) return "video";
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveKindFromExtension(ext: string | undefined): MediaKind | undefined {
|
||||||
|
if (!ext) return undefined;
|
||||||
|
if (IMAGE_EXTENSIONS.has(ext)) return "image";
|
||||||
|
if (VIDEO_EXTENSIONS.has(ext)) return "video";
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveMediaKind(
|
||||||
|
rawUrl: string,
|
||||||
|
contentType: string,
|
||||||
|
extension: string | undefined,
|
||||||
|
hint: MediaHint
|
||||||
|
): MediaKind | undefined {
|
||||||
|
const kindFromType = resolveKindFromContentType(contentType);
|
||||||
|
if (kindFromType) return kindFromType;
|
||||||
|
|
||||||
|
const kindFromExtension = resolveKindFromExtension(extension);
|
||||||
|
if (kindFromExtension) return kindFromExtension;
|
||||||
|
|
||||||
|
if (contentType && contentType !== "application/octet-stream") {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
return hint === "image" ? "image" : undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveOutputExtension(
|
||||||
|
contentType: string,
|
||||||
|
extension: string | undefined,
|
||||||
|
kind: MediaKind
|
||||||
|
): string {
|
||||||
|
const extFromMime = normalizeExtension(MIME_EXTENSION_MAP[contentType]);
|
||||||
|
if (extFromMime) return extFromMime;
|
||||||
|
|
||||||
|
const normalizedExt = normalizeExtension(extension);
|
||||||
|
if (normalizedExt) return normalizedExt;
|
||||||
|
|
||||||
|
return kind === "video" ? "mp4" : "jpg";
|
||||||
|
}
|
||||||
|
|
||||||
|
function safeDecodeURIComponent(value: string): string {
|
||||||
|
try {
|
||||||
|
return decodeURIComponent(value);
|
||||||
|
} catch {
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function sanitizeFileSegment(input: string): string {
|
||||||
|
return input
|
||||||
|
.replace(/[^a-zA-Z0-9_-]+/g, "-")
|
||||||
|
.replace(/-+/g, "-")
|
||||||
|
.replace(/^[-_]+|[-_]+$/g, "")
|
||||||
|
.slice(0, 48);
|
||||||
|
}
|
||||||
|
|
||||||
|
function resolveFileStem(rawUrl: string, extension: string): string {
|
||||||
|
try {
|
||||||
|
const parsed = new URL(rawUrl);
|
||||||
|
const base = path.posix.basename(parsed.pathname);
|
||||||
|
if (!base) return "";
|
||||||
|
const decodedBase = safeDecodeURIComponent(base);
|
||||||
|
const normalizedExt = normalizeExtension(extension);
|
||||||
|
const stripExt = normalizedExt ? new RegExp(`\\.${normalizedExt}$`, "i") : null;
|
||||||
|
const rawStem = stripExt ? decodedBase.replace(stripExt, "") : decodedBase;
|
||||||
|
return sanitizeFileSegment(rawStem);
|
||||||
|
} catch {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildFileName(kind: MediaKind, index: number, sourceUrl: string, extension: string): string {
|
||||||
|
const stem = resolveFileStem(sourceUrl, extension);
|
||||||
|
const prefix = kind === "image" ? "img" : "video";
|
||||||
|
const serial = String(index).padStart(3, "0");
|
||||||
|
const suffix = stem ? `-${stem}` : "";
|
||||||
|
return `${prefix}-${serial}${suffix}.${extension}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function collectMarkdownLinkCandidates(markdown: string): MarkdownLinkCandidate[] {
|
||||||
|
const candidates: MarkdownLinkCandidate[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
|
||||||
|
const fmMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
|
||||||
|
if (fmMatch) {
|
||||||
|
const coverMatch = fmMatch[1]?.match(FRONTMATTER_COVER_RE);
|
||||||
|
if (coverMatch?.[2] && !seen.has(coverMatch[2])) {
|
||||||
|
seen.add(coverMatch[2]);
|
||||||
|
candidates.push({ url: coverMatch[2], hint: "image" });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MARKDOWN_LINK_RE.lastIndex = 0;
|
||||||
|
let match: RegExpExecArray | null;
|
||||||
|
while ((match = MARKDOWN_LINK_RE.exec(markdown))) {
|
||||||
|
const label = match[1] ?? "";
|
||||||
|
const rawUrl = match[3] ?? "";
|
||||||
|
if (!rawUrl || seen.has(rawUrl)) continue;
|
||||||
|
seen.add(rawUrl);
|
||||||
|
candidates.push({
|
||||||
|
url: rawUrl,
|
||||||
|
hint: label.startsWith("![") ? "image" : "unknown",
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidates;
|
||||||
|
}
|
||||||
|
|
||||||
|
function rewriteMarkdownMediaLinks(markdown: string, replacements: Map<string, string>): string {
|
||||||
|
if (replacements.size === 0) return markdown;
|
||||||
|
MARKDOWN_LINK_RE.lastIndex = 0;
|
||||||
|
|
||||||
|
let result = markdown.replace(MARKDOWN_LINK_RE, (full, label, _openAngle, rawUrl) => {
|
||||||
|
const localPath = replacements.get(rawUrl);
|
||||||
|
if (!localPath) return full;
|
||||||
|
return `${label}(${localPath})`;
|
||||||
|
});
|
||||||
|
|
||||||
|
result = result.replace(FRONTMATTER_COVER_RE, (full, prefix, rawUrl, suffix) => {
|
||||||
|
const localPath = replacements.get(rawUrl);
|
||||||
|
if (!localPath) return full;
|
||||||
|
return `${prefix}${localPath}${suffix}`;
|
||||||
|
});
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function localizeMarkdownMedia(
|
||||||
|
markdown: string,
|
||||||
|
options: LocalizeMarkdownMediaOptions
|
||||||
|
): Promise<LocalizeMarkdownMediaResult> {
|
||||||
|
const log = options.log ?? (() => {});
|
||||||
|
const markdownDir = path.dirname(options.markdownPath);
|
||||||
|
const candidates = collectMarkdownLinkCandidates(markdown);
|
||||||
|
|
||||||
|
if (candidates.length === 0) {
|
||||||
|
return {
|
||||||
|
markdown,
|
||||||
|
downloadedImages: 0,
|
||||||
|
downloadedVideos: 0,
|
||||||
|
imageDir: null,
|
||||||
|
videoDir: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const replacements = new Map<string, string>();
|
||||||
|
let downloadedImages = 0;
|
||||||
|
let downloadedVideos = 0;
|
||||||
|
|
||||||
|
for (const candidate of candidates) {
|
||||||
|
try {
|
||||||
|
const response = await fetch(candidate.url, {
|
||||||
|
method: "GET",
|
||||||
|
redirect: "follow",
|
||||||
|
headers: {
|
||||||
|
"user-agent": DOWNLOAD_USER_AGENT,
|
||||||
|
},
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
log(`[url-to-markdown] Skip media (${response.status}): ${candidate.url}`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const sourceUrl = response.url || candidate.url;
|
||||||
|
const contentType = normalizeContentType(response.headers.get("content-type"));
|
||||||
|
const extension = resolveExtensionFromUrl(sourceUrl) ?? resolveExtensionFromUrl(candidate.url);
|
||||||
|
const kind = resolveMediaKind(sourceUrl, contentType, extension, candidate.hint);
|
||||||
|
if (!kind) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
const outputExtension = resolveOutputExtension(contentType, extension, kind);
|
||||||
|
const nextIndex = kind === "image" ? downloadedImages + 1 : downloadedVideos + 1;
|
||||||
|
const dirName = kind === "image" ? "imgs" : "videos";
|
||||||
|
const targetDir = path.join(markdownDir, dirName);
|
||||||
|
await mkdir(targetDir, { recursive: true });
|
||||||
|
|
||||||
|
const fileName = buildFileName(kind, nextIndex, sourceUrl, outputExtension);
|
||||||
|
const absolutePath = path.join(targetDir, fileName);
|
||||||
|
const relativePath = path.posix.join(dirName, fileName);
|
||||||
|
const bytes = Buffer.from(await response.arrayBuffer());
|
||||||
|
await writeFile(absolutePath, bytes);
|
||||||
|
replacements.set(candidate.url, relativePath);
|
||||||
|
|
||||||
|
if (kind === "image") {
|
||||||
|
downloadedImages = nextIndex;
|
||||||
|
} else {
|
||||||
|
downloadedVideos = nextIndex;
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
const message = error instanceof Error ? error.message : String(error ?? "");
|
||||||
|
log(`[url-to-markdown] Failed to download media ${candidate.url}: ${message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
markdown: rewriteMarkdownMediaLinks(markdown, replacements),
|
||||||
|
downloadedImages,
|
||||||
|
downloadedVideos,
|
||||||
|
imageDir: downloadedImages > 0 ? path.join(markdownDir, "imgs") : null,
|
||||||
|
videoDir: downloadedVideos > 0 ? path.join(markdownDir, "videos") : null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export function countRemoteMedia(markdown: string): { images: number; videos: number; hasCoverImage: boolean } {
|
||||||
|
const fmMatch = markdown.match(/^---\n([\s\S]*?)\n---/);
|
||||||
|
const hasCoverImage = !!(fmMatch?.[1]?.match(FRONTMATTER_COVER_RE)?.[2]);
|
||||||
|
const candidates = collectMarkdownLinkCandidates(markdown);
|
||||||
|
let images = 0;
|
||||||
|
let videos = 0;
|
||||||
|
for (const c of candidates) {
|
||||||
|
const ext = resolveExtensionFromUrl(c.url);
|
||||||
|
const kind = resolveKindFromExtension(ext);
|
||||||
|
if (kind === "video") {
|
||||||
|
videos++;
|
||||||
|
} else if (kind === "image" || c.hint === "image") {
|
||||||
|
images++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return { images, videos, hasCoverImage };
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue