diff --git a/.claude-plugin/marketplace.json b/.claude-plugin/marketplace.json index 5a20d0f..39718a4 100644 --- a/.claude-plugin/marketplace.json +++ b/.claude-plugin/marketplace.json @@ -47,7 +47,8 @@ "./skills/baoyu-url-to-markdown", "./skills/baoyu-format-markdown", "./skills/baoyu-markdown-to-html", - "./skills/baoyu-translate" + "./skills/baoyu-translate", + "./skills/baoyu-youtube-transcript" ] } ] diff --git a/.gitignore b/.gitignore index c37c4df..eb59e2a 100644 --- a/.gitignore +++ b/.gitignore @@ -166,3 +166,4 @@ posts/ .clawdhub/ .release-artifacts/ .worktrees/ +youtube-transcript/ diff --git a/skills/baoyu-youtube-transcript/SKILL.md b/skills/baoyu-youtube-transcript/SKILL.md new file mode 100644 index 0000000..ee4c59a --- /dev/null +++ b/skills/baoyu-youtube-transcript/SKILL.md @@ -0,0 +1,172 @@ +--- +name: baoyu-youtube-transcript +description: Downloads YouTube video transcripts/subtitles and cover images by URL or video ID. Supports multiple languages, translation, chapters, and speaker identification. Caches raw data for fast re-formatting. Use when user asks to "get YouTube transcript", "download subtitles", "get captions", "YouTube字幕", "YouTube封面", "视频封面", "video thumbnail", "video cover image", or provides a YouTube URL and wants the transcript/subtitle text or cover image extracted. +version: 1.1.0 +metadata: + openclaw: + homepage: https://github.com/JimLiu/baoyu-skills#baoyu-youtube-transcript + requires: + anyBins: + - bun + - npx +--- + +# YouTube Transcript + +Downloads transcripts (subtitles/captions) from YouTube videos. Works with both manually created and auto-generated transcripts. No API key or browser required — uses YouTube's InnerTube API directly. + +Fetches video metadata and cover image on first run, caches raw data for fast re-formatting. + +## Script Directory + +Scripts in `scripts/` subdirectory. `{baseDir}` = this SKILL.md's directory path. Resolve `${BUN_X}` runtime: if `bun` installed → `bun`; if `npx` available → `npx -y bun`; else suggest installing bun. Replace `{baseDir}` and `${BUN_X}` with actual values. + +| Script | Purpose | +|--------|---------| +| `scripts/main.ts` | Transcript download CLI | + +## Usage + +```bash +# Default: markdown with timestamps (English) +${BUN_X} {baseDir}/scripts/main.ts + +# Specify languages (priority order) +${BUN_X} {baseDir}/scripts/main.ts --languages zh,en,ja + +# Without timestamps +${BUN_X} {baseDir}/scripts/main.ts --no-timestamps + +# With chapter segmentation +${BUN_X} {baseDir}/scripts/main.ts --chapters + +# With speaker identification (requires AI post-processing) +${BUN_X} {baseDir}/scripts/main.ts --speakers + +# SRT subtitle file +${BUN_X} {baseDir}/scripts/main.ts --format srt + +# Translate transcript +${BUN_X} {baseDir}/scripts/main.ts --translate zh-Hans + +# List available transcripts +${BUN_X} {baseDir}/scripts/main.ts --list + +# Force re-fetch (ignore cache) +${BUN_X} {baseDir}/scripts/main.ts --refresh +``` + +## Options + +| Option | Description | Default | +|--------|-------------|---------| +| `` | YouTube URL or video ID (multiple allowed) | Required | +| `--languages ` | Language codes, comma-separated, in priority order | `en` | +| `--format ` | Output format: `text`, `srt` | `text` | +| `--translate ` | Translate to specified language code | | +| `--list` | List available transcripts instead of fetching | | +| `--timestamps` | Include `[HH:MM:SS → HH:MM:SS]` timestamps per paragraph | on | +| `--no-timestamps` | Disable timestamps | | +| `--chapters` | Chapter segmentation from video description | | +| `--speakers` | Raw transcript with metadata for speaker identification | | +| `--exclude-generated` | Skip auto-generated transcripts | | +| `--exclude-manually-created` | Skip manually created transcripts | | +| `--refresh` | Force re-fetch, ignore cached data | | +| `-o, --output ` | Save to specific file path | auto-generated | +| `--output-dir ` | Base output directory | `youtube-transcript` | + +## Input Formats + +Accepts any of these as video input: +- Full URL: `https://www.youtube.com/watch?v=dQw4w9WgXcQ` +- Short URL: `https://youtu.be/dQw4w9WgXcQ` +- Embed URL: `https://www.youtube.com/embed/dQw4w9WgXcQ` +- Shorts URL: `https://www.youtube.com/shorts/dQw4w9WgXcQ` +- Video ID: `dQw4w9WgXcQ` + +## Output Formats + +| Format | Extension | Description | +|--------|-----------|-------------| +| `text` | `.md` | Markdown with frontmatter, natural paragraphs, optional timestamps/chapters/speakers | +| `srt` | `.srt` | SubRip subtitle format for video players | + +## Output Directory + +``` +youtube-transcript/ +├── .index.json # Video ID → directory path mapping (for cache lookup) +└── {channel-slug}/{title-full-slug}/ + ├── meta.json # Video metadata (title, channel, description, duration, chapters, etc.) + ├── transcript-raw.srt # Raw transcript in SRT format (cached, token-efficient for LLM) + ├── imgs/ + │ └── cover.jpg # Video thumbnail + ├── transcript.md # Markdown transcript + └── transcript.srt # SRT subtitle (if --format srt) +``` + +- `{channel-slug}`: Channel name in kebab-case +- `{title-full-slug}`: Full video title in kebab-case + +The `--list` mode outputs to stdout only (no file saved). + +## Caching + +On first fetch, the script saves: +- `meta.json` — video metadata, chapters, cover image path, language info +- `transcript-raw.srt` — raw transcript in SRT format (pre-computed timestamps, token-efficient for LLM processing) +- `imgs/cover.jpg` — video thumbnail + +Subsequent runs for the same video use cached data (no network calls). Use `--refresh` to force re-fetch. If a different language is requested, the cache is automatically refreshed. + +## Workflow + +When user provides a YouTube URL and wants the transcript: + +1. Run with `--list` first if the user hasn't specified a language, to show available options +2. Default: run with `--chapters --speakers` for the richest output (chapters + speaker identification) +3. The script auto-saves cached data + output file and prints the file path +4. For `--speakers` mode: after the script saves the raw file, follow the speaker identification workflow below to post-process with speaker labels + +When user only wants a cover image or metadata, running the script with any option will also cache `meta.json` and `imgs/cover.jpg`. + +When re-formatting the same video (e.g., first text then SRT), the cached data is reused — no re-fetch needed. + +## Chapter & Speaker Workflow + +### Chapters (`--chapters`) + +The script parses chapter timestamps from the video description (e.g., `0:00 Introduction`), segments the transcript by chapter boundaries, groups snippets into readable paragraphs, and saves as `.md` with a Table of Contents. No further processing needed. + +If no chapter timestamps exist in the description, the transcript is output as grouped paragraphs without chapter headings. + +### Speaker Identification (`--speakers`) + +Speaker identification requires AI processing. The script outputs a raw `.md` file containing: +- YAML frontmatter with video metadata (title, channel, date, cover, language) +- Video description (for speaker name extraction) +- Chapter list from description (if available) +- Raw transcript in SRT format (pre-computed start/end timestamps, token-efficient) + +After the script saves the raw file: + +1. Read the saved `.md` file +2. Read the prompt template at `{baseDir}/prompts/speaker-transcript.md` +3. Process the raw transcript following the prompt: + - Identify speakers using video metadata (title → guest, channel → host, description → names) + - Detect speaker turns from conversation flow, question-answer patterns, and contextual cues + - Segment into chapters (use description chapters if available, else create from topic shifts) + - Format with `**Speaker Name:**` labels, paragraph grouping (2-4 sentences), and `[HH:MM:SS → HH:MM:SS]` timestamps +4. Overwrite the `.md` file with the processed transcript (keep the YAML frontmatter) + +When `--speakers` is used, `--chapters` is implied — the processed output always includes chapter segmentation. + +## Error Cases + +| Error | Meaning | +|-------|---------| +| Transcripts disabled | Video has no captions at all | +| No transcript found | Requested language not available | +| Video unavailable | Video deleted, private, or region-locked | +| IP blocked | Too many requests, try again later | +| Age restricted | Video requires login for age verification | diff --git a/skills/baoyu-youtube-transcript/prompts/speaker-transcript.md b/skills/baoyu-youtube-transcript/prompts/speaker-transcript.md new file mode 100644 index 0000000..9e4808f --- /dev/null +++ b/skills/baoyu-youtube-transcript/prompts/speaker-transcript.md @@ -0,0 +1,108 @@ +# Speaker & Chapter Transcript Processing + +You are an expert transcript specialist. Process the raw transcript file (with YAML frontmatter metadata and SRT-formatted transcript) into a structured, verbatim transcript with speaker identification and chapter segmentation. + +## Output Structure + +Produce a single cohesive markdown file containing: +1. YAML frontmatter (keep the original frontmatter from the raw file) +2. Table of Contents +3. Full chapter-segmented transcript with speaker labels + +Use the same language as the transcription for the title and ToC. + +## Rules + +### Transcription Fidelity +- Preserve every spoken word exactly, including filler words (`um`, `uh`, `like`) and stutters +- **NEVER translate.** If the audio mixes languages (e.g., "这个 feature 很酷"), replicate that mix exactly + +### Speaker Identification +- **Priority 1: Use metadata.** Analyze the video's title, channel name, and description to identify speakers +- **Priority 2: Use transcript content.** Look for introductions, how speakers address each other, contextual cues +- **Fallback:** Use consistent generic labels (`**Speaker 1:**`, `**Host:**`, etc.) +- **Consistency:** If a speaker's name is revealed later, update ALL previous labels for that speaker + +### Chapter Generation +- If the raw file contains a `# Chapters` section, use those as the primary basis for segmenting +- Otherwise, create chapters based on significant topic shifts in the conversation + +### Input Format +- The `# Transcript` section contains SRT-formatted subtitles with pre-computed start/end timestamps +- Each SRT block has: sequence number, `HH:MM:SS,mmm --> HH:MM:SS,mmm` timestamp line, and text +- Use the SRT timestamps directly — no need to calculate paragraph start/end times, just merge adjacent blocks + +### Formatting + +**Timestamps:** Use `[HH:MM:SS → HH:MM:SS]` format (start → end) at the end of each paragraph. No milliseconds. + +**Table of Contents:** +``` +## Table of Contents +* [HH:MM:SS] Chapter Title +``` + +**Chapters:** +``` +## [HH:MM:SS] Chapter Title +``` +Two blank lines between chapters. + +**Dialogue Paragraphs:** +- First paragraph of a speaker's turn starts with `**Speaker Name:** ` +- Split long monologues into 2-4 sentence paragraphs separated by blank lines +- Subsequent paragraphs from the SAME speaker do NOT repeat the speaker label +- Every paragraph ends with exactly ONE timestamp range `[HH:MM:SS → HH:MM:SS]` + +Correct example: +``` +**Jane Doe:** The study focuses on long-term effects of dietary changes. We tracked two groups over five years. [00:00:15 → 00:00:21] + +The first group followed the new regimen, while the second group maintained a traditional diet. [00:00:21 → 00:00:28] + +**Host:** Fascinating. And what did you find? [00:00:28 → 00:00:31] +``` + +Wrong (multiple timestamps in one paragraph): +``` +**Host:** Welcome back. [00:00:01] Today we have a guest. [00:00:02] +``` + +**Non-Speech Audio:** On its own line: `[Laughter] [HH:MM:SS]` + +## Example Output + +```markdown +--- +title: "Example Interview" +channel: "The Show" +date: 2024-04-15 +url: "https://www.youtube.com/watch?v=xxx" +cover: imgs/cover.jpg +language: en +--- + +## Table of Contents +* [00:00:00] Introduction and Welcome +* [00:00:12] Overview of the New Research + + +## [00:00:00] Introduction and Welcome + +**Host:** Welcome back to the show. Today, we have a, uh, very special guest, Jane Doe. [00:00:00 → 00:00:03] + +**Jane Doe:** Thank you for having me. I'm excited to be here and discuss the findings. [00:00:03 → 00:00:07] + + +## [00:00:12] Overview of the New Research + +**Host:** So, Jane, before we get into the nitty-gritty, could you, you know, give us a brief overview for our audience? [00:00:12 → 00:00:16] + +**Jane Doe:** Of course. The study focuses on the long-term effects of specific dietary changes. It's a bit complicated but essentially we tracked two large groups over a five-year period. [00:00:16 → 00:00:23] + +The first group followed the new regimen, while the second group, our control, maintained a traditional diet. This allowed us to isolate variables effectively. [00:00:23 → 00:00:30] + +[Laughter] [00:00:30] + +**Host:** Fascinating. And what did you find? [00:00:31 → 00:00:33] +``` diff --git a/skills/baoyu-youtube-transcript/scripts/main.ts b/skills/baoyu-youtube-transcript/scripts/main.ts new file mode 100644 index 0000000..2a01c8a --- /dev/null +++ b/skills/baoyu-youtube-transcript/scripts/main.ts @@ -0,0 +1,696 @@ +#!/usr/bin/env bun +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "fs"; +import { dirname, join, resolve } from "path"; + +type Format = "text" | "srt"; + +interface Options { + videoIds: string[]; + languages: string[]; + format: Format; + translate: string; + list: boolean; + excludeGenerated: boolean; + excludeManual: boolean; + output: string; + outputDir: string; + timestamps: boolean; + chapters: boolean; + speakers: boolean; + refresh: boolean; +} + +interface Snippet { + text: string; + start: number; + duration: number; +} + +interface TranscriptInfo { + language: string; + languageCode: string; + isGenerated: boolean; + isTranslatable: boolean; + baseUrl: string; + translationLanguages: { language: string; languageCode: string }[]; +} + +interface Chapter { + title: string; + start: number; +} + +interface VideoMeta { + videoId: string; + title: string; + channel: string; + channelId: string; + description: string; + duration: number; + publishDate: string; + url: string; + coverImage: string; + thumbnailUrl: string; + language: { code: string; name: string; isGenerated: boolean }; + chapters: Chapter[]; +} + +interface VideoResult { + videoId: string; + title?: string; + filePath?: string; + content?: string; + error?: string; +} + +const WATCH_URL = "https://www.youtube.com/watch?v="; +const INNERTUBE_URL = "https://www.youtube.com/youtubei/v1/player"; +const INNERTUBE_CTX = { client: { clientName: "ANDROID", clientVersion: "20.10.38" } }; + +function extractVideoId(input: string): string { + input = input.replace(/\\/g, "").trim(); + const patterns = [ + /(?:youtube\.com\/watch\?.*v=|youtu\.be\/|youtube\.com\/embed\/|youtube\.com\/v\/|youtube\.com\/shorts\/)([a-zA-Z0-9_-]{11})/, + /^([a-zA-Z0-9_-]{11})$/, + ]; + for (const p of patterns) { + const m = input.match(p); + if (m) return m[1]; + } + return input; +} + +function slugify(s: string): string { + return s + .toLowerCase() + .replace(/[^\w\s-]/g, "") + .replace(/\s+/g, "-") + .replace(/-+/g, "-") + .replace(/^-|-$/g, "") || "untitled"; +} + +function htmlUnescape(s: string): string { + return s + .replace(/&/g, "&") + .replace(/</g, "<") + .replace(/>/g, ">") + .replace(/"/g, '"') + .replace(/'/g, "'") + .replace(/'/g, "'") + .replace(///g, "/") + .replace(/'/g, "'") + .replace(/&#(\d+);/g, (_, n) => String.fromCharCode(parseInt(n))) + .replace(/&#x([0-9a-fA-F]+);/g, (_, n) => String.fromCharCode(parseInt(n, 16))); +} + +function stripTags(s: string): string { + return s.replace(/<[^>]*>/g, ""); +} + +function parseTranscriptXml(xml: string): Snippet[] { + const snippets: Snippet[] = []; + const re = /]*>([\s\S]*?)<\/text>/g; + let m: RegExpExecArray | null; + while ((m = re.exec(xml)) !== null) { + const raw = m[3]; + if (!raw) continue; + snippets.push({ + text: htmlUnescape(stripTags(raw)), + start: parseFloat(m[1]), + duration: parseFloat(m[2] || "0"), + }); + } + return snippets; +} + +// --- YouTube API --- + +async function fetchHtml(videoId: string): Promise { + const r = await fetch(WATCH_URL + videoId, { + headers: { "Accept-Language": "en-US", "User-Agent": "Mozilla/5.0" }, + }); + if (!r.ok) throw new Error(`HTTP ${r.status} fetching video page`); + let html = await r.text(); + if (html.includes('action="https://consent.youtube.com/s"')) { + const cv = html.match(/name="v" value="(.*?)"/); + if (!cv) throw new Error("Failed to create consent cookie"); + const r2 = await fetch(WATCH_URL + videoId, { + headers: { + "Accept-Language": "en-US", + "User-Agent": "Mozilla/5.0", + Cookie: `CONSENT=YES+${cv[1]}`, + }, + }); + if (!r2.ok) throw new Error(`HTTP ${r2.status} fetching video page (consent)`); + html = await r2.text(); + } + return html; +} + +function extractApiKey(html: string, videoId: string): string { + const m = html.match(/"INNERTUBE_API_KEY":\s*"([a-zA-Z0-9_-]+)"/); + if (!m) { + if (html.includes('class="g-recaptcha"')) throw new Error(`IP blocked for ${videoId} (reCAPTCHA)`); + throw new Error(`Cannot extract API key for ${videoId}`); + } + return m[1]; +} + +async function fetchInnertubeData(videoId: string, apiKey: string): Promise { + const r = await fetch(`${INNERTUBE_URL}?key=${apiKey}`, { + method: "POST", + headers: { "Content-Type": "application/json" }, + body: JSON.stringify({ context: INNERTUBE_CTX, videoId }), + }); + if (r.status === 429) throw new Error(`IP blocked for ${videoId} (429)`); + if (!r.ok) throw new Error(`HTTP ${r.status} from InnerTube API`); + return r.json(); +} + +function assertPlayability(data: any, videoId: string) { + const ps = data?.playabilityStatus; + if (!ps) return; + const status = ps.status; + if (status === "OK" || !status) return; + const reason = ps.reason || ""; + if (status === "LOGIN_REQUIRED") { + if (reason.includes("bot")) throw new Error(`Request blocked for ${videoId}: bot detected`); + if (reason.includes("inappropriate")) throw new Error(`Age restricted: ${videoId}`); + } + if (status === "ERROR" && reason.includes("unavailable")) { + if (videoId.startsWith("http")) throw new Error(`Invalid video ID: pass the ID, not the URL`); + throw new Error(`Video unavailable: ${videoId}`); + } + const subreasons = ps.errorScreen?.playerErrorMessageRenderer?.subreason?.runs?.map((r: any) => r.text).join("") || ""; + throw new Error(`Video unplayable (${videoId}): ${reason} ${subreasons}`.trim()); +} + +function extractCaptionsJson(data: any, videoId: string): any { + assertPlayability(data, videoId); + const cj = data?.captions?.playerCaptionsTracklistRenderer; + if (!cj || !cj.captionTracks) throw new Error(`Transcripts disabled for ${videoId}`); + return cj; +} + +function buildTranscriptList(captionsJson: any): TranscriptInfo[] { + const tlLangs = (captionsJson.translationLanguages || []).map((tl: any) => ({ + language: tl.languageName?.runs?.[0]?.text || tl.languageName?.simpleText || "", + languageCode: tl.languageCode, + })); + return (captionsJson.captionTracks || []).map((t: any) => ({ + language: t.name?.runs?.[0]?.text || t.name?.simpleText || "", + languageCode: t.languageCode, + isGenerated: t.kind === "asr", + isTranslatable: !!t.isTranslatable, + baseUrl: (t.baseUrl || "").replace(/&fmt=srv3/g, ""), + translationLanguages: t.isTranslatable ? tlLangs : [], + })); +} + +function findTranscript( + transcripts: TranscriptInfo[], + languages: string[], + excludeGenerated: boolean, + excludeManual: boolean +): TranscriptInfo { + let filtered = transcripts; + if (excludeGenerated) filtered = filtered.filter((t) => !t.isGenerated); + if (excludeManual) filtered = filtered.filter((t) => t.isGenerated); + for (const lang of languages) { + const found = filtered.find((t) => t.languageCode === lang); + if (found) return found; + } + const available = filtered.map((t) => `${t.languageCode} ("${t.language}")`).join(", "); + throw new Error(`No transcript found for languages [${languages.join(", ")}]. Available: ${available || "none"}`); +} + +async function fetchTranscriptSnippets(info: TranscriptInfo, translateTo?: string): Promise<{ snippets: Snippet[]; language: string; languageCode: string }> { + let url = info.baseUrl; + let lang = info.language; + let langCode = info.languageCode; + if (translateTo) { + if (!info.isTranslatable) throw new Error(`Transcript ${info.languageCode} is not translatable`); + const tl = info.translationLanguages.find((t) => t.languageCode === translateTo); + if (!tl) throw new Error(`Translation language ${translateTo} not available`); + url += `&tlang=${translateTo}`; + lang = tl.language; + langCode = translateTo; + } + const r = await fetch(url, { headers: { "Accept-Language": "en-US" } }); + if (!r.ok) throw new Error(`HTTP ${r.status} fetching transcript`); + return { snippets: parseTranscriptXml(await r.text()), language: lang, languageCode: langCode }; +} + +// --- Metadata & chapters --- + +function parseChapters(description: string): Chapter[] { + const chapters: Chapter[] = []; + for (const line of description.split("\n")) { + const m = line.trim().match(/^(?:(\d{1,2}):)?(\d{1,2}):(\d{2})\s+(.+)$/); + if (m) { + const h = m[1] ? parseInt(m[1]) : 0; + chapters.push({ title: m[4].trim(), start: h * 3600 + parseInt(m[2]) * 60 + parseInt(m[3]) }); + } + } + return chapters.length >= 2 ? chapters : []; +} + +function getBestThumbnailUrl(videoId: string, data: any): string { + const thumbnails = data?.videoDetails?.thumbnail?.thumbnails || + data?.microformat?.playerMicroformatRenderer?.thumbnail?.thumbnails || []; + if (thumbnails.length) { + const sorted = [...thumbnails].sort((a: any, b: any) => (b.width || 0) - (a.width || 0)); + return sorted[0].url; + } + return `https://i.ytimg.com/vi/${videoId}/maxresdefault.jpg`; +} + +function buildVideoMeta(data: any, videoId: string, langInfo: { code: string; name: string; isGenerated: boolean }, chapters: Chapter[]): VideoMeta { + const vd = data?.videoDetails || {}; + const mf = data?.microformat?.playerMicroformatRenderer || {}; + return { + videoId, + title: vd.title || mf.title?.simpleText || "", + channel: vd.author || mf.ownerChannelName || "", + channelId: vd.channelId || mf.externalChannelId || "", + description: vd.shortDescription || mf.description?.simpleText || "", + duration: parseInt(vd.lengthSeconds || "0"), + publishDate: mf.publishDate || mf.uploadDate || "", + url: `https://www.youtube.com/watch?v=${videoId}`, + coverImage: "", + thumbnailUrl: getBestThumbnailUrl(videoId, data), + language: langInfo, + chapters, + }; +} + +async function downloadCoverImage(url: string, outputPath: string): Promise { + const urls = [url]; + if (url.includes("maxresdefault")) urls.push(url.replace("maxresdefault", "hqdefault")); + for (const u of urls) { + try { + const r = await fetch(u); + if (r.ok) { + writeFileSync(outputPath, Buffer.from(await r.arrayBuffer())); + return true; + } + } catch {} + } + return false; +} + +function parseSrt(srt: string): Snippet[] { + const blocks = srt.trim().split(/\n\n+/); + const snippets: Snippet[] = []; + for (const block of blocks) { + const lines = block.split("\n"); + if (lines.length < 3) continue; + const m = lines[1].match(/(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2}),(\d{3})/); + if (!m) continue; + const start = parseInt(m[1]) * 3600 + parseInt(m[2]) * 60 + parseInt(m[3]) + parseInt(m[4]) / 1000; + const end = parseInt(m[5]) * 3600 + parseInt(m[6]) * 60 + parseInt(m[7]) + parseInt(m[8]) / 1000; + snippets.push({ text: lines.slice(2).join(" "), start, duration: end - start }); + } + return snippets; +} + +// --- Timestamp formatting --- + +function ts(t: number): string { + const h = Math.floor(t / 3600); + const m = Math.floor((t % 3600) / 60); + const s = Math.floor(t % 60); + return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`; +} + +function tsMs(t: number, sep: string): string { + const h = Math.floor(t / 3600); + const m = Math.floor((t % 3600) / 60); + const s = Math.floor(t % 60); + const ms = Math.round((t - Math.floor(t)) * 1000); + return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}${sep}${String(ms).padStart(3, "0")}`; +} + +// --- Paragraph grouping --- + +interface Paragraph { + text: string; + start: number; + end: number; +} + +function groupIntoParagraphs(snippets: Snippet[]): Paragraph[] { + if (!snippets.length) return []; + const paras: Paragraph[] = []; + let buf: Snippet[] = []; + for (let i = 0; i < snippets.length; i++) { + buf.push(snippets[i]); + const last = i === snippets.length - 1; + const gap = !last && snippets[i + 1].start - (snippets[i].start + snippets[i].duration) > 1.5; + if (last || gap || buf.length >= 8) { + const lastS = buf[buf.length - 1]; + paras.push({ text: buf.map(s => s.text).join(" "), start: buf[0].start, end: lastS.start + lastS.duration }); + buf = []; + } + } + return paras; +} + +// --- Format functions --- + +function formatSrt(snippets: Snippet[]): string { + return snippets + .map((s, i) => { + const end = i < snippets.length - 1 && snippets[i + 1].start < s.start + s.duration + ? snippets[i + 1].start + : s.start + s.duration; + return `${i + 1}\n${tsMs(s.start, ",")} --> ${tsMs(end, ",")}\n${s.text}`; + }) + .join("\n\n") + "\n"; +} + +function yamlEscape(s: string): string { + if (/[:"'{}\[\]#&*!|>%@`\n]/.test(s) || s.trim() !== s) return `"${s.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`; + return s; +} + +function formatMarkdown(snippets: Snippet[], meta: VideoMeta, opts: { timestamps: boolean; chapters: boolean; speakers: boolean }, rawSrt?: string): string { + let md = "---\n"; + md += `title: ${yamlEscape(meta.title)}\n`; + md += `channel: ${yamlEscape(meta.channel)}\n`; + if (meta.publishDate) md += `date: ${meta.publishDate}\n`; + md += `url: ${yamlEscape(meta.url)}\n`; + if (meta.coverImage) md += `cover: ${meta.coverImage}\n`; + if (meta.language) md += `language: ${meta.language.code}\n`; + md += "---\n\n"; + + if (opts.speakers) { + if (meta.description) md += "# Description\n\n" + meta.description.trim() + "\n\n"; + if (meta.chapters.length) { + md += "# Chapters\n\n"; + for (const ch of meta.chapters) md += `* [${ts(ch.start)}] ${ch.title}\n`; + md += "\n"; + } + md += "# Transcript\n\n"; + md += rawSrt || formatSrt(snippets); + return md; + } + + const chapters = opts.chapters ? meta.chapters : []; + + if (chapters.length) { + md += "## Table of Contents\n\n"; + for (const ch of chapters) md += opts.timestamps ? `* [${ts(ch.start)}] ${ch.title}\n` : `* ${ch.title}\n`; + md += "\n\n"; + for (let i = 0; i < chapters.length; i++) { + const nextStart = i < chapters.length - 1 ? chapters[i + 1].start : Infinity; + const segs = snippets.filter(s => s.start >= chapters[i].start && s.start < nextStart); + const paras = groupIntoParagraphs(segs); + md += opts.timestamps + ? `## [${ts(chapters[i].start)}] ${chapters[i].title}\n\n` + : `## ${chapters[i].title}\n\n`; + for (const p of paras) md += opts.timestamps ? `${p.text} [${ts(p.start)} → ${ts(p.end)}]\n\n` : `${p.text}\n\n`; + md += "\n"; + } + } else { + const paras = groupIntoParagraphs(snippets); + for (const p of paras) md += opts.timestamps ? `${p.text} [${ts(p.start)} → ${ts(p.end)}]\n\n` : `${p.text}\n\n`; + } + + return md.trimEnd() + "\n"; +} + +function formatListOutput(videoId: string, title: string, transcripts: TranscriptInfo[]): string { + const manual = transcripts.filter((t) => !t.isGenerated); + const generated = transcripts.filter((t) => t.isGenerated); + const tlLangs = transcripts.find((t) => t.translationLanguages.length > 0)?.translationLanguages || []; + const fmtList = (list: TranscriptInfo[]) => + list.length ? list.map((t) => ` - ${t.languageCode} ("${t.language}")${t.isTranslatable ? " [TRANSLATABLE]" : ""}`).join("\n") : "None"; + const fmtTl = tlLangs.length + ? tlLangs.map((t) => ` - ${t.languageCode} ("${t.language}")`).join("\n") + : "None"; + return `Transcripts for ${videoId}${title ? ` (${title})` : ""}:\n\n(MANUALLY CREATED)\n${fmtList(manual)}\n\n(GENERATED)\n${fmtList(generated)}\n\n(TRANSLATION LANGUAGES)\n${fmtTl}`; +} + +// --- File helpers --- + +function ensureDir(p: string) { + const dir = dirname(p); + if (!existsSync(dir)) mkdirSync(dir, { recursive: true }); +} + +function resolveBaseDir(outputDir: string): string { + return resolve(outputDir || "youtube-transcript"); +} + +function loadIndex(baseDir: string): Record { + try { return JSON.parse(readFileSync(join(baseDir, ".index.json"), "utf-8")); } catch { return {}; } +} + +function saveIndex(baseDir: string, index: Record) { + const p = join(baseDir, ".index.json"); + ensureDir(p); + writeFileSync(p, JSON.stringify(index, null, 2)); +} + +function lookupVideoDir(videoId: string, baseDir: string): string | null { + const rel = loadIndex(baseDir)[videoId]; + if (rel) { + const dir = resolve(baseDir, rel); + if (existsSync(dir)) return dir; + } + return null; +} + +function registerVideoDir(videoId: string, channelSlug: string, titleSlug: string, baseDir: string): string { + const rel = join(channelSlug, titleSlug); + const index = loadIndex(baseDir); + index[videoId] = rel; + saveIndex(baseDir, index); + return resolve(baseDir, rel); +} + +function hasCachedData(videoDir: string): boolean { + return existsSync(join(videoDir, "meta.json")) && existsSync(join(videoDir, "transcript-raw.srt")); +} + +function loadMeta(videoDir: string): VideoMeta { + return JSON.parse(readFileSync(join(videoDir, "meta.json"), "utf-8")); +} + +function loadSnippets(videoDir: string): Snippet[] { + return parseSrt(readFileSync(join(videoDir, "transcript-raw.srt"), "utf-8")); +} + +function loadRawSrt(videoDir: string): string { + return readFileSync(join(videoDir, "transcript-raw.srt"), "utf-8"); +} + +// --- Main processing --- + +async function fetchAndCache(videoId: string, baseDir: string, opts: Options): Promise<{ meta: VideoMeta; snippets: Snippet[]; videoDir: string }> { + const html = await fetchHtml(videoId); + const apiKey = extractApiKey(html, videoId); + const data = await fetchInnertubeData(videoId, apiKey); + const captionsJson = extractCaptionsJson(data, videoId); + const transcripts = buildTranscriptList(captionsJson); + const info = findTranscript(transcripts, opts.languages, opts.excludeGenerated, opts.excludeManual); + const result = await fetchTranscriptSnippets(info, opts.translate || undefined); + const description = data?.videoDetails?.shortDescription || ""; + const chapters = parseChapters(description); + const langInfo = { code: result.languageCode, name: result.language, isGenerated: info.isGenerated }; + const meta = buildVideoMeta(data, videoId, langInfo, chapters); + + // Compute directory: {baseDir}/{channel-slug}/{title-slug}/ + const videoDir = registerVideoDir(videoId, slugify(meta.channel), slugify(meta.title), baseDir); + + // Save raw data as SRT (pre-computed timestamps, token-efficient for LLM) + ensureDir(join(videoDir, "meta.json")); + writeFileSync(join(videoDir, "transcript-raw.srt"), formatSrt(result.snippets)); + + // Download cover image + const imgPath = join(videoDir, "imgs", "cover.jpg"); + ensureDir(imgPath); + const downloaded = await downloadCoverImage(meta.thumbnailUrl, imgPath); + meta.coverImage = downloaded ? "imgs/cover.jpg" : ""; + + // Save meta (after cover image result is known) + writeFileSync(join(videoDir, "meta.json"), JSON.stringify(meta, null, 2)); + + return { meta, snippets: result.snippets, videoDir }; +} + +async function processVideo(videoId: string, opts: Options): Promise { + const baseDir = resolveBaseDir(opts.outputDir); + + // --list: always fetch fresh + if (opts.list) { + const html = await fetchHtml(videoId); + const apiKey = extractApiKey(html, videoId); + const data = await fetchInnertubeData(videoId, apiKey); + const title = data?.videoDetails?.title || ""; + const captionsJson = extractCaptionsJson(data, videoId); + const transcripts = buildTranscriptList(captionsJson); + return { videoId, title, content: formatListOutput(videoId, title, transcripts) }; + } + + // Fetch phase: use cache via index lookup + let videoDir = lookupVideoDir(videoId, baseDir); + let meta: VideoMeta; + let snippets: Snippet[]; + let rawSrt: string | undefined; + let needsFetch = opts.refresh || !videoDir || !hasCachedData(videoDir); + + if (!needsFetch && videoDir) { + meta = loadMeta(videoDir); + snippets = loadSnippets(videoDir); + rawSrt = loadRawSrt(videoDir); + const wantLangs = opts.translate ? [opts.translate] : opts.languages; + if (!wantLangs.includes(meta.language.code)) needsFetch = true; + } + + if (needsFetch) { + const result = await fetchAndCache(videoId, baseDir, opts); + meta = result.meta; + snippets = result.snippets; + videoDir = result.videoDir; + rawSrt = loadRawSrt(videoDir); + } else { + meta = meta!; + snippets = snippets!; + } + + // Format phase + let content: string; + let ext: string; + + if (opts.format === "srt") { + content = rawSrt || formatSrt(snippets); + ext = "srt"; + } else { + content = formatMarkdown(snippets, meta, { + timestamps: opts.timestamps, + chapters: opts.chapters, + speakers: opts.speakers, + }, rawSrt); + ext = "md"; + } + + const filePath = opts.output ? resolve(opts.output) : join(videoDir!, `transcript.${ext}`); + ensureDir(filePath); + writeFileSync(filePath, content); + + return { videoId, title: meta.title, filePath }; +} + +// --- CLI --- + +function printHelp() { + console.log(`Usage: bun main.ts [options] + +Options: + --languages Language codes, comma-separated (default: en) + --format Output format: text, srt (default: text) + --translate Translate to language code + --list List available transcripts + --timestamps Include timestamps (default: on) + --no-timestamps Disable timestamps + --chapters Chapter segmentation from description + --speakers Raw transcript with metadata for speaker identification + --exclude-generated Skip auto-generated transcripts + --exclude-manually-created Skip manually created transcripts + --refresh Force re-fetch (ignore cache) + -o, --output Save to specific file path + --output-dir Base output directory (default: youtube-transcript) + -h, --help Show help`); +} + +function parseArgs(argv: string[]): Options | null { + const opts: Options = { + videoIds: [], + languages: ["en"], + format: "text", + translate: "", + list: false, + excludeGenerated: false, + excludeManual: false, + output: "", + outputDir: "", + timestamps: true, + chapters: false, + speakers: false, + refresh: false, + }; + + for (let i = 0; i < argv.length; i++) { + const arg = argv[i]; + if (arg === "-h" || arg === "--help") { + printHelp(); + process.exit(0); + } else if (arg === "--languages") { + const v = argv[++i]; + if (v) opts.languages = v.split(",").map((s) => s.trim()); + } else if (arg === "--format") { + const v = argv[++i]?.toLowerCase(); + if (v === "text" || v === "srt") opts.format = v; + else { + console.error(`Invalid format: ${v}. Use: text, srt`); + return null; + } + } else if (arg === "--translate") { + opts.translate = argv[++i] || ""; + } else if (arg === "--list" || arg === "--list-transcripts") { + opts.list = true; + } else if (arg === "--timestamps" || arg === "-t") { + opts.timestamps = true; + } else if (arg === "--no-timestamps") { + opts.timestamps = false; + } else if (arg === "--chapters") { + opts.chapters = true; + } else if (arg === "--speakers") { + opts.speakers = true; + } else if (arg === "--exclude-generated") { + opts.excludeGenerated = true; + } else if (arg === "--exclude-manually-created") { + opts.excludeManual = true; + } else if (arg === "--refresh") { + opts.refresh = true; + } else if (arg === "-o" || arg === "--output") { + opts.output = argv[++i] || ""; + } else if (arg === "--output-dir") { + opts.outputDir = argv[++i] || ""; + } else if (!arg.startsWith("-")) { + opts.videoIds.push(extractVideoId(arg)); + } + } + + if (opts.videoIds.length === 0) { + console.error("Error: At least one video URL or ID required"); + printHelp(); + return null; + } + return opts; +} + +async function main() { + const opts = parseArgs(process.argv.slice(2)); + if (!opts) process.exit(1); + + if (opts.excludeGenerated && opts.excludeManual) { + console.error("Error: Cannot exclude both generated and manually created transcripts"); + process.exit(1); + } + + for (const videoId of opts.videoIds) { + try { + const r = await processVideo(videoId, opts); + if (r.error) console.error(`Error (${r.videoId}): ${r.error}`); + else if (r.filePath) console.log(r.filePath); + else if (r.content) console.log(r.content); + } catch (e) { + console.error(`Error (${videoId}): ${(e as Error).message}`); + } + } +} + +main();