JimLiu-baoyu-skills/skills/baoyu-youtube-transcript/scripts/transcript.ts

350 lines
13 KiB
TypeScript

import { htmlUnescape, makeError, stripTags } from "./shared.ts";
import type { Sentence, Snippet, TranscriptInfo, VideoMeta } from "./types.ts";
interface Paragraph {
text: string;
start: number;
end: number;
}
const SENTENCE_END_RE = /[.?!…。?!⁈⁇‼‽.]/;
export function parseTranscriptXml(xml: string): Snippet[] {
const snippets: Snippet[] = [];
const pattern = /<text\s+start="([^"]*)"(?:\s+dur="([^"]*)")?[^>]*>([\s\S]*?)<\/text>/g;
let match: RegExpExecArray | null;
while ((match = pattern.exec(xml)) !== null) {
const raw = match[3];
if (!raw) continue;
snippets.push({
text: htmlUnescape(stripTags(raw)),
start: parseFloat(match[1]),
duration: parseFloat(match[2] || "0"),
});
}
return snippets;
}
export function parseTranscriptJson3(text: string): Snippet[] {
const data = JSON.parse(text);
const events = Array.isArray(data?.events) ? data.events : [];
const snippets: Snippet[] = [];
for (const event of events) {
const segs = Array.isArray(event?.segs) ? event.segs : [];
const textParts = segs
.map((seg: any) => htmlUnescape(String(seg?.utf8 || "").replace(/\n+/g, " ").trim()))
.filter(Boolean);
const merged = mergeTexts(textParts).trim();
if (!merged) continue;
snippets.push({
text: merged,
start: Number(event?.tStartMs || 0) / 1000,
duration: Number(event?.dDurationMs || 0) / 1000,
});
}
return snippets;
}
function parseSrt(srt: string): Snippet[] {
const blocks = srt.trim().split(/\n\n+/);
const snippets: Snippet[] = [];
for (const block of blocks) {
const lines = block.split("\n");
if (lines.length < 3) continue;
const match = lines[1].match(/(\d{2}):(\d{2}):(\d{2}),(\d{3})\s*-->\s*(\d{2}):(\d{2}):(\d{2}),(\d{3})/);
if (!match) continue;
const start = parseInt(match[1]) * 3600 + parseInt(match[2]) * 60 + parseInt(match[3]) + parseInt(match[4]) / 1000;
const end = parseInt(match[5]) * 3600 + parseInt(match[6]) * 60 + parseInt(match[7]) + parseInt(match[8]) / 1000;
snippets.push({ text: lines.slice(2).join(" "), start, duration: end - start });
}
return snippets;
}
export function parseWebVtt(vtt: string): Snippet[] {
const blocks = vtt
.replace(/^WEBVTT\s*/m, "")
.trim()
.split(/\n\n+/);
const snippets: Snippet[] = [];
for (const block of blocks) {
const lines = block.split("\n").map((line) => line.trim()).filter(Boolean);
const tsLine = lines.find((line) => line.includes("-->"));
if (!tsLine) continue;
const match = tsLine.match(
/(?:(\d{2}):)?(\d{2}):(\d{2})\.(\d{3})\s*-->\s*(?:(\d{2}):)?(\d{2}):(\d{2})\.(\d{3})/
);
if (!match) continue;
const start =
(match[1] ? parseInt(match[1]) : 0) * 3600 +
parseInt(match[2]) * 60 +
parseInt(match[3]) +
parseInt(match[4]) / 1000;
const end =
(match[5] ? parseInt(match[5]) : 0) * 3600 +
parseInt(match[6]) * 60 +
parseInt(match[7]) +
parseInt(match[8]) / 1000;
const text = htmlUnescape(stripTags(lines.slice(lines.indexOf(tsLine) + 1).join(" ").replace(/\s+/g, " ").trim()));
if (!text) continue;
snippets.push({ text, start, duration: end - start });
}
return snippets;
}
export function parseTranscriptPayload(payload: string, url: string): Snippet[] {
const normalized = payload.trimStart();
if (url.includes("fmt=json3") || normalized.startsWith("{")) return parseTranscriptJson3(payload);
if (normalized.startsWith("WEBVTT")) return parseWebVtt(payload);
if (/^\d+\s*\n\d{2}:\d{2}:\d{2},\d{3}\s*-->/.test(normalized)) return parseSrt(payload);
return parseTranscriptXml(payload);
}
function isCJK(ch: string): boolean {
const code = ch.charCodeAt(0);
return (code >= 0x4E00 && code <= 0x9FFF) ||
(code >= 0x3040 && code <= 0x309F) ||
(code >= 0x30A0 && code <= 0x30FF) ||
(code >= 0xAC00 && code <= 0xD7AF) ||
(code >= 0x3400 && code <= 0x4DBF) ||
(code >= 0xF900 && code <= 0xFAFF);
}
function splitSnippetAtPunctuation(snippet: Snippet): { text: string; start: number; end: number }[] {
const { text, start, duration } = snippet;
const end = start + duration;
if (!text.length) return [];
const splitPoints: number[] = [];
for (let i = 0; i < text.length; i++) {
if (SENTENCE_END_RE.test(text[i])) {
while (i + 1 < text.length && SENTENCE_END_RE.test(text[i + 1])) i++;
if (i < text.length - 1) splitPoints.push(i);
}
}
if (!splitPoints.length) return [{ text, start, end }];
const parts: { text: string; start: number; end: number }[] = [];
let prev = 0;
for (const pos of splitPoints) {
const partText = text.slice(prev, pos + 1).trim();
if (partText) {
parts.push({
text: partText,
start: start + (prev / text.length) * duration,
end: start + ((pos + 1) / text.length) * duration,
});
}
prev = pos + 1;
}
const remaining = text.slice(prev).trim();
if (remaining) parts.push({ text: remaining, start: start + (prev / text.length) * duration, end });
return parts;
}
function mergeTexts(texts: string[]): string {
if (!texts.length) return "";
let result = texts[0];
for (let i = 1; i < texts.length; i++) {
const next = texts[i];
if (!next) continue;
const lastChar = result[result.length - 1];
const firstChar = next[0];
if (isCJK(lastChar) || isCJK(firstChar)) {
result += next;
} else {
result = result.trimEnd() + " " + next.trimStart();
}
}
return result.replace(/ {2,}/g, " ");
}
export function ts(time: number): string {
const h = Math.floor(time / 3600);
const m = Math.floor((time % 3600) / 60);
const s = Math.floor(time % 60);
return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}`;
}
function tsMs(time: number, sep: string): string {
const h = Math.floor(time / 3600);
const m = Math.floor((time % 3600) / 60);
const s = Math.floor(time % 60);
const ms = Math.round((time - Math.floor(time)) * 1000);
return `${String(h).padStart(2, "0")}:${String(m).padStart(2, "0")}:${String(s).padStart(2, "0")}${sep}${String(ms).padStart(3, "0")}`;
}
function parseTs(time: string): number {
const [h, m, s] = time.split(":").map(Number);
return h * 3600 + m * 60 + s;
}
export function segmentIntoSentences(snippets: Snippet[]): Sentence[] {
const parts: { text: string; start: number; end: number }[] = [];
for (const snippet of snippets) parts.push(...splitSnippetAtPunctuation(snippet));
const sentences: Sentence[] = [];
let buffer: { text: string; start: number; end: number }[] = [];
for (const part of parts) {
buffer.push(part);
if (SENTENCE_END_RE.test(part.text[part.text.length - 1])) {
sentences.push({
text: mergeTexts(buffer.map((entry) => entry.text)),
start: ts(buffer[0].start),
end: ts(buffer[buffer.length - 1].end),
});
buffer = [];
}
}
if (buffer.length) {
sentences.push({
text: mergeTexts(buffer.map((entry) => entry.text)),
start: ts(buffer[0].start),
end: ts(buffer[buffer.length - 1].end),
});
}
return sentences;
}
function groupSentenceParas(sentences: Sentence[]): Paragraph[] {
if (!sentences.length) return [];
const paragraphs: Paragraph[] = [];
let buffer: Sentence[] = [];
for (let i = 0; i < sentences.length; i++) {
buffer.push(sentences[i]);
const last = i === sentences.length - 1;
const gap = !last && parseTs(sentences[i + 1].start) - parseTs(sentences[i].end) > 2;
if (last || gap || buffer.length >= 5) {
paragraphs.push({
text: mergeTexts(buffer.map((sentence) => sentence.text)),
start: parseTs(buffer[0].start),
end: parseTs(buffer[buffer.length - 1].end),
});
buffer = [];
}
}
return paragraphs;
}
export function formatSrt(snippets: Snippet[]): string {
return snippets
.map((snippet, index) => {
const end = index < snippets.length - 1 && snippets[index + 1].start < snippet.start + snippet.duration
? snippets[index + 1].start
: snippet.start + snippet.duration;
return `${index + 1}\n${tsMs(snippet.start, ",")} --> ${tsMs(end, ",")}\n${snippet.text}`;
})
.join("\n\n") + "\n";
}
function yamlEscape(value: string): string {
if (/[:"'{}\[\]#&*!|>%@`\n]/.test(value) || value.trim() !== value) {
return `"${value.replace(/\\/g, "\\\\").replace(/"/g, '\\"')}"`;
}
return value;
}
function extractSummary(description: string): string {
if (!description) return "";
const firstPara = description.split(/\n\s*\n/)[0].trim();
const lines = firstPara.split("\n").filter((line) => !/^\s*(https?:\/\/|#|@|\d+:\d+)/.test(line) && line.trim());
return lines.join(" ").slice(0, 300).trim();
}
export function formatMarkdown(
sentences: Sentence[],
meta: VideoMeta,
opts: { timestamps: boolean; chapters: boolean; speakers: boolean },
snippets?: Snippet[]
): string {
const summary = extractSummary(meta.description);
let md = "---\n";
md += `title: ${yamlEscape(meta.title)}\n`;
md += `channel: ${yamlEscape(meta.channel)}\n`;
if (meta.publishDate) md += `date: ${meta.publishDate}\n`;
md += `url: ${yamlEscape(meta.url)}\n`;
if (meta.coverImage) md += `cover: ${meta.coverImage}\n`;
if (summary) md += `description: ${yamlEscape(summary)}\n`;
if (meta.language) md += `language: ${meta.language.code}\n`;
md += "---\n\n";
if (opts.speakers) {
md += `# ${meta.title}\n\n`;
if (summary) md += `${summary}\n\n`;
if (meta.description) md += `# Description\n\n${meta.description.trim()}\n\n`;
if (meta.chapters.length) {
md += "# Chapters\n\n";
for (const chapter of meta.chapters) md += `* [${ts(chapter.start)}] ${chapter.title}\n`;
md += "\n";
}
md += "# Transcript\n\n";
md += snippets ? formatSrt(snippets) : "";
return md;
}
md += `# ${meta.title}\n\n`;
if (summary) md += `${summary}\n\n`;
const chapters = opts.chapters ? meta.chapters : [];
if (chapters.length) {
md += "## Table of Contents\n\n";
for (const chapter of chapters) md += opts.timestamps ? `* [${ts(chapter.start)}] ${chapter.title}\n` : `* ${chapter.title}\n`;
md += "\n";
if (meta.coverImage) md += `\n![cover](${meta.coverImage})\n`;
md += "\n";
for (let i = 0; i < chapters.length; i++) {
const nextStart = i < chapters.length - 1 ? chapters[i + 1].start : Infinity;
const chapterSentences = sentences.filter((sentence) => parseTs(sentence.start) >= chapters[i].start && parseTs(sentence.start) < nextStart);
const paragraphs = groupSentenceParas(chapterSentences);
md += opts.timestamps ? `## [${ts(chapters[i].start)}] ${chapters[i].title}\n\n` : `## ${chapters[i].title}\n\n`;
for (const paragraph of paragraphs) {
md += opts.timestamps ? `${paragraph.text} [${ts(paragraph.start)}${ts(paragraph.end)}]\n\n` : `${paragraph.text}\n\n`;
}
md += "\n";
}
} else {
const paragraphs = groupSentenceParas(sentences);
for (const paragraph of paragraphs) {
md += opts.timestamps ? `${paragraph.text} [${ts(paragraph.start)}${ts(paragraph.end)}]\n\n` : `${paragraph.text}\n\n`;
}
}
return md.trimEnd() + "\n";
}
export function formatListOutput(videoId: string, title: string, transcripts: TranscriptInfo[]): string {
const manual = transcripts.filter((transcript) => !transcript.isGenerated);
const generated = transcripts.filter((transcript) => transcript.isGenerated);
const translationLanguages = transcripts.find((transcript) => transcript.translationLanguages.length > 0)?.translationLanguages || [];
const formatList = (list: TranscriptInfo[]) =>
list.length
? list.map((transcript) => ` - ${transcript.languageCode} ("${transcript.language}")${transcript.isTranslatable ? " [TRANSLATABLE]" : ""}`).join("\n")
: "None";
const formatTranslations = translationLanguages.length
? translationLanguages.map((language) => ` - ${language.languageCode} ("${language.language}")`).join("\n")
: "None";
return `Transcripts for ${videoId}${title ? ` (${title})` : ""}:\n\n(MANUALLY CREATED)\n${formatList(manual)}\n\n(GENERATED)\n${formatList(generated)}\n\n(TRANSLATION LANGUAGES)\n${formatTranslations}`;
}
export function findTranscript(
transcripts: TranscriptInfo[],
languages: string[],
excludeGenerated: boolean,
excludeManual: boolean
): TranscriptInfo {
let filtered = transcripts;
if (excludeGenerated) filtered = filtered.filter((transcript) => !transcript.isGenerated);
if (excludeManual) filtered = filtered.filter((transcript) => transcript.isGenerated);
for (const language of languages) {
const found = filtered.find((transcript) => transcript.languageCode === language);
if (found) return found;
}
const available = filtered.map((transcript) => `${transcript.languageCode} ("${transcript.language}")`).join(", ");
throw makeError(`No transcript found for languages [${languages.join(", ")}]. Available: ${available || "none"}`, "NO_TRANSCRIPT");
}