feat(baoyu-youtube-transcript): auto-retry with yt-dlp on empty InnerTube transcript
This commit is contained in:
parent
4874cd2dae
commit
204765a137
|
|
@ -2,7 +2,7 @@ import test from "node:test";
|
|||
import assert from "node:assert/strict";
|
||||
|
||||
import { findTranscript, parseTranscriptJson3, parseWebVtt } from "./transcript.ts";
|
||||
import { buildTranscriptListFromYtDlp, resolveVideoSource, selectYtDlpTrack } from "./youtube.ts";
|
||||
import { buildTranscriptListFromYtDlp, fetchTranscriptWithFallback, resolveVideoSource, selectYtDlpTrack } from "./youtube.ts";
|
||||
|
||||
test("selectYtDlpTrack prefers json3 over xml and vtt", () => {
|
||||
const track = selectYtDlpTrack([
|
||||
|
|
@ -123,3 +123,61 @@ test("resolveVideoSource falls back to yt-dlp only after fallback-eligible error
|
|||
assert.equal(fallbackCalled, true);
|
||||
assert.equal(source.transcripts[0].languageCode, "en");
|
||||
});
|
||||
|
||||
test("fetchTranscriptWithFallback retries with yt-dlp when InnerTube transcript payload is empty", async () => {
|
||||
const warnings: string[] = [];
|
||||
let fallbackCalled = false;
|
||||
const result = await fetchTranscriptWithFallback(
|
||||
"video12345ab",
|
||||
{
|
||||
kind: "innertube",
|
||||
data: { videoDetails: { title: "Primary" } },
|
||||
transcripts: [{
|
||||
language: "English",
|
||||
languageCode: "en",
|
||||
isGenerated: false,
|
||||
isTranslatable: false,
|
||||
baseUrl: "https://www.youtube.com/api/timedtext?v=video12345ab&lang=en&fmt=json3",
|
||||
translationLanguages: [],
|
||||
}],
|
||||
},
|
||||
{
|
||||
languages: ["en"],
|
||||
translate: "",
|
||||
excludeGenerated: false,
|
||||
excludeManual: false,
|
||||
},
|
||||
async (info) => {
|
||||
if (info.baseUrl.includes("youtube.com/api/timedtext")) {
|
||||
return { snippets: [], language: info.language, languageCode: info.languageCode };
|
||||
}
|
||||
return {
|
||||
snippets: [{ text: "Recovered subtitle", start: 0, duration: 2 }],
|
||||
language: info.language,
|
||||
languageCode: info.languageCode,
|
||||
};
|
||||
},
|
||||
async () => {
|
||||
fallbackCalled = true;
|
||||
return {
|
||||
kind: "yt-dlp",
|
||||
info: { title: "Fallback" },
|
||||
transcripts: [{
|
||||
language: "English",
|
||||
languageCode: "en",
|
||||
isGenerated: false,
|
||||
isTranslatable: false,
|
||||
baseUrl: "https://example.com/subtitles.en.json3",
|
||||
translationLanguages: [],
|
||||
}],
|
||||
};
|
||||
},
|
||||
(message) => warnings.push(message)
|
||||
);
|
||||
|
||||
assert.equal(fallbackCalled, true);
|
||||
assert.equal(result.source.kind, "yt-dlp");
|
||||
assert.equal(result.snippets.length, 1);
|
||||
assert.equal(result.snippets[0].text, "Recovered subtitle");
|
||||
assert.match(warnings[0] || "", /Retrying with yt-dlp fallback/);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -13,13 +13,13 @@ import {
|
|||
registerVideoDir,
|
||||
resolveBaseDir,
|
||||
} from "./storage.ts";
|
||||
import { findTranscript, formatListOutput, formatMarkdown, formatSrt, segmentIntoSentences } from "./transcript.ts";
|
||||
import { formatListOutput, formatMarkdown, formatSrt, segmentIntoSentences } from "./transcript.ts";
|
||||
import type { Options, Sentence, Snippet, VideoMeta, VideoResult } from "./types.ts";
|
||||
import {
|
||||
buildVideoMeta,
|
||||
buildVideoMetaFromYtDlp,
|
||||
downloadCoverImage,
|
||||
fetchTranscriptSnippets,
|
||||
fetchTranscriptWithFallback,
|
||||
fetchVideoSource,
|
||||
getThumbnailUrls,
|
||||
getYtDlpThumbnailUrls,
|
||||
|
|
@ -31,10 +31,12 @@ async function fetchAndCache(
|
|||
baseDir: string,
|
||||
opts: Options
|
||||
): Promise<{ meta: VideoMeta; snippets: Snippet[]; sentences: Sentence[]; videoDir: string }> {
|
||||
const source = await fetchVideoSource(videoId);
|
||||
const requestedLanguages = source.kind === "yt-dlp" && opts.translate ? [opts.translate] : opts.languages;
|
||||
const transcript = findTranscript(source.transcripts, requestedLanguages, opts.excludeGenerated, opts.excludeManual);
|
||||
const result = await fetchTranscriptSnippets(transcript, source.kind === "yt-dlp" ? undefined : opts.translate || undefined);
|
||||
const initialSource = await fetchVideoSource(videoId);
|
||||
const { source, transcript, snippets, language, languageCode } = await fetchTranscriptWithFallback(
|
||||
videoId,
|
||||
initialSource,
|
||||
opts
|
||||
);
|
||||
const description = source.kind === "yt-dlp"
|
||||
? source.info.description || ""
|
||||
: source.data?.videoDetails?.shortDescription || "";
|
||||
|
|
@ -42,21 +44,21 @@ async function fetchAndCache(
|
|||
? Number(source.info.duration || 0)
|
||||
: parseInt(source.data?.videoDetails?.lengthSeconds || "0");
|
||||
const chapters = parseChapters(description, duration);
|
||||
const language = {
|
||||
code: result.languageCode,
|
||||
name: result.language,
|
||||
const languageMeta = {
|
||||
code: languageCode,
|
||||
name: language,
|
||||
isGenerated: transcript.isGenerated,
|
||||
};
|
||||
const meta = source.kind === "yt-dlp"
|
||||
? buildVideoMetaFromYtDlp(source.info, videoId, language, chapters)
|
||||
: buildVideoMeta(source.data, videoId, language, chapters);
|
||||
? buildVideoMetaFromYtDlp(source.info, videoId, languageMeta, chapters)
|
||||
: buildVideoMeta(source.data, videoId, languageMeta, chapters);
|
||||
|
||||
const videoDir = registerVideoDir(videoId, slugify(meta.channel), slugify(meta.title), baseDir);
|
||||
ensureDir(join(videoDir, "meta.json"));
|
||||
|
||||
writeFileSync(join(videoDir, "transcript-raw.json"), JSON.stringify(result.snippets, null, 2));
|
||||
writeFileSync(join(videoDir, "transcript-raw.json"), JSON.stringify(snippets, null, 2));
|
||||
|
||||
const sentences = segmentIntoSentences(result.snippets);
|
||||
const sentences = segmentIntoSentences(snippets);
|
||||
writeFileSync(join(videoDir, "transcript-sentences.json"), JSON.stringify(sentences, null, 2));
|
||||
|
||||
const imagePath = join(videoDir, "imgs", "cover.jpg");
|
||||
|
|
@ -69,7 +71,7 @@ async function fetchAndCache(
|
|||
|
||||
writeFileSync(join(videoDir, "meta.json"), JSON.stringify(meta, null, 2));
|
||||
|
||||
return { meta, snippets: result.snippets, sentences, videoDir };
|
||||
return { meta, snippets, sentences, videoDir };
|
||||
}
|
||||
|
||||
async function processVideo(videoId: string, opts: Options): Promise<VideoResult> {
|
||||
|
|
|
|||
|
|
@ -2,12 +2,13 @@ import { spawnSync } from "child_process";
|
|||
import { writeFileSync } from "fs";
|
||||
|
||||
import { makeError, normalizeError, normalizePublishDate, shouldTryAlternateClient, shouldTryYtDlpFallback } from "./shared.ts";
|
||||
import { parseTranscriptPayload } from "./transcript.ts";
|
||||
import { findTranscript, parseTranscriptPayload } from "./transcript.ts";
|
||||
import type {
|
||||
Chapter,
|
||||
InnerTubeClient,
|
||||
InnerTubeSession,
|
||||
LanguageMeta,
|
||||
Options,
|
||||
Snippet,
|
||||
TranscriptInfo,
|
||||
VideoMeta,
|
||||
|
|
@ -219,6 +220,68 @@ export async function fetchTranscriptSnippets(
|
|||
};
|
||||
}
|
||||
|
||||
function buildYtDlpVideoSource(videoId: string, info: YtDlpInfo): VideoSource {
|
||||
const transcripts = buildTranscriptListFromYtDlp(info);
|
||||
if (!transcripts.length) throw makeError(`Transcripts disabled for ${videoId}`, "TRANSCRIPTS_DISABLED");
|
||||
return { kind: "yt-dlp", info, transcripts };
|
||||
}
|
||||
|
||||
function getRequestedLanguages(
|
||||
source: VideoSource,
|
||||
opts: Pick<Options, "languages" | "translate">
|
||||
): string[] {
|
||||
return source.kind === "yt-dlp" && opts.translate ? [opts.translate] : opts.languages;
|
||||
}
|
||||
|
||||
export async function fetchTranscriptWithFallback(
|
||||
videoId: string,
|
||||
source: VideoSource,
|
||||
opts: Pick<Options, "languages" | "translate" | "excludeGenerated" | "excludeManual">,
|
||||
fetchSnippets: (
|
||||
info: TranscriptInfo,
|
||||
translateTo?: string
|
||||
) => Promise<{ snippets: Snippet[]; language: string; languageCode: string }> = fetchTranscriptSnippets,
|
||||
fetchFallbackSource: (videoId: string) => Promise<VideoSource> | VideoSource = (requestedVideoId) =>
|
||||
buildYtDlpVideoSource(requestedVideoId, fetchYtDlpInfo(requestedVideoId)),
|
||||
logWarning: (message: string) => void = (message) => console.error(message)
|
||||
): Promise<{
|
||||
source: VideoSource;
|
||||
transcript: TranscriptInfo;
|
||||
snippets: Snippet[];
|
||||
language: string;
|
||||
languageCode: string;
|
||||
}> {
|
||||
const transcript = findTranscript(
|
||||
source.transcripts,
|
||||
getRequestedLanguages(source, opts),
|
||||
opts.excludeGenerated,
|
||||
opts.excludeManual
|
||||
);
|
||||
const result = await fetchSnippets(transcript, source.kind === "yt-dlp" ? undefined : opts.translate || undefined);
|
||||
if (result.snippets.length > 0) return { source, transcript, ...result };
|
||||
|
||||
if (source.kind === "yt-dlp") {
|
||||
throw makeError(`Transcript fetch returned empty snippets for ${videoId}`, "EMPTY_TRANSCRIPT");
|
||||
}
|
||||
|
||||
logWarning(`Warning (${videoId}): Transcript fetch returned empty snippets. Retrying with yt-dlp fallback.`);
|
||||
const fallbackSource = await fetchFallbackSource(videoId);
|
||||
const fallbackTranscript = findTranscript(
|
||||
fallbackSource.transcripts,
|
||||
getRequestedLanguages(fallbackSource, opts),
|
||||
opts.excludeGenerated,
|
||||
opts.excludeManual
|
||||
);
|
||||
const fallbackResult = await fetchSnippets(
|
||||
fallbackTranscript,
|
||||
fallbackSource.kind === "yt-dlp" ? undefined : opts.translate || undefined
|
||||
);
|
||||
if (!fallbackResult.snippets.length) {
|
||||
throw makeError(`Transcript fetch returned empty snippets for ${videoId} after yt-dlp fallback`, "EMPTY_TRANSCRIPT");
|
||||
}
|
||||
return { source: fallbackSource, transcript: fallbackTranscript, ...fallbackResult };
|
||||
}
|
||||
|
||||
export function detectYtDlpCommand(): { command: string; args: string[]; label: string } | null {
|
||||
if (cachedYtDlpCommand !== undefined) return cachedYtDlpCommand;
|
||||
const candidates = [
|
||||
|
|
@ -366,10 +429,7 @@ export async function resolveVideoSource(
|
|||
const normalized = normalizeError(error);
|
||||
if (!shouldTryYtDlpFallback(normalized)) throw normalized;
|
||||
logWarning(`Warning (${videoId}): ${normalized.message}. Retrying with yt-dlp fallback.`);
|
||||
const info = fetchFallback(videoId);
|
||||
const transcripts = buildTranscriptListFromYtDlp(info);
|
||||
if (!transcripts.length) throw makeError(`Transcripts disabled for ${videoId}`, "TRANSCRIPTS_DISABLED");
|
||||
return { kind: "yt-dlp", info, transcripts };
|
||||
return buildYtDlpVideoSource(videoId, fetchFallback(videoId));
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue