chore(baoyu-url-to-markdown): sync vendor baoyu-fetch with session and lifecycle changes

This commit is contained in:
Jim Liu 宝玉 2026-03-31 18:24:25 -05:00
parent 5eeb1e6d8d
commit 9e3d72cf42
8 changed files with 370 additions and 37 deletions

View File

@ -1,5 +1,6 @@
{
"lockfileVersion": 1,
"configVersion": 0,
"workspaces": {
"": {
"name": "baoyu-url-to-markdown-scripts",

View File

@ -64,6 +64,8 @@ export interface Adapter {
name: string;
match(input: AdapterInput): boolean;
checkLogin?(context: AdapterContext): Promise<AdapterLoginInfo>;
exportCookies?(context: AdapterContext, profileDir?: string): Promise<boolean>;
restoreCookies?(context: AdapterContext, profileDir?: string): Promise<boolean>;
downloadMedia?(request: MediaDownloadRequest): Promise<MediaDownloadResult>;
process(context: AdapterContext): Promise<AdapterProcessResult>;
}

View File

@ -1,4 +1,5 @@
import type { Adapter, AdapterLoginInfo } from "../types";
import { exportCookies, restoreCookies, type CookieSidecarConfig } from "../../browser/cookie-sidecar";
import { detectInteractionGate } from "../../browser/interaction-gates";
import type { ExtractedDocument } from "../../extract/document";
import { collectMediaFromDocument } from "../../media/markdown-media";
@ -10,6 +11,16 @@ import { extractSingleTweetDocumentFromPayload } from "./single";
import { extractThreadDocumentFromPayloads } from "./thread";
import { loadFullXThread } from "./thread-loader";
const cookieConfig: CookieSidecarConfig = {
urls: ["https://x.com/", "https://twitter.com/"],
filename: "x-session-cookies.json",
requiredCookieNames: ["auth_token", "ct0"],
filterCookie: (c) => {
const d = c.domain ?? "";
return d.endsWith("x.com") || d.endsWith("twitter.com");
},
};
function extractDocumentFromPayloads(
payloads: unknown[],
statusId: string,
@ -49,6 +60,12 @@ export const xAdapter: Adapter = {
async checkLogin(context) {
return detectXLogin(context);
},
async exportCookies(context, profileDir) {
return exportCookies(context.browser.targetSession, cookieConfig, profileDir);
},
async restoreCookies(context, profileDir) {
return restoreCookies(context.browser.targetSession, cookieConfig, profileDir);
},
async process(context) {
const statusId = extractStatusId(context.input.url);
if (!statusId) {

View File

@ -0,0 +1,47 @@
import type { AdapterContext } from "../types";
const X_SESSION_URLS = ["https://x.com/", "https://twitter.com/"] as const;
const REQUIRED_X_SESSION_COOKIES = ["auth_token", "ct0"] as const;
interface CookieLike {
name?: string;
value?: string | null;
}
interface NetworkGetCookiesResult {
cookies?: CookieLike[];
}
export function buildXSessionCookieMap(cookies: readonly CookieLike[]): Record<string, string> {
const cookieMap: Record<string, string> = {};
for (const cookie of cookies) {
const name = cookie.name?.trim();
const value = cookie.value?.trim();
if (!name || !value) {
continue;
}
cookieMap[name] = value;
}
return cookieMap;
}
export function hasRequiredXSessionCookies(cookieMap: Record<string, string>): boolean {
return REQUIRED_X_SESSION_COOKIES.every((name) => Boolean(cookieMap[name]));
}
export async function readXSessionCookieMap(
context: Pick<AdapterContext, "browser">,
): Promise<Record<string, string>> {
const { cookies } = await context.browser.targetSession.send<NetworkGetCookiesResult>(
"Network.getCookies",
{ urls: [...X_SESSION_URLS] },
);
return buildXSessionCookieMap(cookies ?? []);
}
export async function isXSessionReady(
context: Pick<AdapterContext, "browser">,
): Promise<boolean> {
const cookieMap = await readXSessionCookieMap(context);
return hasRequiredXSessionCookies(cookieMap);
}

View File

@ -1,6 +1,16 @@
import { launch, type LaunchedChrome } from "chrome-launcher";
import WebSocket from "ws";
import type { Logger } from "../utils/logger";
import { ensureChromeProfileDir, findExistingChromeDebugPort, resolveChromeProfileDir } from "./profile";
import {
cleanChromeLockArtifacts,
ensureChromeProfileDir,
findChromeProcessUsingProfile,
findExistingChromeDebugPort,
hasChromeLockArtifacts,
listChromeProfileEntries,
resolveChromeProfileDir,
shouldRetryChromeLaunchRecovery,
} from "./profile";
interface ChromeVersionResponse {
webSocketDebuggerUrl: string;
@ -65,6 +75,80 @@ async function tryReuseChrome(profileDir: string, logger?: Logger): Promise<Chro
return null;
}
async function launchFreshChrome(
profileDir: string,
options: Pick<ChromeConnectOptions, "browserPath" | "headless">,
): Promise<ChromeConnection> {
let launchedChrome: LaunchedChrome | null = null;
try {
launchedChrome = await launch({
chromePath: options.browserPath,
userDataDir: profileDir,
chromeFlags: [
"--disable-background-networking",
"--disable-default-apps",
"--disable-popup-blocking",
"--disable-sync",
"--no-first-run",
"--no-default-browser-check",
"--remote-allow-origins=*",
...(!options.headless ? ["--no-startup-window"] : []),
...(options.headless ? ["--headless=new"] : []),
],
});
const origin = `http://127.0.0.1:${launchedChrome.port}`;
const version = await fetchJson<ChromeVersionResponse>(`${origin}/json/version`);
const chrome = launchedChrome;
return {
browserWsUrl: version.webSocketDebuggerUrl,
origin,
port: launchedChrome.port,
profileDir,
launched: true,
async close() {
if (!chrome) return;
await gracefulCloseChrome(chrome, origin);
},
};
} catch (error) {
launchedChrome?.kill();
throw error;
}
}
async function gracefulCloseChrome(chrome: LaunchedChrome, origin: string): Promise<void> {
try {
const resp = await fetch(`${origin}/json/version`);
const { webSocketDebuggerUrl } = (await resp.json()) as ChromeVersionResponse;
if (webSocketDebuggerUrl) {
const ws = await new Promise<WebSocket>((resolve, reject) => {
const socket = new WebSocket(webSocketDebuggerUrl);
socket.once("open", () => resolve(socket));
socket.once("error", reject);
});
const id = 1;
ws.send(JSON.stringify({ id, method: "Browser.close" }));
await new Promise<void>((resolve) => {
const timer = setTimeout(() => { ws.close(); resolve(); }, 5_000);
ws.once("close", () => { clearTimeout(timer); resolve(); });
});
const exited = await new Promise<boolean>((resolve) => {
if (chrome.pid && !isProcessAlive(chrome.pid)) { resolve(true); return; }
const timer = setTimeout(() => resolve(false), 3_000);
chrome.process.once("exit", () => { clearTimeout(timer); resolve(true); });
});
if (exited) return;
}
} catch {}
chrome.kill();
}
function isProcessAlive(pid: number): boolean {
try { process.kill(pid, 0); return true; } catch { return false; }
}
export async function connectChrome(options: ChromeConnectOptions): Promise<ChromeConnection> {
if (options.cdpUrl) {
if (options.cdpUrl.startsWith("ws://") || options.cdpUrl.startsWith("wss://")) {
@ -84,34 +168,20 @@ export async function connectChrome(options: ChromeConnectOptions): Promise<Chro
}
options.logger?.warn(`No running Chrome debugger found for profile ${profileDir}. Launching Chrome with that profile.`);
try {
return await launchFreshChrome(profileDir, options);
} catch (error) {
const entries = await listChromeProfileEntries(profileDir);
const shouldRetry = shouldRetryChromeLaunchRecovery({
hasLockArtifacts: hasChromeLockArtifacts(entries),
hasLiveOwner: findChromeProcessUsingProfile(profileDir),
});
if (!shouldRetry) {
throw error;
}
const launchedChrome: LaunchedChrome = await launch({
chromePath: options.browserPath,
userDataDir: profileDir,
chromeFlags: [
"--disable-background-networking",
"--disable-default-apps",
"--disable-popup-blocking",
"--disable-sync",
"--no-first-run",
"--no-default-browser-check",
"--remote-allow-origins=*",
...(!options.headless ? ["--no-startup-window"] : []),
...(options.headless ? ["--headless=new"] : []),
],
});
const origin = `http://127.0.0.1:${launchedChrome.port}`;
const version = await fetchJson<ChromeVersionResponse>(`${origin}/json/version`);
return {
browserWsUrl: version.webSocketDebuggerUrl,
origin,
port: launchedChrome.port,
profileDir,
launched: true,
async close() {
launchedChrome.kill();
},
};
options.logger?.warn(`Chrome launch failed with stale profile locks. Cleaning ${profileDir} and retrying once.`);
cleanChromeLockArtifacts(profileDir);
return await launchFreshChrome(profileDir, options);
}
}

View File

@ -0,0 +1,100 @@
import { readFile, writeFile, mkdir } from "node:fs/promises";
import { dirname, join } from "node:path";
import { resolveChromeProfileDir } from "./profile";
import type { TargetSession } from "./cdp-client";
export interface CdpCookie {
name: string;
value: string;
domain: string;
path: string;
expires: number;
size: number;
httpOnly: boolean;
secure: boolean;
session: boolean;
sameSite?: string;
priority?: string;
sameParty?: boolean;
sourceScheme?: string;
sourcePort?: number;
partitionKey?: string;
}
interface SidecarData {
savedAt: string;
cookies: CdpCookie[];
}
export interface CookieSidecarConfig {
urls: readonly string[];
filename: string;
requiredCookieNames: readonly string[];
filterCookie?: (cookie: CdpCookie) => boolean;
}
function sidecarPath(filename: string, profileDir?: string): string {
return join(resolveChromeProfileDir(profileDir), filename);
}
function hasRequired(cookies: CdpCookie[], names: readonly string[]): boolean {
return names.every((name) =>
cookies.some((c) => c.name === name && Boolean(c.value)),
);
}
async function getCookies(session: TargetSession, urls: readonly string[]): Promise<CdpCookie[]> {
const { cookies } = await session.send<{ cookies: CdpCookie[] }>(
"Network.getCookies",
{ urls: [...urls] },
);
return cookies ?? [];
}
export async function exportCookies(
session: TargetSession,
config: CookieSidecarConfig,
profileDir?: string,
): Promise<boolean> {
const all = await getCookies(session, config.urls);
const filtered = config.filterCookie ? all.filter(config.filterCookie) : all;
if (!hasRequired(filtered, config.requiredCookieNames)) return false;
const filePath = sidecarPath(config.filename, profileDir);
await mkdir(dirname(filePath), { recursive: true });
const data: SidecarData = { savedAt: new Date().toISOString(), cookies: filtered };
await writeFile(filePath, JSON.stringify(data, null, 2));
return true;
}
export async function restoreCookies(
session: TargetSession,
config: CookieSidecarConfig,
profileDir?: string,
): Promise<boolean> {
const live = await getCookies(session, config.urls);
if (hasRequired(live, config.requiredCookieNames)) return false;
const filePath = sidecarPath(config.filename, profileDir);
const raw = await readFile(filePath, "utf8");
const data = JSON.parse(raw) as SidecarData;
if (!data.cookies?.length) return false;
const now = Date.now() / 1000;
const valid = data.cookies.filter((c) => c.session || !c.expires || c.expires > now);
if (!hasRequired(valid, config.requiredCookieNames)) return false;
await session.send("Network.setCookies", {
cookies: valid.map((c) => ({
name: c.name,
value: c.value,
domain: c.domain,
path: c.path,
httpOnly: c.httpOnly,
secure: c.secure,
sameSite: c.sameSite,
expires: c.expires,
})),
});
return true;
}

View File

@ -19,6 +19,8 @@ interface ChromeVersionResponse {
webSocketDebuggerUrl?: string;
}
const CHROME_LOCK_FILE_NAMES = ["SingletonLock", "SingletonSocket", "SingletonCookie", "chrome.pid"] as const;
function resolveDataBaseDir(): string {
if (process.platform === "darwin") {
return path.join(os.homedir(), "Library", "Application Support");
@ -61,6 +63,57 @@ export function ensureChromeProfileDir(profileDir: string): string {
return profileDir;
}
export function hasChromeLockArtifacts(entries: readonly string[]): boolean {
return CHROME_LOCK_FILE_NAMES.some((name) => entries.includes(name));
}
export function shouldRetryChromeLaunchRecovery(options: {
hasLockArtifacts: boolean;
hasLiveOwner: boolean;
}): boolean {
return options.hasLockArtifacts && !options.hasLiveOwner;
}
export function findChromeProcessUsingProfile(profileDir: string): boolean {
if (process.platform === "win32") {
return false;
}
try {
const result = spawnSync("ps", ["aux"], {
encoding: "utf8",
timeout: 5_000,
});
if (result.status !== 0 || !result.stdout) {
return false;
}
return result.stdout
.split("\n")
.some((line) => line.includes(`--user-data-dir=${profileDir}`));
} catch {
return false;
}
}
export function cleanChromeLockArtifacts(profileDir: string): void {
for (const name of CHROME_LOCK_FILE_NAMES) {
try {
fs.unlinkSync(path.join(profileDir, name));
} catch {
// Ignore missing files and continue cleaning the remaining artifacts.
}
}
}
export async function listChromeProfileEntries(profileDir: string): Promise<string[]> {
try {
return await fs.promises.readdir(profileDir);
} catch {
return [];
}
}
async function fetchWithTimeout(url: string, timeoutMs = 3_000): Promise<Response> {
const controller = new AbortController();
const timer = setTimeout(() => controller.abort(), timeoutMs);

View File

@ -7,6 +7,7 @@ import { detectInteractionGate } from "../browser/interaction-gates";
import { NetworkJournal } from "../browser/network-journal";
import { BrowserSession } from "../browser/session";
import { genericAdapter, resolveAdapter } from "../adapters";
import { isXSessionReady } from "../adapters/x/session";
import type { ExtractedDocument } from "../extract/document";
import { renderMarkdown } from "../extract/markdown-renderer";
import { downloadMediaAssets } from "../media/default-downloader";
@ -55,6 +56,7 @@ interface ForceWaitSnapshot {
url: string;
hasGate: boolean;
loginState: LoginState | "unavailable";
sessionReady: boolean;
}
interface SuccessfulConvertOutput {
@ -78,6 +80,17 @@ function sleep(ms: number): Promise<void> {
return new Promise((resolve) => setTimeout(resolve, ms));
}
function isForceWaitSessionReady(snapshot: ForceWaitSnapshot): boolean {
return snapshot.sessionReady;
}
export function shouldKeepBrowserOpenAfterInteraction(options: {
launched: boolean;
interaction: Pick<WaitForInteractionRequest, "kind" | "provider">;
}): boolean {
return options.launched && options.interaction.kind === "login" && options.interaction.provider === "x";
}
export function shouldAutoContinueForceWait(
initial: ForceWaitSnapshot,
current: ForceWaitSnapshot,
@ -86,15 +99,20 @@ export function shouldAutoContinueForceWait(
return true;
}
if (initial.loginState === "logged_out" && current.loginState !== "logged_out") {
if (initial.loginState === "logged_out" && current.loginState !== "logged_out" && isForceWaitSessionReady(current)) {
return true;
}
if (initial.loginState !== "logged_in" && current.loginState === "logged_in") {
if (initial.loginState !== "logged_in" && current.loginState === "logged_in" && isForceWaitSessionReady(current)) {
return true;
}
if (current.url !== initial.url && !current.hasGate && current.loginState !== "logged_out") {
if (
current.url !== initial.url &&
!current.hasGate &&
current.loginState !== "logged_out" &&
isForceWaitSessionReady(current)
) {
return true;
}
@ -173,6 +191,16 @@ async function closeRuntime(runtime: RuntimeResources | null | undefined): Promi
await runtime.chrome.close().catch(() => {});
}
async function isInteractionSessionReady(
context: AdapterContext,
interaction: WaitForInteractionRequest,
): Promise<boolean> {
if (interaction.provider !== "x") {
return true;
}
return await isXSessionReady(context).catch(() => false);
}
async function reopenInteractiveRuntime(
runtime: RuntimeResources,
options: ConvertCommandOptions,
@ -203,6 +231,7 @@ async function captureForceWaitSnapshot(
url,
hasGate: Boolean(gate),
loginState: login?.state ?? "unavailable",
sessionReady: adapter.name === "x" ? await isXSessionReady(context).catch(() => false) : true,
};
}
@ -280,7 +309,7 @@ async function waitForInteraction(
while (Date.now() - startedAt < timeoutMs) {
if (interaction.kind === "login" && adapter.checkLogin) {
lastLogin = await adapter.checkLogin(context);
if (lastLogin.state === "logged_in") {
if (lastLogin.state === "logged_in" && await isInteractionSessionReady(context, interaction)) {
return lastLogin;
}
}
@ -303,7 +332,7 @@ async function waitForInteraction(
}
lastLogin = await adapter.checkLogin(context);
if (lastLogin.state !== "logged_out") {
if (lastLogin.state !== "logged_out" && await isInteractionSessionReady(context, interaction)) {
return lastLogin;
}
}
@ -347,10 +376,13 @@ export async function runConvertCommand(options: ConvertCommandOptions): Promise
const url = normalizeUrl(options.url);
let runtime = await openRuntime(options, options.waitMode !== "none", Boolean(options.debugDir));
const logger = createLogger(Boolean(options.debugDir));
let didLogin = false;
let adapter: Adapter | null = null;
let context: AdapterContext | null = null;
try {
const adapter = resolveAdapter({ url }, options.adapter);
let context: AdapterContext = {
adapter = resolveAdapter({ url }, options.adapter);
context = {
input: { url },
browser: runtime.browser,
network: runtime.network,
@ -362,6 +394,11 @@ export async function runConvertCommand(options: ConvertCommandOptions): Promise
downloadMedia: options.downloadMedia,
};
if (adapter.restoreCookies) {
const restored = await adapter.restoreCookies(context, runtime.chrome.profileDir).catch(() => false);
if (restored) logger.info(`Restored ${adapter.name} session cookies from sidecar.`);
}
if (options.waitMode === "force") {
await context.browser.goto(url.toString(), options.timeoutMs).catch(() => {});
await waitForForceResume(adapter, context, options);
@ -414,6 +451,9 @@ export async function runConvertCommand(options: ConvertCommandOptions): Promise
};
await context.browser.goto(url.toString(), options.timeoutMs).catch(() => {});
if (result.interaction.kind === "login") {
didLogin = true;
}
await waitForInteraction(adapter, context, result.interaction, options);
result = await adapter.process(context);
@ -516,6 +556,9 @@ export async function runConvertCommand(options: ConvertCommandOptions): Promise
printOutput(markdown);
} finally {
if (adapter?.exportCookies && context) {
await adapter.exportCookies(context, runtime.chrome.profileDir).catch(() => {});
}
await closeRuntime(runtime);
}
}