From 86a84739e886460e263fedd521f17dc8a7b6e319 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= Date: Fri, 6 Feb 2026 16:06:58 -0600 Subject: [PATCH] feat(baoyu-image-gen): add OpenAI GPT Image edits support for reference images - Support --ref with OpenAI GPT Image models (gpt-image-1.5) - Auto-select Google or OpenAI when --ref provided - Change ref-related warnings to explicit errors with fix hints - Add reference image validation before generation - Improve retry logic to skip non-retryable errors --- skills/baoyu-image-gen/SKILL.md | 18 ++-- skills/baoyu-image-gen/scripts/main.ts | 47 +++++++++- .../scripts/providers/dashscope.ts | 4 +- .../scripts/providers/google.ts | 8 +- .../scripts/providers/openai.ts | 93 ++++++++++++++++--- 5 files changed, 146 insertions(+), 24 deletions(-) diff --git a/skills/baoyu-image-gen/SKILL.md b/skills/baoyu-image-gen/SKILL.md index dd04a24..1a900c4 100644 --- a/skills/baoyu-image-gen/SKILL.md +++ b/skills/baoyu-image-gen/SKILL.md @@ -60,9 +60,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --quali # From prompt files npx -y bun ${SKILL_DIR}/scripts/main.ts --promptfiles system.md content.md --image out.png -# With reference images (Google multimodal only) +# With reference images (Google multimodal or OpenAI edits) npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --ref source.png +# With reference images (explicit provider/model) +npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --provider google --model gemini-3-pro-image-preview --ref source.png + # Specific provider npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai @@ -78,12 +81,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou | `--promptfiles ` | Read prompt from files (concatenated) | | `--image ` | Output image path (required) | | `--provider google\|openai\|dashscope` | Force provider (default: google) | -| `--model `, `-m` | Model ID | +| `--model `, `-m` | Model ID (`--ref` with OpenAI requires GPT Image model, e.g. `gpt-image-1.5`) | | `--ar ` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) | | `--size ` | Size (e.g., `1024x1024`) | | `--quality normal\|2k` | Quality preset (default: 2k) | | `--imageSize 1K\|2K\|4K` | Image size for Google (default: from quality) | -| `--ref ` | Reference images (Google multimodal only) | +| `--ref ` | Reference images. Supported by Google multimodal and OpenAI edits (GPT Image models). If provider omitted: Google first, then OpenAI | | `--n ` | Number of images | | `--json` | JSON output | @@ -105,9 +108,10 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou ## Provider Selection -1. `--provider` specified → use it -2. Only one API key available → use that provider -3. Multiple available → default to Google +1. `--ref` provided + no `--provider` → auto-select Google first, then OpenAI +2. `--provider` specified → use it (if `--ref`, must be `google` or `openai`) +3. Only one API key available → use that provider +4. Multiple available → default to Google ## Quality Presets @@ -157,7 +161,7 @@ Supported: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `2.35:1` - Missing API key → error with setup instructions - Generation failure → auto-retry once - Invalid aspect ratio → warning, proceed with default -- Reference images with non-multimodal model → warning, ignore refs +- Reference images with unsupported provider/model → error with fix hint (switch to Google multimodal or OpenAI GPT Image edits) ## Extension Support diff --git a/skills/baoyu-image-gen/scripts/main.ts b/skills/baoyu-image-gen/scripts/main.ts index d24469b..ce0620c 100644 --- a/skills/baoyu-image-gen/scripts/main.ts +++ b/skills/baoyu-image-gen/scripts/main.ts @@ -1,7 +1,7 @@ import path from "node:path"; import process from "node:process"; import { homedir } from "node:os"; -import { mkdir, readFile, writeFile } from "node:fs/promises"; +import { access, mkdir, readFile, writeFile } from "node:fs/promises"; import type { CliArgs, Provider, ExtendConfig } from "./types"; function printUsage(): void { @@ -20,7 +20,7 @@ Options: --size Size (e.g., 1024x1024) --quality normal|2k Quality preset (default: 2k) --imageSize 1K|2K|4K Image size for Google (default: from quality) - --ref Reference images (Google multimodal only) + --ref Reference images (Google multimodal or OpenAI edits) --n Number of images (default: 1) --json JSON output -h, --help Show help @@ -323,12 +323,26 @@ function normalizeOutputImagePath(p: string): string { } function detectProvider(args: CliArgs): Provider { + if (args.referenceImages.length > 0 && args.provider && args.provider !== "google" && args.provider !== "openai") { + throw new Error( + "Reference images require a ref-capable provider. Use --provider google (Gemini multimodal) or --provider openai (GPT Image edits)." + ); + } + if (args.provider) return args.provider; const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY); const hasOpenai = !!process.env.OPENAI_API_KEY; const hasDashscope = !!process.env.DASHSCOPE_API_KEY; + if (args.referenceImages.length > 0) { + if (hasGoogle) return "google"; + if (hasOpenai) return "openai"; + throw new Error( + "Reference images require Google or OpenAI. Set GOOGLE_API_KEY/GEMINI_API_KEY or OPENAI_API_KEY, or remove --ref." + ); + } + const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[]; if (available.length === 1) return available[0]!; @@ -340,11 +354,34 @@ function detectProvider(args: CliArgs): Provider { ); } +async function validateReferenceImages(referenceImages: string[]): Promise { + for (const refPath of referenceImages) { + const fullPath = path.resolve(refPath); + try { + await access(fullPath); + } catch { + throw new Error(`Reference image not found: ${fullPath}`); + } + } +} + type ProviderModule = { getDefaultModel: () => string; generateImage: (prompt: string, model: string, args: CliArgs) => Promise; }; +function isRetryableGenerationError(error: unknown): boolean { + const msg = error instanceof Error ? error.message : String(error); + const nonRetryableMarkers = [ + "Reference image", + "not supported", + "only supported", + "No API key found", + "is required", + ]; + return !nonRetryableMarkers.some((marker) => msg.includes(marker)); +} + async function loadProviderModule(provider: Provider): Promise { if (provider === "google") { return (await import("./providers/google")) as ProviderModule; @@ -387,6 +424,10 @@ async function main(): Promise { return; } + if (mergedArgs.referenceImages.length > 0) { + await validateReferenceImages(mergedArgs.referenceImages); + } + const provider = detectProvider(mergedArgs); const providerModule = await loadProviderModule(provider); @@ -408,7 +449,7 @@ async function main(): Promise { imageData = await providerModule.generateImage(prompt, model, mergedArgs); break; } catch (e) { - if (!retried) { + if (!retried && isRetryableGenerationError(e)) { retried = true; console.error("Generation failed, retrying..."); continue; diff --git a/skills/baoyu-image-gen/scripts/providers/dashscope.ts b/skills/baoyu-image-gen/scripts/providers/dashscope.ts index aff55b5..607a947 100644 --- a/skills/baoyu-image-gen/scripts/providers/dashscope.ts +++ b/skills/baoyu-image-gen/scripts/providers/dashscope.ts @@ -58,7 +58,9 @@ export async function generateImage( if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required"); if (args.referenceImages.length > 0) { - console.error("Warning: Reference images not yet supported with DashScope, ignoring."); + throw new Error( + "Reference images are not supported with DashScope provider in baoyu-image-gen. Use --provider google with a Gemini multimodal model." + ); } const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality); diff --git a/skills/baoyu-image-gen/scripts/providers/google.ts b/skills/baoyu-image-gen/scripts/providers/google.ts index fa17bfc..e1a2729 100644 --- a/skills/baoyu-image-gen/scripts/providers/google.ts +++ b/skills/baoyu-image-gen/scripts/providers/google.ts @@ -216,13 +216,17 @@ export async function generateImage( ): Promise { if (isGoogleImagen(model)) { if (args.referenceImages.length > 0) { - console.error("Warning: Reference images not supported with Imagen models, ignoring."); + throw new Error( + "Reference images are not supported with Imagen models. Use gemini-3-pro-image-preview or gemini-3-flash-preview." + ); } return generateWithImagen(prompt, model, args); } if (!isGoogleMultimodal(model) && args.referenceImages.length > 0) { - console.error("Warning: Reference images are only supported with Gemini multimodal models."); + throw new Error( + "Reference images are only supported with Gemini multimodal models. Use gemini-3-pro-image-preview or gemini-3-flash-preview." + ); } return generateWithGemini(prompt, model, args); diff --git a/skills/baoyu-image-gen/scripts/providers/openai.ts b/skills/baoyu-image-gen/scripts/providers/openai.ts index f42a762..a721318 100644 --- a/skills/baoyu-image-gen/scripts/providers/openai.ts +++ b/skills/baoyu-image-gen/scripts/providers/openai.ts @@ -1,9 +1,13 @@ +import path from "node:path"; +import { readFile } from "node:fs/promises"; import type { CliArgs } from "../types"; export function getDefaultModel(): string { return process.env.OPENAI_IMAGE_MODEL || "gpt-image-1.5"; } +type OpenAIImageResponse = { data: Array<{ url?: string; b64_json?: string }> }; + function parseAspectRatio(ar: string): { width: number; height: number } | null { const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/); if (!match) return null; @@ -66,20 +70,32 @@ export async function generateImage( if (!apiKey) throw new Error("OPENAI_API_KEY is required"); - if (args.referenceImages.length > 0) { - console.error("Warning: Reference images not supported with OpenAI, ignoring."); - } - const size = args.size || getOpenAISize(model, args.aspectRatio, args.quality); - const body: Record = { - model, - prompt, - size, - }; + if (args.referenceImages.length > 0) { + if (model.includes("dall-e-2") || model.includes("dall-e-3")) { + throw new Error( + "Reference images with OpenAI in this skill require GPT Image models. Use --model gpt-image-1.5 (or another gpt-image model)." + ); + } + return generateWithOpenAIEdits(baseURL, apiKey, prompt, model, size, args.referenceImages, args.quality); + } + + return generateWithOpenAIGenerations(baseURL, apiKey, prompt, model, size, args.quality); +} + +async function generateWithOpenAIGenerations( + baseURL: string, + apiKey: string, + prompt: string, + model: string, + size: string, + quality: CliArgs["quality"] +): Promise { + const body: Record = { model, prompt, size }; if (model.includes("dall-e-3")) { - body.quality = args.quality === "2k" ? "hd" : "standard"; + body.quality = quality === "2k" ? "hd" : "standard"; } const res = await fetch(`${baseURL}/images/generations`, { @@ -96,7 +112,62 @@ export async function generateImage( throw new Error(`OpenAI API error: ${err}`); } - const result = (await res.json()) as { data: Array<{ url?: string; b64_json?: string }> }; + const result = (await res.json()) as OpenAIImageResponse; + return extractImageFromResponse(result); +} + +async function generateWithOpenAIEdits( + baseURL: string, + apiKey: string, + prompt: string, + model: string, + size: string, + referenceImages: string[], + quality: CliArgs["quality"] +): Promise { + const form = new FormData(); + form.append("model", model); + form.append("prompt", prompt); + form.append("size", size); + + if (model.includes("gpt-image")) { + form.append("quality", quality === "2k" ? "high" : "medium"); + } + + for (const refPath of referenceImages) { + const bytes = await readFile(refPath); + const filename = path.basename(refPath); + const mimeType = getMimeType(filename); + const blob = new Blob([bytes], { type: mimeType }); + form.append("image[]", blob, filename); + } + + const res = await fetch(`${baseURL}/images/edits`, { + method: "POST", + headers: { + Authorization: `Bearer ${apiKey}`, + }, + body: form, + }); + + if (!res.ok) { + const err = await res.text(); + throw new Error(`OpenAI edits API error: ${err}`); + } + + const result = (await res.json()) as OpenAIImageResponse; + return extractImageFromResponse(result); +} + +function getMimeType(filename: string): string { + const ext = path.extname(filename).toLowerCase(); + if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg"; + if (ext === ".webp") return "image/webp"; + if (ext === ".gif") return "image/gif"; + return "image/png"; +} + +async function extractImageFromResponse(result: OpenAIImageResponse): Promise { const img = result.data[0]; if (img?.b64_json) {