From 86a84739e886460e263fedd521f17dc8a7b6e319 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jim=20Liu=20=E5=AE=9D=E7=8E=89?= <junminliu@gmail.com>
Date: Fri, 6 Feb 2026 16:06:58 -0600
Subject: [PATCH] feat(baoyu-image-gen): add OpenAI GPT Image edits support for
 reference images

- Support --ref with OpenAI GPT Image models (gpt-image-1.5)
- Auto-select Google or OpenAI when --ref provided
- Change ref-related warnings to explicit errors with fix hints
- Add reference image validation before generation
- Improve retry logic to skip non-retryable errors
---
 skills/baoyu-image-gen/SKILL.md               | 18 ++--
 skills/baoyu-image-gen/scripts/main.ts        | 47 +++++++++-
 .../scripts/providers/dashscope.ts            |  4 +-
 .../scripts/providers/google.ts               |  8 +-
 .../scripts/providers/openai.ts               | 93 ++++++++++++++++---
 5 files changed, 146 insertions(+), 24 deletions(-)
diff --git a/skills/baoyu-image-gen/SKILL.md b/skills/baoyu-image-gen/SKILL.md
index dd04a24..1a900c4 100644
--- a/skills/baoyu-image-gen/SKILL.md
+++ b/skills/baoyu-image-gen/SKILL.md
@@ -60,9 +60,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --quali
 # From prompt files
 npx -y bun ${SKILL_DIR}/scripts/main.ts --promptfiles system.md content.md --image out.png
 
-# With reference images (Google multimodal only)
+# With reference images (Google multimodal or OpenAI edits)
 npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --ref source.png
 
+# With reference images (explicit provider/model)
+npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --provider google --model gemini-3-pro-image-preview --ref source.png
+
 # Specific provider
 npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
 
@@ -78,12 +81,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
 | `--promptfiles <files...>` | Read prompt from files (concatenated) |
 | `--image <path>` | Output image path (required) |
 | `--provider google\|openai\|dashscope` | Force provider (default: google) |
-| `--model <id>`, `-m` | Model ID |
+| `--model <id>`, `-m` | Model ID (`--ref` with OpenAI requires GPT Image model, e.g. `gpt-image-1.5`) |
 | `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
 | `--size <WxH>` | Size (e.g., `1024x1024`) |
 | `--quality normal\|2k` | Quality preset (default: 2k) |
 | `--imageSize 1K\|2K\|4K` | Image size for Google (default: from quality) |
-| `--ref <files...>` | Reference images (Google multimodal only) |
+| `--ref <files...>` | Reference images. Supported by Google multimodal and OpenAI edits (GPT Image models). If provider omitted: Google first, then OpenAI |
 | `--n <count>` | Number of images |
 | `--json` | JSON output |
 
@@ -105,9 +108,10 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
 
 ## Provider Selection
 
-1. `--provider` specified → use it
-2. Only one API key available → use that provider
-3. Multiple available → default to Google
+1. `--ref` provided + no `--provider` → auto-select Google first, then OpenAI
+2. `--provider` specified → use it (if `--ref`, must be `google` or `openai`)
+3. Only one API key available → use that provider
+4. Multiple available → default to Google
 
 ## Quality Presets
 
@@ -157,7 +161,7 @@ Supported: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `2.35:1`
 - Missing API key → error with setup instructions
 - Generation failure → auto-retry once
 - Invalid aspect ratio → warning, proceed with default
-- Reference images with non-multimodal model → warning, ignore refs
+- Reference images with unsupported provider/model → error with fix hint (switch to Google multimodal or OpenAI GPT Image edits)
 
 ## Extension Support
 
diff --git a/skills/baoyu-image-gen/scripts/main.ts b/skills/baoyu-image-gen/scripts/main.ts
index d24469b..ce0620c 100644
--- a/skills/baoyu-image-gen/scripts/main.ts
+++ b/skills/baoyu-image-gen/scripts/main.ts
@@ -1,7 +1,7 @@
 import path from "node:path";
 import process from "node:process";
 import { homedir } from "node:os";
-import { mkdir, readFile, writeFile } from "node:fs/promises";
+import { access, mkdir, readFile, writeFile } from "node:fs/promises";
 import type { CliArgs, Provider, ExtendConfig } from "./types";
 
 function printUsage(): void {
@@ -20,7 +20,7 @@ Options:
   --size <WxH>              Size (e.g., 1024x1024)
   --quality normal|2k       Quality preset (default: 2k)
   --imageSize 1K|2K|4K      Image size for Google (default: from quality)
-  --ref <files...>          Reference images (Google multimodal only)
+  --ref <files...>          Reference images (Google multimodal or OpenAI edits)
   --n <count>               Number of images (default: 1)
   --json                    JSON output
   -h, --help                Show help
@@ -323,12 +323,26 @@ function normalizeOutputImagePath(p: string): string {
 }
 
 function detectProvider(args: CliArgs): Provider {
+  if (args.referenceImages.length > 0 && args.provider && args.provider !== "google" && args.provider !== "openai") {
+    throw new Error(
+      "Reference images require a ref-capable provider. Use --provider google (Gemini multimodal) or --provider openai (GPT Image edits)."
+    );
+  }
+
   if (args.provider) return args.provider;
 
   const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
   const hasOpenai = !!process.env.OPENAI_API_KEY;
   const hasDashscope = !!process.env.DASHSCOPE_API_KEY;
 
+  if (args.referenceImages.length > 0) {
+    if (hasGoogle) return "google";
+    if (hasOpenai) return "openai";
+    throw new Error(
+      "Reference images require Google or OpenAI. Set GOOGLE_API_KEY/GEMINI_API_KEY or OPENAI_API_KEY, or remove --ref."
+    );
+  }
+
   const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];
 
   if (available.length === 1) return available[0]!;
@@ -340,11 +354,34 @@ function detectProvider(args: CliArgs): Provider {
   );
 }
 
+async function validateReferenceImages(referenceImages: string[]): Promise<void> {
+  for (const refPath of referenceImages) {
+    const fullPath = path.resolve(refPath);
+    try {
+      await access(fullPath);
+    } catch {
+      throw new Error(`Reference image not found: ${fullPath}`);
+    }
+  }
+}
+
 type ProviderModule = {
   getDefaultModel: () => string;
   generateImage: (prompt: string, model: string, args: CliArgs) => Promise<Uint8Array>;
 };
 
+function isRetryableGenerationError(error: unknown): boolean {
+  const msg = error instanceof Error ? error.message : String(error);
+  const nonRetryableMarkers = [
+    "Reference image",
+    "not supported",
+    "only supported",
+    "No API key found",
+    "is required",
+  ];
+  return !nonRetryableMarkers.some((marker) => msg.includes(marker));
+}
+
 async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
   if (provider === "google") {
     return (await import("./providers/google")) as ProviderModule;
@@ -387,6 +424,10 @@ async function main(): Promise<void> {
     return;
   }
 
+  if (mergedArgs.referenceImages.length > 0) {
+    await validateReferenceImages(mergedArgs.referenceImages);
+  }
+
   const provider = detectProvider(mergedArgs);
   const providerModule = await loadProviderModule(provider);
 
@@ -408,7 +449,7 @@ async function main(): Promise<void> {
       imageData = await providerModule.generateImage(prompt, model, mergedArgs);
       break;
     } catch (e) {
-      if (!retried) {
+      if (!retried && isRetryableGenerationError(e)) {
         retried = true;
         console.error("Generation failed, retrying...");
         continue;
diff --git a/skills/baoyu-image-gen/scripts/providers/dashscope.ts b/skills/baoyu-image-gen/scripts/providers/dashscope.ts
index aff55b5..607a947 100644
--- a/skills/baoyu-image-gen/scripts/providers/dashscope.ts
+++ b/skills/baoyu-image-gen/scripts/providers/dashscope.ts
@@ -58,7 +58,9 @@ export async function generateImage(
   if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");
 
   if (args.referenceImages.length > 0) {
-    console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
+    throw new Error(
+      "Reference images are not supported with DashScope provider in baoyu-image-gen. Use --provider google with a Gemini multimodal model."
+    );
   }
 
   const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);
diff --git a/skills/baoyu-image-gen/scripts/providers/google.ts b/skills/baoyu-image-gen/scripts/providers/google.ts
index fa17bfc..e1a2729 100644
--- a/skills/baoyu-image-gen/scripts/providers/google.ts
+++ b/skills/baoyu-image-gen/scripts/providers/google.ts
@@ -216,13 +216,17 @@ export async function generateImage(
 ): Promise<Uint8Array> {
   if (isGoogleImagen(model)) {
     if (args.referenceImages.length > 0) {
-      console.error("Warning: Reference images not supported with Imagen models, ignoring.");
+      throw new Error(
+        "Reference images are not supported with Imagen models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
+      );
     }
     return generateWithImagen(prompt, model, args);
   }
 
   if (!isGoogleMultimodal(model) && args.referenceImages.length > 0) {
-    console.error("Warning: Reference images are only supported with Gemini multimodal models.");
+    throw new Error(
+      "Reference images are only supported with Gemini multimodal models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
+    );
   }
 
   return generateWithGemini(prompt, model, args);
diff --git a/skills/baoyu-image-gen/scripts/providers/openai.ts b/skills/baoyu-image-gen/scripts/providers/openai.ts
index f42a762..a721318 100644
--- a/skills/baoyu-image-gen/scripts/providers/openai.ts
+++ b/skills/baoyu-image-gen/scripts/providers/openai.ts
@@ -1,9 +1,13 @@
+import path from "node:path";
+import { readFile } from "node:fs/promises";
 import type { CliArgs } from "../types";
 
 export function getDefaultModel(): string {
   return process.env.OPENAI_IMAGE_MODEL || "gpt-image-1.5";
 }
 
+type OpenAIImageResponse = { data: Array<{ url?: string; b64_json?: string }> };
+
 function parseAspectRatio(ar: string): { width: number; height: number } | null {
   const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
   if (!match) return null;
@@ -66,20 +70,32 @@ export async function generateImage(
 
   if (!apiKey) throw new Error("OPENAI_API_KEY is required");
 
-  if (args.referenceImages.length > 0) {
-    console.error("Warning: Reference images not supported with OpenAI, ignoring.");
-  }
-
   const size = args.size || getOpenAISize(model, args.aspectRatio, args.quality);
 
-  const body: Record<string, any> = {
-    model,
-    prompt,
-    size,
-  };
+  if (args.referenceImages.length > 0) {
+    if (model.includes("dall-e-2") || model.includes("dall-e-3")) {
+      throw new Error(
+        "Reference images with OpenAI in this skill require GPT Image models. Use --model gpt-image-1.5 (or another gpt-image model)."
+      );
+    }
+    return generateWithOpenAIEdits(baseURL, apiKey, prompt, model, size, args.referenceImages, args.quality);
+  }
+
+  return generateWithOpenAIGenerations(baseURL, apiKey, prompt, model, size, args.quality);
+}
+
+async function generateWithOpenAIGenerations(
+  baseURL: string,
+  apiKey: string,
+  prompt: string,
+  model: string,
+  size: string,
+  quality: CliArgs["quality"]
+): Promise<Uint8Array> {
+  const body: Record<string, any> = { model, prompt, size };
 
   if (model.includes("dall-e-3")) {
-    body.quality = args.quality === "2k" ? "hd" : "standard";
+    body.quality = quality === "2k" ? "hd" : "standard";
   }
 
   const res = await fetch(`${baseURL}/images/generations`, {
@@ -96,7 +112,62 @@ export async function generateImage(
     throw new Error(`OpenAI API error: ${err}`);
   }
 
-  const result = (await res.json()) as { data: Array<{ url?: string; b64_json?: string }> };
+  const result = (await res.json()) as OpenAIImageResponse;
+  return extractImageFromResponse(result);
+}
+
+async function generateWithOpenAIEdits(
+  baseURL: string,
+  apiKey: string,
+  prompt: string,
+  model: string,
+  size: string,
+  referenceImages: string[],
+  quality: CliArgs["quality"]
+): Promise<Uint8Array> {
+  const form = new FormData();
+  form.append("model", model);
+  form.append("prompt", prompt);
+  form.append("size", size);
+
+  if (model.includes("gpt-image")) {
+    form.append("quality", quality === "2k" ? "high" : "medium");
+  }
+
+  for (const refPath of referenceImages) {
+    const bytes = await readFile(refPath);
+    const filename = path.basename(refPath);
+    const mimeType = getMimeType(filename);
+    const blob = new Blob([bytes], { type: mimeType });
+    form.append("image[]", blob, filename);
+  }
+
+  const res = await fetch(`${baseURL}/images/edits`, {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: form,
+  });
+
+  if (!res.ok) {
+    const err = await res.text();
+    throw new Error(`OpenAI edits API error: ${err}`);
+  }
+
+  const result = (await res.json()) as OpenAIImageResponse;
+  return extractImageFromResponse(result);
+}
+
+function getMimeType(filename: string): string {
+  const ext = path.extname(filename).toLowerCase();
+  if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg";
+  if (ext === ".webp") return "image/webp";
+  if (ext === ".gif") return "image/gif";
+  return "image/png";
+}
+
+async function extractImageFromResponse(result: OpenAIImageResponse): Promise<Uint8Array> {
   const img = result.data[0];
 
   if (img?.b64_json) {