feat(baoyu-image-gen): add OpenAI GPT Image edits support for reference images

- Support --ref with OpenAI GPT Image models (gpt-image-1.5) - Auto-select Google or OpenAI when --ref provided - Change ref-related warnings to explicit errors with fix hints - Add reference image validation before generation - Improve retry logic to skip non-retryable errors
2026-02-06 16:06:58 -06:00 · 2026-02-06 16:06:58 -06:00 · 86a84739e8
parent 7f80100b3e
commit 86a84739e8
5 changed files with 146 additions and 24 deletions
--- a/skills/baoyu-image-gen/SKILL.md
+++ b/skills/baoyu-image-gen/SKILL.md
@ -60,9 +60,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --quali
 # From prompt files
 npx -y bun ${SKILL_DIR}/scripts/main.ts --promptfiles system.md content.md --image out.png

-# With reference images (Google multimodal only)
+# With reference images (Google multimodal or OpenAI edits)
 npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --ref source.png

+# With reference images (explicit provider/model)
+npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --provider google --model gemini-3-pro-image-preview --ref source.png
+
 # Specific provider
 npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai

@ -78,12 +81,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
 | `--promptfiles <files...>` | Read prompt from files (concatenated) |
 | `--image <path>` | Output image path (required) |
 | `--provider google\|openai\|dashscope` | Force provider (default: google) |
-| `--model <id>`, `-m` | Model ID |
+| `--model <id>`, `-m` | Model ID (`--ref` with OpenAI requires GPT Image model, e.g. `gpt-image-1.5`) |
 | `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
 | `--size <WxH>` | Size (e.g., `1024x1024`) |
 | `--quality normal\|2k` | Quality preset (default: 2k) |
 | `--imageSize 1K\|2K\|4K` | Image size for Google (default: from quality) |
-| `--ref <files...>` | Reference images (Google multimodal only) |
+| `--ref <files...>` | Reference images. Supported by Google multimodal and OpenAI edits (GPT Image models). If provider omitted: Google first, then OpenAI |
 | `--n <count>` | Number of images |
 | `--json` | JSON output |

@ -105,9 +108,10 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou

 ## Provider Selection

-1. `--provider` specified → use it
-2. Only one API key available → use that provider
-3. Multiple available → default to Google
+1. `--ref` provided + no `--provider` → auto-select Google first, then OpenAI
+2. `--provider` specified → use it (if `--ref`, must be `google` or `openai`)
+3. Only one API key available → use that provider
+4. Multiple available → default to Google

 ## Quality Presets

@ -157,7 +161,7 @@ Supported: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `2.35:1`
 - Missing API key → error with setup instructions
 - Generation failure → auto-retry once
 - Invalid aspect ratio → warning, proceed with default
- Reference images with non-multimodal model → warning, ignore refs
+- Reference images with unsupported provider/model → error with fix hint (switch to Google multimodal or OpenAI GPT Image edits)

 ## Extension Support

--- a/skills/baoyu-image-gen/scripts/main.ts
+++ b/skills/baoyu-image-gen/scripts/main.ts
@ -1,7 +1,7 @@
 import path from "node:path";
 import process from "node:process";
 import { homedir } from "node:os";
-import { mkdir, readFile, writeFile } from "node:fs/promises";
+import { access, mkdir, readFile, writeFile } from "node:fs/promises";
 import type { CliArgs, Provider, ExtendConfig } from "./types";

 function printUsage(): void {
@ -20,7 +20,7 @@ Options:
  --size <WxH>              Size (e.g., 1024x1024)
  --quality normal|2k       Quality preset (default: 2k)
  --imageSize 1K|2K|4K      Image size for Google (default: from quality)
-  --ref <files...>          Reference images (Google multimodal only)
+  --ref <files...>          Reference images (Google multimodal or OpenAI edits)
  --n <count>               Number of images (default: 1)
  --json                    JSON output
  -h, --help                Show help
@ -323,12 +323,26 @@ function normalizeOutputImagePath(p: string): string {
 }

 function detectProvider(args: CliArgs): Provider {
+  if (args.referenceImages.length > 0 && args.provider && args.provider !== "google" && args.provider !== "openai") {
+    throw new Error(
+      "Reference images require a ref-capable provider. Use --provider google (Gemini multimodal) or --provider openai (GPT Image edits)."
+    );
+  }
+
  if (args.provider) return args.provider;

  const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
  const hasOpenai = !!process.env.OPENAI_API_KEY;
  const hasDashscope = !!process.env.DASHSCOPE_API_KEY;

+  if (args.referenceImages.length > 0) {
+    if (hasGoogle) return "google";
+    if (hasOpenai) return "openai";
+    throw new Error(
+      "Reference images require Google or OpenAI. Set GOOGLE_API_KEY/GEMINI_API_KEY or OPENAI_API_KEY, or remove --ref."
+    );
+  }
+
  const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];

  if (available.length === 1) return available[0]!;
@ -340,11 +354,34 @@ function detectProvider(args: CliArgs): Provider {
  );
 }

+async function validateReferenceImages(referenceImages: string[]): Promise<void> {
+  for (const refPath of referenceImages) {
+    const fullPath = path.resolve(refPath);
+    try {
+      await access(fullPath);
+    } catch {
+      throw new Error(`Reference image not found: ${fullPath}`);
+    }
+  }
+}
+
 type ProviderModule = {
  getDefaultModel: () => string;
  generateImage: (prompt: string, model: string, args: CliArgs) => Promise<Uint8Array>;
 };

+function isRetryableGenerationError(error: unknown): boolean {
+  const msg = error instanceof Error ? error.message : String(error);
+  const nonRetryableMarkers = [
+    "Reference image",
+    "not supported",
+    "only supported",
+    "No API key found",
+    "is required",
+  ];
+  return !nonRetryableMarkers.some((marker) => msg.includes(marker));
+}
+
 async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
  if (provider === "google") {
    return (await import("./providers/google")) as ProviderModule;
@ -387,6 +424,10 @@ async function main(): Promise<void> {
    return;
  }

+  if (mergedArgs.referenceImages.length > 0) {
+    await validateReferenceImages(mergedArgs.referenceImages);
+  }
+
  const provider = detectProvider(mergedArgs);
  const providerModule = await loadProviderModule(provider);

@ -408,7 +449,7 @@ async function main(): Promise<void> {
      imageData = await providerModule.generateImage(prompt, model, mergedArgs);
      break;
    } catch (e) {
-      if (!retried) {
+      if (!retried && isRetryableGenerationError(e)) {
        retried = true;
        console.error("Generation failed, retrying...");
        continue;
--- a/skills/baoyu-image-gen/scripts/providers/dashscope.ts
+++ b/skills/baoyu-image-gen/scripts/providers/dashscope.ts
@ -58,7 +58,9 @@ export async function generateImage(
  if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");

  if (args.referenceImages.length > 0) {
-    console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
+    throw new Error(
+      "Reference images are not supported with DashScope provider in baoyu-image-gen. Use --provider google with a Gemini multimodal model."
+    );
  }

  const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);
--- a/skills/baoyu-image-gen/scripts/providers/google.ts
+++ b/skills/baoyu-image-gen/scripts/providers/google.ts
@ -216,13 +216,17 @@ export async function generateImage(
 ): Promise<Uint8Array> {
  if (isGoogleImagen(model)) {
    if (args.referenceImages.length > 0) {
-      console.error("Warning: Reference images not supported with Imagen models, ignoring.");
+      throw new Error(
+        "Reference images are not supported with Imagen models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
+      );
    }
    return generateWithImagen(prompt, model, args);
  }

  if (!isGoogleMultimodal(model) && args.referenceImages.length > 0) {
-    console.error("Warning: Reference images are only supported with Gemini multimodal models.");
+    throw new Error(
+      "Reference images are only supported with Gemini multimodal models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
+    );
  }

  return generateWithGemini(prompt, model, args);
--- a/skills/baoyu-image-gen/scripts/providers/openai.ts
+++ b/skills/baoyu-image-gen/scripts/providers/openai.ts
@ -1,9 +1,13 @@
+import path from "node:path";
+import { readFile } from "node:fs/promises";
 import type { CliArgs } from "../types";

 export function getDefaultModel(): string {
  return process.env.OPENAI_IMAGE_MODEL || "gpt-image-1.5";
 }

+type OpenAIImageResponse = { data: Array<{ url?: string; b64_json?: string }> };
+
 function parseAspectRatio(ar: string): { width: number; height: number } | null {
  const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
  if (!match) return null;
@ -66,20 +70,32 @@ export async function generateImage(

  if (!apiKey) throw new Error("OPENAI_API_KEY is required");

-  if (args.referenceImages.length > 0) {
-    console.error("Warning: Reference images not supported with OpenAI, ignoring.");
-  }
-
  const size = args.size || getOpenAISize(model, args.aspectRatio, args.quality);

-  const body: Record<string, any> = {
-    model,
-    prompt,
-    size,
-  };
+  if (args.referenceImages.length > 0) {
+    if (model.includes("dall-e-2") || model.includes("dall-e-3")) {
+      throw new Error(
+        "Reference images with OpenAI in this skill require GPT Image models. Use --model gpt-image-1.5 (or another gpt-image model)."
+      );
+    }
+    return generateWithOpenAIEdits(baseURL, apiKey, prompt, model, size, args.referenceImages, args.quality);
+  }
+
+  return generateWithOpenAIGenerations(baseURL, apiKey, prompt, model, size, args.quality);
+}
+
+async function generateWithOpenAIGenerations(
+  baseURL: string,
+  apiKey: string,
+  prompt: string,
+  model: string,
+  size: string,
+  quality: CliArgs["quality"]
+): Promise<Uint8Array> {
+  const body: Record<string, any> = { model, prompt, size };

  if (model.includes("dall-e-3")) {
-    body.quality = args.quality === "2k" ? "hd" : "standard";
+    body.quality = quality === "2k" ? "hd" : "standard";
  }

  const res = await fetch(`${baseURL}/images/generations`, {
@ -96,7 +112,62 @@ export async function generateImage(
    throw new Error(`OpenAI API error: ${err}`);
  }

-  const result = (await res.json()) as { data: Array<{ url?: string; b64_json?: string }> };
+  const result = (await res.json()) as OpenAIImageResponse;
+  return extractImageFromResponse(result);
+}
+
+async function generateWithOpenAIEdits(
+  baseURL: string,
+  apiKey: string,
+  prompt: string,
+  model: string,
+  size: string,
+  referenceImages: string[],
+  quality: CliArgs["quality"]
+): Promise<Uint8Array> {
+  const form = new FormData();
+  form.append("model", model);
+  form.append("prompt", prompt);
+  form.append("size", size);
+
+  if (model.includes("gpt-image")) {
+    form.append("quality", quality === "2k" ? "high" : "medium");
+  }
+
+  for (const refPath of referenceImages) {
+    const bytes = await readFile(refPath);
+    const filename = path.basename(refPath);
+    const mimeType = getMimeType(filename);
+    const blob = new Blob([bytes], { type: mimeType });
+    form.append("image[]", blob, filename);
+  }
+
+  const res = await fetch(`${baseURL}/images/edits`, {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${apiKey}`,
+    },
+    body: form,
+  });
+
+  if (!res.ok) {
+    const err = await res.text();
+    throw new Error(`OpenAI edits API error: ${err}`);
+  }
+
+  const result = (await res.json()) as OpenAIImageResponse;
+  return extractImageFromResponse(result);
+}
+
+function getMimeType(filename: string): string {
+  const ext = path.extname(filename).toLowerCase();
+  if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg";
+  if (ext === ".webp") return "image/webp";
+  if (ext === ".gif") return "image/gif";
+  return "image/png";
+}
+
+async function extractImageFromResponse(result: OpenAIImageResponse): Promise<Uint8Array> {
  const img = result.data[0];

  if (img?.b64_json) {