feat(baoyu-image-gen): add OpenAI GPT Image edits support for reference images

- Support --ref with OpenAI GPT Image models (gpt-image-1.5)
- Auto-select Google or OpenAI when --ref provided
- Change ref-related warnings to explicit errors with fix hints
- Add reference image validation before generation
- Improve retry logic to skip non-retryable errors
This commit is contained in:
Jim Liu 宝玉 2026-02-06 16:06:58 -06:00
parent 7f80100b3e
commit 86a84739e8
5 changed files with 146 additions and 24 deletions

View File

@ -60,9 +60,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --quali
# From prompt files
npx -y bun ${SKILL_DIR}/scripts/main.ts --promptfiles system.md content.md --image out.png
# With reference images (Google multimodal only)
# With reference images (Google multimodal or OpenAI edits)
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --ref source.png
# With reference images (explicit provider/model)
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --provider google --model gemini-3-pro-image-preview --ref source.png
# Specific provider
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
@ -78,12 +81,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
| `--promptfiles <files...>` | Read prompt from files (concatenated) |
| `--image <path>` | Output image path (required) |
| `--provider google\|openai\|dashscope` | Force provider (default: google) |
| `--model <id>`, `-m` | Model ID |
| `--model <id>`, `-m` | Model ID (`--ref` with OpenAI requires GPT Image model, e.g. `gpt-image-1.5`) |
| `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
| `--size <WxH>` | Size (e.g., `1024x1024`) |
| `--quality normal\|2k` | Quality preset (default: 2k) |
| `--imageSize 1K\|2K\|4K` | Image size for Google (default: from quality) |
| `--ref <files...>` | Reference images (Google multimodal only) |
| `--ref <files...>` | Reference images. Supported by Google multimodal and OpenAI edits (GPT Image models). If provider omitted: Google first, then OpenAI |
| `--n <count>` | Number of images |
| `--json` | JSON output |
@ -105,9 +108,10 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
## Provider Selection
1. `--provider` specified → use it
2. Only one API key available → use that provider
3. Multiple available → default to Google
1. `--ref` provided + no `--provider` → auto-select Google first, then OpenAI
2. `--provider` specified → use it (if `--ref`, must be `google` or `openai`)
3. Only one API key available → use that provider
4. Multiple available → default to Google
## Quality Presets
@ -157,7 +161,7 @@ Supported: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `2.35:1`
- Missing API key → error with setup instructions
- Generation failure → auto-retry once
- Invalid aspect ratio → warning, proceed with default
- Reference images with non-multimodal model → warning, ignore refs
- Reference images with unsupported provider/model → error with fix hint (switch to Google multimodal or OpenAI GPT Image edits)
## Extension Support

View File

@ -1,7 +1,7 @@
import path from "node:path";
import process from "node:process";
import { homedir } from "node:os";
import { mkdir, readFile, writeFile } from "node:fs/promises";
import { access, mkdir, readFile, writeFile } from "node:fs/promises";
import type { CliArgs, Provider, ExtendConfig } from "./types";
function printUsage(): void {
@ -20,7 +20,7 @@ Options:
--size <WxH> Size (e.g., 1024x1024)
--quality normal|2k Quality preset (default: 2k)
--imageSize 1K|2K|4K Image size for Google (default: from quality)
--ref <files...> Reference images (Google multimodal only)
--ref <files...> Reference images (Google multimodal or OpenAI edits)
--n <count> Number of images (default: 1)
--json JSON output
-h, --help Show help
@ -323,12 +323,26 @@ function normalizeOutputImagePath(p: string): string {
}
function detectProvider(args: CliArgs): Provider {
if (args.referenceImages.length > 0 && args.provider && args.provider !== "google" && args.provider !== "openai") {
throw new Error(
"Reference images require a ref-capable provider. Use --provider google (Gemini multimodal) or --provider openai (GPT Image edits)."
);
}
if (args.provider) return args.provider;
const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
const hasOpenai = !!process.env.OPENAI_API_KEY;
const hasDashscope = !!process.env.DASHSCOPE_API_KEY;
if (args.referenceImages.length > 0) {
if (hasGoogle) return "google";
if (hasOpenai) return "openai";
throw new Error(
"Reference images require Google or OpenAI. Set GOOGLE_API_KEY/GEMINI_API_KEY or OPENAI_API_KEY, or remove --ref."
);
}
const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];
if (available.length === 1) return available[0]!;
@ -340,11 +354,34 @@ function detectProvider(args: CliArgs): Provider {
);
}
async function validateReferenceImages(referenceImages: string[]): Promise<void> {
for (const refPath of referenceImages) {
const fullPath = path.resolve(refPath);
try {
await access(fullPath);
} catch {
throw new Error(`Reference image not found: ${fullPath}`);
}
}
}
type ProviderModule = {
getDefaultModel: () => string;
generateImage: (prompt: string, model: string, args: CliArgs) => Promise<Uint8Array>;
};
function isRetryableGenerationError(error: unknown): boolean {
const msg = error instanceof Error ? error.message : String(error);
const nonRetryableMarkers = [
"Reference image",
"not supported",
"only supported",
"No API key found",
"is required",
];
return !nonRetryableMarkers.some((marker) => msg.includes(marker));
}
async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
if (provider === "google") {
return (await import("./providers/google")) as ProviderModule;
@ -387,6 +424,10 @@ async function main(): Promise<void> {
return;
}
if (mergedArgs.referenceImages.length > 0) {
await validateReferenceImages(mergedArgs.referenceImages);
}
const provider = detectProvider(mergedArgs);
const providerModule = await loadProviderModule(provider);
@ -408,7 +449,7 @@ async function main(): Promise<void> {
imageData = await providerModule.generateImage(prompt, model, mergedArgs);
break;
} catch (e) {
if (!retried) {
if (!retried && isRetryableGenerationError(e)) {
retried = true;
console.error("Generation failed, retrying...");
continue;

View File

@ -58,7 +58,9 @@ export async function generateImage(
if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");
if (args.referenceImages.length > 0) {
console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
throw new Error(
"Reference images are not supported with DashScope provider in baoyu-image-gen. Use --provider google with a Gemini multimodal model."
);
}
const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);

View File

@ -216,13 +216,17 @@ export async function generateImage(
): Promise<Uint8Array> {
if (isGoogleImagen(model)) {
if (args.referenceImages.length > 0) {
console.error("Warning: Reference images not supported with Imagen models, ignoring.");
throw new Error(
"Reference images are not supported with Imagen models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
);
}
return generateWithImagen(prompt, model, args);
}
if (!isGoogleMultimodal(model) && args.referenceImages.length > 0) {
console.error("Warning: Reference images are only supported with Gemini multimodal models.");
throw new Error(
"Reference images are only supported with Gemini multimodal models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
);
}
return generateWithGemini(prompt, model, args);

View File

@ -1,9 +1,13 @@
import path from "node:path";
import { readFile } from "node:fs/promises";
import type { CliArgs } from "../types";
export function getDefaultModel(): string {
return process.env.OPENAI_IMAGE_MODEL || "gpt-image-1.5";
}
type OpenAIImageResponse = { data: Array<{ url?: string; b64_json?: string }> };
function parseAspectRatio(ar: string): { width: number; height: number } | null {
const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
if (!match) return null;
@ -66,20 +70,32 @@ export async function generateImage(
if (!apiKey) throw new Error("OPENAI_API_KEY is required");
if (args.referenceImages.length > 0) {
console.error("Warning: Reference images not supported with OpenAI, ignoring.");
}
const size = args.size || getOpenAISize(model, args.aspectRatio, args.quality);
const body: Record<string, any> = {
model,
prompt,
size,
};
if (args.referenceImages.length > 0) {
if (model.includes("dall-e-2") || model.includes("dall-e-3")) {
throw new Error(
"Reference images with OpenAI in this skill require GPT Image models. Use --model gpt-image-1.5 (or another gpt-image model)."
);
}
return generateWithOpenAIEdits(baseURL, apiKey, prompt, model, size, args.referenceImages, args.quality);
}
return generateWithOpenAIGenerations(baseURL, apiKey, prompt, model, size, args.quality);
}
async function generateWithOpenAIGenerations(
baseURL: string,
apiKey: string,
prompt: string,
model: string,
size: string,
quality: CliArgs["quality"]
): Promise<Uint8Array> {
const body: Record<string, any> = { model, prompt, size };
if (model.includes("dall-e-3")) {
body.quality = args.quality === "2k" ? "hd" : "standard";
body.quality = quality === "2k" ? "hd" : "standard";
}
const res = await fetch(`${baseURL}/images/generations`, {
@ -96,7 +112,62 @@ export async function generateImage(
throw new Error(`OpenAI API error: ${err}`);
}
const result = (await res.json()) as { data: Array<{ url?: string; b64_json?: string }> };
const result = (await res.json()) as OpenAIImageResponse;
return extractImageFromResponse(result);
}
async function generateWithOpenAIEdits(
baseURL: string,
apiKey: string,
prompt: string,
model: string,
size: string,
referenceImages: string[],
quality: CliArgs["quality"]
): Promise<Uint8Array> {
const form = new FormData();
form.append("model", model);
form.append("prompt", prompt);
form.append("size", size);
if (model.includes("gpt-image")) {
form.append("quality", quality === "2k" ? "high" : "medium");
}
for (const refPath of referenceImages) {
const bytes = await readFile(refPath);
const filename = path.basename(refPath);
const mimeType = getMimeType(filename);
const blob = new Blob([bytes], { type: mimeType });
form.append("image[]", blob, filename);
}
const res = await fetch(`${baseURL}/images/edits`, {
method: "POST",
headers: {
Authorization: `Bearer ${apiKey}`,
},
body: form,
});
if (!res.ok) {
const err = await res.text();
throw new Error(`OpenAI edits API error: ${err}`);
}
const result = (await res.json()) as OpenAIImageResponse;
return extractImageFromResponse(result);
}
function getMimeType(filename: string): string {
const ext = path.extname(filename).toLowerCase();
if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg";
if (ext === ".webp") return "image/webp";
if (ext === ".gif") return "image/gif";
return "image/png";
}
async function extractImageFromResponse(result: OpenAIImageResponse): Promise<Uint8Array> {
const img = result.data[0];
if (img?.b64_json) {