feat(baoyu-image-gen): add OpenAI GPT Image edits support for reference images
- Support --ref with OpenAI GPT Image models (gpt-image-1.5) - Auto-select Google or OpenAI when --ref provided - Change ref-related warnings to explicit errors with fix hints - Add reference image validation before generation - Improve retry logic to skip non-retryable errors
This commit is contained in:
parent
7f80100b3e
commit
86a84739e8
|
|
@ -60,9 +60,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --quali
|
|||
# From prompt files
|
||||
npx -y bun ${SKILL_DIR}/scripts/main.ts --promptfiles system.md content.md --image out.png
|
||||
|
||||
# With reference images (Google multimodal only)
|
||||
# With reference images (Google multimodal or OpenAI edits)
|
||||
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --ref source.png
|
||||
|
||||
# With reference images (explicit provider/model)
|
||||
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --provider google --model gemini-3-pro-image-preview --ref source.png
|
||||
|
||||
# Specific provider
|
||||
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
|
||||
|
||||
|
|
@ -78,12 +81,12 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
|
|||
| `--promptfiles <files...>` | Read prompt from files (concatenated) |
|
||||
| `--image <path>` | Output image path (required) |
|
||||
| `--provider google\|openai\|dashscope` | Force provider (default: google) |
|
||||
| `--model <id>`, `-m` | Model ID |
|
||||
| `--model <id>`, `-m` | Model ID (`--ref` with OpenAI requires GPT Image model, e.g. `gpt-image-1.5`) |
|
||||
| `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
|
||||
| `--size <WxH>` | Size (e.g., `1024x1024`) |
|
||||
| `--quality normal\|2k` | Quality preset (default: 2k) |
|
||||
| `--imageSize 1K\|2K\|4K` | Image size for Google (default: from quality) |
|
||||
| `--ref <files...>` | Reference images (Google multimodal only) |
|
||||
| `--ref <files...>` | Reference images. Supported by Google multimodal and OpenAI edits (GPT Image models). If provider omitted: Google first, then OpenAI |
|
||||
| `--n <count>` | Number of images |
|
||||
| `--json` | JSON output |
|
||||
|
||||
|
|
@ -105,9 +108,10 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image ou
|
|||
|
||||
## Provider Selection
|
||||
|
||||
1. `--provider` specified → use it
|
||||
2. Only one API key available → use that provider
|
||||
3. Multiple available → default to Google
|
||||
1. `--ref` provided + no `--provider` → auto-select Google first, then OpenAI
|
||||
2. `--provider` specified → use it (if `--ref`, must be `google` or `openai`)
|
||||
3. Only one API key available → use that provider
|
||||
4. Multiple available → default to Google
|
||||
|
||||
## Quality Presets
|
||||
|
||||
|
|
@ -157,7 +161,7 @@ Supported: `1:1`, `16:9`, `9:16`, `4:3`, `3:4`, `2.35:1`
|
|||
- Missing API key → error with setup instructions
|
||||
- Generation failure → auto-retry once
|
||||
- Invalid aspect ratio → warning, proceed with default
|
||||
- Reference images with non-multimodal model → warning, ignore refs
|
||||
- Reference images with unsupported provider/model → error with fix hint (switch to Google multimodal or OpenAI GPT Image edits)
|
||||
|
||||
## Extension Support
|
||||
|
||||
|
|
|
|||
|
|
@ -1,7 +1,7 @@
|
|||
import path from "node:path";
|
||||
import process from "node:process";
|
||||
import { homedir } from "node:os";
|
||||
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
||||
import { access, mkdir, readFile, writeFile } from "node:fs/promises";
|
||||
import type { CliArgs, Provider, ExtendConfig } from "./types";
|
||||
|
||||
function printUsage(): void {
|
||||
|
|
@ -20,7 +20,7 @@ Options:
|
|||
--size <WxH> Size (e.g., 1024x1024)
|
||||
--quality normal|2k Quality preset (default: 2k)
|
||||
--imageSize 1K|2K|4K Image size for Google (default: from quality)
|
||||
--ref <files...> Reference images (Google multimodal only)
|
||||
--ref <files...> Reference images (Google multimodal or OpenAI edits)
|
||||
--n <count> Number of images (default: 1)
|
||||
--json JSON output
|
||||
-h, --help Show help
|
||||
|
|
@ -323,12 +323,26 @@ function normalizeOutputImagePath(p: string): string {
|
|||
}
|
||||
|
||||
function detectProvider(args: CliArgs): Provider {
|
||||
if (args.referenceImages.length > 0 && args.provider && args.provider !== "google" && args.provider !== "openai") {
|
||||
throw new Error(
|
||||
"Reference images require a ref-capable provider. Use --provider google (Gemini multimodal) or --provider openai (GPT Image edits)."
|
||||
);
|
||||
}
|
||||
|
||||
if (args.provider) return args.provider;
|
||||
|
||||
const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
|
||||
const hasOpenai = !!process.env.OPENAI_API_KEY;
|
||||
const hasDashscope = !!process.env.DASHSCOPE_API_KEY;
|
||||
|
||||
if (args.referenceImages.length > 0) {
|
||||
if (hasGoogle) return "google";
|
||||
if (hasOpenai) return "openai";
|
||||
throw new Error(
|
||||
"Reference images require Google or OpenAI. Set GOOGLE_API_KEY/GEMINI_API_KEY or OPENAI_API_KEY, or remove --ref."
|
||||
);
|
||||
}
|
||||
|
||||
const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];
|
||||
|
||||
if (available.length === 1) return available[0]!;
|
||||
|
|
@ -340,11 +354,34 @@ function detectProvider(args: CliArgs): Provider {
|
|||
);
|
||||
}
|
||||
|
||||
async function validateReferenceImages(referenceImages: string[]): Promise<void> {
|
||||
for (const refPath of referenceImages) {
|
||||
const fullPath = path.resolve(refPath);
|
||||
try {
|
||||
await access(fullPath);
|
||||
} catch {
|
||||
throw new Error(`Reference image not found: ${fullPath}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
type ProviderModule = {
|
||||
getDefaultModel: () => string;
|
||||
generateImage: (prompt: string, model: string, args: CliArgs) => Promise<Uint8Array>;
|
||||
};
|
||||
|
||||
function isRetryableGenerationError(error: unknown): boolean {
|
||||
const msg = error instanceof Error ? error.message : String(error);
|
||||
const nonRetryableMarkers = [
|
||||
"Reference image",
|
||||
"not supported",
|
||||
"only supported",
|
||||
"No API key found",
|
||||
"is required",
|
||||
];
|
||||
return !nonRetryableMarkers.some((marker) => msg.includes(marker));
|
||||
}
|
||||
|
||||
async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
|
||||
if (provider === "google") {
|
||||
return (await import("./providers/google")) as ProviderModule;
|
||||
|
|
@ -387,6 +424,10 @@ async function main(): Promise<void> {
|
|||
return;
|
||||
}
|
||||
|
||||
if (mergedArgs.referenceImages.length > 0) {
|
||||
await validateReferenceImages(mergedArgs.referenceImages);
|
||||
}
|
||||
|
||||
const provider = detectProvider(mergedArgs);
|
||||
const providerModule = await loadProviderModule(provider);
|
||||
|
||||
|
|
@ -408,7 +449,7 @@ async function main(): Promise<void> {
|
|||
imageData = await providerModule.generateImage(prompt, model, mergedArgs);
|
||||
break;
|
||||
} catch (e) {
|
||||
if (!retried) {
|
||||
if (!retried && isRetryableGenerationError(e)) {
|
||||
retried = true;
|
||||
console.error("Generation failed, retrying...");
|
||||
continue;
|
||||
|
|
|
|||
|
|
@ -58,7 +58,9 @@ export async function generateImage(
|
|||
if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");
|
||||
|
||||
if (args.referenceImages.length > 0) {
|
||||
console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
|
||||
throw new Error(
|
||||
"Reference images are not supported with DashScope provider in baoyu-image-gen. Use --provider google with a Gemini multimodal model."
|
||||
);
|
||||
}
|
||||
|
||||
const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);
|
||||
|
|
|
|||
|
|
@ -216,13 +216,17 @@ export async function generateImage(
|
|||
): Promise<Uint8Array> {
|
||||
if (isGoogleImagen(model)) {
|
||||
if (args.referenceImages.length > 0) {
|
||||
console.error("Warning: Reference images not supported with Imagen models, ignoring.");
|
||||
throw new Error(
|
||||
"Reference images are not supported with Imagen models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
|
||||
);
|
||||
}
|
||||
return generateWithImagen(prompt, model, args);
|
||||
}
|
||||
|
||||
if (!isGoogleMultimodal(model) && args.referenceImages.length > 0) {
|
||||
console.error("Warning: Reference images are only supported with Gemini multimodal models.");
|
||||
throw new Error(
|
||||
"Reference images are only supported with Gemini multimodal models. Use gemini-3-pro-image-preview or gemini-3-flash-preview."
|
||||
);
|
||||
}
|
||||
|
||||
return generateWithGemini(prompt, model, args);
|
||||
|
|
|
|||
|
|
@ -1,9 +1,13 @@
|
|||
import path from "node:path";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import type { CliArgs } from "../types";
|
||||
|
||||
export function getDefaultModel(): string {
|
||||
return process.env.OPENAI_IMAGE_MODEL || "gpt-image-1.5";
|
||||
}
|
||||
|
||||
type OpenAIImageResponse = { data: Array<{ url?: string; b64_json?: string }> };
|
||||
|
||||
function parseAspectRatio(ar: string): { width: number; height: number } | null {
|
||||
const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
|
||||
if (!match) return null;
|
||||
|
|
@ -66,20 +70,32 @@ export async function generateImage(
|
|||
|
||||
if (!apiKey) throw new Error("OPENAI_API_KEY is required");
|
||||
|
||||
if (args.referenceImages.length > 0) {
|
||||
console.error("Warning: Reference images not supported with OpenAI, ignoring.");
|
||||
}
|
||||
|
||||
const size = args.size || getOpenAISize(model, args.aspectRatio, args.quality);
|
||||
|
||||
const body: Record<string, any> = {
|
||||
model,
|
||||
prompt,
|
||||
size,
|
||||
};
|
||||
if (args.referenceImages.length > 0) {
|
||||
if (model.includes("dall-e-2") || model.includes("dall-e-3")) {
|
||||
throw new Error(
|
||||
"Reference images with OpenAI in this skill require GPT Image models. Use --model gpt-image-1.5 (or another gpt-image model)."
|
||||
);
|
||||
}
|
||||
return generateWithOpenAIEdits(baseURL, apiKey, prompt, model, size, args.referenceImages, args.quality);
|
||||
}
|
||||
|
||||
return generateWithOpenAIGenerations(baseURL, apiKey, prompt, model, size, args.quality);
|
||||
}
|
||||
|
||||
async function generateWithOpenAIGenerations(
|
||||
baseURL: string,
|
||||
apiKey: string,
|
||||
prompt: string,
|
||||
model: string,
|
||||
size: string,
|
||||
quality: CliArgs["quality"]
|
||||
): Promise<Uint8Array> {
|
||||
const body: Record<string, any> = { model, prompt, size };
|
||||
|
||||
if (model.includes("dall-e-3")) {
|
||||
body.quality = args.quality === "2k" ? "hd" : "standard";
|
||||
body.quality = quality === "2k" ? "hd" : "standard";
|
||||
}
|
||||
|
||||
const res = await fetch(`${baseURL}/images/generations`, {
|
||||
|
|
@ -96,7 +112,62 @@ export async function generateImage(
|
|||
throw new Error(`OpenAI API error: ${err}`);
|
||||
}
|
||||
|
||||
const result = (await res.json()) as { data: Array<{ url?: string; b64_json?: string }> };
|
||||
const result = (await res.json()) as OpenAIImageResponse;
|
||||
return extractImageFromResponse(result);
|
||||
}
|
||||
|
||||
async function generateWithOpenAIEdits(
|
||||
baseURL: string,
|
||||
apiKey: string,
|
||||
prompt: string,
|
||||
model: string,
|
||||
size: string,
|
||||
referenceImages: string[],
|
||||
quality: CliArgs["quality"]
|
||||
): Promise<Uint8Array> {
|
||||
const form = new FormData();
|
||||
form.append("model", model);
|
||||
form.append("prompt", prompt);
|
||||
form.append("size", size);
|
||||
|
||||
if (model.includes("gpt-image")) {
|
||||
form.append("quality", quality === "2k" ? "high" : "medium");
|
||||
}
|
||||
|
||||
for (const refPath of referenceImages) {
|
||||
const bytes = await readFile(refPath);
|
||||
const filename = path.basename(refPath);
|
||||
const mimeType = getMimeType(filename);
|
||||
const blob = new Blob([bytes], { type: mimeType });
|
||||
form.append("image[]", blob, filename);
|
||||
}
|
||||
|
||||
const res = await fetch(`${baseURL}/images/edits`, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
body: form,
|
||||
});
|
||||
|
||||
if (!res.ok) {
|
||||
const err = await res.text();
|
||||
throw new Error(`OpenAI edits API error: ${err}`);
|
||||
}
|
||||
|
||||
const result = (await res.json()) as OpenAIImageResponse;
|
||||
return extractImageFromResponse(result);
|
||||
}
|
||||
|
||||
function getMimeType(filename: string): string {
|
||||
const ext = path.extname(filename).toLowerCase();
|
||||
if (ext === ".jpg" || ext === ".jpeg") return "image/jpeg";
|
||||
if (ext === ".webp") return "image/webp";
|
||||
if (ext === ".gif") return "image/gif";
|
||||
return "image/png";
|
||||
}
|
||||
|
||||
async function extractImageFromResponse(result: OpenAIImageResponse): Promise<Uint8Array> {
|
||||
const img = result.data[0];
|
||||
|
||||
if (img?.b64_json) {
|
||||
|
|
|
|||
Loading…
Reference in New Issue