Adapting baoyu-image-gen to the Tongyi Text-to-Image Model
This commit is contained in:
parent
64945f3341
commit
907c8ab852
|
|
@ -152,3 +152,8 @@ slide-deck/
|
||||||
infographic/
|
infographic/
|
||||||
illustrations/
|
illustrations/
|
||||||
comic/
|
comic/
|
||||||
|
### IntelliJ IDEA ###
|
||||||
|
.idea
|
||||||
|
*.iws
|
||||||
|
*.iml
|
||||||
|
*.ipr
|
||||||
|
|
@ -1,11 +1,11 @@
|
||||||
---
|
---
|
||||||
name: baoyu-image-gen
|
name: baoyu-image-gen
|
||||||
description: AI image generation with OpenAI and Google APIs. Supports text-to-image, reference images, aspect ratios, and parallel generation (recommended 4 concurrent subagents). Use when user asks to generate, create, or draw images.
|
description: AI image generation with OpenAI, Google and DashScope APIs. Supports text-to-image, reference images, aspect ratios, and parallel generation (recommended 4 concurrent subagents). Use when user asks to generate, create, or draw images.
|
||||||
---
|
---
|
||||||
|
|
||||||
# Image Generation (AI SDK)
|
# Image Generation (AI SDK)
|
||||||
|
|
||||||
Official API-based image generation. Supports OpenAI and Google providers.
|
Official API-based image generation. Supports OpenAI, Google and DashScope (阿里通义万象) providers.
|
||||||
|
|
||||||
## Script Directory
|
## Script Directory
|
||||||
|
|
||||||
|
|
@ -63,6 +63,9 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --r
|
||||||
|
|
||||||
# Specific provider
|
# Specific provider
|
||||||
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
|
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
|
||||||
|
|
||||||
|
# DashScope (阿里通义万象)
|
||||||
|
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image out.png --provider dashscope
|
||||||
```
|
```
|
||||||
|
|
||||||
## Options
|
## Options
|
||||||
|
|
@ -72,7 +75,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
|
||||||
| `--prompt <text>`, `-p` | Prompt text |
|
| `--prompt <text>`, `-p` | Prompt text |
|
||||||
| `--promptfiles <files...>` | Read prompt from files (concatenated) |
|
| `--promptfiles <files...>` | Read prompt from files (concatenated) |
|
||||||
| `--image <path>` | Output image path (required) |
|
| `--image <path>` | Output image path (required) |
|
||||||
| `--provider google\|openai` | Force provider (default: google) |
|
| `--provider google\|openai\|dashscope` | Force provider (default: google) |
|
||||||
| `--model <id>`, `-m` | Model ID |
|
| `--model <id>`, `-m` | Model ID |
|
||||||
| `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
|
| `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
|
||||||
| `--size <WxH>` | Size (e.g., `1024x1024`) |
|
| `--size <WxH>` | Size (e.g., `1024x1024`) |
|
||||||
|
|
@ -88,10 +91,13 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
|
||||||
|----------|-------------|
|
|----------|-------------|
|
||||||
| `OPENAI_API_KEY` | OpenAI API key |
|
| `OPENAI_API_KEY` | OpenAI API key |
|
||||||
| `GOOGLE_API_KEY` | Google API key |
|
| `GOOGLE_API_KEY` | Google API key |
|
||||||
|
| `DASHSCOPE_API_KEY` | DashScope API key (阿里云) |
|
||||||
| `OPENAI_IMAGE_MODEL` | OpenAI model override |
|
| `OPENAI_IMAGE_MODEL` | OpenAI model override |
|
||||||
| `GOOGLE_IMAGE_MODEL` | Google model override |
|
| `GOOGLE_IMAGE_MODEL` | Google model override |
|
||||||
|
| `DASHSCOPE_IMAGE_MODEL` | DashScope model override (default: z-image-turbo) |
|
||||||
| `OPENAI_BASE_URL` | Custom OpenAI endpoint |
|
| `OPENAI_BASE_URL` | Custom OpenAI endpoint |
|
||||||
| `GOOGLE_BASE_URL` | Custom Google endpoint |
|
| `GOOGLE_BASE_URL` | Custom Google endpoint |
|
||||||
|
| `DASHSCOPE_BASE_URL` | Custom DashScope endpoint |
|
||||||
|
|
||||||
**Load Priority**: CLI args > env vars > `<cwd>/.baoyu-skills/.env` > `~/.baoyu-skills/.env`
|
**Load Priority**: CLI args > env vars > `<cwd>/.baoyu-skills/.env` > `~/.baoyu-skills/.env`
|
||||||
|
|
||||||
|
|
@ -99,7 +105,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
|
||||||
|
|
||||||
1. `--provider` specified → use it
|
1. `--provider` specified → use it
|
||||||
2. Only one API key available → use that provider
|
2. Only one API key available → use that provider
|
||||||
3. Both available → default to Google
|
3. Multiple available → default to Google
|
||||||
|
|
||||||
## Quality Presets
|
## Quality Presets
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ Options:
|
||||||
-p, --prompt <text> Prompt text
|
-p, --prompt <text> Prompt text
|
||||||
--promptfiles <files...> Read prompt from files (concatenated)
|
--promptfiles <files...> Read prompt from files (concatenated)
|
||||||
--image <path> Output image path (required)
|
--image <path> Output image path (required)
|
||||||
--provider google|openai Force provider (auto-detect by default)
|
--provider google|openai|dashscope Force provider (auto-detect by default)
|
||||||
-m, --model <id> Model ID
|
-m, --model <id> Model ID
|
||||||
--ar <ratio> Aspect ratio (e.g., 16:9, 1:1, 4:3)
|
--ar <ratio> Aspect ratio (e.g., 16:9, 1:1, 4:3)
|
||||||
--size <WxH> Size (e.g., 1024x1024)
|
--size <WxH> Size (e.g., 1024x1024)
|
||||||
|
|
@ -29,10 +29,13 @@ Environment variables:
|
||||||
OPENAI_API_KEY OpenAI API key
|
OPENAI_API_KEY OpenAI API key
|
||||||
GOOGLE_API_KEY Google API key
|
GOOGLE_API_KEY Google API key
|
||||||
GEMINI_API_KEY Gemini API key (alias for GOOGLE_API_KEY)
|
GEMINI_API_KEY Gemini API key (alias for GOOGLE_API_KEY)
|
||||||
|
DASHSCOPE_API_KEY DashScope API key (阿里云通义万象)
|
||||||
OPENAI_IMAGE_MODEL Default OpenAI model (gpt-image-1.5)
|
OPENAI_IMAGE_MODEL Default OpenAI model (gpt-image-1.5)
|
||||||
GOOGLE_IMAGE_MODEL Default Google model (gemini-3-pro-image-preview)
|
GOOGLE_IMAGE_MODEL Default Google model (gemini-3-pro-image-preview)
|
||||||
|
DASHSCOPE_IMAGE_MODEL Default DashScope model (z-image-turbo)
|
||||||
OPENAI_BASE_URL Custom OpenAI endpoint
|
OPENAI_BASE_URL Custom OpenAI endpoint
|
||||||
GOOGLE_BASE_URL Custom Google endpoint
|
GOOGLE_BASE_URL Custom Google endpoint
|
||||||
|
DASHSCOPE_BASE_URL Custom DashScope endpoint
|
||||||
|
|
||||||
Env file load order: CLI args > process.env > <cwd>/.baoyu-skills/.env > ~/.baoyu-skills/.env`);
|
Env file load order: CLI args > process.env > <cwd>/.baoyu-skills/.env > ~/.baoyu-skills/.env`);
|
||||||
}
|
}
|
||||||
|
|
@ -105,7 +108,7 @@ function parseArgs(argv: string[]): CliArgs {
|
||||||
|
|
||||||
if (a === "--provider") {
|
if (a === "--provider") {
|
||||||
const v = argv[++i];
|
const v = argv[++i];
|
||||||
if (v !== "google" && v !== "openai") throw new Error(`Invalid provider: ${v}`);
|
if (v !== "google" && v !== "openai" && v !== "dashscope") throw new Error(`Invalid provider: ${v}`);
|
||||||
out.provider = v;
|
out.provider = v;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
@ -243,13 +246,15 @@ function detectProvider(args: CliArgs): Provider {
|
||||||
|
|
||||||
const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
|
const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
|
||||||
const hasOpenai = !!process.env.OPENAI_API_KEY;
|
const hasOpenai = !!process.env.OPENAI_API_KEY;
|
||||||
|
const hasDashscope = !!process.env.DASHSCOPE_API_KEY;
|
||||||
|
|
||||||
if (hasGoogle && !hasOpenai) return "google";
|
const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];
|
||||||
if (hasOpenai && !hasGoogle) return "openai";
|
|
||||||
if (hasGoogle && hasOpenai) return "google";
|
if (available.length === 1) return available[0]!;
|
||||||
|
if (available.length > 1) return available[0]!;
|
||||||
|
|
||||||
throw new Error(
|
throw new Error(
|
||||||
"No API key found. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENAI_API_KEY.\n" +
|
"No API key found. Set GOOGLE_API_KEY, GEMINI_API_KEY, OPENAI_API_KEY, or DASHSCOPE_API_KEY.\n" +
|
||||||
"Create ~/.baoyu-skills/.env or <cwd>/.baoyu-skills/.env with your keys."
|
"Create ~/.baoyu-skills/.env or <cwd>/.baoyu-skills/.env with your keys."
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
@ -263,6 +268,9 @@ async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
|
||||||
if (provider === "google") {
|
if (provider === "google") {
|
||||||
return (await import("./providers/google")) as ProviderModule;
|
return (await import("./providers/google")) as ProviderModule;
|
||||||
}
|
}
|
||||||
|
if (provider === "dashscope") {
|
||||||
|
return (await import("./providers/dashscope")) as ProviderModule;
|
||||||
|
}
|
||||||
return (await import("./providers/openai")) as ProviderModule;
|
return (await import("./providers/openai")) as ProviderModule;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,137 @@
|
||||||
|
import type { CliArgs } from "../types";
|
||||||
|
|
||||||
|
export function getDefaultModel(): string {
|
||||||
|
return process.env.DASHSCOPE_IMAGE_MODEL || "z-image-turbo";
|
||||||
|
}
|
||||||
|
|
||||||
|
function getApiKey(): string | null {
|
||||||
|
return process.env.DASHSCOPE_API_KEY || null;
|
||||||
|
}
|
||||||
|
|
||||||
|
function getBaseUrl(): string {
|
||||||
|
const base = process.env.DASHSCOPE_BASE_URL || "https://dashscope.aliyuncs.com";
|
||||||
|
return base.replace(/\/+$/g, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseAspectRatio(ar: string): { width: number; height: number } | null {
|
||||||
|
const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
|
||||||
|
if (!match) return null;
|
||||||
|
const w = parseFloat(match[1]!);
|
||||||
|
const h = parseFloat(match[2]!);
|
||||||
|
if (w <= 0 || h <= 0) return null;
|
||||||
|
return { width: w, height: h };
|
||||||
|
}
|
||||||
|
|
||||||
|
function getSizeFromAspectRatio(ar: string | null, quality: CliArgs["quality"]): string {
|
||||||
|
const baseSize = quality === "2k" ? 1440 : 1024;
|
||||||
|
|
||||||
|
if (!ar) return `${baseSize}*${baseSize}`;
|
||||||
|
|
||||||
|
const parsed = parseAspectRatio(ar);
|
||||||
|
if (!parsed) return `${baseSize}*${baseSize}`;
|
||||||
|
|
||||||
|
const ratio = parsed.width / parsed.height;
|
||||||
|
|
||||||
|
if (Math.abs(ratio - 1) < 0.1) {
|
||||||
|
return `${baseSize}*${baseSize}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ratio > 1) {
|
||||||
|
const w = Math.round(baseSize * ratio);
|
||||||
|
return `${w}*${baseSize}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
const h = Math.round(baseSize / ratio);
|
||||||
|
return `${baseSize}*${h}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function normalizeSize(size: string): string {
|
||||||
|
return size.replace("x", "*");
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function generateImage(
|
||||||
|
prompt: string,
|
||||||
|
model: string,
|
||||||
|
args: CliArgs
|
||||||
|
): Promise<Uint8Array> {
|
||||||
|
const apiKey = getApiKey();
|
||||||
|
if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");
|
||||||
|
|
||||||
|
if (args.referenceImages.length > 0) {
|
||||||
|
console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
|
||||||
|
}
|
||||||
|
|
||||||
|
const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);
|
||||||
|
const url = `${getBaseUrl()}/api/v1/services/aigc/multimodal-generation/generation`;
|
||||||
|
|
||||||
|
const body = {
|
||||||
|
model,
|
||||||
|
input: {
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: "user",
|
||||||
|
content: [{ text: prompt }],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
parameters: {
|
||||||
|
prompt_extend: false,
|
||||||
|
size,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
console.log(`Generating image with DashScope (${model})...`, { size });
|
||||||
|
|
||||||
|
const res = await fetch(url, {
|
||||||
|
method: "POST",
|
||||||
|
headers: {
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
Authorization: `Bearer ${apiKey}`,
|
||||||
|
},
|
||||||
|
body: JSON.stringify(body),
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!res.ok) {
|
||||||
|
const err = await res.text();
|
||||||
|
throw new Error(`DashScope API error (${res.status}): ${err}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const result = await res.json() as {
|
||||||
|
output?: {
|
||||||
|
result_image?: string;
|
||||||
|
choices?: Array<{
|
||||||
|
message?: {
|
||||||
|
content?: Array<{ image?: string }>;
|
||||||
|
};
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
let imageData: string | null = null;
|
||||||
|
|
||||||
|
if (result.output?.result_image) {
|
||||||
|
imageData = result.output.result_image;
|
||||||
|
} else if (result.output?.choices?.[0]?.message?.content) {
|
||||||
|
const content = result.output.choices[0].message.content;
|
||||||
|
for (const item of content) {
|
||||||
|
if (item.image) {
|
||||||
|
imageData = item.image;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!imageData) {
|
||||||
|
console.error("Response:", JSON.stringify(result, null, 2));
|
||||||
|
throw new Error("No image in response");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (imageData.startsWith("http://") || imageData.startsWith("https://")) {
|
||||||
|
const imgRes = await fetch(imageData);
|
||||||
|
if (!imgRes.ok) throw new Error("Failed to download image");
|
||||||
|
const buf = await imgRes.arrayBuffer();
|
||||||
|
return new Uint8Array(buf);
|
||||||
|
}
|
||||||
|
|
||||||
|
return Uint8Array.from(Buffer.from(imageData, "base64"));
|
||||||
|
}
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
export type Provider = "google" | "openai";
|
export type Provider = "google" | "openai" | "dashscope";
|
||||||
export type Quality = "normal" | "2k";
|
export type Quality = "normal" | "2k";
|
||||||
|
|
||||||
export type CliArgs = {
|
export type CliArgs = {
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue