Adapting baoyu-image-gen to the Tongyi Text-to-Image Model

This commit is contained in:
jianzhang50 2026-01-28 09:04:57 +08:00
parent 64945f3341
commit 907c8ab852
5 changed files with 167 additions and 11 deletions

5
.gitignore vendored
View File

@ -152,3 +152,8 @@ slide-deck/
infographic/
illustrations/
comic/
### IntelliJ IDEA ###
.idea
*.iws
*.iml
*.ipr

View File

@ -1,11 +1,11 @@
---
name: baoyu-image-gen
description: AI image generation with OpenAI and Google APIs. Supports text-to-image, reference images, aspect ratios, and parallel generation (recommended 4 concurrent subagents). Use when user asks to generate, create, or draw images.
description: AI image generation with OpenAI, Google and DashScope APIs. Supports text-to-image, reference images, aspect ratios, and parallel generation (recommended 4 concurrent subagents). Use when user asks to generate, create, or draw images.
---
# Image Generation (AI SDK)
Official API-based image generation. Supports OpenAI and Google providers.
Official API-based image generation. Supports OpenAI, Google and DashScope (阿里通义万象) providers.
## Script Directory
@ -63,6 +63,9 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "Make blue" --image out.png --r
# Specific provider
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provider openai
# DashScope (阿里通义万象)
npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "一只可爱的猫" --image out.png --provider dashscope
```
## Options
@ -72,7 +75,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
| `--prompt <text>`, `-p` | Prompt text |
| `--promptfiles <files...>` | Read prompt from files (concatenated) |
| `--image <path>` | Output image path (required) |
| `--provider google\|openai` | Force provider (default: google) |
| `--provider google\|openai\|dashscope` | Force provider (default: google) |
| `--model <id>`, `-m` | Model ID |
| `--ar <ratio>` | Aspect ratio (e.g., `16:9`, `1:1`, `4:3`) |
| `--size <WxH>` | Size (e.g., `1024x1024`) |
@ -88,10 +91,13 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
|----------|-------------|
| `OPENAI_API_KEY` | OpenAI API key |
| `GOOGLE_API_KEY` | Google API key |
| `DASHSCOPE_API_KEY` | DashScope API key (阿里云) |
| `OPENAI_IMAGE_MODEL` | OpenAI model override |
| `GOOGLE_IMAGE_MODEL` | Google model override |
| `DASHSCOPE_IMAGE_MODEL` | DashScope model override (default: z-image-turbo) |
| `OPENAI_BASE_URL` | Custom OpenAI endpoint |
| `GOOGLE_BASE_URL` | Custom Google endpoint |
| `DASHSCOPE_BASE_URL` | Custom DashScope endpoint |
**Load Priority**: CLI args > env vars > `<cwd>/.baoyu-skills/.env` > `~/.baoyu-skills/.env`
@ -99,7 +105,7 @@ npx -y bun ${SKILL_DIR}/scripts/main.ts --prompt "A cat" --image out.png --provi
1. `--provider` specified → use it
2. Only one API key available → use that provider
3. Both available → default to Google
3. Multiple available → default to Google
## Quality Presets

View File

@ -14,7 +14,7 @@ Options:
-p, --prompt <text> Prompt text
--promptfiles <files...> Read prompt from files (concatenated)
--image <path> Output image path (required)
--provider google|openai Force provider (auto-detect by default)
--provider google|openai|dashscope Force provider (auto-detect by default)
-m, --model <id> Model ID
--ar <ratio> Aspect ratio (e.g., 16:9, 1:1, 4:3)
--size <WxH> Size (e.g., 1024x1024)
@ -29,10 +29,13 @@ Environment variables:
OPENAI_API_KEY OpenAI API key
GOOGLE_API_KEY Google API key
GEMINI_API_KEY Gemini API key (alias for GOOGLE_API_KEY)
DASHSCOPE_API_KEY DashScope API key ()
OPENAI_IMAGE_MODEL Default OpenAI model (gpt-image-1.5)
GOOGLE_IMAGE_MODEL Default Google model (gemini-3-pro-image-preview)
DASHSCOPE_IMAGE_MODEL Default DashScope model (z-image-turbo)
OPENAI_BASE_URL Custom OpenAI endpoint
GOOGLE_BASE_URL Custom Google endpoint
DASHSCOPE_BASE_URL Custom DashScope endpoint
Env file load order: CLI args > process.env > <cwd>/.baoyu-skills/.env > ~/.baoyu-skills/.env`);
}
@ -105,7 +108,7 @@ function parseArgs(argv: string[]): CliArgs {
if (a === "--provider") {
const v = argv[++i];
if (v !== "google" && v !== "openai") throw new Error(`Invalid provider: ${v}`);
if (v !== "google" && v !== "openai" && v !== "dashscope") throw new Error(`Invalid provider: ${v}`);
out.provider = v;
continue;
}
@ -243,13 +246,15 @@ function detectProvider(args: CliArgs): Provider {
const hasGoogle = !!(process.env.GOOGLE_API_KEY || process.env.GEMINI_API_KEY);
const hasOpenai = !!process.env.OPENAI_API_KEY;
const hasDashscope = !!process.env.DASHSCOPE_API_KEY;
if (hasGoogle && !hasOpenai) return "google";
if (hasOpenai && !hasGoogle) return "openai";
if (hasGoogle && hasOpenai) return "google";
const available = [hasGoogle && "google", hasOpenai && "openai", hasDashscope && "dashscope"].filter(Boolean) as Provider[];
if (available.length === 1) return available[0]!;
if (available.length > 1) return available[0]!;
throw new Error(
"No API key found. Set GOOGLE_API_KEY, GEMINI_API_KEY, or OPENAI_API_KEY.\n" +
"No API key found. Set GOOGLE_API_KEY, GEMINI_API_KEY, OPENAI_API_KEY, or DASHSCOPE_API_KEY.\n" +
"Create ~/.baoyu-skills/.env or <cwd>/.baoyu-skills/.env with your keys."
);
}
@ -263,6 +268,9 @@ async function loadProviderModule(provider: Provider): Promise<ProviderModule> {
if (provider === "google") {
return (await import("./providers/google")) as ProviderModule;
}
if (provider === "dashscope") {
return (await import("./providers/dashscope")) as ProviderModule;
}
return (await import("./providers/openai")) as ProviderModule;
}

View File

@ -0,0 +1,137 @@
import type { CliArgs } from "../types";
export function getDefaultModel(): string {
return process.env.DASHSCOPE_IMAGE_MODEL || "z-image-turbo";
}
function getApiKey(): string | null {
return process.env.DASHSCOPE_API_KEY || null;
}
function getBaseUrl(): string {
const base = process.env.DASHSCOPE_BASE_URL || "https://dashscope.aliyuncs.com";
return base.replace(/\/+$/g, "");
}
function parseAspectRatio(ar: string): { width: number; height: number } | null {
const match = ar.match(/^(\d+(?:\.\d+)?):(\d+(?:\.\d+)?)$/);
if (!match) return null;
const w = parseFloat(match[1]!);
const h = parseFloat(match[2]!);
if (w <= 0 || h <= 0) return null;
return { width: w, height: h };
}
function getSizeFromAspectRatio(ar: string | null, quality: CliArgs["quality"]): string {
const baseSize = quality === "2k" ? 1440 : 1024;
if (!ar) return `${baseSize}*${baseSize}`;
const parsed = parseAspectRatio(ar);
if (!parsed) return `${baseSize}*${baseSize}`;
const ratio = parsed.width / parsed.height;
if (Math.abs(ratio - 1) < 0.1) {
return `${baseSize}*${baseSize}`;
}
if (ratio > 1) {
const w = Math.round(baseSize * ratio);
return `${w}*${baseSize}`;
}
const h = Math.round(baseSize / ratio);
return `${baseSize}*${h}`;
}
function normalizeSize(size: string): string {
return size.replace("x", "*");
}
export async function generateImage(
prompt: string,
model: string,
args: CliArgs
): Promise<Uint8Array> {
const apiKey = getApiKey();
if (!apiKey) throw new Error("DASHSCOPE_API_KEY is required");
if (args.referenceImages.length > 0) {
console.error("Warning: Reference images not yet supported with DashScope, ignoring.");
}
const size = args.size ? normalizeSize(args.size) : getSizeFromAspectRatio(args.aspectRatio, args.quality);
const url = `${getBaseUrl()}/api/v1/services/aigc/multimodal-generation/generation`;
const body = {
model,
input: {
messages: [
{
role: "user",
content: [{ text: prompt }],
},
],
},
parameters: {
prompt_extend: false,
size,
},
};
console.log(`Generating image with DashScope (${model})...`, { size });
const res = await fetch(url, {
method: "POST",
headers: {
"Content-Type": "application/json",
Authorization: `Bearer ${apiKey}`,
},
body: JSON.stringify(body),
});
if (!res.ok) {
const err = await res.text();
throw new Error(`DashScope API error (${res.status}): ${err}`);
}
const result = await res.json() as {
output?: {
result_image?: string;
choices?: Array<{
message?: {
content?: Array<{ image?: string }>;
};
}>;
};
};
let imageData: string | null = null;
if (result.output?.result_image) {
imageData = result.output.result_image;
} else if (result.output?.choices?.[0]?.message?.content) {
const content = result.output.choices[0].message.content;
for (const item of content) {
if (item.image) {
imageData = item.image;
break;
}
}
}
if (!imageData) {
console.error("Response:", JSON.stringify(result, null, 2));
throw new Error("No image in response");
}
if (imageData.startsWith("http://") || imageData.startsWith("https://")) {
const imgRes = await fetch(imageData);
if (!imgRes.ok) throw new Error("Failed to download image");
const buf = await imgRes.arrayBuffer();
return new Uint8Array(buf);
}
return Uint8Array.from(Buffer.from(imageData, "base64"));
}

View File

@ -1,4 +1,4 @@
export type Provider = "google" | "openai";
export type Provider = "google" | "openai" | "dashscope";
export type Quality = "normal" | "2k";
export type CliArgs = {