Files
Memoh/internal/agent/tools/image_gen.go
T
Acbox ddda00f980 feat(models): add image model type support
Add a dedicated image model type so bots can use image API models without overloading chat model capabilities, while keeping existing chat-based image generation selectable.
2026-04-16 16:00:22 +08:00

309 lines
9.0 KiB
Go

package tools
import (
"context"
"encoding/base64"
"errors"
"fmt"
"log/slog"
"strings"
"time"
sdk "github.com/memohai/twilight-ai/sdk"
"github.com/memohai/memoh/internal/db/sqlc"
"github.com/memohai/memoh/internal/models"
"github.com/memohai/memoh/internal/providers"
"github.com/memohai/memoh/internal/settings"
"github.com/memohai/memoh/internal/workspace/bridge"
)
const imageGenDir = "/data/generated-images"
type ImageGenProvider struct {
logger *slog.Logger
settings *settings.Service
models *models.Service
queries *sqlc.Queries
containers bridge.Provider
dataMount string
}
type generatedImageFile struct {
Data string
MediaType string
}
func NewImageGenProvider(
log *slog.Logger,
settingsSvc *settings.Service,
modelsSvc *models.Service,
queries *sqlc.Queries,
containers bridge.Provider,
dataMount string,
) *ImageGenProvider {
if log == nil {
log = slog.Default()
}
return &ImageGenProvider{
logger: log.With(slog.String("tool", "image_gen")),
settings: settingsSvc,
models: modelsSvc,
queries: queries,
containers: containers,
dataMount: dataMount,
}
}
func (p *ImageGenProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
if session.IsSubagent || p.settings == nil || p.models == nil || p.queries == nil {
return nil, nil
}
botID := strings.TrimSpace(session.BotID)
if botID == "" {
return nil, nil
}
botSettings, err := p.settings.GetBot(ctx, botID)
if err != nil {
return nil, nil
}
if strings.TrimSpace(botSettings.ImageModelID) == "" {
return nil, nil
}
modelResp, err := p.models.GetByID(ctx, botSettings.ImageModelID)
if err != nil || !supportsImageGeneration(modelResp) {
return nil, nil
}
sess := session
return []sdk.Tool{
{
Name: "generate_image",
Description: "Generate an image from a text description using the configured image generation model. Returns the file path of the generated image in the workspace.",
Parameters: map[string]any{
"type": "object",
"properties": map[string]any{
"prompt": map[string]any{"type": "string", "description": "Detailed description of the image to generate"},
"size": map[string]any{"type": "string", "description": "Image size, e.g. 1024x1024, 1792x1024, 1024x1792. Defaults to 1024x1024."},
},
"required": []string{"prompt"},
},
Execute: func(execCtx *sdk.ToolExecContext, input any) (any, error) {
return p.execGenerateImage(execCtx.Context, sess, inputAsMap(input))
},
},
}, nil
}
func (p *ImageGenProvider) execGenerateImage(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
botID := strings.TrimSpace(session.BotID)
if botID == "" {
return nil, errors.New("bot_id is required")
}
prompt := strings.TrimSpace(StringArg(args, "prompt"))
if prompt == "" {
return nil, errors.New("prompt is required")
}
size := strings.TrimSpace(StringArg(args, "size"))
if size == "" {
size = "1024x1024"
}
botSettings, err := p.settings.GetBot(ctx, botID)
if err != nil {
return nil, errors.New("failed to load bot settings")
}
imageModelID := strings.TrimSpace(botSettings.ImageModelID)
if imageModelID == "" {
return nil, errors.New("no image generation model configured")
}
modelResp, err := p.models.GetByID(ctx, imageModelID)
if err != nil {
return nil, fmt.Errorf("failed to load image model: %w", err)
}
if !supportsImageGeneration(modelResp) {
return nil, errors.New("configured model does not support image generation")
}
provider, err := models.FetchProviderByID(ctx, p.queries, modelResp.ProviderID)
if err != nil {
return nil, fmt.Errorf("failed to load model provider: %w", err)
}
authResolver := providers.NewService(nil, p.queries, "")
creds, err := authResolver.ResolveModelCredentials(ctx, provider)
if err != nil {
return nil, fmt.Errorf("failed to resolve provider credentials: %w", err)
}
file, imgBytes, ext, err := generateImage(ctx, modelResp, provider, creds, prompt, size)
if err != nil {
return nil, err
}
containerPath := fmt.Sprintf("%s/%d.%s", imageGenDir, time.Now().UnixMilli(), ext)
client, clientErr := p.containers.MCPClient(ctx, botID)
if clientErr != nil {
return map[string]any{
"content": []map[string]any{
{"type": "text", "text": "Image generated (container not reachable, not saved to disk)"},
{"type": "image", "data": file.Data, "mimeType": file.MediaType},
},
}, nil
}
mkdirCmd := fmt.Sprintf("mkdir -p %s", imageGenDir)
_, _ = client.Exec(ctx, mkdirCmd, "/", 5)
if writeErr := client.WriteFile(ctx, containerPath, imgBytes); writeErr != nil {
return map[string]any{
"content": []map[string]any{
{"type": "text", "text": fmt.Sprintf("Image generated (failed to save: %s)", writeErr.Error())},
{"type": "image", "data": file.Data, "mimeType": file.MediaType},
},
}, nil
}
return map[string]any{
"path": containerPath,
"media_type": file.MediaType,
"size_bytes": len(imgBytes),
}, nil
}
func supportsImageGeneration(model models.GetResponse) bool {
switch model.Type {
case models.ModelTypeChat:
return model.HasCompatibility(models.CompatImageOutput)
case models.ModelTypeImage:
return model.HasCompatibility(models.CompatGenerate)
default:
return false
}
}
func generateImage(
ctx context.Context,
modelResp models.GetResponse,
provider sqlc.Provider,
creds providers.ModelCredentials,
prompt string,
size string,
) (generatedImageFile, []byte, string, error) {
switch modelResp.Type {
case models.ModelTypeChat:
return generateImageFromChatModel(ctx, modelResp, provider, creds, prompt, size)
case models.ModelTypeImage:
return generateImageFromImageModel(ctx, modelResp, provider, creds, prompt, size)
default:
return generatedImageFile{}, nil, "", fmt.Errorf("unsupported image model type: %s", modelResp.Type)
}
}
func generateImageFromChatModel(
ctx context.Context,
modelResp models.GetResponse,
provider sqlc.Provider,
creds providers.ModelCredentials,
prompt string,
size string,
) (generatedImageFile, []byte, string, error) {
sdkModel := models.NewSDKChatModel(models.SDKModelConfig{
ModelID: modelResp.ModelID,
ClientType: provider.ClientType,
APIKey: creds.APIKey,
BaseURL: providers.ProviderConfigString(provider, "base_url"),
})
userMsg := fmt.Sprintf("Generate an image with the following description. Size: %s\n\n%s", size, prompt)
result, err := sdk.GenerateTextResult(ctx,
sdk.WithModel(sdkModel),
sdk.WithMessages([]sdk.Message{
{Role: sdk.MessageRoleUser, Content: []sdk.MessagePart{sdk.TextPart{Text: userMsg}}},
}),
)
if err != nil {
return generatedImageFile{}, nil, "", fmt.Errorf("image generation failed: %w", err)
}
if len(result.Files) == 0 {
if result.Text != "" {
return generatedImageFile{}, nil, "", fmt.Errorf("no image generated: %s", result.Text)
}
return generatedImageFile{}, nil, "", errors.New("no image was generated by the model")
}
file := generatedImageFile{
Data: result.Files[0].Data,
MediaType: result.Files[0].MediaType,
}
imgBytes, ext, err := decodeGeneratedImage(file)
if err != nil {
return generatedImageFile{}, nil, "", err
}
return file, imgBytes, ext, nil
}
func generateImageFromImageModel(
ctx context.Context,
modelResp models.GetResponse,
provider sqlc.Provider,
creds providers.ModelCredentials,
prompt string,
size string,
) (generatedImageFile, []byte, string, error) {
imageModel := models.NewSDKImageGenerationModel(models.SDKModelConfig{
ModelID: modelResp.ModelID,
ClientType: provider.ClientType,
APIKey: creds.APIKey,
BaseURL: providers.ProviderConfigString(provider, "base_url"),
})
if imageModel == nil {
return generatedImageFile{}, nil, "", errors.New("configured provider does not support image generation API")
}
result, err := sdk.GenerateImage(ctx,
sdk.WithImageGenerationModel(imageModel),
sdk.WithImagePrompt(prompt),
sdk.WithImageSize(size),
sdk.WithImageResponseFormat("b64_json"),
sdk.WithImageOutputFormat("png"),
)
if err != nil {
return generatedImageFile{}, nil, "", fmt.Errorf("image generation failed: %w", err)
}
if len(result.Data) == 0 {
return generatedImageFile{}, nil, "", errors.New("no image was generated by the model")
}
if strings.TrimSpace(result.Data[0].B64JSON) == "" {
return generatedImageFile{}, nil, "", errors.New("image model did not return inline image data")
}
file := generatedImageFile{
Data: result.Data[0].B64JSON,
MediaType: "image/png",
}
imgBytes, ext, err := decodeGeneratedImage(file)
if err != nil {
return generatedImageFile{}, nil, "", err
}
return file, imgBytes, ext, nil
}
func decodeGeneratedImage(file generatedImageFile) ([]byte, string, error) {
imgBytes, err := base64.StdEncoding.DecodeString(file.Data)
if err != nil {
return nil, "", fmt.Errorf("failed to decode generated image: %w", err)
}
ext := "png"
switch {
case strings.Contains(file.MediaType, "jpeg"), strings.Contains(file.MediaType, "jpg"):
ext = "jpg"
case strings.Contains(file.MediaType, "webp"):
ext = "webp"
}
return imgBytes, ext, nil
}