From 66c529e4b17031ce3c311bb45d020d6e4f763b56 Mon Sep 17 00:00:00 2001
From: aki <arisu@ieee.org>
Date: Mon, 20 Apr 2026 17:57:27 +0900
Subject: [PATCH] feat: Ear and Mouth

---
 apps/web/src/constants/client-types.ts        |   5 +
 apps/web/src/i18n/locales/en.json             |  16 +
 apps/web/src/i18n/locales/zh.json             |  16 +
 .../pages/bots/components/bot-settings.vue    |  27 ++
 .../speech/components/model-config-editor.vue |  87 ++++-
 .../speech/components/provider-setting.vue    | 214 ++++++++++-
 cmd/agent/app.go                              |  32 ++
 db/migrations/0001_init.up.sql                |   6 +-
 ...cription_models_and_google_speech.down.sql |  33 ++
 ...nscription_models_and_google_speech.up.sql |  31 ++
 .../0070_add_bot_transcription_model.down.sql |   8 +
 .../0070_add_bot_transcription_model.up.sql   |   5 +
 db/queries/models.sql                         |  51 ++-
 db/queries/settings.sql                       |   8 +-
 internal/agent/retry.go                       |   2 +-
 internal/agent/tools/container.go             |   6 +-
 internal/agent/tools/transcribe.go            | 232 ++++++++++++
 .../identities/service_integration_test.go    |   1 -
 internal/channel/inbound/channel.go           | 140 +++++++
 internal/db/sqlc/conversations.sql.go         |   2 +-
 internal/db/sqlc/models.go                    |   1 +
 internal/db/sqlc/models.sql.go                | 166 +++++++-
 internal/db/sqlc/settings.sql.go              |  20 +-
 internal/handlers/tts_providers.go            | 185 +++++++++
 internal/models/models.go                     |  13 +-
 internal/models/types.go                      |  10 +-
 internal/settings/service.go                  |  15 +
 internal/settings/types.go                    |   2 +
 internal/tts/bootstrap.go                     |  34 +-
 internal/tts/registry.go                      | 216 +++++++++--
 internal/tts/service.go                       | 358 ++++++++++++++++--
 internal/tts/types.go                         |  52 ++-
 internal/workspace/image_preference.go        |   2 +
 packages/sdk/src/types.gen.ts                 |   2 +
 34 files changed, 1863 insertions(+), 135 deletions(-)
 create mode 100644 db/migrations/0069_add_transcription_models_and_google_speech.down.sql
 create mode 100644 db/migrations/0069_add_transcription_models_and_google_speech.up.sql
 create mode 100644 db/migrations/0070_add_bot_transcription_model.down.sql
 create mode 100644 db/migrations/0070_add_bot_transcription_model.up.sql
 create mode 100644 internal/agent/tools/transcribe.go

diff --git a/apps/web/src/constants/client-types.ts b/apps/web/src/constants/client-types.ts
index 812786b2..27e634df 100644
--- a/apps/web/src/constants/client-types.ts
+++ b/apps/web/src/constants/client-types.ts
@@ -80,6 +80,11 @@ export const CLIENT_TYPE_META: Record<string, ClientTypeMeta> = {
     label: 'Microsoft Speech',
     hint: 'Azure Cognitive Services TTS',
   },
+  'google-speech': {
+    value: 'google-speech',
+    label: 'Google Speech',
+    hint: 'Gemini speech transcription',
+  },
 }
 
 export const CLIENT_TYPE_LIST: ClientTypeMeta[] = Object.values(CLIENT_TYPE_META)
diff --git a/apps/web/src/i18n/locales/en.json b/apps/web/src/i18n/locales/en.json
index 59f83d6c..65eaea5c 100644
--- a/apps/web/src/i18n/locales/en.json
+++ b/apps/web/src/i18n/locales/en.json
@@ -425,6 +425,20 @@
     "noModels": "No models found. Click \"Import Models\" to discover available models or \"Add Model\" to create one manually.",
     "noCapabilities": "No capabilities available for this model.",
     "saveSuccess": "Speech configuration saved",
+    "synthesis": {
+      "models": "Synthesis Models"
+    },
+    "transcription": {
+      "models": "Transcription Models",
+      "noModels": "No transcription models found. Import available models or keep the default template model.",
+      "importModels": "Import Transcription Models",
+      "importSuccess": "Transcription models imported successfully",
+      "importFailed": "Failed to import transcription models",
+      "test": {
+        "title": "Test Transcription",
+        "run": "Transcribe"
+      }
+    },
     "advanced": {
       "title": "Advanced Settings",
       "description": "These fields usually map to underlying vendor implementation details. Most users can keep the defaults."
@@ -920,6 +934,8 @@
       "memoryHealthUnavailable": "Unavailable",
       "ttsModel": "TTS Model",
       "ttsModelPlaceholder": "Select TTS model",
+      "transcriptionModel": "Transcription Model",
+      "transcriptionModelPlaceholder": "Select transcription model",
       "imageModel": "Image Generation Model",
       "imageModelDescription": "Model used for the generate_image tool. Must support image-output compatibility.",
       "imageModelPlaceholder": "Select image model (optional)",
diff --git a/apps/web/src/i18n/locales/zh.json b/apps/web/src/i18n/locales/zh.json
index 8891cad6..9d3b9120 100644
--- a/apps/web/src/i18n/locales/zh.json
+++ b/apps/web/src/i18n/locales/zh.json
@@ -421,6 +421,20 @@
     "noModels": "暂无模型，点击\"导入模型\"发现可用模型，或点击\"新建模型\"手动创建。",
     "noCapabilities": "该模型暂无可用能力信息。",
     "saveSuccess": "语音配置已保存",
+    "synthesis": {
+      "models": "语音合成模型"
+    },
+    "transcription": {
+      "models": "语音识别模型",
+      "noModels": "暂无语音识别模型，可导入可用模型，或保留默认模板模型。",
+      "importModels": "导入识别模型",
+      "importSuccess": "识别模型导入成功",
+      "importFailed": "识别模型导入失败",
+      "test": {
+        "title": "测试识别",
+        "run": "开始识别"
+      }
+    },
     "advanced": {
       "title": "高级设置",
       "description": "这些字段通常对应底层服务商实现细节。大多数情况下保留默认值即可。"
@@ -916,6 +930,8 @@
       "memoryHealthUnavailable": "暂不可用",
       "ttsModel": "语音合成模型",
       "ttsModelPlaceholder": "选择语音合成模型",
+      "transcriptionModel": "转写模型",
+      "transcriptionModelPlaceholder": "选择语音转写模型",
       "imageModel": "图片生成模型",
       "imageModelDescription": "用于 generate_image 工具的模型，必须支持 image-output 兼容性。",
       "imageModelPlaceholder": "选择图片模型（可选）",
diff --git a/apps/web/src/pages/bots/components/bot-settings.vue b/apps/web/src/pages/bots/components/bot-settings.vue
index 7fd4632b..8087de75 100644
--- a/apps/web/src/pages/bots/components/bot-settings.vue
+++ b/apps/web/src/pages/bots/components/bot-settings.vue
@@ -187,6 +187,17 @@
       />
     </div>
 
+    <!-- Transcription Model -->
+    <div class="space-y-2">
+      <Label>{{ $t('bots.settings.transcriptionModel') }}</Label>
+      <TtsModelSelect
+        v-model="form.transcription_model_id"
+        :models="transcriptionModels"
+        :providers="ttsProviders"
+        :placeholder="$t('bots.settings.transcriptionModelPlaceholder')"
+      />
+    </div>
+
     <!-- Image Generation Model -->
     <div class="space-y-2">
       <Label>{{ $t('bots.settings.imageModel') }}</Label>
@@ -357,6 +368,7 @@ import TtsModelSelect from './tts-model-select.vue'
 import BrowserContextSelect from './browser-context-select.vue'
 import { useQuery, useMutation, useQueryCache } from '@pinia/colada'
 import { getBotsById, putBotsById, getBotsByBotIdSettings, putBotsByBotIdSettings, deleteBotsById, getModels, getProviders, getSearchProviders, getMemoryProviders, getSpeechProviders, getSpeechModels, getBrowserContexts, getBotsByBotIdMemoryStatus, postBotsByBotIdMemoryRebuild } from '@memohai/sdk'
+import { client } from '@memohai/sdk/client'
 import type { SettingsSettings } from '@memohai/sdk'
 import type { Ref } from 'vue'
 import { resolveApiErrorMessage } from '@/utils/api-error'
@@ -440,6 +452,17 @@ const { data: ttsModelData } = useQuery({
   },
 })
 
+const { data: transcriptionModelData } = useQuery({
+  key: ['transcription-models'],
+  query: async () => {
+    const resp = await client.get({
+      url: '/transcription-models',
+      throwOnError: true,
+    })
+    return resp.data
+  },
+})
+
 const { data: browserContextData } = useQuery({
   key: ['all-browser-contexts'],
   query: async () => {
@@ -495,6 +518,7 @@ const memoryProviders = computed(() => memoryProviderData.value ?? [])
 const ttsProviders = computed(() => (ttsProviderData.value ?? []).filter((p) => p.enable !== false))
 const enabledTtsProviderIds = computed(() => new Set(ttsProviders.value.map((p) => p.id)))
 const ttsModels = computed(() => (ttsModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTtsProviderIds.value.has(m.provider_id as string)))
+const transcriptionModels = computed(() => (transcriptionModelData.value ?? []).filter((m: Record<string, unknown>) => enabledTtsProviderIds.value.has(m.provider_id as string)))
 const browserContexts = computed(() => browserContextData.value ?? [])
 
 // ---- Form ----
@@ -505,6 +529,7 @@ const form = reactive({
   search_provider_id: '',
   memory_provider_id: '',
   tts_model_id: '',
+  transcription_model_id: '',
   browser_context_id: '',
   timezone: '',
   language: '',
@@ -644,6 +669,7 @@ watch(settings, (val) => {
     form.search_provider_id = val.search_provider_id ?? ''
     form.memory_provider_id = val.memory_provider_id ?? ''
     form.tts_model_id = val.tts_model_id ?? ''
+    form.transcription_model_id = val.transcription_model_id ?? ''
     form.browser_context_id = val.browser_context_id ?? ''
     form.language = val.language ?? ''
     form.timezone = val.timezone ?? ''
@@ -666,6 +692,7 @@ const hasSettingsChanges = computed(() => {
     || form.search_provider_id !== (s.search_provider_id ?? '')
     || form.memory_provider_id !== (s.memory_provider_id ?? '')
     || form.tts_model_id !== (s.tts_model_id ?? '')
+    || form.transcription_model_id !== (s.transcription_model_id ?? '')
     || form.browser_context_id !== (s.browser_context_id ?? '')
     || form.language !== (s.language ?? '')
     || form.timezone !== (s.timezone ?? '')
diff --git a/apps/web/src/pages/speech/components/model-config-editor.vue b/apps/web/src/pages/speech/components/model-config-editor.vue
index 334d8f0d..62e24821 100644
--- a/apps/web/src/pages/speech/components/model-config-editor.vue
+++ b/apps/web/src/pages/speech/components/model-config-editor.vue
@@ -195,9 +195,12 @@
 
     <div class="space-y-3">
       <h4 class="text-xs font-medium">
-        {{ $t('speech.test.title') }}
+        {{ mode === 'transcription' ? $t('speech.transcription.test.title') : $t('speech.test.title') }}
       </h4>
-      <div class="relative">
+      <div
+        v-if="mode === 'synthesis'"
+        class="relative"
+      >
         <Textarea
           v-model="testText"
           :placeholder="$t('speech.test.placeholder')"
@@ -209,17 +212,36 @@
           {{ testText.length }}/{{ maxTestTextLen }}
         </span>
       </div>
+      <div
+        v-else
+        class="space-y-2"
+      >
+        <Input
+          type="file"
+          accept="audio/*"
+          @change="handleFileChange"
+        />
+        <p
+          v-if="selectedFileName"
+          class="text-xs text-muted-foreground"
+        >
+          {{ selectedFileName }}
+        </p>
+      </div>
       <div class="flex items-center gap-3">
         <LoadingButton
           type="button"
           variant="outline"
           size="sm"
           :loading="testLoading"
-          :disabled="!testText.trim() || testText.length > maxTestTextLen"
+          :disabled="mode === 'synthesis' ? (!testText.trim() || testText.length > maxTestTextLen) : !selectedFile"
           @click="handleTest"
         >
-          <Play class="mr-1.5" />
-          {{ $t('speech.test.generate') }}
+          <Play
+            v-if="mode === 'synthesis'"
+            class="mr-1.5"
+          />
+          {{ mode === 'transcription' ? $t('speech.transcription.test.run') : $t('speech.test.generate') }}
         </LoadingButton>
         <span
           v-if="testError"
@@ -229,7 +251,7 @@
         </span>
       </div>
       <div
-        v-if="audioUrl"
+        v-if="mode === 'synthesis' && audioUrl"
         class="rounded-md border border-border bg-muted/30 p-3"
       >
         <audio
@@ -239,6 +261,20 @@
           class="w-full"
         />
       </div>
+      <div
+        v-if="mode === 'transcription' && transcriptionText"
+        class="rounded-md border border-border bg-muted/30 p-3 space-y-2"
+      >
+        <p class="text-sm whitespace-pre-wrap break-words">
+          {{ transcriptionText }}
+        </p>
+        <p
+          v-if="transcriptionLanguage"
+          class="text-xs text-muted-foreground"
+        >
+          {{ transcriptionLanguage }}
+        </p>
+      </div>
     </div>
 
     <Separator class="my-3" />
@@ -296,7 +332,8 @@ const props = defineProps<{
   modelName: string
   config: Record<string, unknown>
   schema: SpeechConfigSchema | null
-  onTest: (text: string, config: Record<string, unknown>) => Promise<Blob>
+  mode?: 'synthesis' | 'transcription'
+  onTest: (payload: string | File, config: Record<string, unknown>) => Promise<Blob | { text?: string, language?: string }>
 }>()
 
 const emit = defineEmits<{
@@ -309,11 +346,16 @@ const visibleSecrets = reactive<Record<string, boolean>>({})
 const saving = ref(false)
 const showAdvanced = ref(false)
 const testText = ref('')
+const selectedFile = ref<File | null>(null)
+const selectedFileName = ref('')
 const testLoading = ref(false)
 const testError = ref('')
 const audioUrl = ref('')
+const transcriptionText = ref('')
+const transcriptionLanguage = ref('')
 const audioEl = ref<HTMLAudioElement>()
 const maxTestTextLen = 500
+const mode = computed(() => props.mode ?? 'synthesis')
 
 const orderedFields = computed(() => {
   const fields = props.schema?.fields ?? []
@@ -348,6 +390,11 @@ function revokeAudio() {
   }
 }
 
+function resetTranscription() {
+  transcriptionText.value = ''
+  transcriptionLanguage.value = ''
+}
+
 onBeforeUnmount(revokeAudio)
 
 async function handleSaveConfig() {
@@ -360,17 +407,26 @@ async function handleSaveConfig() {
 }
 
 async function handleTest() {
-  if (!testText.value.trim()) return
+  if (mode.value === 'synthesis' && !testText.value.trim()) return
+  if (mode.value === 'transcription' && !selectedFile.value) return
   testLoading.value = true
   testError.value = ''
   revokeAudio()
+  resetTranscription()
 
   try {
-    const blob = await props.onTest(testText.value, buildConfig())
+    const result = await props.onTest(mode.value === 'synthesis' ? testText.value : selectedFile.value as File, buildConfig())
 
-    audioUrl.value = URL.createObjectURL(blob)
-    await new Promise<void>((resolve) => setTimeout(resolve, 50))
-    audioEl.value?.play()
+    if (mode.value === 'synthesis') {
+      const blob = result as Blob
+      audioUrl.value = URL.createObjectURL(blob)
+      await new Promise<void>((resolve) => setTimeout(resolve, 50))
+      audioEl.value?.play()
+    } else {
+      const payload = result as { text?: string, language?: string }
+      transcriptionText.value = payload.text ?? ''
+      transcriptionLanguage.value = payload.language ?? ''
+    }
   } catch (error: unknown) {
     const msg = error instanceof Error ? error.message : t('speech.test.failed')
     testError.value = msg
@@ -379,4 +435,11 @@ async function handleTest() {
     testLoading.value = false
   }
 }
+
+function handleFileChange(event: Event) {
+  const input = event.target as HTMLInputElement
+  const file = input.files?.[0] ?? null
+  selectedFile.value = file
+  selectedFileName.value = file?.name ?? ''
+}
 </script>
diff --git a/apps/web/src/pages/speech/components/provider-setting.vue b/apps/web/src/pages/speech/components/provider-setting.vue
index ba159708..feb4ff6d 100644
--- a/apps/web/src/pages/speech/components/provider-setting.vue
+++ b/apps/web/src/pages/speech/components/provider-setting.vue
@@ -138,7 +138,7 @@
     <section>
       <div class="flex justify-between items-center mb-4">
         <h3 class="text-xs font-medium">
-          {{ $t('speech.models') }}
+          {{ $t('speech.synthesis.models') }}
         </h3>
         <LoadingButton
           v-if="curProviderId"
@@ -191,12 +191,78 @@
             :model-name="model.model_id ?? ''"
             :config="model.config || {}"
             :schema="getModelSchema(model.model_id ?? '')"
-            :on-test="(text, cfg) => handleTestModel(model.id ?? '', text, cfg)"
+            :on-test="(text, cfg) => handleTestModel(model.id ?? '', text as string, cfg)"
             @save="(cfg) => handleSaveModel(model.id ?? '', cfg)"
           />
         </div>
       </div>
     </section>
+
+    <Separator class="mt-6 mb-6" />
+
+    <section>
+      <div class="flex justify-between items-center mb-4">
+        <h3 class="text-xs font-medium">
+          {{ $t('speech.transcription.models') }}
+        </h3>
+        <LoadingButton
+          v-if="curProviderId"
+          type="button"
+          variant="outline"
+          size="sm"
+          :loading="importTranscriptionLoading"
+          @click="handleImportTranscriptionModels"
+        >
+          {{ $t('speech.transcription.importModels') }}
+        </LoadingButton>
+      </div>
+
+      <div
+        v-if="providerTranscriptionModels.length === 0"
+        class="text-xs text-muted-foreground py-4 text-center"
+      >
+        {{ $t('speech.transcription.noModels') }}
+      </div>
+
+      <div
+        v-for="model in providerTranscriptionModels"
+        :key="model.id"
+        class="border border-border rounded-lg mb-4"
+      >
+        <button
+          type="button"
+          class="w-full flex items-center justify-between p-3 text-left hover:bg-accent/50 rounded-t-lg transition-colors"
+          @click="toggleTranscriptionModel(model.id ?? '')"
+        >
+          <div>
+            <span class="text-xs font-medium">{{ model.name || model.model_id }}</span>
+            <span
+              v-if="model.name"
+              class="text-xs text-muted-foreground ml-2"
+            >{{ model.model_id }}</span>
+          </div>
+          <component
+            :is="expandedTranscriptionModelId === model.id ? ChevronUp : ChevronDown"
+            class="size-3 text-muted-foreground"
+          />
+        </button>
+
+        <div
+          v-if="expandedTranscriptionModelId === model.id"
+          class="px-3 pb-3 space-y-4 border-t border-border pt-3"
+        >
+          <ModelConfigEditor
+            :model-id="model.id ?? ''"
+            :model-name="model.model_id ?? ''"
+            :config="model.config || {}"
+            :schema="getTranscriptionModelSchema(model.model_id ?? '')"
+            mode="transcription"
+            :on-test="(file, cfg) => handleTestTranscriptionModel(model.id ?? '', file as File, cfg)"
+            @save="(cfg) => handleSaveTranscriptionModel(model.id ?? '', cfg)"
+          />
+        </div>
+      </div>
+    </section>
   </div>
 </template>
 
@@ -218,7 +284,7 @@ import { computed, inject, reactive, ref, watch } from 'vue'
 import { toast } from 'vue-sonner'
 import { useI18n } from 'vue-i18n'
 import { useQuery, useQueryCache } from '@pinia/colada'
-import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putModelsById, putProvidersById } from '@memohai/sdk'
+import { getSpeechProvidersById, getSpeechProvidersByIdModels, getSpeechProvidersMeta, postSpeechProvidersByIdImportModels, putProvidersById } from '@memohai/sdk'
 import type { TtsSpeechModelResponse, TtsSpeechProviderResponse } from '@memohai/sdk'
 import LoadingButton from '@/components/loading-button/index.vue'
 import ProviderIcon from '@/components/provider-icon/index.vue'
@@ -256,6 +322,10 @@ interface SpeechProviderMeta {
   config_schema?: SpeechConfigSchema
   default_model?: string
   models?: SpeechModelMeta[]
+  default_synthesis_model?: string
+  synthesis_models?: SpeechModelMeta[]
+  default_transcription_model?: string
+  transcription_models?: SpeechModelMeta[]
 }
 
 function getInitials(name: string | undefined) {
@@ -270,9 +340,11 @@ const providerName = ref('')
 const providerConfig = reactive<Record<string, unknown>>({})
 const visibleSecrets = reactive<Record<string, boolean>>({})
 const expandedModelId = ref('')
+const expandedTranscriptionModelId = ref('')
 const enableLoading = ref(false)
 const saveLoading = ref(false)
 const importLoading = ref(false)
+const importTranscriptionLoading = ref(false)
 const queryCache = useQueryCache()
 
 const { data: providerDetail } = useQuery({
@@ -297,7 +369,7 @@ const { data: metaList } = useQuery({
 
 const currentMeta = computed(() => {
   if (!metaList.value || !curProvider.value?.client_type) return null
-  return (metaList.value as SpeechProviderMeta[]).find((m) => m.provider === curProvider.value?.client_type) ?? null
+  return (metaList.value as SpeechProviderMeta[]).find(m => m.provider === curProvider.value?.client_type) ?? null
 })
 
 const orderedProviderFields = computed(() => {
@@ -317,10 +389,23 @@ const { data: providerSpeechModels } = useQuery({
   },
 })
 
-const providerModels = computed(() => {
-  return (providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []
+const { data: providerTranscriptionModelsData } = useQuery({
+  key: () => ['speech-provider-transcription-models', curProviderId.value],
+  query: async () => {
+    if (!curProviderId.value) return []
+    const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
+    const token = localStorage.getItem('token')
+    const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/transcription-models`, {
+      headers: token ? { Authorization: `Bearer ${token}` } : undefined,
+    })
+    if (!resp.ok) throw new Error(await resp.text())
+    return await resp.json()
+  },
 })
 
+const providerModels = computed(() => ((providerSpeechModels.value as TtsSpeechModelResponse[] | undefined) ?? []))
+const providerTranscriptionModels = computed(() => ((providerTranscriptionModelsData.value as TtsSpeechModelResponse[] | undefined) ?? []))
+
 watch(() => providerDetail.value, (provider) => {
   providerName.value = provider?.name ?? curProvider.value?.name ?? ''
   Object.keys(providerConfig).forEach((key) => delete providerConfig[key])
@@ -328,12 +413,11 @@ watch(() => providerDetail.value, (provider) => {
 }, { immediate: true, deep: true })
 
 function getModelMeta(modelID: string): SpeechModelMeta | null {
-  const models = currentMeta.value?.models ?? []
+  const models = currentMeta.value?.synthesis_models ?? currentMeta.value?.models ?? []
   const exact = models.find(m => m.id === modelID)
   if (exact) return exact
-  if (currentMeta.value?.default_model) {
-    return models.find(m => m.id === currentMeta.value?.default_model) ?? null
-  }
+  const defaultModel = currentMeta.value?.default_synthesis_model ?? currentMeta.value?.default_model
+  if (defaultModel) return models.find(m => m.id === defaultModel) ?? null
   return models[0] ?? null
 }
 
@@ -342,10 +426,29 @@ function getModelSchema(modelID: string): SpeechConfigSchema | null {
   return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
 }
 
+function getTranscriptionModelMeta(modelID: string): SpeechModelMeta | null {
+  const models = currentMeta.value?.transcription_models ?? []
+  const exact = models.find(m => m.id === modelID)
+  if (exact) return exact
+  if (currentMeta.value?.default_transcription_model) {
+    return models.find(m => m.id === currentMeta.value?.default_transcription_model) ?? null
+  }
+  return models[0] ?? null
+}
+
+function getTranscriptionModelSchema(modelID: string): SpeechConfigSchema | null {
+  const meta = getTranscriptionModelMeta(modelID)
+  return meta?.config_schema ?? meta?.capabilities?.config_schema ?? null
+}
+
 function toggleModel(id: string) {
   expandedModelId.value = expandedModelId.value === id ? '' : id
 }
 
+function toggleTranscriptionModel(id: string) {
+  expandedTranscriptionModelId.value = expandedTranscriptionModelId.value === id ? '' : id
+}
+
 async function handleToggleEnable(value: boolean) {
   if (!curProviderId.value || !curProvider.value) return
   const prev = curProvider.value.enable ?? false
@@ -398,20 +501,23 @@ async function handleSaveProvider() {
 }
 
 async function handleSaveModel(modelId: string, config: Record<string, unknown>) {
-  const model = providerModels.value.find((item) => item.id === modelId)
+  const model = providerModels.value.find(item => item.id === modelId)
   if (!model) return
   try {
-    await putModelsById({
-      path: { id: modelId },
-      body: {
-        model_id: model.model_id,
-        name: model.name ?? model.model_id,
-        provider_id: model.provider_id,
-        type: 'speech',
-        config,
+    const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
+    const token = localStorage.getItem('token')
+    const resp = await fetch(`${apiBase}/speech-models/${modelId}`, {
+      method: 'PUT',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(token ? { Authorization: `Bearer ${token}` } : {}),
       },
-      throwOnError: true,
+      body: JSON.stringify({
+        name: model.name ?? model.model_id,
+        config,
+      }),
     })
+    if (!resp.ok) throw new Error(await resp.text())
     toast.success(t('speech.saveSuccess'))
     queryCache.invalidateQueries({ key: ['speech-provider-models', curProviderId.value] })
     queryCache.invalidateQueries({ key: ['speech-models'] })
@@ -420,6 +526,31 @@ async function handleSaveModel(modelId: string, config: Record<string, unknown>)
   }
 }
 
+async function handleSaveTranscriptionModel(modelId: string, config: Record<string, unknown>) {
+  const model = providerTranscriptionModels.value.find(item => item.id === modelId)
+  if (!model) return
+  try {
+    const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
+    const token = localStorage.getItem('token')
+    const resp = await fetch(`${apiBase}/transcription-models/${modelId}`, {
+      method: 'PUT',
+      headers: {
+        'Content-Type': 'application/json',
+        ...(token ? { Authorization: `Bearer ${token}` } : {}),
+      },
+      body: JSON.stringify({
+        name: model.name ?? model.model_id,
+        config,
+      }),
+    })
+    if (!resp.ok) throw new Error(await resp.text())
+    toast.success(t('speech.saveSuccess'))
+    queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] })
+  } catch {
+    toast.error(t('common.saveFailed'))
+  }
+}
+
 async function handleImportModels() {
   if (!curProviderId.value) return
   importLoading.value = true
@@ -442,6 +573,31 @@ async function handleImportModels() {
   }
 }
 
+async function handleImportTranscriptionModels() {
+  if (!curProviderId.value) return
+  importTranscriptionLoading.value = true
+  try {
+    const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
+    const token = localStorage.getItem('token')
+    const resp = await fetch(`${apiBase}/speech-providers/${curProviderId.value}/import-transcription-models`, {
+      method: 'POST',
+      headers: token ? { Authorization: `Bearer ${token}` } : undefined,
+    })
+    if (!resp.ok) throw new Error(await resp.text())
+    const data = await resp.json()
+    toast.success(t('speech.transcription.importSuccess', {
+      created: data?.created ?? 0,
+      skipped: data?.skipped ?? 0,
+    }))
+    queryCache.invalidateQueries({ key: ['speech-provider-transcription-models', curProviderId.value] })
+    queryCache.invalidateQueries({ key: ['speech-providers-meta'] })
+  } catch {
+    toast.error(t('speech.transcription.importFailed'))
+  } finally {
+    importTranscriptionLoading.value = false
+  }
+}
+
 async function handleTestModel(modelId: string, text: string, config: Record<string, unknown>) {
   const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
   const token = localStorage.getItem('token')
@@ -466,6 +622,24 @@ async function handleTestModel(modelId: string, text: string, config: Record<str
   return resp.blob()
 }
 
+async function handleTestTranscriptionModel(modelId: string, file: File, config: Record<string, unknown>) {
+  const apiBase = import.meta.env.VITE_API_URL?.trim() || '/api'
+  const token = localStorage.getItem('token')
+  const form = new FormData()
+  form.append('file', file)
+  form.append('config', JSON.stringify(config))
+  const resp = await fetch(`${apiBase}/transcription-models/${modelId}/test`, {
+    method: 'POST',
+    headers: token ? { Authorization: `Bearer ${token}` } : undefined,
+    body: form,
+  })
+  if (!resp.ok) {
+    const errBody = await resp.text()
+    throw new Error(errBody)
+  }
+  return await resp.json()
+}
+
 function sanitizeConfig(input: Record<string, unknown>) {
   const result: Record<string, unknown> = {}
   for (const [key, value] of Object.entries(input)) {
diff --git a/cmd/agent/app.go b/cmd/agent/app.go
index 3f959fbc..b4e548c9 100644
--- a/cmd/agent/app.go
+++ b/cmd/agent/app.go
@@ -373,6 +373,7 @@ func provideChannelRouter(
 	processor.SetStreamObserver(local.NewRouteHubBroadcaster(hub))
 	processor.SetDispatcher(inbound.NewRouteDispatcher(log))
 	processor.SetTtsService(ttsService, &settingsTtsModelResolver{settings: settingsService})
+	processor.SetTranscriptionService(&settingsTranscriptionAdapter{tts: ttsService}, &settingsTranscriptionModelResolver{settings: settingsService})
 	cmdHandler := command.NewHandler(
 		log,
 		&command.BotMemberRoleAdapter{BotService: botService},
@@ -468,6 +469,7 @@ func provideToolProviders(log *slog.Logger, cfg config.Config, channelManager *c
 		agenttools.NewSkillProvider(log),
 		agenttools.NewBrowserProvider(log, settingsService, browserContextService, manager, cfg.BrowserGateway),
 		agenttools.NewTTSProvider(log, settingsService, ttsService, channelManager, registry),
+		agenttools.NewTranscriptionProvider(log, settingsService, ttsService, mediaService),
 		agenttools.NewImageGenProvider(log, settingsService, modelsService, queries, manager, config.DefaultDataMount),
 		agenttools.NewFederationProvider(log, fedSource),
 		agenttools.NewHistoryProvider(log, sessionService, queries),
@@ -595,6 +597,36 @@ func (r *settingsTtsModelResolver) ResolveTtsModelID(ctx context.Context, botID
 	return s.TtsModelID, nil
 }
 
+type settingsTranscriptionModelResolver struct {
+	settings *settings.Service
+}
+
+func (r *settingsTranscriptionModelResolver) ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error) {
+	s, err := r.settings.GetBot(ctx, botID)
+	if err != nil {
+		return "", err
+	}
+	return s.TranscriptionModelID, nil
+}
+
+type settingsTranscriptionAdapter struct {
+	tts *ttspkg.Service
+}
+
+type inboundTranscriptionResult struct {
+	text string
+}
+
+func (r inboundTranscriptionResult) GetText() string { return r.text }
+
+func (a *settingsTranscriptionAdapter) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (inbound.TranscriptionResult, error) {
+	result, err := a.tts.Transcribe(ctx, modelID, audio, filename, contentType, overrideCfg)
+	if err != nil {
+		return nil, err
+	}
+	return inboundTranscriptionResult{text: result.Text}, nil
+}
+
 func provideEmailRegistry(log *slog.Logger, tokenStore *emailpkg.DBOAuthTokenStore) *emailpkg.Registry {
 	reg := emailpkg.NewRegistry()
 	reg.Register(emailgeneric.New(log))
diff --git a/db/migrations/0001_init.up.sql b/db/migrations/0001_init.up.sql
index 56fc9fbb..45db41ea 100644
--- a/db/migrations/0001_init.up.sql
+++ b/db/migrations/0001_init.up.sql
@@ -83,7 +83,8 @@ CREATE TABLE IF NOT EXISTS providers (
     'minimax-speech',
     'volcengine-speech',
     'alibabacloud-speech',
-    'microsoft-speech'
+    'microsoft-speech',
+    'google-speech'
   ))
 );
 
@@ -108,7 +109,7 @@ CREATE TABLE IF NOT EXISTS models (
   created_at TIMESTAMPTZ NOT NULL DEFAULT now(),
   updated_at TIMESTAMPTZ NOT NULL DEFAULT now(),
   CONSTRAINT models_provider_id_model_id_unique UNIQUE (provider_id, model_id),
-  CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'))
+  CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'))
 );
 
 CREATE TABLE IF NOT EXISTS model_variants (
@@ -170,6 +171,7 @@ CREATE TABLE IF NOT EXISTS bots (
   image_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
   discuss_probe_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
   tts_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
+  transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL,
   browser_context_id UUID REFERENCES browser_contexts(id) ON DELETE SET NULL,
   persist_full_tool_results BOOLEAN NOT NULL DEFAULT false,
   metadata JSONB NOT NULL DEFAULT '{}'::jsonb,
diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.down.sql b/db/migrations/0069_add_transcription_models_and_google_speech.down.sql
new file mode 100644
index 00000000..10135402
--- /dev/null
+++ b/db/migrations/0069_add_transcription_models_and_google_speech.down.sql
@@ -0,0 +1,33 @@
+-- 0069_add_transcription_models_and_google_speech
+-- Revert transcription model type and Google speech provider support.
+
+DELETE FROM models WHERE type = 'transcription';
+DELETE FROM providers WHERE client_type = 'google-speech';
+
+ALTER TABLE models
+  DROP CONSTRAINT IF EXISTS models_type_check;
+
+ALTER TABLE models
+  ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech'));
+
+ALTER TABLE providers
+  DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE providers
+  ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
+    'openai-responses',
+    'openai-completions',
+    'anthropic-messages',
+    'google-generative-ai',
+    'openai-codex',
+    'github-copilot',
+    'edge-speech',
+    'openai-speech',
+    'openrouter-speech',
+    'elevenlabs-speech',
+    'deepgram-speech',
+    'minimax-speech',
+    'volcengine-speech',
+    'alibabacloud-speech',
+    'microsoft-speech'
+  ));
diff --git a/db/migrations/0069_add_transcription_models_and_google_speech.up.sql b/db/migrations/0069_add_transcription_models_and_google_speech.up.sql
new file mode 100644
index 00000000..e1f24cc3
--- /dev/null
+++ b/db/migrations/0069_add_transcription_models_and_google_speech.up.sql
@@ -0,0 +1,31 @@
+-- 0069_add_transcription_models_and_google_speech
+-- Expand unified speech domain to support transcription models and Google speech providers.
+
+ALTER TABLE providers
+  DROP CONSTRAINT IF EXISTS providers_client_type_check;
+
+ALTER TABLE providers
+  ADD CONSTRAINT providers_client_type_check CHECK (client_type IN (
+    'openai-responses',
+    'openai-completions',
+    'anthropic-messages',
+    'google-generative-ai',
+    'openai-codex',
+    'github-copilot',
+    'edge-speech',
+    'openai-speech',
+    'openrouter-speech',
+    'elevenlabs-speech',
+    'deepgram-speech',
+    'minimax-speech',
+    'volcengine-speech',
+    'alibabacloud-speech',
+    'microsoft-speech',
+    'google-speech'
+  ));
+
+ALTER TABLE models
+  DROP CONSTRAINT IF EXISTS models_type_check;
+
+ALTER TABLE models
+  ADD CONSTRAINT models_type_check CHECK (type IN ('chat', 'embedding', 'speech', 'transcription'));
diff --git a/db/migrations/0070_add_bot_transcription_model.down.sql b/db/migrations/0070_add_bot_transcription_model.down.sql
new file mode 100644
index 00000000..1b8c2475
--- /dev/null
+++ b/db/migrations/0070_add_bot_transcription_model.down.sql
@@ -0,0 +1,8 @@
+-- 0070_add_bot_transcription_model
+-- Remove bots.transcription_model_id.
+
+ALTER TABLE bots
+  DROP CONSTRAINT IF EXISTS bots_transcription_model_id_fkey;
+
+ALTER TABLE bots
+  DROP COLUMN IF EXISTS transcription_model_id;
diff --git a/db/migrations/0070_add_bot_transcription_model.up.sql b/db/migrations/0070_add_bot_transcription_model.up.sql
new file mode 100644
index 00000000..acf5a355
--- /dev/null
+++ b/db/migrations/0070_add_bot_transcription_model.up.sql
@@ -0,0 +1,5 @@
+-- 0070_add_bot_transcription_model
+-- Add bots.transcription_model_id for bot-level speech-to-text defaults.
+
+ALTER TABLE bots
+  ADD COLUMN IF NOT EXISTS transcription_model_id UUID REFERENCES models(id) ON DELETE SET NULL;
diff --git a/db/queries/models.sql b/db/queries/models.sql
index a90e33f8..fd2e0703 100644
--- a/db/queries/models.sql
+++ b/db/queries/models.sql
@@ -27,7 +27,8 @@ WHERE client_type NOT IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 )
 ORDER BY created_at DESC;
 
@@ -59,7 +60,8 @@ WHERE client_type NOT IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 );
 
 -- name: CreateModel :one
@@ -86,7 +88,7 @@ ORDER BY created_at DESC;
 
 -- name: ListModels :many
 SELECT * FROM models
-WHERE type != 'speech'
+WHERE type NOT IN ('speech', 'transcription')
 ORDER BY created_at DESC;
 
 -- name: ListModelsByType :many
@@ -97,7 +99,7 @@ ORDER BY created_at DESC;
 -- name: ListModelsByProviderID :many
 SELECT * FROM models
 WHERE provider_id = sqlc.arg(provider_id)
-  AND type != 'speech'
+  AND type NOT IN ('speech', 'transcription')
 ORDER BY created_at DESC;
 
 -- name: ListModelsByProviderIDAndType :many
@@ -136,9 +138,15 @@ DELETE FROM models
 WHERE provider_id = sqlc.arg(provider_id)
   AND model_id = sqlc.arg(model_id);
 
+-- name: DeleteModelByProviderAndType :exec
+DELETE FROM models
+WHERE provider_id = sqlc.arg(provider_id)
+  AND model_id = sqlc.arg(model_id)
+  AND type = sqlc.arg(type);
+
 -- name: CountModels :one
 SELECT COUNT(*) FROM models
-WHERE type != 'speech';
+WHERE type NOT IN ('speech', 'transcription');
 
 -- name: CountModelsByType :one
 SELECT COUNT(*) FROM models WHERE type = sqlc.arg(type);
@@ -150,11 +158,6 @@ VALUES (sqlc.arg(name), sqlc.arg(client_type), sqlc.arg(icon), false, sqlc.arg(c
 ON CONFLICT (name) DO UPDATE SET
   icon = EXCLUDED.icon,
   client_type = EXCLUDED.client_type,
-  config = CASE
-    WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
-    THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
-    ELSE EXCLUDED.config
-  END,
   updated_at = now()
 RETURNING *;
 
@@ -173,7 +176,7 @@ SELECT m.*
 FROM models m
 JOIN providers p ON m.provider_id = p.id
 WHERE p.enable = true
-  AND m.type != 'speech'
+  AND m.type NOT IN ('speech', 'transcription')
 ORDER BY m.created_at DESC;
 
 -- name: ListEnabledModelsByType :many
@@ -227,7 +230,8 @@ WHERE client_type IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 )
 ORDER BY created_at DESC;
 
@@ -250,3 +254,26 @@ SELECT * FROM models
 WHERE provider_id = sqlc.arg(provider_id)
   AND model_id = sqlc.arg(model_id)
 LIMIT 1;
+
+-- name: GetTranscriptionModelWithProvider :one
+SELECT
+  m.*,
+  p.client_type AS provider_type
+FROM models m
+JOIN providers p ON p.id = m.provider_id
+WHERE m.id = sqlc.arg(id)
+  AND m.type = 'transcription';
+
+-- name: ListTranscriptionModels :many
+SELECT m.*,
+  p.client_type AS provider_type
+FROM models m
+JOIN providers p ON p.id = m.provider_id
+WHERE m.type = 'transcription'
+ORDER BY m.created_at DESC;
+
+-- name: ListTranscriptionModelsByProviderID :many
+SELECT * FROM models
+WHERE provider_id = sqlc.arg(provider_id)
+  AND type = 'transcription'
+ORDER BY created_at DESC;
diff --git a/db/queries/settings.sql b/db/queries/settings.sql
index 2d74994e..53ca739d 100644
--- a/db/queries/settings.sql
+++ b/db/queries/settings.sql
@@ -19,6 +19,7 @@ SELECT
   memory_providers.id AS memory_provider_id,
   image_models.id AS image_model_id,
   tts_models.id AS tts_model_id,
+  transcription_models.id AS transcription_model_id,
   browser_contexts.id AS browser_context_id,
   bots.persist_full_tool_results
 FROM bots
@@ -30,6 +31,7 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
 LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
+LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
 WHERE bots.id = $1;
 
@@ -54,11 +56,12 @@ WITH updated AS (
       memory_provider_id = COALESCE(sqlc.narg(memory_provider_id)::uuid, bots.memory_provider_id),
       image_model_id = COALESCE(sqlc.narg(image_model_id)::uuid, bots.image_model_id),
       tts_model_id = COALESCE(sqlc.narg(tts_model_id)::uuid, bots.tts_model_id),
+      transcription_model_id = COALESCE(sqlc.narg(transcription_model_id)::uuid, bots.transcription_model_id),
       browser_context_id = COALESCE(sqlc.narg(browser_context_id)::uuid, bots.browser_context_id),
       persist_full_tool_results = sqlc.arg(persist_full_tool_results),
       updated_at = now()
   WHERE bots.id = sqlc.arg(id)
-  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
+  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
 )
 SELECT
   updated.id AS bot_id,
@@ -80,6 +83,7 @@ SELECT
   memory_providers.id AS memory_provider_id,
   image_models.id AS image_model_id,
   tts_models.id AS tts_model_id,
+  transcription_models.id AS transcription_model_id,
   browser_contexts.id AS browser_context_id,
   updated.persist_full_tool_results
 FROM updated
@@ -91,6 +95,7 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
 LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
+LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id;
 
 -- name: DeleteSettingsByBotID :exec
@@ -112,6 +117,7 @@ SET language = 'auto',
     search_provider_id = NULL,
     memory_provider_id = NULL,
     tts_model_id = NULL,
+    transcription_model_id = NULL,
     browser_context_id = NULL,
     persist_full_tool_results = false,
     updated_at = now()
diff --git a/internal/agent/retry.go b/internal/agent/retry.go
index e63bac41..c84a9f21 100644
--- a/internal/agent/retry.go
+++ b/internal/agent/retry.go
@@ -84,7 +84,7 @@ func retryDelay(attempt int, cfg RetryConfig) time.Duration {
 	if backoffIdx > 20 {
 		backoffIdx = 20
 	}
-	delay := cfg.BaseDelay * time.Duration(1<<uint(backoffIdx))
+	delay := cfg.BaseDelay * time.Duration(1<<backoffIdx)
 	delay = min(delay, cfg.MaxDelay)
 	// Add jitter: random value in [0, delay/2), so final delay is in [delay/2, delay).
 	// math/rand is intentional here — cryptographic randomness is not needed for backoff jitter.
diff --git a/internal/agent/tools/container.go b/internal/agent/tools/container.go
index a048a00e..b0a82df6 100644
--- a/internal/agent/tools/container.go
+++ b/internal/agent/tools/container.go
@@ -295,7 +295,7 @@ func (p *ContainerProvider) execRead(ctx context.Context, session SessionContext
 		content += "\n"
 	}
 
-	content = addLineNumbers(content, int32(lineOffset))
+	content = addLineNumbers(content, lineOffset)
 	return map[string]any{"content": content, "total_lines": totalLines}, nil
 }
 
@@ -757,7 +757,7 @@ func truncateStr(s string, n int) string {
 	return s[:n] + "..."
 }
 
-func addLineNumbers(content string, startLine int32) string {
+func addLineNumbers(content string, startLine int) string {
 	if content == "" {
 		return content
 	}
@@ -765,7 +765,7 @@ func addLineNumbers(content string, startLine int32) string {
 	var out strings.Builder
 	out.Grow(len(content) + len(lines)*8)
 	for i, line := range lines {
-		fmt.Fprintf(&out, "%6d\t%s\n", int(startLine)+i, line)
+		fmt.Fprintf(&out, "%6d\t%s\n", startLine+i, line)
 	}
 	return out.String()
 }
diff --git a/internal/agent/tools/transcribe.go b/internal/agent/tools/transcribe.go
new file mode 100644
index 00000000..ee5f0999
--- /dev/null
+++ b/internal/agent/tools/transcribe.go
@@ -0,0 +1,232 @@
+//nolint:gosec
+package tools
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"log/slog"
+	"net"
+	"net/http"
+	"net/url"
+	"path/filepath"
+	"strings"
+	"time"
+
+	sdk "github.com/memohai/twilight-ai/sdk"
+
+	"github.com/memohai/memoh/internal/media"
+	"github.com/memohai/memoh/internal/settings"
+	ttspkg "github.com/memohai/memoh/internal/tts"
+)
+
+const mediaDataPrefix = "/data/media/"
+
+type TranscriptionProvider struct {
+	logger   *slog.Logger
+	settings *settings.Service
+	tts      *ttspkg.Service
+	media    *media.Service
+	http     *http.Client
+}
+
+func NewTranscriptionProvider(log *slog.Logger, settingsSvc *settings.Service, ttsSvc *ttspkg.Service, mediaSvc *media.Service) *TranscriptionProvider {
+	if log == nil {
+		log = slog.Default()
+	}
+	return &TranscriptionProvider{
+		logger:   log.With(slog.String("tool", "transcribe_audio")),
+		settings: settingsSvc,
+		tts:      ttsSvc,
+		media:    mediaSvc,
+		http: &http.Client{
+			Timeout: 30 * time.Second,
+			CheckRedirect: func(req *http.Request, via []*http.Request) error {
+				if len(via) >= 10 {
+					return errors.New("stopped after 10 redirects")
+				}
+				if _, err := validateURL(req.Context(), req.URL.String()); err != nil {
+					return fmt.Errorf("redirect to non-public address is not allowed: %w", err)
+				}
+				return nil
+			},
+		},
+	}
+}
+
+func (p *TranscriptionProvider) Tools(ctx context.Context, session SessionContext) ([]sdk.Tool, error) {
+	if session.IsSubagent || p.settings == nil || p.tts == nil || p.media == nil {
+		return nil, nil
+	}
+	botID := strings.TrimSpace(session.BotID)
+	if botID == "" {
+		return nil, nil
+	}
+	botSettings, err := p.settings.GetBot(ctx, botID)
+	if err != nil || strings.TrimSpace(botSettings.TranscriptionModelID) == "" {
+		return nil, nil
+	}
+	sess := session
+	return []sdk.Tool{{
+		Name:        "transcribe_audio",
+		Description: "Transcribe an audio or voice message into text. Use this when the user sent a voice message and you need to understand its contents. Accepts a bot media path such as /data/media/... or a direct URL.",
+		Parameters: map[string]any{
+			"type": "object",
+			"properties": map[string]any{
+				"path":        map[string]any{"type": "string", "description": "Audio file path from the message context, usually under /data/media/..."},
+				"url":         map[string]any{"type": "string", "description": "Direct audio URL when a path is unavailable"},
+				"language":    map[string]any{"type": "string", "description": "Optional language hint"},
+				"prompt":      map[string]any{"type": "string", "description": "Optional transcription prompt"},
+				"contentType": map[string]any{"type": "string", "description": "Optional MIME type override"},
+			},
+			"required": []string{},
+		},
+		Execute: func(execCtx *sdk.ToolExecContext, input any) (any, error) {
+			return p.execTranscribe(execCtx.Context, sess, inputAsMap(input))
+		},
+	}}, nil
+}
+
+func (p *TranscriptionProvider) execTranscribe(ctx context.Context, session SessionContext, args map[string]any) (any, error) {
+	botID := strings.TrimSpace(session.BotID)
+	if botID == "" {
+		return nil, errors.New("bot_id is required")
+	}
+	botSettings, err := p.settings.GetBot(ctx, botID)
+	if err != nil {
+		return nil, errors.New("failed to load bot settings")
+	}
+	modelID := strings.TrimSpace(botSettings.TranscriptionModelID)
+	if modelID == "" {
+		return nil, errors.New("bot has no transcription model configured")
+	}
+
+	path := FirstStringArg(args, "path", "audio_path", "file_path")
+	rawURL := FirstStringArg(args, "url", "audio_url")
+	if path == "" && rawURL == "" {
+		return nil, errors.New("path or url is required")
+	}
+
+	audio, filename, contentType, err := p.loadAudio(ctx, botID, path, rawURL, FirstStringArg(args, "contentType", "content_type"))
+	if err != nil {
+		return nil, err
+	}
+
+	override := map[string]any{}
+	if language := FirstStringArg(args, "language"); language != "" {
+		override["language"] = language
+	}
+	if prompt := FirstStringArg(args, "prompt"); prompt != "" {
+		override["prompt"] = prompt
+	}
+	result, err := p.tts.Transcribe(ctx, modelID, audio, filename, contentType, override)
+	if err != nil {
+		return nil, err
+	}
+	return map[string]any{
+		"ok":               true,
+		"text":             result.Text,
+		"language":         result.Language,
+		"duration_seconds": result.DurationSeconds,
+	}, nil
+}
+
+func (p *TranscriptionProvider) loadAudio(ctx context.Context, botID, pathValue, rawURL, contentTypeOverride string) ([]byte, string, string, error) {
+	if pathValue != "" {
+		return p.loadAudioFromPath(ctx, botID, pathValue, contentTypeOverride)
+	}
+	u, err := validateURL(ctx, rawURL)
+	if err != nil {
+		return nil, "", "", err
+	}
+	req, err := http.NewRequestWithContext(ctx, http.MethodGet, u.String(), nil)
+	if err != nil {
+		return nil, "", "", err
+	}
+	resp, err := p.http.Do(req)
+	if err != nil {
+		return nil, "", "", err
+	}
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		_ = resp.Body.Close()
+		return nil, "", "", fmt.Errorf("download audio: unexpected status %d", resp.StatusCode)
+	}
+	defer func(body io.ReadCloser) {
+		if closeErr := body.Close(); closeErr != nil {
+			p.logger.Warn("failed to close audio response body", slog.Any("error", closeErr))
+		}
+	}(resp.Body)
+	audio, err := io.ReadAll(resp.Body)
+	if err != nil {
+		return nil, "", "", err
+	}
+	contentType := strings.TrimSpace(contentTypeOverride)
+	if contentType == "" {
+		contentType = strings.TrimSpace(resp.Header.Get("Content-Type"))
+	}
+	return audio, filepath.Base(strings.TrimSpace(req.URL.Path)), contentType, nil
+}
+
+func (p *TranscriptionProvider) loadAudioFromPath(ctx context.Context, botID, pathValue, contentTypeOverride string) ([]byte, string, string, error) {
+	storageKey := strings.TrimSpace(strings.TrimPrefix(strings.TrimSpace(pathValue), mediaDataPrefix))
+	if storageKey == "" || storageKey == strings.TrimSpace(pathValue) {
+		return nil, "", "", fmt.Errorf("unsupported media path: %s", pathValue)
+	}
+	asset, err := p.media.GetByStorageKey(ctx, botID, storageKey)
+	if err != nil {
+		return nil, "", "", err
+	}
+	reader, _, err := p.media.Open(ctx, botID, asset.ContentHash)
+	if err != nil {
+		return nil, "", "", err
+	}
+	defer func(reader io.ReadCloser) {
+		if closeErr := reader.Close(); closeErr != nil {
+			p.logger.Warn("failed to close media reader", slog.Any("error", closeErr))
+		}
+	}(reader)
+	audio, err := io.ReadAll(reader)
+	if err != nil {
+		return nil, "", "", err
+	}
+	contentType := strings.TrimSpace(contentTypeOverride)
+	if contentType == "" {
+		contentType = strings.TrimSpace(asset.Mime)
+	}
+	return audio, filepath.Base(storageKey), contentType, nil
+}
+
+func validateURL(ctx context.Context, rawURL string) (*url.URL, error) {
+	u, err := url.Parse(rawURL)
+	if err != nil {
+		return nil, fmt.Errorf("invalid url: %w", err)
+	}
+
+	if u.Scheme != "http" && u.Scheme != "https" {
+		return nil, fmt.Errorf("unsupported scheme: %s", u.Scheme)
+	}
+
+	hostname := u.Hostname()
+	if hostname == "" {
+		return nil, errors.New("missing hostname in url")
+	}
+
+	resolver := net.Resolver{}
+	ips, err := resolver.LookupIPAddr(ctx, hostname)
+	if err != nil {
+		return nil, fmt.Errorf("dns lookup failed for %s: %w", hostname, err)
+	}
+
+	if len(ips) == 0 {
+		return nil, fmt.Errorf("no ip addresses found for %s", hostname)
+	}
+
+	for _, ip := range ips {
+		if ip.IP.IsLoopback() || ip.IP.IsPrivate() || ip.IP.IsLinkLocalUnicast() || ip.IP.IsLinkLocalMulticast() {
+			return nil, fmt.Errorf("url resolves to a non-public ip address: %s", ip.IP.String())
+		}
+	}
+
+	return u, nil
+}
diff --git a/internal/channel/identities/service_integration_test.go b/internal/channel/identities/service_integration_test.go
index a5b89d71..08111ced 100644
--- a/internal/channel/identities/service_integration_test.go
+++ b/internal/channel/identities/service_integration_test.go
@@ -1,5 +1,4 @@
 //go:build ignore
-// +build ignore
 
 package identities_test
 
diff --git a/internal/channel/inbound/channel.go b/internal/channel/inbound/channel.go
index dc3cc66a..0fe47100 100644
--- a/internal/channel/inbound/channel.go
+++ b/internal/channel/inbound/channel.go
@@ -68,6 +68,21 @@ type ttsModelResolver interface {
 	ResolveTtsModelID(ctx context.Context, botID string) (string, error)
 }
 
+// TranscriptionResult is the minimal speech-to-text response shape needed by inbound routing.
+type TranscriptionResult interface {
+	GetText() string
+}
+
+// transcriptionRecognizer converts inbound audio to text using a configured model.
+type transcriptionRecognizer interface {
+	Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (TranscriptionResult, error)
+}
+
+// transcriptionModelResolver looks up the transcription model ID configured for a bot.
+type transcriptionModelResolver interface {
+	ResolveTranscriptionModelID(ctx context.Context, botID string) (string, error)
+}
+
 // SessionEnsurer resolves or creates an active session for a route.
 type SessionEnsurer interface {
 	EnsureActiveSession(ctx context.Context, botID, routeID, channelType string) (SessionResult, error)
@@ -103,6 +118,8 @@ type ChannelInboundProcessor struct {
 	observer         channel.StreamObserver
 	ttsService       ttsSynthesizer
 	ttsModelResolver ttsModelResolver
+	transcriber      transcriptionRecognizer
+	sttModelResolver transcriptionModelResolver
 	sessionEnsurer   SessionEnsurer
 	pipeline         *pipelinepkg.Pipeline
 	eventStore       *pipelinepkg.EventStore
@@ -198,6 +215,15 @@ func (p *ChannelInboundProcessor) SetTtsService(synth ttsSynthesizer, modelResol
 	p.ttsModelResolver = modelResolver
 }
 
+// SetTranscriptionService configures speech-to-text processing for inbound audio attachments.
+func (p *ChannelInboundProcessor) SetTranscriptionService(recognizer transcriptionRecognizer, modelResolver transcriptionModelResolver) {
+	if p == nil {
+		return
+	}
+	p.transcriber = recognizer
+	p.sttModelResolver = modelResolver
+}
+
 // SetSessionEnsurer configures the session ensurer for auto-creating sessions on routes.
 func (p *ChannelInboundProcessor) SetSessionEnsurer(ensurer SessionEnsurer) {
 	if p == nil {
@@ -326,6 +352,8 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
 	}
 
 	resolvedAttachments := p.ingestInboundAttachments(ctx, cfg, msg, strings.TrimSpace(identity.BotID), msg.Message.Attachments)
+	msg.Message.Attachments = resolvedAttachments
+	hadVoiceAttachment := containsVoiceAttachment(resolvedAttachments)
 	attachments := mapChannelToChatAttachments(resolvedAttachments)
 	text = strings.TrimSpace(msg.Message.PlainText())
 
@@ -466,6 +494,24 @@ func (p *ChannelInboundProcessor) HandleInbound(ctx context.Context, cfg channel
 	}
 	shouldTrigger := shouldTriggerAssistantResponse(msg) || identity.ForceReply
 
+	if sessionType == sessionpkg.TypeDiscuss || shouldTrigger {
+		if transcript := p.transcribeInboundAttachments(ctx, strings.TrimSpace(identity.BotID), resolvedAttachments); transcript != "" {
+			labeledTranscript := formatInboundTranscript(transcript)
+			if msg.Message.Metadata == nil {
+				msg.Message.Metadata = make(map[string]any)
+			}
+			msg.Message.Metadata["transcript"] = transcript
+			if plain := strings.TrimSpace(msg.Message.PlainText()); plain == "" {
+				msg.Message.Text = labeledTranscript
+			} else if !strings.Contains(plain, transcript) {
+				msg.Message.Text = plain + "\n\n" + labeledTranscript
+			}
+		} else if hadVoiceAttachment && strings.TrimSpace(msg.Message.PlainText()) == "" {
+			msg.Message.Text = formatVoiceTranscriptionUnavailableNotice(resolvedAttachments)
+		}
+		text = strings.TrimSpace(msg.Message.PlainText())
+	}
+
 	if !shouldTrigger {
 		p.persistPassiveMessage(ctx, identity, msg, text, attachments, resolved.RouteID, sessionID, eventID)
 		if p.logger != nil {
@@ -1900,6 +1946,97 @@ func (p *ChannelInboundProcessor) loadInboundAttachmentPayload(
 	}, nil
 }
 
+func (p *ChannelInboundProcessor) transcribeInboundAttachments(ctx context.Context, botID string, attachments []channel.Attachment) string {
+	if p == nil || p.transcriber == nil || p.sttModelResolver == nil || p.mediaService == nil || strings.TrimSpace(botID) == "" {
+		return ""
+	}
+	modelID, err := p.sttModelResolver.ResolveTranscriptionModelID(ctx, botID)
+	if err != nil || strings.TrimSpace(modelID) == "" {
+		return ""
+	}
+	transcripts := make([]string, 0, len(attachments))
+	for _, att := range attachments {
+		if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
+			continue
+		}
+		if strings.TrimSpace(att.ContentHash) == "" {
+			continue
+		}
+		reader, asset, err := p.mediaService.Open(ctx, botID, strings.TrimSpace(att.ContentHash))
+		if err != nil {
+			if p.logger != nil {
+				p.logger.Warn("open inbound audio for transcription failed", slog.Any("error", err), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
+			}
+			continue
+		}
+		audio, readErr := io.ReadAll(reader)
+		_ = reader.Close()
+		if readErr != nil || len(audio) == 0 {
+			if p.logger != nil {
+				p.logger.Warn("read inbound audio for transcription failed", slog.Any("error", readErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
+			}
+			continue
+		}
+		filename := strings.TrimSpace(att.Name)
+		if filename == "" {
+			filename = "audio" + filepath.Ext(asset.StorageKey)
+		}
+		contentType := strings.TrimSpace(att.Mime)
+		if contentType == "" {
+			contentType = strings.TrimSpace(asset.Mime)
+		}
+		result, txErr := p.transcriber.Transcribe(ctx, modelID, audio, filename, contentType, nil)
+		if txErr != nil {
+			if p.logger != nil {
+				p.logger.Warn("inbound attachment transcription failed", slog.Any("error", txErr), slog.String("bot_id", botID), slog.String("content_hash", att.ContentHash))
+			}
+			continue
+		}
+		text := strings.TrimSpace(result.GetText())
+		if text == "" {
+			continue
+		}
+		transcripts = append(transcripts, text)
+	}
+	if len(transcripts) == 0 {
+		return ""
+	}
+	return strings.Join(transcripts, "\n\n")
+}
+
+func formatInboundTranscript(transcript string) string {
+	transcript = strings.TrimSpace(transcript)
+	if transcript == "" {
+		return ""
+	}
+	return "[Voice message transcription]\n" + transcript
+}
+
+func containsVoiceAttachment(attachments []channel.Attachment) bool {
+	for _, att := range attachments {
+		if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
+			return true
+		}
+	}
+	return false
+}
+
+func formatVoiceTranscriptionUnavailableNotice(attachments []channel.Attachment) string {
+	paths := make([]string, 0, len(attachments))
+	for _, att := range attachments {
+		if att.Type != channel.AttachmentAudio && att.Type != channel.AttachmentVoice {
+			continue
+		}
+		if ref := strings.TrimSpace(att.URL); ref != "" {
+			paths = append(paths, ref)
+		}
+	}
+	if len(paths) == 0 {
+		return "[User sent a voice message, but transcription is unavailable.]"
+	}
+	return "[User sent a voice message, but transcription is unavailable. Use transcribe_audio with one of these paths if needed: " + strings.Join(paths, ", ") + "]"
+}
+
 func openInboundAttachmentURL(ctx context.Context, rawURL string) (inboundAttachmentPayload, error) {
 	req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawURL, nil)
 	if err != nil {
@@ -2090,6 +2227,9 @@ func mapChannelToChatAttachments(attachments []channel.Attachment) []conversatio
 	}
 	result := make([]conversation.ChatAttachment, 0, len(attachments))
 	for _, att := range attachments {
+		if att.Type == channel.AttachmentAudio || att.Type == channel.AttachmentVoice {
+			continue
+		}
 		ca := conversation.ChatAttachment{
 			Type:        string(att.Type),
 			PlatformKey: att.PlatformKey,
diff --git a/internal/db/sqlc/conversations.sql.go b/internal/db/sqlc/conversations.sql.go
index 838b7a39..518d1c4c 100644
--- a/internal/db/sqlc/conversations.sql.go
+++ b/internal/db/sqlc/conversations.sql.go
@@ -511,7 +511,7 @@ WITH updated AS (
   SET display_name = $1,
       updated_at = now()
   WHERE bots.id = $2
-  RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
+  RETURNING id, owner_user_id, display_name, avatar_url, timezone, is_active, status, language, reasoning_enabled, reasoning_effort, chat_model_id, search_provider_id, memory_provider_id, heartbeat_enabled, heartbeat_interval, heartbeat_prompt, heartbeat_model_id, compaction_enabled, compaction_threshold, compaction_ratio, compaction_model_id, title_model_id, image_model_id, discuss_probe_model_id, tts_model_id, transcription_model_id, browser_context_id, persist_full_tool_results, metadata, created_at, updated_at, acl_default_effect
 )
 SELECT
   updated.id AS id,
diff --git a/internal/db/sqlc/models.go b/internal/db/sqlc/models.go
index b1f14174..09750f1a 100644
--- a/internal/db/sqlc/models.go
+++ b/internal/db/sqlc/models.go
@@ -34,6 +34,7 @@ type Bot struct {
 	ImageModelID           pgtype.UUID        `json:"image_model_id"`
 	DiscussProbeModelID    pgtype.UUID        `json:"discuss_probe_model_id"`
 	TtsModelID             pgtype.UUID        `json:"tts_model_id"`
+	TranscriptionModelID   pgtype.UUID        `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID        `json:"browser_context_id"`
 	PersistFullToolResults bool               `json:"persist_full_tool_results"`
 	Metadata               []byte             `json:"metadata"`
diff --git a/internal/db/sqlc/models.sql.go b/internal/db/sqlc/models.sql.go
index ef48d0e1..a94ca59b 100644
--- a/internal/db/sqlc/models.sql.go
+++ b/internal/db/sqlc/models.sql.go
@@ -13,7 +13,7 @@ import (
 
 const countModels = `-- name: CountModels :one
 SELECT COUNT(*) FROM models
-WHERE type != 'speech'
+WHERE type NOT IN ('speech', 'transcription')
 `
 
 func (q *Queries) CountModels(ctx context.Context) (int64, error) {
@@ -46,7 +46,8 @@ WHERE client_type NOT IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 )
 `
 
@@ -201,6 +202,24 @@ func (q *Queries) DeleteModelByModelID(ctx context.Context, modelID string) erro
 	return err
 }
 
+const deleteModelByProviderAndType = `-- name: DeleteModelByProviderAndType :exec
+DELETE FROM models
+WHERE provider_id = $1
+  AND model_id = $2
+  AND type = $3
+`
+
+type DeleteModelByProviderAndTypeParams struct {
+	ProviderID pgtype.UUID `json:"provider_id"`
+	ModelID    string      `json:"model_id"`
+	Type       string      `json:"type"`
+}
+
+func (q *Queries) DeleteModelByProviderAndType(ctx context.Context, arg DeleteModelByProviderAndTypeParams) error {
+	_, err := q.db.Exec(ctx, deleteModelByProviderAndType, arg.ProviderID, arg.ModelID, arg.Type)
+	return err
+}
+
 const deleteModelByProviderIDAndModelID = `-- name: DeleteModelByProviderIDAndModelID :exec
 DELETE FROM models
 WHERE provider_id = $1
@@ -375,12 +394,51 @@ func (q *Queries) GetSpeechModelWithProvider(ctx context.Context, id pgtype.UUID
 	return i, err
 }
 
+const getTranscriptionModelWithProvider = `-- name: GetTranscriptionModelWithProvider :one
+SELECT
+  m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
+  p.client_type AS provider_type
+FROM models m
+JOIN providers p ON p.id = m.provider_id
+WHERE m.id = $1
+  AND m.type = 'transcription'
+`
+
+type GetTranscriptionModelWithProviderRow struct {
+	ID           pgtype.UUID        `json:"id"`
+	ModelID      string             `json:"model_id"`
+	Name         pgtype.Text        `json:"name"`
+	ProviderID   pgtype.UUID        `json:"provider_id"`
+	Type         string             `json:"type"`
+	Config       []byte             `json:"config"`
+	CreatedAt    pgtype.Timestamptz `json:"created_at"`
+	UpdatedAt    pgtype.Timestamptz `json:"updated_at"`
+	ProviderType string             `json:"provider_type"`
+}
+
+func (q *Queries) GetTranscriptionModelWithProvider(ctx context.Context, id pgtype.UUID) (GetTranscriptionModelWithProviderRow, error) {
+	row := q.db.QueryRow(ctx, getTranscriptionModelWithProvider, id)
+	var i GetTranscriptionModelWithProviderRow
+	err := row.Scan(
+		&i.ID,
+		&i.ModelID,
+		&i.Name,
+		&i.ProviderID,
+		&i.Type,
+		&i.Config,
+		&i.CreatedAt,
+		&i.UpdatedAt,
+		&i.ProviderType,
+	)
+	return i, err
+}
+
 const listEnabledModels = `-- name: ListEnabledModels :many
 SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at
 FROM models m
 JOIN providers p ON m.provider_id = p.id
 WHERE p.enable = true
-  AND m.type != 'speech'
+  AND m.type NOT IN ('speech', 'transcription')
 ORDER BY m.created_at DESC
 `
 
@@ -525,7 +583,7 @@ func (q *Queries) ListModelVariantsByModelUUID(ctx context.Context, modelUuid pg
 
 const listModels = `-- name: ListModels :many
 SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
-WHERE type != 'speech'
+WHERE type NOT IN ('speech', 'transcription')
 ORDER BY created_at DESC
 `
 
@@ -633,7 +691,7 @@ func (q *Queries) ListModelsByProviderClientType(ctx context.Context, clientType
 const listModelsByProviderID = `-- name: ListModelsByProviderID :many
 SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
 WHERE provider_id = $1
-  AND type != 'speech'
+  AND type NOT IN ('speech', 'transcription')
 ORDER BY created_at DESC
 `
 
@@ -753,7 +811,8 @@ WHERE client_type NOT IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 )
 ORDER BY created_at DESC
 `
@@ -886,7 +945,8 @@ WHERE client_type IN (
   'minimax-speech',
   'volcengine-speech',
   'alibabacloud-speech',
-  'microsoft-speech'
+  'microsoft-speech',
+  'google-speech'
 )
 ORDER BY created_at DESC
 `
@@ -921,6 +981,93 @@ func (q *Queries) ListSpeechProviders(ctx context.Context) ([]Provider, error) {
 	return items, nil
 }
 
+const listTranscriptionModels = `-- name: ListTranscriptionModels :many
+SELECT m.id, m.model_id, m.name, m.provider_id, m.type, m.config, m.created_at, m.updated_at,
+  p.client_type AS provider_type
+FROM models m
+JOIN providers p ON p.id = m.provider_id
+WHERE m.type = 'transcription'
+ORDER BY m.created_at DESC
+`
+
+type ListTranscriptionModelsRow struct {
+	ID           pgtype.UUID        `json:"id"`
+	ModelID      string             `json:"model_id"`
+	Name         pgtype.Text        `json:"name"`
+	ProviderID   pgtype.UUID        `json:"provider_id"`
+	Type         string             `json:"type"`
+	Config       []byte             `json:"config"`
+	CreatedAt    pgtype.Timestamptz `json:"created_at"`
+	UpdatedAt    pgtype.Timestamptz `json:"updated_at"`
+	ProviderType string             `json:"provider_type"`
+}
+
+func (q *Queries) ListTranscriptionModels(ctx context.Context) ([]ListTranscriptionModelsRow, error) {
+	rows, err := q.db.Query(ctx, listTranscriptionModels)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var items []ListTranscriptionModelsRow
+	for rows.Next() {
+		var i ListTranscriptionModelsRow
+		if err := rows.Scan(
+			&i.ID,
+			&i.ModelID,
+			&i.Name,
+			&i.ProviderID,
+			&i.Type,
+			&i.Config,
+			&i.CreatedAt,
+			&i.UpdatedAt,
+			&i.ProviderType,
+		); err != nil {
+			return nil, err
+		}
+		items = append(items, i)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return items, nil
+}
+
+const listTranscriptionModelsByProviderID = `-- name: ListTranscriptionModelsByProviderID :many
+SELECT id, model_id, name, provider_id, type, config, created_at, updated_at FROM models
+WHERE provider_id = $1
+  AND type = 'transcription'
+ORDER BY created_at DESC
+`
+
+func (q *Queries) ListTranscriptionModelsByProviderID(ctx context.Context, providerID pgtype.UUID) ([]Model, error) {
+	rows, err := q.db.Query(ctx, listTranscriptionModelsByProviderID, providerID)
+	if err != nil {
+		return nil, err
+	}
+	defer rows.Close()
+	var items []Model
+	for rows.Next() {
+		var i Model
+		if err := rows.Scan(
+			&i.ID,
+			&i.ModelID,
+			&i.Name,
+			&i.ProviderID,
+			&i.Type,
+			&i.Config,
+			&i.CreatedAt,
+			&i.UpdatedAt,
+		); err != nil {
+			return nil, err
+		}
+		items = append(items, i)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, err
+	}
+	return items, nil
+}
+
 const updateModel = `-- name: UpdateModel :one
 UPDATE models
 SET
@@ -1062,11 +1209,6 @@ VALUES ($1, $2, $3, false, $4, '{}')
 ON CONFLICT (name) DO UPDATE SET
   icon = EXCLUDED.icon,
   client_type = EXCLUDED.client_type,
-  config = CASE
-    WHEN providers.config->>'api_key' IS NOT NULL AND providers.config->>'api_key' != ''
-    THEN jsonb_set(EXCLUDED.config, '{api_key}', providers.config->'api_key')
-    ELSE EXCLUDED.config
-  END,
   updated_at = now()
 RETURNING id, name, client_type, icon, enable, config, metadata, created_at, updated_at
 `
diff --git a/internal/db/sqlc/settings.sql.go b/internal/db/sqlc/settings.sql.go
index ccf44284..8e93176f 100644
--- a/internal/db/sqlc/settings.sql.go
+++ b/internal/db/sqlc/settings.sql.go
@@ -30,6 +30,7 @@ SET language = 'auto',
     search_provider_id = NULL,
     memory_provider_id = NULL,
     tts_model_id = NULL,
+    transcription_model_id = NULL,
     browser_context_id = NULL,
     persist_full_tool_results = false,
     updated_at = now()
@@ -62,6 +63,7 @@ SELECT
   memory_providers.id AS memory_provider_id,
   image_models.id AS image_model_id,
   tts_models.id AS tts_model_id,
+  transcription_models.id AS transcription_model_id,
   browser_contexts.id AS browser_context_id,
   bots.persist_full_tool_results
 FROM bots
@@ -73,6 +75,7 @@ LEFT JOIN models AS image_models ON image_models.id = bots.image_model_id
 LEFT JOIN search_providers ON search_providers.id = bots.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = bots.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = bots.tts_model_id
+LEFT JOIN models AS transcription_models ON transcription_models.id = bots.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = bots.browser_context_id
 WHERE bots.id = $1
 `
@@ -97,6 +100,7 @@ type GetSettingsByBotIDRow struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
+	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 }
@@ -124,6 +128,7 @@ func (q *Queries) GetSettingsByBotID(ctx context.Context, id pgtype.UUID) (GetSe
 		&i.MemoryProviderID,
 		&i.ImageModelID,
 		&i.TtsModelID,
+		&i.TranscriptionModelID,
 		&i.BrowserContextID,
 		&i.PersistFullToolResults,
 	)
@@ -151,11 +156,12 @@ WITH updated AS (
       memory_provider_id = COALESCE($16::uuid, bots.memory_provider_id),
       image_model_id = COALESCE($17::uuid, bots.image_model_id),
       tts_model_id = COALESCE($18::uuid, bots.tts_model_id),
-      browser_context_id = COALESCE($19::uuid, bots.browser_context_id),
-      persist_full_tool_results = $20,
+      transcription_model_id = COALESCE($19::uuid, bots.transcription_model_id),
+      browser_context_id = COALESCE($20::uuid, bots.browser_context_id),
+      persist_full_tool_results = $21,
       updated_at = now()
-  WHERE bots.id = $21
-  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.browser_context_id, bots.persist_full_tool_results
+  WHERE bots.id = $22
+  RETURNING bots.id, bots.language, bots.reasoning_enabled, bots.reasoning_effort, bots.heartbeat_enabled, bots.heartbeat_interval, bots.heartbeat_prompt, bots.compaction_enabled, bots.compaction_threshold, bots.compaction_ratio, bots.timezone, bots.chat_model_id, bots.heartbeat_model_id, bots.compaction_model_id, bots.title_model_id, bots.image_model_id, bots.search_provider_id, bots.memory_provider_id, bots.tts_model_id, bots.transcription_model_id, bots.browser_context_id, bots.persist_full_tool_results
 )
 SELECT
   updated.id AS bot_id,
@@ -177,6 +183,7 @@ SELECT
   memory_providers.id AS memory_provider_id,
   image_models.id AS image_model_id,
   tts_models.id AS tts_model_id,
+  transcription_models.id AS transcription_model_id,
   browser_contexts.id AS browser_context_id,
   updated.persist_full_tool_results
 FROM updated
@@ -188,6 +195,7 @@ LEFT JOIN models AS image_models ON image_models.id = updated.image_model_id
 LEFT JOIN search_providers ON search_providers.id = updated.search_provider_id
 LEFT JOIN memory_providers ON memory_providers.id = updated.memory_provider_id
 LEFT JOIN models AS tts_models ON tts_models.id = updated.tts_model_id
+LEFT JOIN models AS transcription_models ON transcription_models.id = updated.transcription_model_id
 LEFT JOIN browser_contexts ON browser_contexts.id = updated.browser_context_id
 `
 
@@ -210,6 +218,7 @@ type UpsertBotSettingsParams struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
+	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 	ID                     pgtype.UUID `json:"id"`
@@ -235,6 +244,7 @@ type UpsertBotSettingsRow struct {
 	MemoryProviderID       pgtype.UUID `json:"memory_provider_id"`
 	ImageModelID           pgtype.UUID `json:"image_model_id"`
 	TtsModelID             pgtype.UUID `json:"tts_model_id"`
+	TranscriptionModelID   pgtype.UUID `json:"transcription_model_id"`
 	BrowserContextID       pgtype.UUID `json:"browser_context_id"`
 	PersistFullToolResults bool        `json:"persist_full_tool_results"`
 }
@@ -259,6 +269,7 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
 		arg.MemoryProviderID,
 		arg.ImageModelID,
 		arg.TtsModelID,
+		arg.TranscriptionModelID,
 		arg.BrowserContextID,
 		arg.PersistFullToolResults,
 		arg.ID,
@@ -284,6 +295,7 @@ func (q *Queries) UpsertBotSettings(ctx context.Context, arg UpsertBotSettingsPa
 		&i.MemoryProviderID,
 		&i.ImageModelID,
 		&i.TtsModelID,
+		&i.TranscriptionModelID,
 		&i.BrowserContextID,
 		&i.PersistFullToolResults,
 	)
diff --git a/internal/handlers/tts_providers.go b/internal/handlers/tts_providers.go
index 52814382..b8440bae 100644
--- a/internal/handlers/tts_providers.go
+++ b/internal/handlers/tts_providers.go
@@ -1,9 +1,12 @@
 package handlers
 
 import (
+	"encoding/json"
 	"errors"
 	"fmt"
+	"io"
 	"log/slog"
+	"mime/multipart"
 	"net/http"
 	"strings"
 
@@ -34,12 +37,22 @@ func (h *SpeechHandler) Register(e *echo.Echo) {
 	pg.GET("/meta", h.ListMeta)
 	pg.GET("/:id/models", h.ListModelsByProvider)
 	pg.POST("/:id/import-models", h.ImportModels)
+	pg.GET("/:id/transcription-models", h.ListTranscriptionModelsByProvider)
+	pg.POST("/:id/import-transcription-models", h.ImportTranscriptionModels)
 
 	mg := e.Group("/speech-models")
 	mg.GET("", h.ListModels)
 	mg.GET("/:id", h.GetModel)
+	mg.PUT("/:id", h.UpdateModel)
 	mg.GET("/:id/capabilities", h.GetModelCapabilities)
 	mg.POST("/:id/test", h.TestModel)
+
+	tg := e.Group("/transcription-models")
+	tg.GET("", h.ListTranscriptionModels)
+	tg.GET("/:id", h.GetTranscriptionModel)
+	tg.PUT("/:id", h.UpdateTranscriptionModel)
+	tg.GET("/:id/capabilities", h.GetTranscriptionModelCapabilities)
+	tg.POST("/:id/test", h.TestTranscriptionModel)
 }
 
 // ListMeta godoc
@@ -167,6 +180,61 @@ func (h *SpeechHandler) ImportModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }
 
+func (h *SpeechHandler) ListTranscriptionModelsByProvider(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	items, err := h.service.ListTranscriptionModelsByProvider(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	return c.JSON(http.StatusOK, items)
+}
+
+func (h *SpeechHandler) ImportTranscriptionModels(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+
+	remoteModels, err := h.service.FetchRemoteTranscriptionModels(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, fmt.Sprintf("fetch remote transcription models: %v", err))
+	}
+
+	resp := tts.ImportModelsResponse{
+		Models: make([]string, 0, len(remoteModels)),
+	}
+
+	for _, model := range remoteModels {
+		name := strings.TrimSpace(model.Name)
+		if name == "" {
+			name = model.ID
+		}
+
+		_, err := h.modelsService.Create(c.Request().Context(), models.AddRequest{
+			ModelID:    model.ID,
+			Name:       name,
+			ProviderID: id,
+			Type:       models.ModelTypeTranscription,
+			Config:     models.ModelConfig{},
+		})
+		if err != nil {
+			if errors.Is(err, models.ErrModelIDAlreadyExists) {
+				resp.Skipped++
+				continue
+			}
+			h.logger.Warn("failed to import transcription model", slog.String("model_id", model.ID), slog.Any("error", err))
+			continue
+		}
+		resp.Created++
+		resp.Models = append(resp.Models, model.ID)
+	}
+
+	return c.JSON(http.StatusOK, resp)
+}
+
 // ListModels godoc
 // @Summary List all speech models
 // @Description List all models of type 'speech' (filtered view of unified models table)
@@ -183,6 +251,14 @@ func (h *SpeechHandler) ListModels(c echo.Context) error {
 	return c.JSON(http.StatusOK, items)
 }
 
+func (h *SpeechHandler) ListTranscriptionModels(c echo.Context) error {
+	items, err := h.service.ListTranscriptionModels(c.Request().Context())
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	return c.JSON(http.StatusOK, items)
+}
+
 // GetModel godoc
 // @Summary Get a speech model
 // @Tags speech-models
@@ -203,6 +279,50 @@ func (h *SpeechHandler) GetModel(c echo.Context) error {
 	return c.JSON(http.StatusOK, resp)
 }
 
+func (h *SpeechHandler) UpdateModel(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	var req tts.UpdateSpeechModelRequest
+	if err := c.Bind(&req); err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
+	}
+	resp, err := h.service.UpdateSpeechModel(c.Request().Context(), id, req)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	return c.JSON(http.StatusOK, resp)
+}
+
+func (h *SpeechHandler) GetTranscriptionModel(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	resp, err := h.service.GetTranscriptionModel(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusNotFound, err.Error())
+	}
+	return c.JSON(http.StatusOK, resp)
+}
+
+func (h *SpeechHandler) UpdateTranscriptionModel(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	var req tts.UpdateSpeechModelRequest
+	if err := c.Bind(&req); err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
+	}
+	resp, err := h.service.UpdateTranscriptionModel(c.Request().Context(), id, req)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	return c.JSON(http.StatusOK, resp)
+}
+
 // GetModelCapabilities godoc
 // @Summary Get speech model capabilities
 // @Tags speech-models
@@ -223,6 +343,18 @@ func (h *SpeechHandler) GetModelCapabilities(c echo.Context) error {
 	return c.JSON(http.StatusOK, caps)
 }
 
+func (h *SpeechHandler) GetTranscriptionModelCapabilities(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	caps, err := h.service.GetTranscriptionModelCapabilities(c.Request().Context(), id)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusNotFound, err.Error())
+	}
+	return c.JSON(http.StatusOK, caps)
+}
+
 // TestModel godoc
 // @Summary Test speech model synthesis
 // @Description Synthesize text using a specific model's config and return audio
@@ -258,3 +390,56 @@ func (h *SpeechHandler) TestModel(c echo.Context) error {
 	}
 	return c.Blob(http.StatusOK, contentType, audio)
 }
+
+func (h *SpeechHandler) TestTranscriptionModel(c echo.Context) error {
+	id := strings.TrimSpace(c.Param("id"))
+	if id == "" {
+		return echo.NewHTTPError(http.StatusBadRequest, "id is required")
+	}
+	file, err := c.FormFile("file")
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, "file is required")
+	}
+	src, err := file.Open()
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
+	}
+	defer func(src multipart.File) {
+		err := src.Close()
+		if err != nil {
+			h.logger.Warn("failed to close uploaded file", slog.Any("error", err))
+		}
+	}(src)
+	audio, err := io.ReadAll(src)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusBadRequest, err.Error())
+	}
+	var cfg map[string]any
+	if raw := strings.TrimSpace(c.FormValue("config")); raw != "" {
+		if err := json.Unmarshal([]byte(raw), &cfg); err != nil {
+			return echo.NewHTTPError(http.StatusBadRequest, "invalid config")
+		}
+	}
+	result, err := h.service.Transcribe(c.Request().Context(), id, audio, file.Filename, file.Header.Get("Content-Type"), cfg)
+	if err != nil {
+		return echo.NewHTTPError(http.StatusInternalServerError, err.Error())
+	}
+	resp := tts.TestTranscriptionResponse{
+		Text:            result.Text,
+		Language:        result.Language,
+		DurationSeconds: result.DurationSeconds,
+		Metadata:        result.ProviderMetadata,
+	}
+	if len(result.Words) > 0 {
+		resp.Words = make([]tts.TranscriptionWord, 0, len(result.Words))
+		for _, word := range result.Words {
+			resp.Words = append(resp.Words, tts.TranscriptionWord{
+				Text:      word.Text,
+				Start:     word.Start,
+				End:       word.End,
+				SpeakerID: word.SpeakerID,
+			})
+		}
+	}
+	return c.JSON(http.StatusOK, resp)
+}
diff --git a/internal/models/models.go b/internal/models/models.go
index 9ed7784d..3c2f04d0 100644
--- a/internal/models/models.go
+++ b/internal/models/models.go
@@ -126,9 +126,9 @@ func (s *Service) List(ctx context.Context) ([]GetResponse, error) {
 	return s.convertToGetResponseList(dbModels), nil
 }
 
-// ListByType returns models filtered by type (chat, embedding, or speech).
+// ListByType returns models filtered by type.
 func (s *Service) ListByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}
 
@@ -165,7 +165,7 @@ func (s *Service) ListEnabled(ctx context.Context) ([]GetResponse, error) {
 
 // ListEnabledByType returns models from enabled providers filtered by type.
 func (s *Service) ListEnabledByType(ctx context.Context, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}
 	dbModels, err := s.queries.ListEnabledModelsByType(ctx, string(modelType))
@@ -206,7 +206,7 @@ func (s *Service) ListByProviderID(ctx context.Context, providerID string) ([]Ge
 
 // ListByProviderIDAndType returns models filtered by provider ID and type.
 func (s *Service) ListByProviderIDAndType(ctx context.Context, providerID string, modelType ModelType) ([]GetResponse, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
 		return nil, fmt.Errorf("invalid model type: %s", modelType)
 	}
 	if strings.TrimSpace(providerID) == "" {
@@ -361,7 +361,7 @@ func (s *Service) Count(ctx context.Context) (int64, error) {
 
 // CountByType returns the number of models of a specific type.
 func (s *Service) CountByType(ctx context.Context, modelType ModelType) (int64, error) {
-	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech {
+	if modelType != ModelTypeChat && modelType != ModelTypeEmbedding && modelType != ModelTypeSpeech && modelType != ModelTypeTranscription {
 		return 0, fmt.Errorf("invalid model type: %s", modelType)
 	}
 
@@ -438,7 +438,8 @@ func IsValidClientType(clientType ClientType) bool {
 		ClientTypeMiniMaxSpeech,
 		ClientTypeVolcengineSpeech,
 		ClientTypeAlibabaSpeech,
-		ClientTypeMicrosoftSpeech:
+		ClientTypeMicrosoftSpeech,
+		ClientTypeGoogleSpeech:
 		return true
 	default:
 		return false
diff --git a/internal/models/types.go b/internal/models/types.go
index f9185f85..a4ef8e1b 100644
--- a/internal/models/types.go
+++ b/internal/models/types.go
@@ -9,9 +9,10 @@ import (
 type ModelType string
 
 const (
-	ModelTypeChat      ModelType = "chat"
-	ModelTypeEmbedding ModelType = "embedding"
-	ModelTypeSpeech    ModelType = "speech"
+	ModelTypeChat          ModelType = "chat"
+	ModelTypeEmbedding     ModelType = "embedding"
+	ModelTypeSpeech        ModelType = "speech"
+	ModelTypeTranscription ModelType = "transcription"
 )
 
 type ClientType string
@@ -32,6 +33,7 @@ const (
 	ClientTypeVolcengineSpeech   ClientType = "volcengine-speech"
 	ClientTypeAlibabaSpeech      ClientType = "alibabacloud-speech"
 	ClientTypeMicrosoftSpeech    ClientType = "microsoft-speech"
+	ClientTypeGoogleSpeech       ClientType = "google-speech"
 )
 
 const (
@@ -88,7 +90,7 @@ func (m *Model) Validate() error {
 	if _, err := uuid.Parse(m.ProviderID); err != nil {
 		return errors.New("provider ID must be a valid UUID")
 	}
-	if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech {
+	if m.Type != ModelTypeChat && m.Type != ModelTypeEmbedding && m.Type != ModelTypeSpeech && m.Type != ModelTypeTranscription {
 		return errors.New("invalid model type")
 	}
 	if m.Type == ModelTypeEmbedding {
diff --git a/internal/settings/service.go b/internal/settings/service.go
index 189a80bd..7af9c4ff 100644
--- a/internal/settings/service.go
+++ b/internal/settings/service.go
@@ -175,6 +175,14 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
 		}
 		ttsModelUUID = modelID
 	}
+	transcriptionModelUUID := pgtype.UUID{}
+	if value := strings.TrimSpace(req.TranscriptionModelID); value != "" {
+		modelID, err := db.ParseUUID(value)
+		if err != nil {
+			return Settings{}, err
+		}
+		transcriptionModelUUID = modelID
+	}
 	browserContextUUID := pgtype.UUID{}
 	if value := strings.TrimSpace(req.BrowserContextID); value != "" {
 		ctxID, err := db.ParseUUID(value)
@@ -204,6 +212,7 @@ func (s *Service) UpsertBot(ctx context.Context, botID string, req UpsertRequest
 		SearchProviderID:       searchProviderUUID,
 		MemoryProviderID:       memoryProviderUUID,
 		TtsModelID:             ttsModelUUID,
+		TranscriptionModelID:   transcriptionModelUUID,
 		BrowserContextID:       browserContextUUID,
 		PersistFullToolResults: current.PersistFullToolResults,
 	})
@@ -298,6 +307,7 @@ func normalizeBotSettingsReadRow(row sqlc.GetSettingsByBotIDRow) Settings {
 		row.SearchProviderID,
 		row.MemoryProviderID,
 		row.TtsModelID,
+		row.TranscriptionModelID,
 		row.BrowserContextID,
 		row.PersistFullToolResults,
 	)
@@ -322,6 +332,7 @@ func normalizeBotSettingsWriteRow(row sqlc.UpsertBotSettingsRow) Settings {
 		row.SearchProviderID,
 		row.MemoryProviderID,
 		row.TtsModelID,
+		row.TranscriptionModelID,
 		row.BrowserContextID,
 		row.PersistFullToolResults,
 	)
@@ -345,6 +356,7 @@ func normalizeBotSettingsFields(
 	searchProviderID pgtype.UUID,
 	memoryProviderID pgtype.UUID,
 	ttsModelID pgtype.UUID,
+	transcriptionModelID pgtype.UUID,
 	browserContextID pgtype.UUID,
 	persistFullToolResults bool,
 ) Settings {
@@ -376,6 +388,9 @@ func normalizeBotSettingsFields(
 	if ttsModelID.Valid {
 		settings.TtsModelID = uuid.UUID(ttsModelID.Bytes).String()
 	}
+	if transcriptionModelID.Valid {
+		settings.TranscriptionModelID = uuid.UUID(transcriptionModelID.Bytes).String()
+	}
 	if browserContextID.Valid {
 		settings.BrowserContextID = uuid.UUID(browserContextID.Bytes).String()
 	}
diff --git a/internal/settings/types.go b/internal/settings/types.go
index 45c82065..df802c3e 100644
--- a/internal/settings/types.go
+++ b/internal/settings/types.go
@@ -12,6 +12,7 @@ type Settings struct {
 	SearchProviderID       string `json:"search_provider_id"`
 	MemoryProviderID       string `json:"memory_provider_id"`
 	TtsModelID             string `json:"tts_model_id"`
+	TranscriptionModelID   string `json:"transcription_model_id"`
 	BrowserContextID       string `json:"browser_context_id"`
 	Language               string `json:"language"`
 	AclDefaultEffect       string `json:"acl_default_effect"`
@@ -36,6 +37,7 @@ type UpsertRequest struct {
 	SearchProviderID       string  `json:"search_provider_id,omitempty"`
 	MemoryProviderID       string  `json:"memory_provider_id,omitempty"`
 	TtsModelID             string  `json:"tts_model_id,omitempty"`
+	TranscriptionModelID   string  `json:"transcription_model_id,omitempty"`
 	BrowserContextID       string  `json:"browser_context_id,omitempty"`
 	Language               string  `json:"language,omitempty"`
 	AclDefaultEffect       string  `json:"acl_default_effect,omitempty"`
diff --git a/internal/tts/bootstrap.go b/internal/tts/bootstrap.go
index 70d52b29..91e6a0d7 100644
--- a/internal/tts/bootstrap.go
+++ b/internal/tts/bootstrap.go
@@ -9,6 +9,7 @@ import (
 	"github.com/jackc/pgx/v5/pgtype"
 
 	"github.com/memohai/memoh/internal/db/sqlc"
+	"github.com/memohai/memoh/internal/models"
 )
 
 func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Queries, registry *Registry) error {
@@ -34,10 +35,11 @@ func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Querie
 
 		synced := 0
 		for _, model := range def.Models {
-			if shouldHideTemplateModel(def, model.ID) {
-				if err := queries.DeleteModelByProviderIDAndModelID(ctx, sqlc.DeleteModelByProviderIDAndModelIDParams{
+			if shouldHideTemplateModel(def, models.ModelTypeSpeech, model.ID) {
+				if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
 					ProviderID: provider.ID,
 					ModelID:    model.ID,
+					Type:       string(models.ModelTypeSpeech),
 				}); err != nil {
 					return fmt.Errorf("delete hidden speech template model %s: %w", model.ID, err)
 				}
@@ -52,13 +54,39 @@ func SyncRegistry(ctx context.Context, logger *slog.Logger, queries *sqlc.Querie
 				ModelID:    model.ID,
 				Name:       name,
 				ProviderID: provider.ID,
-				Type:       "speech",
+				Type:       string(models.ModelTypeSpeech),
 				Config:     modelConfigJSON,
 			}); err != nil {
 				return fmt.Errorf("upsert speech model %s: %w", model.ID, err)
 			}
 			synced++
 		}
+		for _, model := range def.TranscriptionModels {
+			if shouldHideTemplateModel(def, models.ModelTypeTranscription, model.ID) {
+				if err := queries.DeleteModelByProviderAndType(ctx, sqlc.DeleteModelByProviderAndTypeParams{
+					ProviderID: provider.ID,
+					ModelID:    model.ID,
+					Type:       string(models.ModelTypeTranscription),
+				}); err != nil {
+					return fmt.Errorf("delete hidden transcription template model %s: %w", model.ID, err)
+				}
+				continue
+			}
+			modelConfigJSON, err := json.Marshal(map[string]any{})
+			if err != nil {
+				return fmt.Errorf("marshal transcription model config: %w", err)
+			}
+			name := pgtype.Text{String: model.Name, Valid: model.Name != ""}
+			if _, err := queries.UpsertRegistryModel(ctx, sqlc.UpsertRegistryModelParams{
+				ModelID:    model.ID,
+				Name:       name,
+				ProviderID: provider.ID,
+				Type:       string(models.ModelTypeTranscription),
+				Config:     modelConfigJSON,
+			}); err != nil {
+				return fmt.Errorf("upsert transcription model %s: %w", model.ID, err)
+			}
+		}
 
 		if logger != nil {
 			logger.Info("speech registry synced", slog.String("provider", string(def.ClientType)), slog.Int("models", synced))
diff --git a/internal/tts/registry.go b/internal/tts/registry.go
index 9342288a..8a2d9ed1 100644
--- a/internal/tts/registry.go
+++ b/internal/tts/registry.go
@@ -8,31 +8,43 @@ import (
 
 	alibabaspeech "github.com/memohai/twilight-ai/provider/alibabacloud/speech"
 	deepgramspeech "github.com/memohai/twilight-ai/provider/deepgram/speech"
+	deepgramtranscription "github.com/memohai/twilight-ai/provider/deepgram/transcription"
 	edgespeech "github.com/memohai/twilight-ai/provider/edge/speech"
 	elevenlabsspeech "github.com/memohai/twilight-ai/provider/elevenlabs/speech"
+	elevenlabstranscription "github.com/memohai/twilight-ai/provider/elevenlabs/transcription"
+	googletranscription "github.com/memohai/twilight-ai/provider/google/transcription"
 	microsoftspeech "github.com/memohai/twilight-ai/provider/microsoft/speech"
 	minimaxspeech "github.com/memohai/twilight-ai/provider/minimax/speech"
 	openaispeech "github.com/memohai/twilight-ai/provider/openai/speech"
+	openaitranscription "github.com/memohai/twilight-ai/provider/openai/transcription"
 	openrouterspeech "github.com/memohai/twilight-ai/provider/openrouter/speech"
+	openroutertranscription "github.com/memohai/twilight-ai/provider/openrouter/transcription"
 	volcenginespeech "github.com/memohai/twilight-ai/provider/volcengine/speech"
 	sdk "github.com/memohai/twilight-ai/sdk"
 
 	"github.com/memohai/memoh/internal/models"
 )
 
-type ProviderFactory func(config map[string]any) (sdk.SpeechProvider, error)
+type (
+	ProviderFactory              func(config map[string]any) (sdk.SpeechProvider, error)
+	TranscriptionProviderFactory func(config map[string]any) (sdk.TranscriptionProvider, error)
+)
 
 type ProviderDefinition struct {
-	ClientType   models.ClientType
-	DisplayName  string
-	Icon         string
-	Description  string
-	ConfigSchema ConfigSchema
-	DefaultModel string
-	SupportsList bool
-	Models       []ModelInfo
-	Factory      ProviderFactory
-	Order        int
+	ClientType                models.ClientType
+	DisplayName               string
+	Icon                      string
+	Description               string
+	ConfigSchema              ConfigSchema
+	DefaultModel              string
+	SupportsList              bool
+	Models                    []ModelInfo
+	Factory                   ProviderFactory
+	DefaultTranscriptionModel string
+	SupportsTranscriptionList bool
+	TranscriptionModels       []ModelInfo
+	TranscriptionFactory      TranscriptionProviderFactory
+	Order                     int
 }
 
 type Registry struct {
@@ -94,12 +106,18 @@ func (r *Registry) ListMeta() []ProviderMetaResponse {
 	metas := make([]ProviderMetaResponse, 0, len(defs))
 	for _, def := range defs {
 		metas = append(metas, ProviderMetaResponse{
-			Provider:     string(def.ClientType),
-			DisplayName:  def.DisplayName,
-			Description:  def.Description,
-			ConfigSchema: def.ConfigSchema,
-			DefaultModel: def.DefaultModel,
-			Models:       def.Models,
+			Provider:                  string(def.ClientType),
+			DisplayName:               def.DisplayName,
+			Description:               def.Description,
+			ConfigSchema:              def.ConfigSchema,
+			DefaultModel:              def.DefaultModel,
+			Models:                    def.Models,
+			DefaultSynthesisModel:     def.DefaultModel,
+			SynthesisModels:           def.Models,
+			SupportsSynthesisList:     def.SupportsList,
+			DefaultTranscriptionModel: def.DefaultTranscriptionModel,
+			TranscriptionModels:       def.TranscriptionModels,
+			SupportsTranscriptionList: def.SupportsTranscriptionList,
 		})
 	}
 	return metas
@@ -173,8 +191,10 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "Bearer API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.openai.com/v1", 20),
 			}},
-			DefaultModel: "gpt-4o-mini-tts",
-			SupportsList: true,
+			DefaultModel:              "gpt-4o-mini-tts",
+			SupportsList:              true,
+			DefaultTranscriptionModel: "gpt-4o-mini-transcribe",
+			SupportsTranscriptionList: true,
 			Models: []ModelInfo{{
 				ID:          "gpt-4o-mini-tts",
 				Name:        "gpt-4o-mini-tts",
@@ -195,6 +215,23 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					Formats: []string{"mp3", "opus", "pcm", "wav"},
 				},
 			}},
+			TranscriptionModels: []ModelInfo{{
+				ID:          "gpt-4o-mini-transcribe",
+				Name:        "gpt-4o-mini-transcribe",
+				Description: "Default OpenAI transcription model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language", "Language", "Optional ISO language hint", false, "", 10),
+					stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
+					numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
+					enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language", "Language", "Optional ISO language hint", false, "", 10),
+					stringField("prompt", "Prompt", "Optional prompt to guide transcription", false, "", 20),
+					numberField("temperature", "Temperature", "Sampling temperature", false, 0, 30),
+					enumField("response_format", "Response Format", "Transcription response format", false, []string{"json", "verbose_json", "text", "srt", "vtt"}, 40),
+				}}},
+			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []openaispeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -205,6 +242,16 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return openaispeech.New(opts...), nil
 			},
+			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
+				opts := []openaitranscription.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, openaitranscription.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, openaitranscription.WithBaseURL(v))
+				}
+				return openaitranscription.New(opts...), nil
+			},
 			Order: 20,
 		},
 		{
@@ -216,8 +263,10 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "OpenRouter API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://openrouter.ai/api/v1", 20),
 			}},
-			DefaultModel: "openrouter-tts",
-			SupportsList: true,
+			DefaultModel:              "openrouter-tts",
+			SupportsList:              true,
+			DefaultTranscriptionModel: "openai/gpt-4o-mini-transcribe",
+			SupportsTranscriptionList: true,
 			Models: []ModelInfo{{
 				ID:           "openrouter-tts",
 				Name:         "openrouter-tts",
@@ -234,6 +283,17 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					numberField("speed", "Speed", "Speech rate", false, 1.0, 30),
 				}}},
 			}},
+			TranscriptionModels: []ModelInfo{{
+				ID:          "openai/gpt-4o-mini-transcribe",
+				Name:        "openai/gpt-4o-mini-transcribe",
+				Description: "Default OpenRouter transcription model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("prompt", "Prompt", "Prompt passed to the model before audio input", false, "", 10),
+				}}},
+			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []openrouterspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -244,6 +304,16 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return openrouterspeech.New(opts...), nil
 			},
+			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
+				opts := []openroutertranscription.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, openroutertranscription.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, openroutertranscription.WithBaseURL(v))
+				}
+				return openroutertranscription.New(opts...), nil
+			},
 			Order: 30,
 		},
 		{
@@ -255,8 +325,10 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "ElevenLabs API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.elevenlabs.io", 20),
 			}},
-			DefaultModel: "elevenlabs-tts",
-			SupportsList: true,
+			DefaultModel:              "elevenlabs-tts",
+			SupportsList:              true,
+			DefaultTranscriptionModel: "scribe_v2",
+			SupportsTranscriptionList: true,
 			Models: []ModelInfo{{
 				ID:           "elevenlabs-tts",
 				Name:         "elevenlabs-tts",
@@ -289,6 +361,25 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "en-US", 110),
 				}}},
 			}},
+			TranscriptionModels: []ModelInfo{{
+				ID:          "scribe_v2",
+				Name:        "scribe_v2",
+				Description: "Default ElevenLabs transcription model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
+					boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
+					boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
+					numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
+					enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language_code", "Language Code", "Optional BCP-47 language code", false, "", 10),
+					boolField("tag_audio_events", "Tag Audio Events", "Include non-speech events in timestamps", false, 20),
+					boolField("diarize", "Diarize", "Enable speaker diarization", false, 30),
+					numberField("num_speakers", "Number of Speakers", "Optional expected speaker count", false, 0, 40),
+					enumField("timestamps_granularity", "Timestamps Granularity", "Timestamps granularity", false, []string{"word", "character"}, 50),
+				}}},
+			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []elevenlabsspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -299,8 +390,52 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return elevenlabsspeech.New(opts...), nil
 			},
+			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
+				opts := []elevenlabstranscription.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, elevenlabstranscription.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, elevenlabstranscription.WithBaseURL(v))
+				}
+				return elevenlabstranscription.New(opts...), nil
+			},
 			Order: 40,
 		},
+		{
+			ClientType:  models.ClientTypeGoogleSpeech,
+			DisplayName: "Google Speech",
+			Icon:        "google-color",
+			Description: "Google Gemini speech transcription",
+			ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+				secretField("api_key", "API Key", "Google API key", true, 10),
+				stringField("base_url", "Base URL", "Override the API base URL", false, "https://generativelanguage.googleapis.com/v1beta", 20),
+			}},
+			DefaultTranscriptionModel: "gemini-2.5-flash",
+			SupportsTranscriptionList: true,
+			TranscriptionModels: []ModelInfo{{
+				ID:          "gemini-2.5-flash",
+				Name:        "gemini-2.5-flash",
+				Description: "Default Google transcription model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					advancedStringField("prompt", "Prompt", "Prompt passed alongside audio", false, "", 10),
+				}}},
+			}},
+			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
+				opts := []googletranscription.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, googletranscription.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, googletranscription.WithBaseURL(v))
+				}
+				return googletranscription.New(opts...), nil
+			},
+			Order: 45,
+		},
 		{
 			ClientType:  models.ClientTypeDeepgramSpeech,
 			DisplayName: "Deepgram Speech",
@@ -310,8 +445,10 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				secretField("api_key", "API Key", "Deepgram API key", true, 10),
 				stringField("base_url", "Base URL", "Override the API base URL", false, "https://api.deepgram.com", 20),
 			}},
-			DefaultModel: "deepgram-tts",
-			SupportsList: false,
+			DefaultModel:              "deepgram-tts",
+			SupportsList:              false,
+			DefaultTranscriptionModel: "nova-3",
+			SupportsTranscriptionList: false,
 			Models: []ModelInfo{{
 				ID:          "deepgram-tts",
 				Name:        "deepgram-tts",
@@ -332,6 +469,25 @@ func defaultProviderDefinitions() []ProviderDefinition {
 					Formats: []string{"wav", "none"},
 				},
 			}},
+			TranscriptionModels: []ModelInfo{{
+				ID:          "nova-3",
+				Name:        "nova-3",
+				Description: "Default Deepgram transcription model",
+				ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language", "Language", "Optional language hint", false, "", 10),
+					boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
+					boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
+					boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
+					boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
+				}},
+				Capabilities: ModelCapabilities{ConfigSchema: ConfigSchema{Fields: []FieldSchema{
+					stringField("language", "Language", "Optional language hint", false, "", 10),
+					boolField("smart_format", "Smart Format", "Enable smart formatting", false, 20),
+					boolField("detect_language", "Detect Language", "Enable automatic language detection", false, 30),
+					boolField("diarize", "Diarize", "Enable speaker diarization", false, 40),
+					boolField("punctuate", "Punctuate", "Enable punctuation", false, 50),
+				}}},
+			}},
 			Factory: func(config map[string]any) (sdk.SpeechProvider, error) {
 				opts := []deepgramspeech.Option{}
 				if v := configString(config, "api_key"); v != "" {
@@ -342,6 +498,16 @@ func defaultProviderDefinitions() []ProviderDefinition {
 				}
 				return deepgramspeech.New(opts...), nil
 			},
+			TranscriptionFactory: func(config map[string]any) (sdk.TranscriptionProvider, error) {
+				opts := []deepgramtranscription.Option{}
+				if v := configString(config, "api_key"); v != "" {
+					opts = append(opts, deepgramtranscription.WithAPIKey(v))
+				}
+				if v := configString(config, "base_url"); v != "" {
+					opts = append(opts, deepgramtranscription.WithBaseURL(v))
+				}
+				return deepgramtranscription.New(opts...), nil
+			},
 			Order: 50,
 		},
 		{
diff --git a/internal/tts/service.go b/internal/tts/service.go
index 5c920bf0..eb4da940 100644
--- a/internal/tts/service.go
+++ b/internal/tts/service.go
@@ -7,6 +7,7 @@ import (
 	"io"
 	"log/slog"
 
+	"github.com/jackc/pgx/v5/pgtype"
 	sdk "github.com/memohai/twilight-ai/sdk"
 
 	"github.com/memohai/memoh/internal/db"
@@ -65,7 +66,7 @@ func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse,
 	}
 	items := make([]SpeechModelResponse, 0, len(rows))
 	for _, row := range rows {
-		if s.shouldHideModel(row.ProviderType, row.ModelID) {
+		if s.shouldHideModel(row.ProviderType, models.ModelTypeSpeech, row.ModelID) {
 			continue
 		}
 		items = append(items, toSpeechModelFromListRow(row))
@@ -73,6 +74,21 @@ func (s *Service) ListSpeechModels(ctx context.Context) ([]SpeechModelResponse,
 	return items, nil
 }
 
+func (s *Service) ListTranscriptionModels(ctx context.Context) ([]TranscriptionModelResponse, error) {
+	rows, err := s.queries.ListTranscriptionModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list transcription models: %w", err)
+	}
+	items := make([]TranscriptionModelResponse, 0, len(rows))
+	for _, row := range rows {
+		if s.shouldHideModel(row.ProviderType, models.ModelTypeTranscription, row.ModelID) {
+			continue
+		}
+		items = append(items, toTranscriptionModelFromListRow(row))
+	}
+	return items, nil
+}
+
 func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID string) ([]SpeechModelResponse, error) {
 	pgID, err := db.ParseUUID(providerID)
 	if err != nil {
@@ -92,7 +108,7 @@ func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID str
 	}
 	items := make([]SpeechModelResponse, 0, len(rows))
 	for _, row := range rows {
-		if shouldHideTemplateModel(def, row.ModelID) {
+		if shouldHideTemplateModel(def, models.ModelTypeSpeech, row.ModelID) {
 			continue
 		}
 		items = append(items, toSpeechModelFromModel(row, ""))
@@ -100,6 +116,33 @@ func (s *Service) ListSpeechModelsByProvider(ctx context.Context, providerID str
 	return items, nil
 }
 
+func (s *Service) ListTranscriptionModelsByProvider(ctx context.Context, providerID string) ([]TranscriptionModelResponse, error) {
+	pgID, err := db.ParseUUID(providerID)
+	if err != nil {
+		return nil, err
+	}
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	rows, err := s.queries.ListTranscriptionModelsByProviderID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("list transcription models by provider: %w", err)
+	}
+	items := make([]TranscriptionModelResponse, 0, len(rows))
+	for _, row := range rows {
+		if shouldHideTemplateModel(def, models.ModelTypeTranscription, row.ModelID) {
+			continue
+		}
+		items = append(items, toTranscriptionModelFromModel(row, ""))
+	}
+	return items, nil
+}
+
 func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelResponse, error) {
 	pgID, err := db.ParseUUID(id)
 	if err != nil {
@@ -112,6 +155,80 @@ func (s *Service) GetSpeechModel(ctx context.Context, id string) (SpeechModelRes
 	return toSpeechModelWithProviderResponse(row), nil
 }
 
+func (s *Service) GetTranscriptionModel(ctx context.Context, id string) (TranscriptionModelResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return TranscriptionModelResponse{}, err
+	}
+	row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
+	if err != nil {
+		return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
+	}
+	return toTranscriptionModelWithProviderResponse(row), nil
+}
+
+func (s *Service) UpdateSpeechModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (SpeechModelResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return SpeechModelResponse{}, err
+	}
+	row, err := s.queries.GetSpeechModelWithProvider(ctx, pgID)
+	if err != nil {
+		return SpeechModelResponse{}, fmt.Errorf("get speech model: %w", err)
+	}
+	configJSON, err := json.Marshal(req.Config)
+	if err != nil {
+		return SpeechModelResponse{}, fmt.Errorf("marshal speech config: %w", err)
+	}
+	name := row.Name
+	if req.Name != nil {
+		name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
+	}
+	updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
+		ID:         pgID,
+		ModelID:    row.ModelID,
+		Name:       name,
+		ProviderID: row.ProviderID,
+		Type:       string(models.ModelTypeSpeech),
+		Config:     configJSON,
+	})
+	if err != nil {
+		return SpeechModelResponse{}, fmt.Errorf("update speech model: %w", err)
+	}
+	return toSpeechModelFromModel(updated, row.ProviderType), nil
+}
+
+func (s *Service) UpdateTranscriptionModel(ctx context.Context, id string, req UpdateSpeechModelRequest) (TranscriptionModelResponse, error) {
+	pgID, err := db.ParseUUID(id)
+	if err != nil {
+		return TranscriptionModelResponse{}, err
+	}
+	row, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
+	if err != nil {
+		return TranscriptionModelResponse{}, fmt.Errorf("get transcription model: %w", err)
+	}
+	configJSON, err := json.Marshal(req.Config)
+	if err != nil {
+		return TranscriptionModelResponse{}, fmt.Errorf("marshal transcription config: %w", err)
+	}
+	name := row.Name
+	if req.Name != nil {
+		name = pgtype.Text{String: *req.Name, Valid: *req.Name != ""}
+	}
+	updated, err := s.queries.UpdateModel(ctx, sqlc.UpdateModelParams{
+		ID:         pgID,
+		ModelID:    row.ModelID,
+		Name:       name,
+		ProviderID: row.ProviderID,
+		Type:       string(models.ModelTypeTranscription),
+		Config:     configJSON,
+	})
+	if err != nil {
+		return TranscriptionModelResponse{}, fmt.Errorf("update transcription model: %w", err)
+	}
+	return toTranscriptionModelFromModel(updated, row.ProviderType), nil
+}
+
 func (s *Service) Synthesize(ctx context.Context, modelID string, text string, overrideCfg map[string]any) ([]byte, string, error) {
 	params, err := s.resolveSpeechParams(ctx, modelID, text, overrideCfg)
 	if err != nil {
@@ -164,7 +281,7 @@ func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*Mo
 	if err != nil {
 		return nil, err
 	}
-	template := findModelTemplate(def, modelRow.ModelID)
+	template := findModelTemplate(def.Models, def.DefaultModel, modelRow.ModelID)
 	if template == nil {
 		return nil, fmt.Errorf("speech model capabilities not found: %s", modelRow.ModelID)
 	}
@@ -175,6 +292,34 @@ func (s *Service) GetModelCapabilities(ctx context.Context, modelID string) (*Mo
 	return &caps, nil
 }
 
+func (s *Service) GetSpeechModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
+	return s.GetModelCapabilities(ctx, modelID)
+}
+
+func (s *Service) GetTranscriptionModelCapabilities(ctx context.Context, modelID string) (*ModelCapabilities, error) {
+	pgID, err := db.ParseUUID(modelID)
+	if err != nil {
+		return nil, err
+	}
+	modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get transcription model: %w", err)
+	}
+	def, err := s.registry.Get(models.ClientType(modelRow.ProviderType))
+	if err != nil {
+		return nil, err
+	}
+	template := findModelTemplate(def.TranscriptionModels, def.DefaultTranscriptionModel, modelRow.ModelID)
+	if template == nil {
+		return nil, fmt.Errorf("transcription model capabilities not found: %s", modelRow.ModelID)
+	}
+	caps := template.Capabilities
+	if len(caps.ConfigSchema.Fields) == 0 {
+		caps.ConfigSchema = template.ConfigSchema
+	}
+	return &caps, nil
+}
+
 func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
 	pgID, err := db.ParseUUID(providerID)
 	if err != nil {
@@ -214,11 +359,71 @@ func (s *Service) FetchRemoteModels(ctx context.Context, providerID string) ([]M
 	return discovered, nil
 }
 
+func (s *Service) FetchRemoteTranscriptionModels(ctx context.Context, providerID string) ([]ModelInfo, error) {
+	pgID, err := db.ParseUUID(providerID)
+	if err != nil {
+		return nil, err
+	}
+
+	providerRow, err := s.queries.GetProviderByID(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	if !def.SupportsTranscriptionList || def.TranscriptionFactory == nil {
+		return nil, fmt.Errorf("speech provider does not support transcription model discovery: %s", providerRow.ClientType)
+	}
+
+	provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build transcription provider: %w", err)
+	}
+
+	remoteModels, err := provider.ListModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list transcription models: %w", err)
+	}
+
+	discovered := make([]ModelInfo, 0, len(remoteModels))
+	for _, remoteModel := range remoteModels {
+		if remoteModel == nil || remoteModel.ID == "" {
+			continue
+		}
+		discovered = append(discovered, mergeRemoteModelInfo(remoteModel.ID, def.TranscriptionModels))
+	}
+	return discovered, nil
+}
+
+func (s *Service) Transcribe(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*sdk.TranscriptionResult, error) {
+	params, err := s.resolveTranscriptionParams(ctx, modelID, audio, filename, contentType, overrideCfg)
+	if err != nil {
+		return nil, err
+	}
+	result, err := sdk.Transcribe(ctx,
+		sdk.WithTranscriptionModel(params.model),
+		sdk.WithAudio(audio, filename, contentType),
+		sdk.WithTranscriptionConfig(params.config),
+	)
+	if err != nil {
+		return nil, fmt.Errorf("transcribe: %w", err)
+	}
+	return result, nil
+}
+
 type resolvedSpeechParams struct {
 	model  *sdk.SpeechModel
 	config map[string]any
 }
 
+type resolvedTranscriptionParams struct {
+	model  *sdk.TranscriptionModel
+	config map[string]any
+}
+
 func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text string, overrideCfg map[string]any) (*resolvedSpeechParams, error) {
 	_ = text
 	pgID, err := db.ParseUUID(modelID)
@@ -251,6 +456,40 @@ func (s *Service) resolveSpeechParams(ctx context.Context, modelID string, text
 	}, nil
 }
 
+func (s *Service) resolveTranscriptionParams(ctx context.Context, modelID string, audio []byte, filename string, contentType string, overrideCfg map[string]any) (*resolvedTranscriptionParams, error) {
+	_ = audio
+	_ = filename
+	_ = contentType
+	pgID, err := db.ParseUUID(modelID)
+	if err != nil {
+		return nil, err
+	}
+
+	modelRow, err := s.queries.GetTranscriptionModelWithProvider(ctx, pgID)
+	if err != nil {
+		return nil, fmt.Errorf("get transcription model: %w", err)
+	}
+	providerRow, err := s.queries.GetProviderByID(ctx, modelRow.ProviderID)
+	if err != nil {
+		return nil, fmt.Errorf("get speech provider: %w", err)
+	}
+
+	def, err := s.registry.Get(models.ClientType(providerRow.ClientType))
+	if err != nil {
+		return nil, err
+	}
+	provider, err := def.TranscriptionFactory(parseConfig(providerRow.Config))
+	if err != nil {
+		return nil, fmt.Errorf("build transcription provider: %w", err)
+	}
+
+	cfg := mergeConfig(parseConfig(providerRow.Config), parseConfig(modelRow.Config), overrideCfg)
+	return &resolvedTranscriptionParams{
+		model:  &sdk.TranscriptionModel{ID: modelRow.ModelID, Provider: provider},
+		config: cfg,
+	}, nil
+}
+
 func parseConfig(raw []byte) map[string]any {
 	if len(raw) == 0 {
 		return map[string]any{}
@@ -284,41 +523,53 @@ func mergeRemoteModelInfo(modelID string, defaults []ModelInfo) ModelInfo {
 	}
 }
 
-func (s *Service) shouldHideModel(clientType string, modelID string) bool {
+func (s *Service) shouldHideModel(clientType string, modelType models.ModelType, modelID string) bool {
 	def, err := s.registry.Get(models.ClientType(clientType))
 	if err != nil {
 		return false
 	}
-	return shouldHideTemplateModel(def, modelID)
+	return shouldHideTemplateModel(def, modelType, modelID)
 }
 
-func shouldHideTemplateModel(def ProviderDefinition, modelID string) bool {
-	if !def.SupportsList {
-		return false
-	}
-	for _, model := range def.Models {
-		if model.ID == modelID {
-			return model.TemplateOnly
+func shouldHideTemplateModel(def ProviderDefinition, modelType models.ModelType, modelID string) bool {
+	switch modelType {
+	case models.ModelTypeSpeech:
+		if !def.SupportsList {
+			return false
+		}
+		for _, model := range def.Models {
+			if model.ID == modelID {
+				return model.TemplateOnly
+			}
+		}
+	case models.ModelTypeTranscription:
+		if !def.SupportsTranscriptionList {
+			return false
+		}
+		for _, model := range def.TranscriptionModels {
+			if model.ID == modelID {
+				return model.TemplateOnly
+			}
 		}
 	}
 	return false
 }
 
-func findModelTemplate(def ProviderDefinition, modelID string) *ModelInfo {
-	for i := range def.Models {
-		if def.Models[i].ID == modelID {
-			return &def.Models[i]
+func findModelTemplate(modelsList []ModelInfo, defaultModel string, modelID string) *ModelInfo {
+	for i := range modelsList {
+		if modelsList[i].ID == modelID {
+			return &modelsList[i]
 		}
 	}
-	if def.DefaultModel != "" {
-		for i := range def.Models {
-			if def.Models[i].ID == def.DefaultModel {
-				return &def.Models[i]
+	if defaultModel != "" {
+		for i := range modelsList {
+			if modelsList[i].ID == defaultModel {
+				return &modelsList[i]
 			}
 		}
 	}
-	if len(def.Models) > 0 {
-		return &def.Models[0]
+	if len(modelsList) > 0 {
+		return &modelsList[0]
 	}
 	return nil
 }
@@ -433,3 +684,66 @@ func toSpeechModelWithProviderResponse(row sqlc.GetSpeechModelWithProviderRow) S
 		UpdatedAt:    row.UpdatedAt.Time,
 	}
 }
+
+func toTranscriptionModelFromListRow(row sqlc.ListTranscriptionModelsRow) TranscriptionModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return TranscriptionModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: row.ProviderType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
+
+func toTranscriptionModelFromModel(row sqlc.Model, providerType string) TranscriptionModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return TranscriptionModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: providerType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
+
+func toTranscriptionModelWithProviderResponse(row sqlc.GetTranscriptionModelWithProviderRow) TranscriptionModelResponse {
+	var cfg map[string]any
+	if len(row.Config) > 0 {
+		_ = json.Unmarshal(row.Config, &cfg)
+	}
+	name := ""
+	if row.Name.Valid {
+		name = row.Name.String
+	}
+	return TranscriptionModelResponse{
+		ID:           row.ID.String(),
+		ModelID:      row.ModelID,
+		Name:         name,
+		ProviderID:   row.ProviderID.String(),
+		ProviderType: row.ProviderType,
+		Config:       cfg,
+		CreatedAt:    row.CreatedAt.Time,
+		UpdatedAt:    row.UpdatedAt.Time,
+	}
+}
diff --git a/internal/tts/types.go b/internal/tts/types.go
index 5e122bb8..c2129905 100644
--- a/internal/tts/types.go
+++ b/internal/tts/types.go
@@ -4,12 +4,18 @@ import "time"
 
 // ProviderMetaResponse exposes adapter metadata (from the registry, not DB).
 type ProviderMetaResponse struct {
-	Provider     string       `json:"provider"`
-	DisplayName  string       `json:"display_name"`
-	Description  string       `json:"description"`
-	ConfigSchema ConfigSchema `json:"config_schema,omitempty"`
-	DefaultModel string       `json:"default_model"`
-	Models       []ModelInfo  `json:"models"`
+	Provider                  string       `json:"provider"`
+	DisplayName               string       `json:"display_name"`
+	Description               string       `json:"description"`
+	ConfigSchema              ConfigSchema `json:"config_schema,omitempty"`
+	DefaultModel              string       `json:"default_model,omitempty"`
+	Models                    []ModelInfo  `json:"models,omitempty"`
+	DefaultSynthesisModel     string       `json:"default_synthesis_model,omitempty"`
+	SynthesisModels           []ModelInfo  `json:"synthesis_models,omitempty"`
+	SupportsSynthesisList     bool         `json:"supports_synthesis_list,omitempty"`
+	DefaultTranscriptionModel string       `json:"default_transcription_model,omitempty"`
+	TranscriptionModels       []ModelInfo  `json:"transcription_models,omitempty"`
+	SupportsTranscriptionList bool         `json:"supports_transcription_list,omitempty"`
 }
 
 // SpeechProviderResponse represents a speech-capable provider from the unified providers table.
@@ -36,6 +42,18 @@ type SpeechModelResponse struct {
 	UpdatedAt    time.Time      `json:"updated_at"`
 }
 
+// TranscriptionModelResponse represents a transcription model from the unified models table.
+type TranscriptionModelResponse struct {
+	ID           string         `json:"id"`
+	ModelID      string         `json:"model_id"`
+	Name         string         `json:"name"`
+	ProviderID   string         `json:"provider_id"`
+	ProviderType string         `json:"provider_type,omitempty"`
+	Config       map[string]any `json:"config,omitempty"`
+	CreatedAt    time.Time      `json:"created_at"`
+	UpdatedAt    time.Time      `json:"updated_at"`
+}
+
 // UpdateSpeechProviderRequest is used for updating a speech provider.
 type UpdateSpeechProviderRequest struct {
 	Name   *string `json:"name,omitempty"`
@@ -54,6 +72,28 @@ type TestSynthesizeRequest struct {
 	Config map[string]any `json:"config,omitempty"`
 }
 
+// TestTranscriptionRequest represents an audio-to-text test request.
+type TestTranscriptionRequest struct {
+	Config map[string]any `json:"config,omitempty"`
+}
+
+// TestTranscriptionResponse represents the result of a transcription test.
+type TestTranscriptionResponse struct {
+	Text            string              `json:"text"`
+	Language        string              `json:"language,omitempty"`
+	DurationSeconds float64             `json:"duration_seconds,omitempty"`
+	Words           []TranscriptionWord `json:"words,omitempty"`
+	Metadata        map[string]any      `json:"metadata,omitempty"`
+}
+
+// TranscriptionWord represents a single word alignment from a transcription result.
+type TranscriptionWord struct {
+	Text      string  `json:"text"`
+	Start     float64 `json:"start,omitempty"`
+	End       float64 `json:"end,omitempty"`
+	SpeakerID string  `json:"speaker_id,omitempty"`
+}
+
 // ImportModelsResponse represents the response for importing speech models.
 type ImportModelsResponse struct {
 	Created int      `json:"created"`
diff --git a/internal/workspace/image_preference.go b/internal/workspace/image_preference.go
index 6abc7e05..c003baf0 100644
--- a/internal/workspace/image_preference.go
+++ b/internal/workspace/image_preference.go
@@ -175,6 +175,7 @@ func withWorkspaceGPUPreference(metadata map[string]any, gpu WorkspaceGPUConfig)
 	return next
 }
 
+//nolint:unused // Kept for tests and upcoming metadata plumbing.
 func withWorkspaceSkillDiscoveryRoots(metadata map[string]any, roots []string) map[string]any {
 	next := cloneAnyMap(metadata)
 	section := workspaceSection(next)
@@ -199,6 +200,7 @@ func withoutWorkspaceGPUPreference(metadata map[string]any) map[string]any {
 	return next
 }
 
+//nolint:unused // Kept for tests and upcoming metadata plumbing.
 func withoutWorkspaceSkillDiscoveryRoots(metadata map[string]any) map[string]any {
 	next := cloneAnyMap(metadata)
 	section := workspaceSection(next)
diff --git a/packages/sdk/src/types.gen.ts b/packages/sdk/src/types.gen.ts
index 2b61fba9..ab32fbf7 100644
--- a/packages/sdk/src/types.gen.ts
+++ b/packages/sdk/src/types.gen.ts
@@ -1615,6 +1615,7 @@ export type SettingsSettings = {
     search_provider_id?: string;
     timezone?: string;
     title_model_id?: string;
+    transcription_model_id?: string;
     tts_model_id?: string;
 };
 
@@ -1639,6 +1640,7 @@ export type SettingsUpsertRequest = {
     search_provider_id?: string;
     timezone?: string;
     title_model_id?: string;
+    transcription_model_id?: string;
     tts_model_id?: string;
 };