Skip to content

Speech-to-Text (Transcription)

Basic Transcription

import (
    "github.com/joakimcarlsson/ai/transcription"
    "github.com/joakimcarlsson/ai/model"
)

client, err := transcription.NewSpeechToText(
    model.ProviderOpenAI,
    transcription.WithAPIKey("your-api-key"),
    transcription.WithModel(model.OpenAITranscriptionModels[model.Whisper1]),
)
if err != nil {
    log.Fatal(err)
}

audioData, err := os.ReadFile("audio.mp3")
if err != nil {
    log.Fatal(err)
}

response, err := client.Transcribe(context.Background(), audioData)
if err != nil {
    log.Fatal(err)
}

fmt.Println(response.Text)

Transcription with Options

response, err := client.Transcribe(ctx, audioData,
    transcription.WithLanguage("en"),
    transcription.WithResponseFormat("verbose_json"),
    transcription.WithTimestampGranularities("word", "segment"),
    transcription.WithTemperature(0.2),
)

for _, segment := range response.Segments {
    fmt.Printf("[%.2fs - %.2fs] %s\n", segment.Start, segment.End, segment.Text)
}

for _, word := range response.Words {
    fmt.Printf("%s (%.2fs) ", word.Word, word.Start)
}

Translation (to English)

response, err := client.Translate(ctx, audioData,
    transcription.WithPrompt("Translate this Swedish audio to English"),
)

fmt.Println(response.Text)

Client Options

client, err := transcription.NewSpeechToText(
    model.ProviderOpenAI,
    transcription.WithAPIKey("your-key"),
    transcription.WithModel(model.OpenAITranscriptionModels[model.GPT4oTranscribe]),
    transcription.WithTimeout(30*time.Second),
)