tts/main.go
2024-08-03 22:13:29 -06:00

179 lines
4.6 KiB
Go

package main
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"fmt"
"io"
"log"
"os"
"path/filepath"
"strings"
texttospeech "cloud.google.com/go/texttospeech/apiv1"
"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
)
type Job struct {
Name string `json:"name"`
Voice string `json:"voice"`
Language string `json:"language"`
Message string `json:"message"`
}
func loadJobs(directory string) ([]Job, error) {
jobs := make([]Job, 0)
err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error {
if !info.IsDir() && strings.HasSuffix(path, ".json") {
f, err := os.Open(fmt.Sprintf("%s", path))
defer f.Close()
if err != nil {
return fmt.Errorf("failed to open file '%s': %v", path, err)
}
b, err := io.ReadAll(f)
if err != nil {
return fmt.Errorf("failed to read file '%s': %v", path, err)
}
job := Job{}
err = json.Unmarshal(b, &job)
if err != nil {
return fmt.Errorf("failed to unmarshal file '%s' into Job struct: %v", path, err)
}
jobs = append(jobs, job)
}
return nil
})
if err != nil {
return nil, fmt.Errorf("failed to walk current directory for json files: %v", err)
}
return jobs, nil
}
func main() {
if len(os.Args) > 1 {
if os.Args[1] == "list" {
err := ListVoices()
if err != nil {
log.Fatalf("failed to list voices: %v", err)
}
return
}
}
jobs, err := loadJobs("jobs")
if err != nil {
log.Fatalf("failed to load jobs: %v", err)
}
// Check each message if MP3 file already exists, if not then it will synthesize and save the audio to file
// file names are SHA256 sums of the spoken text
for _, job := range jobs {
h := sha256.New()
h.Write([]byte(job.Message))
sha := h.Sum(nil) // "sha" is uint8 type, encoded in base16
filename := fmt.Sprintf("%s-%s.mp3", job.Name, hex.EncodeToString(sha))
if _, err := os.Stat(fmt.Sprintf("%s", filename)); errors.Is(err, os.ErrNotExist) {
mp3, err := getTTS(job.Voice, job.Message, job.Language)
if err != nil {
log.Printf("failed to get TTS: %v", err)
continue
}
err = os.WriteFile(fmt.Sprintf("%s", filename), mp3, 0644)
if err != nil {
log.Printf("failed to write mp3 file: %v", err)
continue
}
log.Printf("Audio content written to file: %v\n", filename)
}
}
}
// Accepts a string and returns a byteslice of the message in mp3 format, and an error
func getTTS(voice, message, language string) ([]byte, error) {
// Instantiates a client.
ctx := context.Background()
client, err := texttospeech.NewClient(ctx)
if err != nil {
return nil, fmt.Errorf("failed to create TTS client: %v", err)
}
defer client.Close()
// Perform the text-to-speech request on the text input with the selected
// voice parameters and audio file type.
req := texttospeechpb.SynthesizeSpeechRequest{
// Set the text input to be synthesized.
Input: &texttospeechpb.SynthesisInput{
InputSource: &texttospeechpb.SynthesisInput_Text{Text: message},
},
// Build the voice request, select the language code ("en-US") and the SSML
// voice gender ("neutral").
Voice: &texttospeechpb.VoiceSelectionParams{
LanguageCode: language,
Name: voice,
//Name: "en-US-Journey-F",
//Name: "tr-TR-Wavenet-E",
},
// Select the type of audio file you want returned.
AudioConfig: &texttospeechpb.AudioConfig{
AudioEncoding: texttospeechpb.AudioEncoding_MP3,
EffectsProfileId: []string{"headphone-class-device"},
SpeakingRate: 0.8,
},
}
resp, err := client.SynthesizeSpeech(ctx, &req)
if err != nil {
return nil, fmt.Errorf("failed to synthesize speech: %v", err)
}
return resp.AudioContent, nil
}
// ListVoices lists the available text to speech voices.
func ListVoices() error {
ctx := context.Background()
client, err := texttospeech.NewClient(ctx)
if err != nil {
return err
}
defer client.Close()
// Performs the list voices request.
resp, err := client.ListVoices(ctx, &texttospeechpb.ListVoicesRequest{})
if err != nil {
return err
}
for _, voice := range resp.Voices {
// Display the voice's name. Example: tpc-vocoded
fmt.Printf("Name: %v\n", voice.Name)
// Display the supported language codes for this voice. Example: "en-US"
for _, languageCode := range voice.LanguageCodes {
fmt.Printf(" Supported language: %v\n", languageCode)
}
// Display the SSML Voice Gender.
fmt.Printf(" SSML Voice Gender: %v\n", voice.SsmlGender.String())
// Display the natural sample rate hertz for this voice. Example: 24000
fmt.Printf(" Natural Sample Rate Hertz: %v\n",
voice.NaturalSampleRateHertz)
}
return nil
}