tts/main.go

package main

import (
	"context"
	"crypto/sha256"
	"encoding/hex"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"log"
	"os"
	"path/filepath"
	"strings"

	texttospeech "cloud.google.com/go/texttospeech/apiv1"
	"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
)

type Job struct {
	Name     string `json:"name"`
	Voice    string `json:"voice"`
	Language string `json:"language"`
	Message  string `json:"message"`
}

func loadJobs(directory string) ([]Job, error) {
	jobs := make([]Job, 0)
	err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error {
		if !info.IsDir() && strings.HasSuffix(path, ".json") {
			f, err := os.Open(fmt.Sprintf("%s", path))
			defer f.Close()
			if err != nil {
				return fmt.Errorf("failed to open file '%s': %v", path, err)
			}

			b, err := io.ReadAll(f)
			if err != nil {
				return fmt.Errorf("failed to read file '%s': %v", path, err)
			}

			job := Job{}
			err = json.Unmarshal(b, &job)
			if err != nil {
				return fmt.Errorf("failed to unmarshal file '%s' into Job struct: %v", path, err)
			}
			jobs = append(jobs, job)
		}
		return nil
	})

	if err != nil {
		return nil, fmt.Errorf("failed to walk current directory for json files: %v", err)
	}
	return jobs, nil
}

func main() {

	if len(os.Args) > 1 {
		if os.Args[1] == "list" {
			err := ListVoices()
			if err != nil {
				log.Fatalf("failed to list voices: %v", err)
			}
			return
		}
	}

	jobs, err := loadJobs("jobs")
	if err != nil {
		log.Fatalf("failed to load jobs: %v", err)
	}

	// Check each message if MP3 file already exists, if not then it will synthesize and save the audio to file
	// file names are SHA256 sums of the spoken text
	for _, job := range jobs {
		h := sha256.New()
		h.Write([]byte(job.Message))
		sha := h.Sum(nil) // "sha" is uint8 type, encoded in base16

		filename := fmt.Sprintf("%s-%s.mp3", job.Name, hex.EncodeToString(sha))

		if _, err := os.Stat(fmt.Sprintf("%s", filename)); errors.Is(err, os.ErrNotExist) {
			mp3, err := getTTS(job.Voice, job.Message, job.Language)
			if err != nil {
				log.Printf("failed to get TTS: %v", err)
				continue
			}

			err = os.WriteFile(fmt.Sprintf("%s", filename), mp3, 0644)
			if err != nil {
				log.Printf("failed to write mp3 file: %v", err)
				continue
			}
			log.Printf("Audio content written to file: %v\n", filename)
		}
	}
}

// Accepts a string and returns a byteslice of the message in mp3 format, and an error
func getTTS(voice, message, language string) ([]byte, error) {

	// Instantiates a client.
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
		return nil, fmt.Errorf("failed to create TTS client: %v", err)
	}
	defer client.Close()

	// Perform the text-to-speech request on the text input with the selected
	// voice parameters and audio file type.
	req := texttospeechpb.SynthesizeSpeechRequest{
		// Set the text input to be synthesized.
		Input: &texttospeechpb.SynthesisInput{
			InputSource: &texttospeechpb.SynthesisInput_Text{Text: message},
		},
		// Build the voice request, select the language code ("en-US") and the SSML
		// voice gender ("neutral").
		Voice: &texttospeechpb.VoiceSelectionParams{
			LanguageCode: language,
			Name:         voice,
			//Name: "en-US-Journey-F",
			//Name: "tr-TR-Wavenet-E",
		},
		// Select the type of audio file you want returned.
		AudioConfig: &texttospeechpb.AudioConfig{
			AudioEncoding:    texttospeechpb.AudioEncoding_MP3,
			EffectsProfileId: []string{"headphone-class-device"},
			SpeakingRate:     0.8,
		},
	}

	resp, err := client.SynthesizeSpeech(ctx, &req)
	if err != nil {
		return nil, fmt.Errorf("failed to synthesize speech: %v", err)
	}

	return resp.AudioContent, nil
}

// ListVoices lists the available text to speech voices.
func ListVoices() error {
	ctx := context.Background()

	client, err := texttospeech.NewClient(ctx)
	if err != nil {
		return err
	}
	defer client.Close()

	// Performs the list voices request.
	resp, err := client.ListVoices(ctx, &texttospeechpb.ListVoicesRequest{})
	if err != nil {
		return err
	}

	for _, voice := range resp.Voices {
		// Display the voice's name. Example: tpc-vocoded
		fmt.Printf("Name: %v\n", voice.Name)

		// Display the supported language codes for this voice. Example: "en-US"
		for _, languageCode := range voice.LanguageCodes {
			fmt.Printf("  Supported language: %v\n", languageCode)
		}

		// Display the SSML Voice Gender.
		fmt.Printf("  SSML Voice Gender: %v\n", voice.SsmlGender.String())

		// Display the natural sample rate hertz for this voice. Example: 24000
		fmt.Printf("  Natural Sample Rate Hertz: %v\n",
			voice.NaturalSampleRateHertz)
	}

	return nil
}