split jobs into several batch messages to stay within TTS API limitations for max tokens

This commit is contained in:
Steven Polley 2024-08-06 18:01:23 -06:00
parent b18c04d491
commit fa08c95ed2
3 changed files with 154 additions and 22 deletions

6
go.mod
View File

@ -2,7 +2,10 @@ module deadbeef.codes/steven/tts
go 1.22.0
require cloud.google.com/go/texttospeech v1.7.11
require (
cloud.google.com/go/texttospeech v1.7.11
github.com/hyacinthus/mp3join v0.0.0-20190710105654-d46eaeeb9552
)
require (
cloud.google.com/go v0.115.0 // indirect
@ -10,6 +13,7 @@ require (
cloud.google.com/go/auth/oauth2adapt v0.2.3 // indirect
cloud.google.com/go/compute/metadata v0.5.0 // indirect
cloud.google.com/go/longrunning v0.5.11 // indirect
github.com/dmulholland/mp3lib v0.0.0-20190407131416-50ad4bfbe332 // indirect
github.com/felixge/httpsnoop v1.0.4 // indirect
github.com/go-logr/logr v1.4.2 // indirect
github.com/go-logr/stdr v1.2.2 // indirect

4
go.sum
View File

@ -18,6 +18,8 @@ github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGX
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dmulholland/mp3lib v0.0.0-20190407131416-50ad4bfbe332 h1:zh+x3xrRRobJ+O6Jy+u+8+TSj7qzuW4EL8Hkf5cbAck=
github.com/dmulholland/mp3lib v0.0.0-20190407131416-50ad4bfbe332/go.mod h1:U3TgSK0lA/gbTgENpBTSNn/OmowG1hr07mKQqqvbLxE=
github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
@ -60,6 +62,8 @@ github.com/googleapis/enterprise-certificate-proxy v0.3.2 h1:Vie5ybvEvT75RniqhfF
github.com/googleapis/enterprise-certificate-proxy v0.3.2/go.mod h1:VLSiSSBs/ksPL8kq3OBOQ6WRI2QnaFynd1DCjZ62+V0=
github.com/googleapis/gax-go/v2 v2.13.0 h1:yitjD5f7jQHhyDsnhKEBU52NdvvdSeGzlAnDPT0hH1s=
github.com/googleapis/gax-go/v2 v2.13.0/go.mod h1:Z/fvTZXF8/uw7Xu5GuslPw+bplx6SS338j1Is2S+B7A=
github.com/hyacinthus/mp3join v0.0.0-20190710105654-d46eaeeb9552 h1:cjR5hraUrLrNBQ6lXsjd/VDtJf7+3TOow++DaTAj8r8=
github.com/hyacinthus/mp3join v0.0.0-20190710105654-d46eaeeb9552/go.mod h1:eQzsT6lJmJ/wcTqoHaHfuadmI1lzaHjrdDLO4qKiqcI=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=

146
main.go
View File

@ -1,6 +1,7 @@
package main
import (
"bytes"
"context"
"crypto/sha256"
"encoding/hex"
@ -15,6 +16,7 @@ import (
texttospeech "cloud.google.com/go/texttospeech/apiv1"
"cloud.google.com/go/texttospeech/apiv1/texttospeechpb"
"github.com/hyacinthus/mp3join"
)
type Job struct {
@ -28,11 +30,11 @@ func loadJobs(directory string) ([]Job, error) {
jobs := make([]Job, 0)
err := filepath.Walk(directory, func(path string, info os.FileInfo, err error) error {
if !info.IsDir() && strings.HasSuffix(path, ".json") {
f, err := os.Open(fmt.Sprintf("%s", path))
defer f.Close()
f, err := os.Open(path)
if err != nil {
return fmt.Errorf("failed to open file '%s': %v", path, err)
}
defer f.Close()
b, err := io.ReadAll(f)
if err != nil {
@ -75,27 +77,109 @@ func main() {
// Check each message if MP3 file already exists, if not then it will synthesize and save the audio to file
// file names are SHA256 sums of the spoken text
for _, job := range jobs {
err = processJob(job)
if err != nil {
log.Printf("job '%s' failed to process: %v", job.Name, err)
}
}
err = cleanCache(jobs)
if err != nil {
log.Printf("failed to clean cache: %v", err)
}
}
// Splits a job
func processJob(job Job) error {
messages, err := splitJob(job.Message, 12)
if err != nil {
return fmt.Errorf("failed to split job '%s': %v", job.Name, err)
}
// Process
var mp3Files [][]byte
for i, message := range messages {
h := sha256.New()
h.Write([]byte(job.Message))
h.Write([]byte(message))
sha := h.Sum(nil) // "sha" is uint8 type, encoded in base16
filename := fmt.Sprintf("%s-%s.mp3", job.Name, hex.EncodeToString(sha))
filename := fmt.Sprintf("jobs/cache/%s-%d-%s.mp3", job.Name, i, hex.EncodeToString(sha))
var mp3 []byte
if _, err := os.Stat(fmt.Sprintf("%s", filename)); errors.Is(err, os.ErrNotExist) {
mp3, err := getTTS(job.Voice, job.Message, job.Language)
if _, err := os.Stat(filename); errors.Is(err, os.ErrNotExist) {
mp3, err = getTTS(job.Voice, message, job.Language)
if err != nil {
log.Printf("failed to get TTS: %v", err)
continue
return fmt.Errorf("failed to get TTS: %v", err)
}
err = os.WriteFile(fmt.Sprintf("%s", filename), mp3, 0644)
err = os.WriteFile(filename, mp3, 0644)
if err != nil {
log.Printf("failed to write mp3 file: %v", err)
continue
return fmt.Errorf("failed to write mp3 file '%s': %v", filename, err)
}
log.Printf("Audio content written to file: %v\n", filename)
} else {
mp3, err = os.ReadFile(filename)
if err != nil {
return fmt.Errorf("failed to read mp3 file '%s': %v", filename, err)
}
}
mp3Files = append(mp3Files, mp3)
}
mp3Joiner := mp3join.New()
for i, mp3 := range mp3Files {
r := bytes.NewReader(mp3)
err := mp3Joiner.Append(r)
if err != nil {
return fmt.Errorf("failed to join mp3 file '%d': %v", i, err)
}
}
dest := mp3Joiner.Reader()
combinedBytes, err := io.ReadAll(dest)
if err != nil {
return fmt.Errorf("failed to read combined bytes: %v", err)
}
f, err := os.Create(fmt.Sprintf("jobs/%s.mp3", job.Name))
if err != nil {
return fmt.Errorf("failed to create joined mp3 file: %v", err)
}
defer f.Close()
_, err = f.Write(combinedBytes)
if err != nil {
return fmt.Errorf("failed to write combined bytes to file: %v", err)
}
return nil
}
// Splits a job.Message by maxSentences - to get around API limitations on max tokens
func splitJob(jobMessage string, maxSentences int) ([]string, error) {
sentences := strings.Split(jobMessage, ". ")
messages := make([]string, 0)
var message string
for i, sentence := range sentences {
if i%maxSentences == 0 {
if len(message) > 0 {
messages = append(messages, message)
}
message = ""
}
message = fmt.Sprintf("%s %s.", message, sentence)
}
if len(message) > 0 {
messages = append(messages, message)
}
return messages, nil
}
// Accepts a string and returns a byteslice of the message in mp3 format, and an error
@ -176,3 +260,43 @@ func ListVoices() error {
return nil
}
func cleanCache(jobs []Job) error {
cacheFiles, err := os.ReadDir("jobs/cache")
if err != nil {
return fmt.Errorf("failed to read jobs/cache directory: %v", err)
}
for _, file := range cacheFiles {
if file.IsDir() {
continue
}
if !strings.HasSuffix(file.Name(), ".mp3") {
continue
}
splitName := strings.Split(file.Name(), "-")
if len(splitName < 1) {
continue
}
// First, check if this file even has an active job referencing it, if not then delete
var foundActiveJob bool
for _, job := range jobs {
if job.Name == splitName[0] {
foundActiveJob = true
break
}
}
if !foundActiveJob {
err = os.Remove(fmt.Sprintf("jobs/cache/%s", file.Name()))
if err != nil {
return fmt.Errorf("failed to remove file '%s': %v", file.Name(), err)
}
continue
}
// Second, check if this version of a split job message is the active one, if not then delete
// TBD
}
}