Upload 4 files

Browse files

Files changed (5) hide show

.gitattributes +1 -0
RunJets.cs +204 -0
jets-text-to-speech.onnx +3 -0
jets-text-to-speech.sentis +3 -0
phoneme_dict.txt +0 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+jets-text-to-speech.sentis filter=lfs diff=lfs merge=lfs -text

RunJets.cs ADDED Viewed

	@@ -0,0 +1,204 @@

+using System.Collections.Generic;
+using UnityEngine;
+using Unity.Sentis;
+using System.IO;
+//                      Jets Text-To-Speech Inference
+//                      =============================
+//
+// This file implements the Jets Text-to-speech model in Unity Sentis
+// The model uses phenomes instead of raw text so you have to convert it first.
+// Place this file on the Main Camera
+// Add an audio source
+// Change the inputText
+// When running you can press space bar to play it again
+public class RunJets : MonoBehaviour
+{
+    public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
+    //string inputText = "The quick brown fox jumped over the lazy dog";
+    //string inputText = "Hello, my name is Ginger the Giraffe!";
+    //string inputText = "There are many uses of the things she uses!";
+    //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
+    bool hasPhenomeDictionary = true;
+    readonly string[] phonemes = new string[] {
+        "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
+        "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", "','", "AA1", "B",
+        "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
+        "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
+        "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
+        "AH2", "AY0", "IY2", "AW2", "AA0", "''''", "ER2", "UH2", "'?'", "OY2", "'!'", "AW0",
+        "UH0", "OY0", "..", "<sos/eos>" };
+    readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
+    //Can change pitch and speed with this for a slightly different voice:
+    const int samplerate = 22050;
+    Dictionary<string, string> dict = new ();
+    IWorker engine;
+    AudioClip clip;
+    void Start()
+    {
+        LoadModel();
+        ReadDictionary();
+        TextToSpeech();
+    }
+    void LoadModel()
+    {
+        var model = ModelLoader.Load(Application.streamingAssetsPath + "/jets-text-to-speech.sentis");
+        engine = WorkerFactory.CreateWorker(BackendType.GPUCompute, model);
+    }
+    void TextToSpeech()
+    {
+        string ptext;
+        if (hasPhenomeDictionary)
+        {
+            ptext = TextToPhonemes(inputText);
+            Debug.Log(ptext);
+        }
+        else
+        {
+            //If we have no phenome dictionary we can use one of these examples:
+            ptext = "DH AH0 K W IH1 K B R AW1 N F AA1 K S JH AH1 M P S OW1 V ER0 DH AH0 L EY1 Z IY0 D AO1 G .";
+            //ptext = "W AH1 N S AH0 P AA1 N AH0 T AY1 M , AH0 F R AA1 G M EH1 T AH0 P R IH1 N S EH0 S . DH AH0 F R AA1 G K IH1 S T DH AH0 P R IH1 N S EH0 S AH0 N D B IH0 K EY1 M AH0 P R IH1 N S .";
+            //ptext = "D UW1 P L AH0 K EY2 T";
+        }
+        DoInference(ptext);
+    }
+    void ReadDictionary()
+    {
+        if (!hasPhenomeDictionary) return;
+        string[] words = File.ReadAllLines(Application.streamingAssetsPath+"/phoneme_dict.txt");
+        for (int i = 0; i < words.Length; i++)
+        {
+            string s = words[i];
+            string[] parts = s.Split(' ', System.StringSplitOptions.RemoveEmptyEntries);
+            if (parts[0] != ";;;")
+            {
+                string key = parts[0];
+                dict.Add(key, s.Substring(key.Length + 2));
+            }
+        }
+        // Add codes for punctuation to the dictionary
+        dict.Add(",", "','");
+        dict.Add(".", ".");
+        dict.Add("!", "'!'");
+        dict.Add("?", "'?'");
+        dict.Add("\"", "''''");
+    }
+    public string ExpandNumbers(string text)
+    {
+        return text
+            .Replace("0", " ZERO ")
+            .Replace("1", " ONE ")
+            .Replace("2", " TWO ")
+            .Replace("3", " THREE ")
+            .Replace("4", " FOUR ")
+            .Replace("5", " FIVE ")
+            .Replace("6", " SIX ")
+            .Replace("7", " SEVEN ")
+            .Replace("8", " EIGHT ")
+            .Replace("9", " NINE ");
+    }
+    public string TextToPhonemes(string text)
+    {
+        string output = "";
+        text = ExpandNumbers(text).ToUpper();
+        string[] words = text.Split();
+        for (int i = 0; i < words.Length; i++)
+        {
+            output += DecodeWord(words[i]);
+        }
+        return output;
+    }
+    //Decode the word into phenomes by looking for the longest word in the dictionary that matches
+    //the first part of the word and so on.
+    //This is works fairly well but could be improved. The original paper had a model that
+    //dealt with guessing the phonemes of words
+    public string DecodeWord(string word)
+    {
+        string output = "";
+        int start = 0;
+        for (int i = word.Length; i >= 0; i--)
+        {
+            string subword = word.Substring(start, i - start);
+            if (dict.TryGetValue(subword, out string value))
+            {
+                output += value + " ";
+                if (i == word.Length) break;
+                start = i;
+                i = word.Length + 1;
+            }
+        }
+        return output;
+    }
+    int[] GetTokens(string ptext)
+    {
+        string[] p = ptext.Split();
+        var tokens = new int[p.Length];
+        for (int i = 0; i < tokens.Length; i++)
+        {
+            tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i]));
+        }
+        return tokens;
+    }
+    public void DoInference(string ptext)
+    {
+        int[] tokens = GetTokens(ptext);
+        using var input = new TensorInt(new TensorShape(tokens.Length), tokens);
+        var result = engine.Execute(input);
+        var output = result.PeekOutput("wav") as TensorFloat;
+        output.MakeReadable();
+        var samples = output.ToReadOnlyArray();
+        Debug.Log($"Audio size = {samples.Length / samplerate} seconds");
+        clip = AudioClip.Create("voice audio", samples.Length, 1, samplerate, false);
+        clip.SetData(samples, 0);
+        Speak();
+    }
+    private void Speak()
+    {
+        AudioSource audioSource = GetComponent<AudioSource>();
+        if (audioSource != null)
+        {
+            audioSource.clip = clip;
+            audioSource.Play();
+        }
+        else
+        {
+            Debug.Log("There is no audio source");
+        }
+    }
+    void Update()
+    {
+        if (Input.GetKeyDown(KeyCode.Space))
+        {
+            TextToSpeech();
+        }
+    }
+    private void OnDestroy()
+    {
+        engine?.Dispose();
+    }
+}

jets-text-to-speech.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3f71523a869ae567d3f0b6db61a6a84a27288d8f794b564778f1c6cff79eef82
+size 132619847

jets-text-to-speech.sentis ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1541f940099ef2d851adfd23b43b9d0226208ba0b062da7ff7038a0315295bd
+size 138538708

phoneme_dict.txt ADDED Viewed

The diff for this file is too large to render. See raw diff