updated to inference engine

Browse files

Files changed (6) hide show

README.md +17 -14
RunJets.cs +29 -32
phoneme_dict.txt → data/phoneme_dict.txt +0 -0
info.json +3 -4
jets-text-to-speech.sentis +0 -3
jets-text-to-speech.onnx → models/jets-text-to-speech.onnx +0 -0

README.md CHANGED Viewed

@@ -1,25 +1,28 @@
 ---
 license: cc-by-4.0
 library_name: unity-sentis
 ---
-# Jets Text-to-Speech Model validated for Sentis 2.1.2 in Unity 6
-This is a text to speech model called [Jets](https://huggingface.co/imdanboy/jets). It takes in a text string which you convert to phonemes using a dictionary and then outputs a wav to play the voice.
 ## How to Use
-* Create a new scene in Unity 6
-* Install `com.unity.sentis` version `2.1.2` package
-* Put the c# script on the Main Camera
-* Put the `jets-text-to-speech.sentis` file and the `phoneme_dict.txt` file in the `Assets/StreamingAssets` folder
-* Add an AudioSource component on the Main Camera
-* Set the `inputText` string for what you want it to say
-* Press play
-## Information
-This version uses a phoneme dictionary to convert the text into a string of phonemes. There are other ways to do this, for example using another model, or heuristics.
-Since we are using a simple dictionary it has no way of distinguishing heteronyms (two words with the same spelling but different pronounciation).
 ## License
 Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.

 ---
 license: cc-by-4.0
 library_name: unity-sentis
+tags:
+  - unity-inference-engine
 ---
+# Jets in Unity 6 using Inference Engine
+This is the [Jets](https://huggingface.co/imdanboy/jets) model running in Unity 6 with Inference Engine. It text-to-speech model that takes phonemes as an input and outputs wav data of a voice speaking the text.
 ## How to Use
+* Create a new scene in Unity 6;
+* Install `com.unity.ai.inference` from the package manager;
+* Add the `RunJets.cs` script to the Main Camera;
+* Add an AudioSource component to the Main Camera;
+* Drag the `jets-text-to-speech.onnx` file from the `models` folder into the `Model Asset` field;
+* Drag the `phoneme_dict.txt` file from the `data` folder into the `Phoneme Asset` field;
+## Preview
+Enter play mode. If working correctly you should hear the inferred audio of the voice.
+## Inference Engine
+Inference Engine is a neural network inference library for Unity. Find out more [here](https://docs.unity3d.com/Packages/com.unity.ai.inference@latest).
 ## License
 Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.

RunJets.cs CHANGED Viewed

@@ -1,20 +1,12 @@
 using System.Collections.Generic;
 using UnityEngine;
-using Unity.Sentis;
-using System.IO;
-//                      Jets Text-To-Speech Inference
-//                      =============================
-//
-// This file implements the Jets Text-to-speech model in Unity Sentis
-// The model uses phenomes instead of raw text so you have to convert it first.
-// Place this file on the Main Camera
-// Add an audio source
-// Change the inputText
-// When running you can press space bar to play it again
 public class RunJets : MonoBehaviour
 {
     public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
     //string inputText = "The quick brown fox jumped over the lazy dog";
     //string inputText = "There are many uses of the things she uses!";
@@ -22,21 +14,23 @@ public class RunJets : MonoBehaviour
     //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
     bool hasPhenomeDictionary = true;
-    readonly string[] phonemes = new string[] {
-        "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
-        "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B",
-        "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
-        "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
-        "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
-        "AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0",
-        "UH0", "OY0", "..", "<sos/eos>" };
     readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
     //Can change pitch and speed with this for a slightly different voice:
     const int samplerate = 22050;
-    Dictionary<string, string> dict = new ();
     Worker worker;
@@ -51,7 +45,7 @@ public class RunJets : MonoBehaviour
     void LoadModel()
     {
-        var model = ModelLoader.Load(Path.Join(Application.streamingAssetsPath, "jets-text-to-speech.sentis"));
         worker = new Worker(model, BackendType.GPUCompute);
     }
@@ -76,10 +70,12 @@ public class RunJets : MonoBehaviour
     void ReadDictionary()
     {
         if (!hasPhenomeDictionary) return;
-        string[] words = File.ReadAllLines(Path.Join(Application.streamingAssetsPath,"phoneme_dict.txt"));
         for (int i = 0; i < words.Length; i++)
         {
             string s = words[i];
             string[] parts = s.Split();
             if (parts[0] != ";;;") //ignore comments in file
             {
@@ -93,7 +89,7 @@ public class RunJets : MonoBehaviour
         dict.Add("!", "!");
         dict.Add("?", "?");
         dict.Add("\"", "\"");
-        // You could add extra word pronounciations here e.g.
         //dict.Add("somenewword","[phonemes]");
     }
@@ -126,15 +122,15 @@ public class RunJets : MonoBehaviour
     }
     //Decode the word into phenomes by looking for the longest word in the dictionary that matches
-    //the first part of the word and so on.
     //This works fairly well but could be improved. The original paper had a model that
     //dealt with guessing the phonemes of words
     public string DecodeWord(string word)
     {
         string output = "";
         int start = 0;
-        for (int end = word.Length; end >= 0 && start < word.Length ; end--)
-        {
             if (end <= start) //no matches
             {
                 start++;
@@ -151,20 +147,20 @@ public class RunJets : MonoBehaviour
         }
         return output;
     }
     int[] GetTokens(string ptext)
     {
         string[] p = ptext.Split();
         var tokens = new int[p.Length];
         for (int i = 0; i < tokens.Length; i++)
         {
-            tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i]));
         }
         return tokens;
     }
     public void DoInference(string ptext)
-    {
         int[] tokens = GetTokens(ptext);
         using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
@@ -180,7 +176,8 @@ public class RunJets : MonoBehaviour
         Speak();
     }
-    private void Speak()
     {
         AudioSource audioSource = GetComponent<AudioSource>();
         if (audioSource != null)
@@ -202,7 +199,7 @@ public class RunJets : MonoBehaviour
         }
     }
-    private void OnDestroy()
     {
         worker?.Dispose();
     }

+using System;
 using System.Collections.Generic;
+using Unity.InferenceEngine;
 using UnityEngine;
 public class RunJets : MonoBehaviour
 {
+    public ModelAsset modelAsset;
+    public TextAsset phonemeAsset;
     public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
     //string inputText = "The quick brown fox jumped over the lazy dog";
     //string inputText = "There are many uses of the things she uses!";
     //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
     bool hasPhenomeDictionary = true;
+    readonly string[] phonemes =
+    {
+        "<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
+        "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B",
+        "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
+        "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
+        "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
+        "AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0",
+        "UH0", "OY0", "..", "<sos/eos>"
+    };
     readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
     //Can change pitch and speed with this for a slightly different voice:
     const int samplerate = 22050;
+    Dictionary<string, string> dict = new();
     Worker worker;
     void LoadModel()
     {
+        var model = ModelLoader.Load(modelAsset);
         worker = new Worker(model, BackendType.GPUCompute);
     }
     void ReadDictionary()
     {
         if (!hasPhenomeDictionary) return;
+        string[] words = phonemeAsset.text.Split("\r\n");
         for (int i = 0; i < words.Length; i++)
         {
             string s = words[i];
+            if (string.IsNullOrWhiteSpace(s))
+                continue;
             string[] parts = s.Split();
             if (parts[0] != ";;;") //ignore comments in file
             {
         dict.Add("!", "!");
         dict.Add("?", "?");
         dict.Add("\"", "\"");
+        // You could add extra word pronunciations here e.g.
         //dict.Add("somenewword","[phonemes]");
     }
     }
     //Decode the word into phenomes by looking for the longest word in the dictionary that matches
+    //the first part of the word and so on.
     //This works fairly well but could be improved. The original paper had a model that
     //dealt with guessing the phonemes of words
     public string DecodeWord(string word)
     {
         string output = "";
         int start = 0;
+        for (int end = word.Length; end >= 0 && start < word.Length; end--)
+        {
             if (end <= start) //no matches
             {
                 start++;
         }
         return output;
     }
     int[] GetTokens(string ptext)
     {
         string[] p = ptext.Split();
         var tokens = new int[p.Length];
         for (int i = 0; i < tokens.Length; i++)
         {
+            tokens[i] = Mathf.Max(0, Array.IndexOf(phonemes, p[i]));
         }
         return tokens;
     }
     public void DoInference(string ptext)
+    {
         int[] tokens = GetTokens(ptext);
         using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
         Speak();
     }
+    void Speak()
     {
         AudioSource audioSource = GetComponent<AudioSource>();
         if (audioSource != null)
         }
     }
+    void OnDestroy()
     {
         worker?.Dispose();
     }

phoneme_dict.txt → data/phoneme_dict.txt RENAMED Viewed

File without changes

info.json CHANGED Viewed

@@ -3,13 +3,12 @@
         "RunJets.cs"
     ],
     "models": [
-        "jets-text-to-speech.onnx",
-        "jets-text-to-speech.sentis"
     ],
     "data": [
-        "phoneme_dict.txt"
     ],
     "version": [
-        "2.1.2"
     ]
 }

         "RunJets.cs"
     ],
     "models": [
+        "models/jets-text-to-speech.onnx"
     ],
     "data": [
+        "data/phoneme_dict.txt"
     ],
     "version": [
+        "2.2.0"
     ]
 }

jets-text-to-speech.sentis DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:405a4e8d24b07142ac6109b653750b5d97ec720bbea41c5e76838e5f30ec5c70
-size 138331240

jets-text-to-speech.onnx → models/jets-text-to-speech.onnx RENAMED Viewed

File without changes