Commit
·
bb35590
1
Parent(s):
148e5e2
updated to inference engine
Browse files- README.md +17 -14
- RunJets.cs +29 -32
- phoneme_dict.txt → data/phoneme_dict.txt +0 -0
- info.json +3 -4
- jets-text-to-speech.sentis +0 -3
- jets-text-to-speech.onnx → models/jets-text-to-speech.onnx +0 -0
README.md
CHANGED
|
@@ -1,25 +1,28 @@
|
|
| 1 |
---
|
| 2 |
license: cc-by-4.0
|
| 3 |
library_name: unity-sentis
|
|
|
|
|
|
|
| 4 |
---
|
| 5 |
|
| 6 |
-
# Jets
|
| 7 |
|
| 8 |
-
This is
|
| 9 |
|
| 10 |
## How to Use
|
| 11 |
-
|
| 12 |
-
*
|
| 13 |
-
*
|
| 14 |
-
*
|
| 15 |
-
* Add an AudioSource component
|
| 16 |
-
*
|
| 17 |
-
*
|
| 18 |
-
|
| 19 |
-
##
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
|
|
|
| 23 |
|
| 24 |
## License
|
| 25 |
Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.
|
|
|
|
| 1 |
---
|
| 2 |
license: cc-by-4.0
|
| 3 |
library_name: unity-sentis
|
| 4 |
+
tags:
|
| 5 |
+
- unity-inference-engine
|
| 6 |
---
|
| 7 |
|
| 8 |
+
# Jets in Unity 6 using Inference Engine
|
| 9 |
|
| 10 |
+
This is the [Jets](https://huggingface.co/imdanboy/jets) model running in Unity 6 with Inference Engine. It text-to-speech model that takes phonemes as an input and outputs wav data of a voice speaking the text.
|
| 11 |
|
| 12 |
## How to Use
|
| 13 |
+
|
| 14 |
+
* Create a new scene in Unity 6;
|
| 15 |
+
* Install `com.unity.ai.inference` from the package manager;
|
| 16 |
+
* Add the `RunJets.cs` script to the Main Camera;
|
| 17 |
+
* Add an AudioSource component to the Main Camera;
|
| 18 |
+
* Drag the `jets-text-to-speech.onnx` file from the `models` folder into the `Model Asset` field;
|
| 19 |
+
* Drag the `phoneme_dict.txt` file from the `data` folder into the `Phoneme Asset` field;
|
| 20 |
+
|
| 21 |
+
## Preview
|
| 22 |
+
Enter play mode. If working correctly you should hear the inferred audio of the voice.
|
| 23 |
+
|
| 24 |
+
## Inference Engine
|
| 25 |
+
Inference Engine is a neural network inference library for Unity. Find out more [here](https://docs.unity3d.com/Packages/com.unity.ai.inference@latest).
|
| 26 |
|
| 27 |
## License
|
| 28 |
Attribution for the original creators is required. See[Jets](https://huggingface.co/imdanboy/jets) for more details.
|
RunJets.cs
CHANGED
|
@@ -1,20 +1,12 @@
|
|
|
|
|
| 1 |
using System.Collections.Generic;
|
|
|
|
| 2 |
using UnityEngine;
|
| 3 |
-
using Unity.Sentis;
|
| 4 |
-
using System.IO;
|
| 5 |
-
|
| 6 |
-
// Jets Text-To-Speech Inference
|
| 7 |
-
// =============================
|
| 8 |
-
//
|
| 9 |
-
// This file implements the Jets Text-to-speech model in Unity Sentis
|
| 10 |
-
// The model uses phenomes instead of raw text so you have to convert it first.
|
| 11 |
-
// Place this file on the Main Camera
|
| 12 |
-
// Add an audio source
|
| 13 |
-
// Change the inputText
|
| 14 |
-
// When running you can press space bar to play it again
|
| 15 |
|
| 16 |
public class RunJets : MonoBehaviour
|
| 17 |
{
|
|
|
|
|
|
|
| 18 |
public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
|
| 19 |
//string inputText = "The quick brown fox jumped over the lazy dog";
|
| 20 |
//string inputText = "There are many uses of the things she uses!";
|
|
@@ -22,21 +14,23 @@ public class RunJets : MonoBehaviour
|
|
| 22 |
//Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
|
| 23 |
bool hasPhenomeDictionary = true;
|
| 24 |
|
| 25 |
-
readonly string[] phonemes =
|
| 26 |
-
|
| 27 |
-
"
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
|
|
|
|
|
|
| 33 |
|
| 34 |
readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
|
| 35 |
|
| 36 |
//Can change pitch and speed with this for a slightly different voice:
|
| 37 |
const int samplerate = 22050;
|
| 38 |
|
| 39 |
-
Dictionary<string, string> dict = new
|
| 40 |
|
| 41 |
Worker worker;
|
| 42 |
|
|
@@ -51,7 +45,7 @@ public class RunJets : MonoBehaviour
|
|
| 51 |
|
| 52 |
void LoadModel()
|
| 53 |
{
|
| 54 |
-
var model = ModelLoader.Load(
|
| 55 |
worker = new Worker(model, BackendType.GPUCompute);
|
| 56 |
}
|
| 57 |
|
|
@@ -76,10 +70,12 @@ public class RunJets : MonoBehaviour
|
|
| 76 |
void ReadDictionary()
|
| 77 |
{
|
| 78 |
if (!hasPhenomeDictionary) return;
|
| 79 |
-
string[] words =
|
| 80 |
for (int i = 0; i < words.Length; i++)
|
| 81 |
{
|
| 82 |
string s = words[i];
|
|
|
|
|
|
|
| 83 |
string[] parts = s.Split();
|
| 84 |
if (parts[0] != ";;;") //ignore comments in file
|
| 85 |
{
|
|
@@ -93,7 +89,7 @@ public class RunJets : MonoBehaviour
|
|
| 93 |
dict.Add("!", "!");
|
| 94 |
dict.Add("?", "?");
|
| 95 |
dict.Add("\"", "\"");
|
| 96 |
-
// You could add extra word
|
| 97 |
//dict.Add("somenewword","[phonemes]");
|
| 98 |
}
|
| 99 |
|
|
@@ -126,15 +122,15 @@ public class RunJets : MonoBehaviour
|
|
| 126 |
}
|
| 127 |
|
| 128 |
//Decode the word into phenomes by looking for the longest word in the dictionary that matches
|
| 129 |
-
//the first part of the word and so on.
|
| 130 |
//This works fairly well but could be improved. The original paper had a model that
|
| 131 |
//dealt with guessing the phonemes of words
|
| 132 |
public string DecodeWord(string word)
|
| 133 |
{
|
| 134 |
string output = "";
|
| 135 |
int start = 0;
|
| 136 |
-
for (int end = word.Length; end >= 0 && start < word.Length
|
| 137 |
-
{
|
| 138 |
if (end <= start) //no matches
|
| 139 |
{
|
| 140 |
start++;
|
|
@@ -151,20 +147,20 @@ public class RunJets : MonoBehaviour
|
|
| 151 |
}
|
| 152 |
return output;
|
| 153 |
}
|
| 154 |
-
|
| 155 |
int[] GetTokens(string ptext)
|
| 156 |
{
|
| 157 |
string[] p = ptext.Split();
|
| 158 |
var tokens = new int[p.Length];
|
| 159 |
for (int i = 0; i < tokens.Length; i++)
|
| 160 |
{
|
| 161 |
-
tokens[i] = Mathf.Max(0,
|
| 162 |
}
|
| 163 |
return tokens;
|
| 164 |
}
|
| 165 |
|
| 166 |
public void DoInference(string ptext)
|
| 167 |
-
{
|
| 168 |
int[] tokens = GetTokens(ptext);
|
| 169 |
|
| 170 |
using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
|
|
@@ -180,7 +176,8 @@ public class RunJets : MonoBehaviour
|
|
| 180 |
|
| 181 |
Speak();
|
| 182 |
}
|
| 183 |
-
|
|
|
|
| 184 |
{
|
| 185 |
AudioSource audioSource = GetComponent<AudioSource>();
|
| 186 |
if (audioSource != null)
|
|
@@ -202,7 +199,7 @@ public class RunJets : MonoBehaviour
|
|
| 202 |
}
|
| 203 |
}
|
| 204 |
|
| 205 |
-
|
| 206 |
{
|
| 207 |
worker?.Dispose();
|
| 208 |
}
|
|
|
|
| 1 |
+
using System;
|
| 2 |
using System.Collections.Generic;
|
| 3 |
+
using Unity.InferenceEngine;
|
| 4 |
using UnityEngine;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
public class RunJets : MonoBehaviour
|
| 7 |
{
|
| 8 |
+
public ModelAsset modelAsset;
|
| 9 |
+
public TextAsset phonemeAsset;
|
| 10 |
public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods.";
|
| 11 |
//string inputText = "The quick brown fox jumped over the lazy dog";
|
| 12 |
//string inputText = "There are many uses of the things she uses!";
|
|
|
|
| 14 |
//Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder
|
| 15 |
bool hasPhenomeDictionary = true;
|
| 16 |
|
| 17 |
+
readonly string[] phonemes =
|
| 18 |
+
{
|
| 19 |
+
"<blank>", "<unk>", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1",
|
| 20 |
+
"IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", ",", "AA1", "B",
|
| 21 |
+
"HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G",
|
| 22 |
+
"ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2",
|
| 23 |
+
"AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2",
|
| 24 |
+
"AH2", "AY0", "IY2", "AW2", "AA0", "\"", "ER2", "UH2", "?", "OY2", "!", "AW0",
|
| 25 |
+
"UH0", "OY0", "..", "<sos/eos>"
|
| 26 |
+
};
|
| 27 |
|
| 28 |
readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' ');
|
| 29 |
|
| 30 |
//Can change pitch and speed with this for a slightly different voice:
|
| 31 |
const int samplerate = 22050;
|
| 32 |
|
| 33 |
+
Dictionary<string, string> dict = new();
|
| 34 |
|
| 35 |
Worker worker;
|
| 36 |
|
|
|
|
| 45 |
|
| 46 |
void LoadModel()
|
| 47 |
{
|
| 48 |
+
var model = ModelLoader.Load(modelAsset);
|
| 49 |
worker = new Worker(model, BackendType.GPUCompute);
|
| 50 |
}
|
| 51 |
|
|
|
|
| 70 |
void ReadDictionary()
|
| 71 |
{
|
| 72 |
if (!hasPhenomeDictionary) return;
|
| 73 |
+
string[] words = phonemeAsset.text.Split("\r\n");
|
| 74 |
for (int i = 0; i < words.Length; i++)
|
| 75 |
{
|
| 76 |
string s = words[i];
|
| 77 |
+
if (string.IsNullOrWhiteSpace(s))
|
| 78 |
+
continue;
|
| 79 |
string[] parts = s.Split();
|
| 80 |
if (parts[0] != ";;;") //ignore comments in file
|
| 81 |
{
|
|
|
|
| 89 |
dict.Add("!", "!");
|
| 90 |
dict.Add("?", "?");
|
| 91 |
dict.Add("\"", "\"");
|
| 92 |
+
// You could add extra word pronunciations here e.g.
|
| 93 |
//dict.Add("somenewword","[phonemes]");
|
| 94 |
}
|
| 95 |
|
|
|
|
| 122 |
}
|
| 123 |
|
| 124 |
//Decode the word into phenomes by looking for the longest word in the dictionary that matches
|
| 125 |
+
//the first part of the word and so on.
|
| 126 |
//This works fairly well but could be improved. The original paper had a model that
|
| 127 |
//dealt with guessing the phonemes of words
|
| 128 |
public string DecodeWord(string word)
|
| 129 |
{
|
| 130 |
string output = "";
|
| 131 |
int start = 0;
|
| 132 |
+
for (int end = word.Length; end >= 0 && start < word.Length; end--)
|
| 133 |
+
{
|
| 134 |
if (end <= start) //no matches
|
| 135 |
{
|
| 136 |
start++;
|
|
|
|
| 147 |
}
|
| 148 |
return output;
|
| 149 |
}
|
| 150 |
+
|
| 151 |
int[] GetTokens(string ptext)
|
| 152 |
{
|
| 153 |
string[] p = ptext.Split();
|
| 154 |
var tokens = new int[p.Length];
|
| 155 |
for (int i = 0; i < tokens.Length; i++)
|
| 156 |
{
|
| 157 |
+
tokens[i] = Mathf.Max(0, Array.IndexOf(phonemes, p[i]));
|
| 158 |
}
|
| 159 |
return tokens;
|
| 160 |
}
|
| 161 |
|
| 162 |
public void DoInference(string ptext)
|
| 163 |
+
{
|
| 164 |
int[] tokens = GetTokens(ptext);
|
| 165 |
|
| 166 |
using var input = new Tensor<int>(new TensorShape(tokens.Length), tokens);
|
|
|
|
| 176 |
|
| 177 |
Speak();
|
| 178 |
}
|
| 179 |
+
|
| 180 |
+
void Speak()
|
| 181 |
{
|
| 182 |
AudioSource audioSource = GetComponent<AudioSource>();
|
| 183 |
if (audioSource != null)
|
|
|
|
| 199 |
}
|
| 200 |
}
|
| 201 |
|
| 202 |
+
void OnDestroy()
|
| 203 |
{
|
| 204 |
worker?.Dispose();
|
| 205 |
}
|
phoneme_dict.txt → data/phoneme_dict.txt
RENAMED
|
File without changes
|
info.json
CHANGED
|
@@ -3,13 +3,12 @@
|
|
| 3 |
"RunJets.cs"
|
| 4 |
],
|
| 5 |
"models": [
|
| 6 |
-
"jets-text-to-speech.onnx"
|
| 7 |
-
"jets-text-to-speech.sentis"
|
| 8 |
],
|
| 9 |
"data": [
|
| 10 |
-
"phoneme_dict.txt"
|
| 11 |
],
|
| 12 |
"version": [
|
| 13 |
-
"2.
|
| 14 |
]
|
| 15 |
}
|
|
|
|
| 3 |
"RunJets.cs"
|
| 4 |
],
|
| 5 |
"models": [
|
| 6 |
+
"models/jets-text-to-speech.onnx"
|
|
|
|
| 7 |
],
|
| 8 |
"data": [
|
| 9 |
+
"data/phoneme_dict.txt"
|
| 10 |
],
|
| 11 |
"version": [
|
| 12 |
+
"2.2.0"
|
| 13 |
]
|
| 14 |
}
|
jets-text-to-speech.sentis
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:405a4e8d24b07142ac6109b653750b5d97ec720bbea41c5e76838e5f30ec5c70
|
| 3 |
-
size 138331240
|
|
|
|
|
|
|
|
|
|
|
|
jets-text-to-speech.onnx → models/jets-text-to-speech.onnx
RENAMED
|
File without changes
|