update readme
Browse files- README.md +6 -21
- assets/demo.wav +3 -0
README.md
CHANGED
|
@@ -1013,11 +1013,12 @@ def get_video_chunk_content(video_path, flatten=True):
|
|
| 1013 |
return contents
|
| 1014 |
|
| 1015 |
video_path="/path/to/video"
|
| 1016 |
-
sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
| 1017 |
# if use voice clone prompt, please set ref_audio
|
| 1018 |
-
|
| 1019 |
-
|
| 1020 |
-
|
|
|
|
|
|
|
| 1021 |
|
| 1022 |
contents = get_video_chunk_content(video_path)
|
| 1023 |
msg = {"role":"user", "content": contents}
|
|
@@ -1122,7 +1123,7 @@ res = model.chat(
|
|
| 1122 |
<details> <summary>Click to view the Python code for enabling MiniCPM-o 2.6 to interact with you in a specified voice.</summary>
|
| 1123 |
|
| 1124 |
```python
|
| 1125 |
-
ref_audio, _ = librosa.load('
|
| 1126 |
|
| 1127 |
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.
|
| 1128 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
|
@@ -1135,14 +1136,10 @@ user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, m
|
|
| 1135 |
```python
|
| 1136 |
msgs = [sys_prompt, user_question]
|
| 1137 |
res = model.chat(
|
| 1138 |
-
image=None,
|
| 1139 |
msgs=msgs,
|
| 1140 |
-
context=None,
|
| 1141 |
tokenizer=tokenizer,
|
| 1142 |
sampling=True,
|
| 1143 |
max_new_tokens=128,
|
| 1144 |
-
stream=False,
|
| 1145 |
-
stream_input=True,
|
| 1146 |
use_tts_template=True,
|
| 1147 |
generate_audio=True,
|
| 1148 |
temperature=0.3,
|
|
@@ -1154,14 +1151,10 @@ history = msgs.append({'role': 'assistant', 'content': res})
|
|
| 1154 |
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
| 1155 |
msgs = history.append(user_question)
|
| 1156 |
res = model.chat(
|
| 1157 |
-
image=None,
|
| 1158 |
msgs=msgs,
|
| 1159 |
-
context=None,
|
| 1160 |
tokenizer=tokenizer,
|
| 1161 |
sampling=True,
|
| 1162 |
max_new_tokens=128,
|
| 1163 |
-
stream=False,
|
| 1164 |
-
stream_input=True,
|
| 1165 |
use_tts_template=True,
|
| 1166 |
generate_audio=True,
|
| 1167 |
temperature=0.3,
|
|
@@ -1193,14 +1186,10 @@ audio_input, _ = librosa.load('xxx.wav', sr=16000, mono=True)
|
|
| 1193 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
| 1194 |
|
| 1195 |
res = model.chat(
|
| 1196 |
-
image=None,
|
| 1197 |
msgs=msgs,
|
| 1198 |
-
context=None,
|
| 1199 |
tokenizer=tokenizer,
|
| 1200 |
sampling=True,
|
| 1201 |
max_new_tokens=128,
|
| 1202 |
-
stream=False,
|
| 1203 |
-
stream_input=True,
|
| 1204 |
use_tts_template=True,
|
| 1205 |
generate_audio=True,
|
| 1206 |
temperature=0.3,
|
|
@@ -1230,14 +1219,10 @@ msgs = [{'role': 'user', 'content': [task_prompt]}] # you can try to use the sam
|
|
| 1230 |
|
| 1231 |
msgs = [sys_prompt, user_question]
|
| 1232 |
res = model.chat(
|
| 1233 |
-
image=None,
|
| 1234 |
msgs=msgs,
|
| 1235 |
-
context=None,
|
| 1236 |
tokenizer=tokenizer,
|
| 1237 |
sampling=True,
|
| 1238 |
max_new_tokens=128,
|
| 1239 |
-
stream=False,
|
| 1240 |
-
stream_input=True,
|
| 1241 |
use_tts_template=True,
|
| 1242 |
generate_audio=True,
|
| 1243 |
temperature=0.3,
|
|
|
|
| 1013 |
return contents
|
| 1014 |
|
| 1015 |
video_path="/path/to/video"
|
|
|
|
| 1016 |
# if use voice clone prompt, please set ref_audio
|
| 1017 |
+
ref_audio_path = 'assets/demo.wav'
|
| 1018 |
+
ref_audio, _ = librosa.load(ref_audio_path, sr=16000, mono=True)
|
| 1019 |
+
sys_msg = model.get_sys_prompt(ref_audio=ref_audio, mode='omni', language='en')
|
| 1020 |
+
# or use default prompt
|
| 1021 |
+
# sys_msg = model.get_sys_prompt(mode='omni', language='en')
|
| 1022 |
|
| 1023 |
contents = get_video_chunk_content(video_path)
|
| 1024 |
msg = {"role":"user", "content": contents}
|
|
|
|
| 1123 |
<details> <summary>Click to view the Python code for enabling MiniCPM-o 2.6 to interact with you in a specified voice.</summary>
|
| 1124 |
|
| 1125 |
```python
|
| 1126 |
+
ref_audio, _ = librosa.load('assets/demo.wav', sr=16000, mono=True) # load the reference audio
|
| 1127 |
|
| 1128 |
# Audio RolePlay: # With this mode, model will role-play the character based on the audio prompt.
|
| 1129 |
sys_prompt = model.get_sys_prompt(ref_audio=ref_audio, mode='audio_roleplay', language='en')
|
|
|
|
| 1136 |
```python
|
| 1137 |
msgs = [sys_prompt, user_question]
|
| 1138 |
res = model.chat(
|
|
|
|
| 1139 |
msgs=msgs,
|
|
|
|
| 1140 |
tokenizer=tokenizer,
|
| 1141 |
sampling=True,
|
| 1142 |
max_new_tokens=128,
|
|
|
|
|
|
|
| 1143 |
use_tts_template=True,
|
| 1144 |
generate_audio=True,
|
| 1145 |
temperature=0.3,
|
|
|
|
| 1151 |
user_question = {'role': 'user', 'content': [librosa.load('xxx.wav', sr=16000, mono=True)[0]]}
|
| 1152 |
msgs = history.append(user_question)
|
| 1153 |
res = model.chat(
|
|
|
|
| 1154 |
msgs=msgs,
|
|
|
|
| 1155 |
tokenizer=tokenizer,
|
| 1156 |
sampling=True,
|
| 1157 |
max_new_tokens=128,
|
|
|
|
|
|
|
| 1158 |
use_tts_template=True,
|
| 1159 |
generate_audio=True,
|
| 1160 |
temperature=0.3,
|
|
|
|
| 1186 |
msgs = [{'role': 'user', 'content': [task_prompt,audio_input]}]
|
| 1187 |
|
| 1188 |
res = model.chat(
|
|
|
|
| 1189 |
msgs=msgs,
|
|
|
|
| 1190 |
tokenizer=tokenizer,
|
| 1191 |
sampling=True,
|
| 1192 |
max_new_tokens=128,
|
|
|
|
|
|
|
| 1193 |
use_tts_template=True,
|
| 1194 |
generate_audio=True,
|
| 1195 |
temperature=0.3,
|
|
|
|
| 1219 |
|
| 1220 |
msgs = [sys_prompt, user_question]
|
| 1221 |
res = model.chat(
|
|
|
|
| 1222 |
msgs=msgs,
|
|
|
|
| 1223 |
tokenizer=tokenizer,
|
| 1224 |
sampling=True,
|
| 1225 |
max_new_tokens=128,
|
|
|
|
|
|
|
| 1226 |
use_tts_template=True,
|
| 1227 |
generate_audio=True,
|
| 1228 |
temperature=0.3,
|
assets/demo.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0b347d8ed0b2314c0d175fbdac79f6f3f91a6402bd7492ac5c860646a2ba309
|
| 3 |
+
size 1454196
|