Onsei-Tukuri / app.py
Sakalti's picture
Update app.py
bc7b359 verified
import gradio as gr
from gtts import gTTS
from pydub import AudioSegment
import tempfile
import os
import numpy as np
# テンプレート設定
TEMPLATES = {
"パラオ高め(ポーランドボール風)": {"rate": 180, "volume": 1.0},
"低めのナレーター": {"rate": 120, "volume": 0.8},
"普通の話し方": {"rate": 150, "volume": 1.0},
"元気な女の子": {"rate": 180, "volume": 1.2},
"落ち着いた男性": {"rate": 130, "volume": 0.9},
"ロボット風(機械的)": {"rate": 140, "volume": 1.0},
"さっぱりした女性": {"rate": 160, "volume": 1.1},
"しっとりした声": {"rate": 140, "volume": 0.9},
"おじさん風": {"rate": 60, "volume": 0.75},
"怒った声": {"rate": 45, "volume": 0.9},
}
EFFECTS = ["なし", "ふわふわ化", "かちかち化", "減衰", "リバーブ", "音揺れ"]
def generate_tts(text, template_name, pitch_factor=1.0, speed_factor=1.0, effect_type="なし", effect_strength=1.0):
# テンプレートの設定を反映
template = TEMPLATES.get(template_name, {"rate": 150, "volume": 1.0})
rate = template["rate"] * speed_factor # 速度調整
volume = template["volume"] # ボリューム調整
# 音声合成(Gtts使用)
tts = gTTS(text=text, lang='ja')
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as f:
tts_path = f.name
tts.save(tts_path)
# 音声読み込み
sound = AudioSegment.from_mp3(tts_path)
# ピッチ変更
sound = change_pitch(sound, pitch_factor)
# 速度変更
sound = change_speed(sound, rate / 100) # 速度が「%」であることを考慮
# エフェクト適用
sound = apply_effect(sound, effect_type, effect_strength)
# 一時ファイルに保存
output_path = tts_path.replace(".mp3", "_modified.mp3")
sound.export(output_path, format="mp3")
return output_path
def change_pitch(sound, factor):
new_frame_rate = int(sound.frame_rate * factor)
pitched_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
return pitched_sound.set_frame_rate(44100)
def change_speed(sound, speed=1.0):
new_frame_rate = int(sound.frame_rate * speed)
sped_up_sound = sound._spawn(sound.raw_data, overrides={"frame_rate": new_frame_rate})
return sped_up_sound.set_frame_rate(44100)
def apply_effect(sound, effect_type, effect_strength):
if effect_type == "ふわふわ化":
return sound.low_pass_filter(1000 * effect_strength)
elif effect_type == "かちかち化":
return sound.high_pass_filter(3000 * effect_strength)
elif effect_type == "減衰":
return sound.fade_out(int(len(sound) * effect_strength))
elif effect_type == "リバーブ":
reversed_sound = sound.reverse()
faded = reversed_sound.fade_in(200 * effect_strength).fade_out(200 * effect_strength)
return (sound + faded.reverse()) - (10 * effect_strength)
elif effect_type == "音揺れ":
return wobble(sound, effect_strength)
else:
return sound
def wobble(sound, strength):
# 0.1秒ごとにランダムにピッチを揺らす(揺れを強くする)
chunk_ms = 100
chunks = [sound[i:i+chunk_ms] for i in range(0, len(sound), chunk_ms)]
wobbled = AudioSegment.empty()
for chunk in chunks:
pitch_shift = np.random.uniform(1 - 0.05 * strength, 1 + 0.05 * strength) # 強めの揺れ
chunk = change_pitch(chunk, pitch_shift)
wobbled += chunk
return wobbled
with gr.Blocks() as app:
gr.Markdown("# オリジナル声読み上げ機")
with gr.Row():
text_input = gr.Textbox(label="読み上げるテキスト", lines=2, placeholder="ここに入力...")
with gr.Row():
template_dropdown = gr.Dropdown(choices=list(TEMPLATES.keys()), value="パラオ高め(ポーランドボール風)", label="テンプレートを選ぶ")
with gr.Row():
pitch_slider = gr.Slider(0.1, 5.0, value=1.0, step=0.05, label="ピッチ倍率(高く・低く)")
speed_slider = gr.Slider(0.1, 5.0, value=1.0, step=0.05, label="速度倍率(速く・遅く)")
with gr.Row():
effect_dropdown = gr.Dropdown(choices=EFFECTS, value="なし", label="エフェクトを選ぶ")
effect_strength_slider = gr.Slider(0.1, 10.0, value=1.0, step=0.05, label="エフェクト強さ")
with gr.Row():
submit_btn = gr.Button("生成する")
audio_output = gr.Audio(label="出力音声", type="filepath")
submit_btn.click(
fn=generate_tts,
inputs=[text_input, template_dropdown, pitch_slider, speed_slider, effect_dropdown, effect_strength_slider],
outputs=audio_output
)
app.launch()