Spaces:

barbaroo
/

English-Faroese

Running on Zero

App Files Files Community

English-Faroese / app.py

barbaroo

Update app.py

b8bccd5 verified about 1 year ago

raw

history blame contribute delete

3.11 kB

	import gradio as gr
	from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
	import torch
	import re
	import spaces # Import spaces for ZeroGPU compatibility

	# Load the models and tokenizers for each translation direction
	# Faroese to English
	model_faero_eng = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_600M_fo_en")
	tokenizer_faero_eng = AutoTokenizer.from_pretrained("barbaroo/nllb_200_600M_fo_en", src_lang="fao_Latn")

	# English to Faroese
	model_eng_faero = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_1.3B_en_fo")
	tokenizer_eng_faero = AutoTokenizer.from_pretrained("barbaroo/nllb_200_1.3B_en_fo", src_lang="eng_Latn")

	# Check if GPU is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_faero_eng.to(device)
	model_eng_faero.to(device)

	# Function to split text into sentences based on simple punctuation
	def chunk_text_simple(text, max_length, tokenizer):
	# Split by punctuation (period, question mark, or exclamation mark)
	sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.\|\?\|\!)\s', text)
	chunks = []
	current_chunk = ""

	for sentence in sentences:
	# Combine sentences until adding more would exceed max_length
	if len(tokenizer.encode(current_chunk + " " + sentence)) <= max_length:
	current_chunk += " " + sentence
	else:
	chunks.append(current_chunk.strip())
	current_chunk = sentence

	if current_chunk:
	chunks.append(current_chunk.strip())

	return chunks

	@spaces.GPU # Apply ZeroGPU decorator to ensure the function uses GPU when available
	def translate_long_text(text, direction, max_length=256): # Reduce max_length to leave room for output
	# Select the appropriate model and tokenizer
	if direction == "Faroese to English":
	model = model_faero_eng
	tokenizer = tokenizer_faero_eng
	else:
	model = model_eng_faero
	tokenizer = tokenizer_eng_faero

	# Chunk the text based on max token limit
	chunks = chunk_text_simple(text, max_length, tokenizer)

	translated_chunks = []
	for chunk in chunks:
	# Encode and translate each chunk
	inputs = tokenizer(chunk, return_tensors="pt", max_length=max_length, truncation=True).to(device)
	outputs = model.generate(inputs.input_ids, num_beams=4, max_length=max_length, early_stopping=True)
	translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	translated_chunks.append(translated_text)

	# Combine translated chunks
	return " ".join(translated_chunks)

	# Gradio interface with scrollable output box
	iface = gr.Interface(
	fn=translate_long_text,
	inputs=[
	gr.Textbox(label="Input Text"),
	gr.Radio(["Faroese to English", "English to Faroese"], label="Translation Direction")
	],
	outputs=gr.Textbox(label="Translated Text", lines=20), # Scrollable output box
	title="Faroese-English Translator",
	description="Translate between Faroese and English with support for longer texts."
	)

	# Launch Gradio Space
	iface.launch()