Spaces:
Running
on
Zero
Running
on
Zero
| import gradio as gr | |
| from transformers import AutoModelForSeq2SeqLM, AutoTokenizer | |
| import torch | |
| import re | |
| import spaces # Import spaces for ZeroGPU compatibility | |
| # Load the models and tokenizers for each translation direction | |
| # Faroese to English | |
| model_faero_eng = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_600M_fo_en") | |
| tokenizer_faero_eng = AutoTokenizer.from_pretrained("barbaroo/nllb_200_600M_fo_en", src_lang="fao_Latn") | |
| # English to Faroese | |
| model_eng_faero = AutoModelForSeq2SeqLM.from_pretrained("barbaroo/nllb_200_1.3B_en_fo") | |
| tokenizer_eng_faero = AutoTokenizer.from_pretrained("barbaroo/nllb_200_1.3B_en_fo", src_lang="eng_Latn") | |
| # Check if GPU is available | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model_faero_eng.to(device) | |
| model_eng_faero.to(device) | |
| # Function to split text into sentences based on simple punctuation | |
| def chunk_text_simple(text, max_length, tokenizer): | |
| # Split by punctuation (period, question mark, or exclamation mark) | |
| sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|\!)\s', text) | |
| chunks = [] | |
| current_chunk = "" | |
| for sentence in sentences: | |
| # Combine sentences until adding more would exceed max_length | |
| if len(tokenizer.encode(current_chunk + " " + sentence)) <= max_length: | |
| current_chunk += " " + sentence | |
| else: | |
| chunks.append(current_chunk.strip()) | |
| current_chunk = sentence | |
| if current_chunk: | |
| chunks.append(current_chunk.strip()) | |
| return chunks | |
| # Apply ZeroGPU decorator to ensure the function uses GPU when available | |
| def translate_long_text(text, direction, max_length=256): # Reduce max_length to leave room for output | |
| # Select the appropriate model and tokenizer | |
| if direction == "Faroese to English": | |
| model = model_faero_eng | |
| tokenizer = tokenizer_faero_eng | |
| else: | |
| model = model_eng_faero | |
| tokenizer = tokenizer_eng_faero | |
| # Chunk the text based on max token limit | |
| chunks = chunk_text_simple(text, max_length, tokenizer) | |
| translated_chunks = [] | |
| for chunk in chunks: | |
| # Encode and translate each chunk | |
| inputs = tokenizer(chunk, return_tensors="pt", max_length=max_length, truncation=True).to(device) | |
| outputs = model.generate(inputs.input_ids, num_beams=4, max_length=max_length, early_stopping=True) | |
| translated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| translated_chunks.append(translated_text) | |
| # Combine translated chunks | |
| return " ".join(translated_chunks) | |
| # Gradio interface with scrollable output box | |
| iface = gr.Interface( | |
| fn=translate_long_text, | |
| inputs=[ | |
| gr.Textbox(label="Input Text"), | |
| gr.Radio(["Faroese to English", "English to Faroese"], label="Translation Direction") | |
| ], | |
| outputs=gr.Textbox(label="Translated Text", lines=20), # Scrollable output box | |
| title="Faroese-English Translator", | |
| description="Translate between Faroese and English with support for longer texts." | |
| ) | |
| # Launch Gradio Space | |
| iface.launch() | |