import os import sys import codecs import torch from transformers import M2M100ForConditionalGeneration, NllbTokenizerFast def translate_text(text, model, tokenizer, src_lang, target_lang="eng_Latn"): """ Translates a single text string. """ try: tokenizer.src_lang = src_lang inputs = tokenizer(text, return_tensors="pt") generated_tokens = model.generate( **inputs, forced_bos_token_id=tokenizer.vocab[target_lang], max_length=512 ) translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] return translated_text except Exception as e: return f"An error occurred during translation: {e}" def main(): """ Main function to load the model and run a test translation. """ # Reconfigure stdout to handle UTF-8 encoding sys.stdout = codecs.getwriter('utf-8')(sys.stdout.buffer) # --- Configuration --- script_dir = os.path.dirname(os.path.abspath(__file__)) nepali_model_path = os.path.join(script_dir, "models", "nllb-finetuned-nepali-en") # --- Model Loading --- print("Loading Nepali model and tokenizer...") try: device = torch.device("cuda" if torch.cuda.is_available() else "cpu") nepali_model = M2M100ForConditionalGeneration.from_pretrained(nepali_model_path).to(device) nepali_tokenizer = NllbTokenizerFast.from_pretrained(nepali_model_path) print("Nepali model and tokenizer loaded successfully.") except Exception as e: print(f"Error loading Nepali model or tokenizer: {e}") return # --- Nepali Translation --- nepali_sentences = [ "जडान बिन्दु थप्नुहोस्", "स्टिकी नोट आयात पूरा भयो", "मोनोस्पेस १२", "पानी जेट पम्पमा दुईवटा भित्रिने र एउटा बाहिरिने पाइप हुन्छन् र एक भित्र अर्को सिद्धान्त अनुरूप दुईवटा पाइप हुन्छन् । पानीको प्रविष्टिमा एउटा पानी जेटले केही ठूलो पाइपमा पूरा चापले टुटीबाट बाहिर फाल्दछ । यस्तो तरिकाले पानी जेटले वायू वा तरललाई दोस्रो प्रविष्टिबाट टाढा पुर्याउदछ । ड्रिफ्टिङ तरलमा ऋणात्मक चापको कारणले यस्तो हुन्छ । त्यसैले यो हाइड्रोडायनमिक विरोधाभाषको एउटा अनुप्रयोग हो । यसले ड्रिफ्टिङ तरल नजिकका वस्तु टाढा फाल्नुको साटोमा सोस्ने कुरा बताउदछ ।", "वस्तुको परिवर्तन बचत गर्नुहोस् ।" "तिमीलाई कस्तो छ" , "तिमी को हौ", "कति बज्यो" ] print("\n--- Nepali to English Translation Analysis ---") for sentence in nepali_sentences: print(f"\nOriginal (ne): {sentence}") translated_text = translate_text(sentence, nepali_model, nepali_tokenizer, src_lang="nep_Npan") print(f"Translated (en): {translated_text}") # --- Sinhala Translation --- # NOTE: No fine-tuned model for sinhala was found. Using the baseline model for now. print("\n\n--- Sinhala to English Translation Analysis ---") sinhala_sentences = [ "ඩෝසන්මිස් දුරකථනයෙන් ඩෝසන්මිස් කවුද සර්", "කවුද ඩෝසන් නැතුව ඉන්නේ ඔව් සර්", "ඔබ එය උත්සාහ කරන්න සර්", "කොහොමද වැඩේ හරිද ඔව් සර්ට ස්තුතියි", "ඔව්, හරි, ස්තුතියි රත්තරං", ] for sentence in sinhala_sentences: print(f"\nOriginal (si): {sentence}") translated_text = translate_text(sentence, nepali_model, nepali_tokenizer, src_lang="sin_Sinh") print(f"Translated (en): {translated_text}") if __name__ == "__main__": main()