Spaces:
Building
Building
| import streamlit as st | |
| import polars as pl | |
| from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, logging, AutoModelForCausalLM | |
| import torch | |
| import os | |
| import httpx | |
| import languagecodes | |
| device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') | |
| # Language options and mappings | |
| favourite_langs = {"Romanian": "ro", "German": "de", "English": "en", "-----": "-----"} | |
| df = pl.read_parquet("isolanguages.parquet") | |
| non_empty_isos = df.slice(1).filter(pl.col("ISO639-1") != "").rows() | |
| all_langs = {iso[0]: (iso[1], iso[2], iso[3]) for iso in non_empty_isos} # {'Romanian': ('ro', 'rum', 'ron')} | |
| name_to_iso1 = {iso[0]: iso[1] for iso in non_empty_isos} # {'Romanian': 'ro', 'German': 'de'} | |
| # langs = ["German", "Romanian", "English", "French", "Spanish", "Italian"] | |
| langs = list(favourite_langs.keys()) | |
| langs.extend(list(all_langs.keys())) # Language options as list, add favourite languages first | |
| # all_langs = languagecodes.iso_languages_byname | |
| # iso1_to_name = {codes[0]: lang for entry in all_langs for lang, codes in entry.items()} # {'ro': 'Romanian', 'de': 'German'} | |
| iso1_to_name = {iso[1]: iso[0] for iso in non_empty_isos} # {'ro': 'Romanian', 'de': 'German'} | |
| models = ["Helsinki-NLP", "QUICKMT", "Argos", "Lego-MT/Lego-MT", "HPLT", "HPLT-OPUS", "Google", | |
| "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul", "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_nld", | |
| "Helsinki-NLP/opus-mt-tc-bible-big-mul-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-mul", | |
| "Helsinki-NLP/opus-mt-tc-bible-big-roa-deu_eng_fra_por_spa", "Helsinki-NLP/opus-mt-tc-bible-big-deu_eng_fra_por_spa-roa", | |
| "Helsinki-NLP/opus-mt-tc-bible-big-roa-en", | |
| "facebook/nllb-200-distilled-600M", "facebook/nllb-200-distilled-1.3B", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B", | |
| "facebook/mbart-large-50-many-to-many-mmt", "facebook/mbart-large-50-one-to-many-mmt", "facebook/mbart-large-50-many-to-one-mmt", | |
| "facebook/m2m100_418M", "facebook/m2m100_1.2B", | |
| "bigscience/mt0-small", "bigscience/mt0-base", "bigscience/mt0-large", "bigscience/mt0-xl", | |
| "bigscience/bloomz-560m", "bigscience/bloomz-1b1", "bigscience/bloomz-1b7", "bigscience/bloomz-3b", | |
| "t5-small", "t5-base", "t5-large", | |
| "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large", "google/flan-t5-xl", | |
| "google/madlad400-3b-mt", "jbochi/madlad400-3b-mt", | |
| "utter-project/EuroLLM-1.7B", "utter-project/EuroLLM-1.7B-Instruct", | |
| "Unbabel/Tower-Plus-2B", "Unbabel/TowerInstruct-7B-v0.2", "Unbabel/TowerInstruct-Mistral-7B-v0.2", | |
| "HuggingFaceTB/SmolLM3-3B", "winninghealth/WiNGPT-Babel-2", | |
| "tencent/Hunyuan-MT-7B", | |
| "openGPT-X/Teuken-7B-instruct-commercial-v0.4", "openGPT-X/Teuken-7B-instruct-v0.6", | |
| ] | |
| class Translators: | |
| def __init__(self, model_name: str, sl: str, tl: str, input_text: str): | |
| self.model_name = model_name | |
| self.sl, self.tl = sl, tl | |
| self.input_text = input_text | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| def google(self): | |
| url = os.environ['GCLIENT'] + f'sl={self.sl}&tl={self.tl}&q={self.input_text}' | |
| response = httpx.get(url) | |
| return response.json()[0][0][0] | |
| def hplt(self, opus = False): | |
| # langs = ['ar', 'bs', 'ca', 'en', 'et', 'eu', 'fi', 'ga', 'gl', 'hi', 'hr', 'is', 'mt', 'nn', 'sq', 'sw', 'zh_hant'] | |
| hplt_models = ['ar-en', 'bs-en', 'ca-en', 'en-ar', 'en-bs', 'en-ca', 'en-et', 'en-eu', 'en-fi', | |
| 'en-ga', 'en-gl', 'en-hi', 'en-hr', 'en-is', 'en-mt', 'en-nn', 'en-sq', 'en-sw', | |
| 'en-zh_hant', 'et-en', 'eu-en', 'fi-en', 'ga-en', 'gl-en', 'hi-en', 'hr-en', | |
| 'is-en', 'mt-en', 'nn-en', 'sq-en', 'sw-en', 'zh_hant-en'] | |
| if opus: | |
| hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt_opus' # HPLT/translate-en-hr-v1.0-hplt_opus | |
| else: | |
| hplt_model = f'HPLT/translate-{self.sl}-{self.tl}-v1.0-hplt' # HPLT/translate-en-hr-v1.0-hplt | |
| if f'{self.sl}-{self.tl}' in hplt_models: | |
| pipe = pipeline("translation", model=hplt_model, device=self.device) | |
| translation = pipe(self.input_text) | |
| translated_text = translation[0]['translation_text'] | |
| message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {hplt_model}.' | |
| else: | |
| translated_text = f'HPLT model from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} not available!' | |
| message = f"Available models: {', '.join(hplt_models)}" | |
| return translated_text, message | |
| def quickmttranslate(model_path, input_text): | |
| from quickmt import Translator | |
| # 'auto' auto-detects GPU, set to "cpu" to force CPU inference | |
| device = 'gpu' if torch.cuda.is_available() else 'cpu' | |
| translator = Translator(str(model_path), device = device) | |
| # translation = Translator(f"./quickmt-{self.sl}-{self.tl}/", device="auto", inter_threads=2) | |
| # set beam size to 1 for faster speed (but lower quality) | |
| translation = translator(input_text, beam_size=5, max_input_length = 512, max_decoding_length = 512) | |
| # print(model_path, input_text, translation) | |
| return translation | |
| def quickmtdownload(model_name): | |
| from quickmt.hub import hf_download | |
| from pathlib import Path | |
| model_path = Path("/quickmt/models") / model_name | |
| if not model_path.exists(): | |
| hf_download( | |
| model_name = f"quickmt/{model_name}", | |
| output_dir=Path("/quickmt/models") / model_name, | |
| ) | |
| return model_path | |
| def quickmt(self): | |
| model_name = f"quickmt-{self.sl}-{self.tl}" | |
| # from quickmt.hub import hf_list | |
| # quickmt_models = [i.split("/quickmt-")[1] for i in hf_list()] | |
| # quickmt_models.sort() | |
| # print(quickmt_models) | |
| quickmt_models = ['ar-en', 'bn-en', 'cs-en', 'da-en', 'de-en', 'el-en', 'en-ar', 'en-bn', 'en-cs', 'en-de', 'en-el', 'en-es', | |
| 'en-fa', 'en-fr', 'en-he', 'en-hi', 'en-hu', 'en-id', 'en-it', 'en-ja', 'en-ko', 'en-lv', 'en-pl', 'en-pt', | |
| 'en-ro', 'en-ru', 'en-th', 'en-tr', 'en-ur', 'en-vi', 'en-zh', 'es-en', 'fa-en', 'fr-en', 'he-en', 'hi-en', | |
| 'hu-en', 'id-en', 'it-en', 'ja-en', 'ko-en', 'lv-en', 'pl-en', 'pt-en', 'ro-en', 'ru-en', 'th-en', 'tr-en', 'ur-en', 'vi-en', 'zh-en'] | |
| # available_languages = list(set([lang for model in quickmt_models for lang in model.split('-')])) | |
| # available_languages.sort() | |
| available_languages = ['ar', 'bn', 'cs', 'da', 'de', 'el', 'en', 'es', 'fa', 'fr', 'he', 'hi', 'hu', | |
| 'id', 'it', 'ja', 'ko', 'lv', 'pl', 'pt', 'ro', 'ru', 'th', 'tr', 'ur', 'vi', 'zh'] | |
| # Direct translation model | |
| if f"{self.sl}-{self.tl}" in quickmt_models: | |
| model_path = Translators.quickmtdownload(model_name) | |
| translated_text = Translators.quickmttranslate(model_path, self.input_text) | |
| message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' | |
| # Pivot language English | |
| elif self.sl in available_languages and self.tl in available_languages: | |
| model_name = f"quickmt-{self.sl}-en" | |
| model_path = Translators.quickmtdownload(model_name) | |
| entranslation = Translators.quickmttranslate(model_path, self.input_text) | |
| model_name = f"quickmt-en-{self.tl}" | |
| model_path = Translators.quickmtdownload(model_name) | |
| translated_text = Translators.quickmttranslate(model_path, entranslation) | |
| message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with Quickmt using pivot language English.' | |
| else: | |
| translated_text = f'No Quickmt model available for translation from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]}!' | |
| message = f"Available models: {', '.join(quickmt_models)}" | |
| return translated_text, message | |
| def download_argos_model(from_code, to_code): | |
| import argostranslate.package | |
| print('Downloading model', from_code, to_code) | |
| # Download and install Argos Translate package | |
| argostranslate.package.update_package_index() | |
| available_packages = argostranslate.package.get_available_packages() | |
| package_to_install = next( | |
| filter(lambda x: x.from_code == from_code and x.to_code == to_code, available_packages) | |
| ) | |
| argostranslate.package.install_from_path(package_to_install.download()) | |
| def argos(self): | |
| import argostranslate.translate, argostranslate.package | |
| try: | |
| Translators.download_argos_model(self.sl, self.tl) # Download model | |
| translated_text = argostranslate.translate.translate(self.input_text, self.sl, self.tl) # Translate | |
| except StopIteration: | |
| # packages_info = ', '.join(f"{pkg.get_description()}->{str(pkg.links)} {str(pkg.source_languages)}" for pkg in argostranslate.package.get_available_packages()) | |
| packages_info = ', '.join(f"{pkg.from_name} ({pkg.from_code}) -> {pkg.to_name} ({pkg.to_code})" for pkg in argostranslate.package.get_available_packages()) | |
| translated_text = f"No Argos model for {self.sl} to {self.tl}. Try other model or languages combination from the available Argos models: {packages_info}." | |
| except Exception as error: | |
| translated_text = error | |
| return translated_text | |
| def hunyuan(self): | |
| # ZH_CODES = {"Chinese": "zh", "Traditional Chinese": "zh-Hant", "Cantonese": "yue"} | |
| # if self.sl in ZH_CODES.keys() or self.tl in ZH_CODES.keys(): | |
| # prompt = f"把下面的文本翻译成{self.tl},不要额外解释。\n\n{self.input_text}" | |
| # else: | |
| prompt = f"Translate the following segment into {self.tl}, without additional explanation.\n\n{self.input_text}." | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto", dtype=torch.bfloat16) | |
| systemprompt = {"role": "system", "content": "You are a professional translator, translating in a formal tone and providing only translation, no other comments or explanations"} | |
| messages = [systemprompt, {"role": "user", "content": prompt}] | |
| # Tokenize the conversation | |
| tokenized_chat = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ) | |
| # Generate response | |
| temperature = 0.7 | |
| with torch.no_grad(): | |
| outputs = model.generate( | |
| tokenized_chat.to(model.device), | |
| max_new_tokens=512, | |
| temperature=temperature, | |
| top_k=20, | |
| top_p=0.95, | |
| repetition_penalty=1.05, | |
| do_sample=True if temperature > 0 else False, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| # outputs = model.generate(tokenized_chat.to(model.device), max_new_tokens=512, top_k=20, top_p=0.6, repetition_penalty=1.05, temperature=0.7) | |
| # output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| output_text = tokenizer.decode(outputs[0][tokenized_chat.shape[-1]:], skip_special_tokens=True) # Decode only the new tokens | |
| return output_text | |
| def simplepipe(self): | |
| try: | |
| pipe = pipeline("translation", model=self.model_name, device=self.device) | |
| translation = pipe(self.input_text) | |
| message = f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.' | |
| return translation[0]['translation_text'], message | |
| except Exception as error: | |
| return f"Error translating with model: {self.model_name}! Try other available language combination or model.", error | |
| def HelsinkiNLP_mulroa(self): | |
| try: | |
| pipe = pipeline("translation", model=self.model_name, device=self.device) | |
| iso1to3 = {iso[1]: iso[3] for iso in non_empty_isos} # {'ro': 'ron'} | |
| iso3tl = iso1to3.get(self.tl) # 'deu', 'ron', 'eng', 'fra' | |
| translation = pipe(f'>>{iso3tl}<< {self.input_text}') | |
| return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {self.model_name}.' | |
| except Exception as error: | |
| return f"Error translating with model: {self.model_name}! Try other available language combination.", error | |
| def HelsinkiNLP(self): | |
| try: # Standard bilingual model | |
| model_name = f"Helsinki-NLP/opus-mt-{self.sl}-{self.tl}" | |
| pipe = pipeline("translation", model=model_name, device=self.device) | |
| translation = pipe(self.input_text) | |
| return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' | |
| except EnvironmentError: | |
| try: # Tatoeba models | |
| model_name = f"Helsinki-NLP/opus-tatoeba-{self.sl}-{self.tl}" | |
| pipe = pipeline("translation", model=model_name, device=self.device) | |
| translation = pipe(self.input_text) | |
| return translation[0]['translation_text'], f'Translated from {iso1_to_name[self.sl]} to {iso1_to_name[self.tl]} with {model_name}.' | |
| except EnvironmentError as error: | |
| self.model_name = "Helsinki-NLP/opus-mt-tc-bible-big-mul-mul" # Last resort: try multi to multi | |
| return self.HelsinkiNLP_mulroa() | |
| except KeyError as error: | |
| return f"Error: Translation direction {self.sl} to {self.tl} is not supported by Helsinki Translation Models", error | |
| def LLaMAX(self): | |
| pipe = pipeline("text-generation", model="LLaMAX/LLaMAX3-8B") | |
| messages = [ | |
| {"role": "user", "content": f"Translate the following text from {self.sl} to {self.sl}: {self.input_text}"}, | |
| ] | |
| return pipe(messages)[0]["generated_text"] | |
| def LegoMT(self): | |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) # "Lego-MT/Lego-MT" | |
| tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) | |
| tokenizer.src_lang = self.sl | |
| encoded = tokenizer(self.input_text, return_tensors="pt") | |
| generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl)) | |
| return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| def madlad(self): | |
| model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto") | |
| tokenizer = T5Tokenizer.from_pretrained(self.model_name) | |
| text = f"<2{self.tl}> {self.input_text}" | |
| # input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device) | |
| # outputs = model.generate(input_ids=input_ids) | |
| # return tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Use a pipeline as a high-level helper | |
| translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl) | |
| translated_text = translator(text, max_length=512) | |
| return translated_text[0]['translation_text'] | |
| def smollm(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForCausalLM.from_pretrained(self.model_name) | |
| prompt = f"""Translate the following {self.sl} text to {self.tl}, generating only the translated text and maintaining the original meaning and tone: | |
| {self.input_text} | |
| Translation:""" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate( | |
| inputs.input_ids, | |
| max_length=len(inputs.input_ids[0]) + 150, | |
| temperature=0.3, | |
| do_sample=True | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| print(response) | |
| return response.split("Translation:")[-1].strip() | |
| def flan(self): | |
| tokenizer = T5Tokenizer.from_pretrained(self.model_name, legacy=False) | |
| model = T5ForConditionalGeneration.from_pretrained(self.model_name) | |
| prompt = f"translate {self.sl} to {self.tl}: {self.input_text}" | |
| input_ids = tokenizer(prompt, return_tensors="pt").input_ids | |
| outputs = model.generate(input_ids) | |
| return tokenizer.decode(outputs[0], skip_special_tokens=True).strip() | |
| def tfive(self): | |
| tokenizer = T5Tokenizer.from_pretrained(self.model_name) | |
| model = T5ForConditionalGeneration.from_pretrained(self.model_name, device_map="auto") | |
| prompt = f"translate {self.sl} to {self.tl}: {self.input_text}" | |
| input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
| output_ids = model.generate(input_ids, max_length=512) # Perform translation | |
| translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip() # Decode the translated text | |
| return translated_text | |
| def mbart_many_to_many(self): | |
| from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
| model = MBartForConditionalGeneration.from_pretrained(self.model_name) | |
| tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name) | |
| # translate source to target | |
| tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl] | |
| encoded = tokenizer(self.input_text, return_tensors="pt") | |
| generated_tokens = model.generate( | |
| **encoded, | |
| forced_bos_token_id=tokenizer.lang_code_to_id[languagecodes.mbart_large_languages[self.tl]] | |
| ) | |
| return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| def mbart_one_to_many(self): | |
| # translate from English | |
| from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
| model = MBartForConditionalGeneration.from_pretrained(self.model_name) | |
| tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name, src_lang="en_XX") | |
| model_inputs = tokenizer(self.input_text, return_tensors="pt") | |
| langid = languagecodes.mbart_large_languages[self.tl] | |
| generated_tokens = model.generate( | |
| **model_inputs, | |
| forced_bos_token_id=tokenizer.lang_code_to_id[langid] | |
| ) | |
| return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| def mbart_many_to_one(self): | |
| # translate to English | |
| from transformers import MBartForConditionalGeneration, MBart50TokenizerFast | |
| model = MBartForConditionalGeneration.from_pretrained(self.model_name) | |
| tokenizer = MBart50TokenizerFast.from_pretrained(self.model_name) | |
| tokenizer.src_lang = languagecodes.mbart_large_languages[self.sl] | |
| encoded = tokenizer(self.input_text, return_tensors="pt") | |
| generated_tokens = model.generate(**encoded) | |
| return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| def mtom(self): | |
| from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer | |
| model = M2M100ForConditionalGeneration.from_pretrained(self.model_name) | |
| tokenizer = M2M100Tokenizer.from_pretrained(self.model_name) | |
| tokenizer.src_lang = self.sl | |
| encoded = tokenizer(self.input_text, return_tensors="pt") | |
| generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(self.tl)) | |
| return tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0] | |
| def bigscience(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) | |
| self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.' | |
| inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt") | |
| outputs = model.generate(inputs) | |
| translation = tokenizer.decode(outputs[0]) | |
| translation = translation.replace('<pad> ', '').replace('</s>', '') | |
| return translation | |
| def bloomz(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForCausalLM.from_pretrained(self.model_name) | |
| self.input_text = self.input_text if self.input_text.endswith('.') else f'{self.input_text}.' | |
| # inputs = tokenizer.encode(f"Translate from {self.sl} to {self.tl}: {self.input_text} Translation:", return_tensors="pt") | |
| inputs = tokenizer.encode(f"Translate to {self.tl}: {self.input_text}", return_tensors="pt") | |
| outputs = model.generate(inputs) | |
| translation = tokenizer.decode(outputs[0]) | |
| translation = translation.replace('<pad> ', '').replace('</s>', '') | |
| translation = translation.split('Translation:')[-1].strip() if 'Translation:' in translation else translation.strip() | |
| return translation | |
| def nllb(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name, src_lang=self.sl) | |
| # model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name, device_map="auto", torch_dtype=torch.bfloat16) | |
| model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name) | |
| translator = pipeline('translation', model=model, tokenizer=tokenizer, src_lang=self.sl, tgt_lang=self.tl) | |
| translated_text = translator(self.input_text, max_length=512) | |
| return translated_text[0]['translation_text'] | |
| def wingpt(self): | |
| model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| dtype="auto", | |
| device_map="auto" | |
| ) | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| # input_json = '{"input_text": self.input_text}' | |
| messages = [ | |
| {"role": "system", "content": f"Translate this to {self.tl} language"}, | |
| {"role": "user", "content": self.input_text} | |
| ] | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True | |
| ) | |
| model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
| generated_ids = model.generate( | |
| **model_inputs, | |
| max_new_tokens=512, | |
| temperature=0.1 | |
| ) | |
| generated_ids = [ | |
| output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
| ] | |
| print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)) | |
| output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
| result = output.split('\n')[-1].strip() if '\n' in output else output.strip() | |
| return result | |
| def eurollm(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForCausalLM.from_pretrained(self.model_name) | |
| prompt = f"{self.sl}: {self.input_text} {self.tl}:" | |
| inputs = tokenizer(prompt, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_new_tokens=512) | |
| output = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| print(output) | |
| # result = output.rsplit(f'{self.tl}:')[-1].strip() if f'{self.tl}:' in output else output.strip() | |
| result = output.rsplit(f'{self.tl}:')[-1].strip() if '\n' in output or f'{self.tl}:' in output else output.strip() | |
| return result | |
| def eurollm_instruct(self): | |
| tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| model = AutoModelForCausalLM.from_pretrained(self.model_name) | |
| text = f'<|im_start|>system\n<|im_end|>\n<|im_start|>user\nTranslate the following {self.sl} source text to {self.tl}:\n{self.sl}: {self.input_text} \n{self.tl}: <|im_end|>\n<|im_start|>assistant\n' | |
| inputs = tokenizer(text, return_tensors="pt") | |
| outputs = model.generate(**inputs, max_new_tokens=512) | |
| output = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| if f'{self.tl}:' in output: | |
| output = output.rsplit(f'{self.tl}:')[-1].strip().replace('assistant\n', '').strip() | |
| return output | |
| def teuken(self): | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| trust_remote_code=True, | |
| torch_dtype=torch.bfloat16, | |
| ) | |
| model = model.to(device).eval() | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_name, | |
| use_fast=False, | |
| trust_remote_code=True, | |
| ) | |
| translation_prompt = f"Translate the following text from {self.sl} into {self.tl}: {self.input_text}" | |
| messages = [{"role": "User", "content": translation_prompt}] | |
| prompt_ids = tokenizer.apply_chat_template(messages, chat_template="EN", tokenize=True, add_generation_prompt=False, return_tensors="pt") | |
| prediction = model.generate( | |
| prompt_ids.to(model.device), | |
| max_length=512, | |
| do_sample=True, | |
| top_k=50, | |
| top_p=0.95, | |
| temperature=0.7, | |
| num_return_sequences=1, | |
| ) | |
| translation = tokenizer.decode(prediction[0].tolist()) | |
| return translation | |
| def unbabel(self): | |
| pipe = pipeline("text-generation", model=self.model_name, torch_dtype=torch.bfloat16, device_map="auto") | |
| messages = [{"role": "user", | |
| "content": f"Translate the following text from {self.sl} into {self.tl}.\n{self.sl}: {self.input_text}.\n{self.tl}:"}] | |
| prompt = pipe.tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) | |
| tokenized_input = pipe.tokenizer(self.input_text, return_tensors="pt") | |
| num_input_tokens = len(tokenized_input["input_ids"][0]) | |
| max_new_tokens = round(num_input_tokens + 0.75 * num_input_tokens) | |
| outputs = pipe(prompt, max_new_tokens=max_new_tokens, do_sample=False) | |
| translated_text = outputs[0]["generated_text"] | |
| print(f"Input chars: {len(input_text)}", f"Input tokens: {num_input_tokens}", f"max_new_tokens: {max_new_tokens}", | |
| "Chars to tokens ratio:", round(len(input_text) / num_input_tokens, 2), f"Raw translation: {translated_text}") | |
| markers = ["<end_of_turn>", "<|im_end|>", "<|im_start|>assistant"] # , "\n" | |
| for marker in markers: | |
| if marker in translated_text: | |
| translated_text = translated_text.split(marker)[1].strip() | |
| translated_text = translated_text.replace('Answer:', '', 1).strip() if translated_text.startswith('Answer:') else translated_text | |
| translated_text = translated_text.split("Translated text:")[0].strip() if "Translated text:" in translated_text else translated_text | |
| split_translated_text = translated_text.split('\n', translated_text.count('\n')) | |
| translated_text = '\n'.join(split_translated_text[:input_text.count('\n')+1]) | |
| return translated_text | |
| def bergamot(model_name: str = 'deen', sl: str = 'de', tl: str = 'en', input_text: str = 'Hallo, mein Freund'): | |
| try: | |
| import bergamot | |
| # input_text = [input_text] if isinstance(input_text, str) else input_text | |
| config = bergamot.ServiceConfig(numWorkers=4) | |
| service = bergamot.Service(config) | |
| model = service.modelFromConfigPath(f"./{model_name}/bergamot.config.yml") | |
| options = bergamot.ResponseOptions(alignment=False, qualityScores=False, HTML=False) | |
| rawresponse = service.translate(model, bergamot.VectorString(input_text), options) | |
| translated_text: str = next(iter(rawresponse)).target.text | |
| message_text = f"Translated from {sl} to {tl} with Bergamot {model_name}." | |
| except Exception as error: | |
| response = error | |
| return translated_text, message_text | |
| def translate_text(model_name: str, s_language: str, t_language: str, input_text: str) -> tuple[str, str]: | |
| """ | |
| Translates the input text from the source language to the target language using a specified model. | |
| Parameters: | |
| input_text (str): The source text to be translated | |
| s_language (str): The source language of the input text | |
| t_language (str): The target language in which the input text is translated | |
| model_name (str): The selected translation model name | |
| Returns: | |
| tuple: | |
| translated_text(str): The input text translated to the selected target language | |
| message_text(str): A descriptive message summarizing the translation process. Example: "Translated from English to German with Helsinki-NLP." | |
| Example: | |
| >>> translate_text("Hello world", "English", "German", "Helsinki-NLP") | |
| ("Hallo Welt", "Translated from English to German with Helsinki-NLP.") | |
| """ | |
| sl = all_langs[s_language][0] | |
| tl = all_langs[t_language][0] | |
| if input_text == '': | |
| translated_text = f'No input text entered!' | |
| message_text = 'Please enter a text to translate!' | |
| if sl == tl: | |
| translated_text = f'Source language {s_language} identical to target language {t_language}!' | |
| message_text = 'Please choose different target and source language!' | |
| return translated_text, message_text | |
| message_text = f'Translated from {s_language} to {t_language} with {model_name}' | |
| translated_text = None | |
| try: | |
| if model_name == "Helsinki-NLP/opus-mt-tc-bible-big-roa-en": | |
| translated_text, message_text = Translators(model_name, sl, tl, input_text).simplepipe() | |
| elif "-mul" in model_name.lower() or "mul-" in model_name.lower() or "-roa" in model_name.lower(): | |
| translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP_mulroa() | |
| elif model_name == "Helsinki-NLP": | |
| translated_text, message_text = Translators(model_name, sl, tl, input_text).HelsinkiNLP() | |
| elif model_name == "QUICKMT": | |
| translated_text, message_text = Translators(model_name, sl, tl, input_text).quickmt() | |
| elif "HPLT" in model_name: | |
| if model_name == "HPLT-OPUS": | |
| translated_text, message = Translators(model_name, sl, tl, input_text).hplt(opus = True) | |
| else: | |
| translated_text, message = Translators(model_name, sl, tl, input_text).hplt() | |
| elif model_name == 'Argos': | |
| translated_text = Translators(model_name, sl, tl, input_text).argos() | |
| elif model_name == 'Google': | |
| translated_text = Translators(model_name, sl, tl, input_text).google() | |
| elif "m2m" in model_name.lower(): | |
| translated_text = Translators(model_name, sl, tl, input_text).mtom() | |
| elif "lego" in model_name.lower(): | |
| translated_text = Translators(model_name, sl, tl, input_text).LegoMT() | |
| elif model_name.startswith('t5'): | |
| translated_text = Translators(model_name, s_language, t_language, input_text).tfive() | |
| elif 'flan' in model_name.lower(): | |
| translated_text = Translators(model_name, s_language, t_language, input_text).flan() | |
| elif 'madlad' in model_name.lower(): | |
| translated_text = Translators(model_name, sl, tl, input_text).madlad() | |
| elif 'mt0' in model_name.lower(): | |
| translated_text = Translators(model_name, s_language, t_language, input_text).bigscience() | |
| elif 'bloomz' in model_name.lower(): | |
| translated_text = Translators(model_name, s_language, t_language, input_text).bloomz() | |
| elif 'nllb' in model_name.lower(): | |
| nnlbsl, nnlbtl = languagecodes.nllb_language_codes[s_language], languagecodes.nllb_language_codes[t_language] | |
| translated_text = Translators(model_name, nnlbsl, nnlbtl, input_text).nllb() | |
| elif model_name == "facebook/mbart-large-50-many-to-many-mmt": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_many() | |
| elif model_name == "facebook/mbart-large-50-one-to-many-mmt": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).mbart_one_to_many() | |
| elif model_name == "facebook/mbart-large-50-many-to-one-mmt": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).mbart_many_to_one() | |
| elif 'teuken' in model_name.lower(): | |
| translated_text = Translators(model_name, s_language, t_language, input_text).teuken() | |
| elif model_name == "utter-project/EuroLLM-1.7B-Instruct": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).eurollm_instruct() | |
| elif model_name == "utter-project/EuroLLM-1.7B": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).eurollm() | |
| elif 'Unbabel' in model_name: | |
| translated_text = Translators(model_name, s_language, t_language, input_text).unbabel() | |
| elif model_name == "HuggingFaceTB/SmolLM3-3B": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).smollm() | |
| elif model_name == "winninghealth/WiNGPT-Babel-2": | |
| translated_text = Translators(model_name, s_language, t_language, input_text).wingpt() | |
| elif "LLaMAX" in model_name: | |
| translated_text = Translators(model_name, s_language, t_language, input_text).LLaMAX() | |
| elif model_name == "Bergamot": | |
| translated_text, message_text = Translators(model_name, s_language, t_language, input_text).bergamot() | |
| elif "Hunyuan" in model_name: | |
| translated_text = Translators(model_name, s_language, t_language, input_text).hunyuan() | |
| except Exception as error: | |
| translated_text = error | |
| finally: | |
| print(input_text, translated_text, message_text) | |
| return translated_text, message_text | |
| # App layout | |
| st.header("Text Machine Translation", divider="gray", help="Text Machine Translation Streamlit App with Open Source Models") | |
| input_text = st.text_area("Enter text to translate:", placeholder="Enter text to translate, maximum 512 characters!", max_chars=512) | |
| # Initialize session state if not already set | |
| if "sselected_language" not in st.session_state: | |
| st.session_state["sselected_language"] = langs[0] | |
| if "tselected_language" not in st.session_state: | |
| st.session_state["tselected_language"] = langs[1] | |
| if "model_name" not in st.session_state: | |
| st.session_state["model_name"] = models[1] | |
| # Model selection FIRST | |
| model_name = st.selectbox("Select a model:", models, | |
| index=models.index(st.session_state["model_name"])) | |
| # Create columns for language selection | |
| scol, swapcol, tcol = st.columns([3, 1, 3]) | |
| with scol: | |
| sselected_language = st.selectbox("Source language:", langs, | |
| index=langs.index(st.session_state["sselected_language"])) | |
| with swapcol: | |
| if st.button("🔄 Swap"): | |
| st.session_state["model_name"] = model_name # Preserve model | |
| st.session_state["sselected_language"], st.session_state["tselected_language"] = \ | |
| st.session_state["tselected_language"], st.session_state["sselected_language"] | |
| st.rerun() | |
| with tcol: | |
| tselected_language = st.selectbox("Target language:", langs, | |
| index=langs.index(st.session_state["tselected_language"])) | |
| # Language codes | |
| sl = name_to_iso1[st.session_state["sselected_language"]] | |
| tl = name_to_iso1[st.session_state["tselected_language"]] | |
| # Store selections | |
| st.session_state["sselected_language"] = sselected_language | |
| st.session_state["tselected_language"] = tselected_language | |
| st.session_state["model_name"] = model_name | |
| st.write(f'Selected language combination: {sselected_language} - {tselected_language}. Selected model: {model_name}') | |
| with st.container(border=None, width="stretch", height="content", horizontal=False, horizontal_alignment="center", vertical_alignment="center", gap="small"): | |
| submit_button = st.button("Translate") | |
| # Show text area with placeholder also before translating | |
| # translated_textarea = st.empty() | |
| # message_textarea = st.empty() | |
| # translated_textarea.text_area(":green[Translation:]", placeholder="Translation area", value='') | |
| # message_textarea.text_input(":blue[Messages:]", placeholder="Messages area", value='') | |
| if submit_button: # Handle the submit button click | |
| with st.spinner("Translating...", show_time=True): | |
| translated_text, message = translate_text(model_name, sselected_language, tselected_language, input_text) | |
| print(f"Translated from {sselected_language} to {tselected_language} using {model_name}.", input_text, translated_text) | |
| # Display the translated text | |
| # translated_textarea.text_area(":green[Translation:]", value=translated_text) | |
| # message_textarea.text_input(":blue[Message:]", value=message) | |
| st.text_area(":green[Translation:]", value=translated_text) | |
| # st.success(message, icon=":material/check:") st.info(message, icon="ℹ️"), st.warning(message, icon=":material/warning:"), error(message, icon=":material/error:"), st.exception | |
| st.info(message, icon=":material/info:") | |
| # st.text_input(":blue[Messages:]", value=message) | |
| # st.rerun() |