Spaces:
Sleeping
Sleeping
| from typing import Optional, List, Set, Union, Tuple | |
| from huggingface_hub import hf_hub_download | |
| import gradio as gr | |
| import fasttext | |
| model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin")) | |
| model_labels = set(label[-3:] for label in model.get_labels()) | |
| language_dict = { | |
| 'dan': 'Danish', | |
| 'eng': 'English', | |
| 'fao': 'Faroese', | |
| 'fin': 'Finnish', | |
| 'isl': 'Icelandic', | |
| 'nno': 'Norwegian Nynorsk', | |
| 'nob': 'Norwegian Bokmål', | |
| 'sma': 'Southern Sami', | |
| 'sme': 'Northern Sami', | |
| 'smj': 'Lule Sami', | |
| 'smn': 'Inari Sami', | |
| 'sms': 'Skolt Sami', | |
| 'swe': 'Swedish', | |
| 'und': 'Undetermined', | |
| } | |
| def detect_lang( | |
| text: str, | |
| langs: Optional[Union[List, Set]]=None, | |
| threshold: float=-1.0, | |
| return_proba: bool=False | |
| ) -> Union[str, Tuple[str, float]]: | |
| """ | |
| This function takes in a text string and optional arguments for a list or | |
| set of languages to detect, a threshold for minimum probability of language | |
| detection, and a boolean for returning the probability of detected language. | |
| It uses a pre-defined model to predict the language of the text and returns | |
| the detected ISO-639-3 language code as a string. If the return_proba | |
| argument is set to True, it will also return a tuple with the language code | |
| and the probability of detection. If no language is detected, it will | |
| return "und" as the language code. | |
| Args: | |
| - text (str): The text to detect the language of. | |
| - langs (List or Set, optional): The list or set of languages to detect in | |
| the text. Defaults to all languages in the model's labels. | |
| - threshold (float, optional): The minimum probability for a language to be | |
| considered detected. Defaults to `-1.0`. | |
| - return_proba (bool, optional): Whether to return the language code and | |
| probability of detection as a tuple. Defaults to `False`. | |
| Returns: | |
| str or Tuple[str, float]: The detected language code as a string, or a | |
| tuple with the language code and probability of detection if | |
| return_proba is set to True. | |
| """ | |
| if len(text.split()) < 4: | |
| return [("und", 1.0)] if return_proba else "und" | |
| if langs: | |
| langs = set(langs) | |
| else: | |
| langs = model_labels | |
| raw_prediction = model.predict(text, threshold=threshold, k=-1) | |
| predictions = [ | |
| (label[-3:], min(probability, 1.0)) | |
| for label, probability in zip(*raw_prediction) | |
| if label[-3:] in langs | |
| ] | |
| if not predictions: | |
| return [("und", 1.0)] if return_proba else "und" | |
| else: | |
| return predictions if return_proba else predictions[0][0] | |
| def identify(text, threshold): | |
| return {language_dict[lang]: proba for lang, proba in detect_lang(text.replace("\n", " "), threshold=threshold / 100.0, return_proba=True)} | |
| iface = gr.Interface( | |
| title="NB Nordic Language Identification", | |
| description="""This demo uses the [NB-Nordic-LID](https://huggingface.co/NbAiLab/nb-nordic-lid) model to classify a given text into one of the 12 Nordic languages supported. <b>At least 3 or 4 words are needed to identify the language.</b>""", | |
| fn=identify, | |
| inputs=[gr.Textbox(label="Text to identify language for"), gr.Slider(0, 100, value=80, step=1, label="Probability threshold (%)")], | |
| outputs=gr.Label(label="Prediction"), | |
| examples=[ | |
| ["Jeg heter Svein Arne", 80], | |
| ["Dán lágan li biejadusá dárogiela, rijkalasj unneplågogielaj ja dáro siejvvemgiela birra", 80], | |
| ["Skriftspråket har derfor helst brukt ord som kan førast attende til gammalnorsk der slike har funnest i levande talemål.", 80], | |
| ["Ođđadárogiela vuođđun leat leamaš Norgga suopmanat, ja dasto das eai leat nu olu dánskkagiel sánit go girjedárogielas.", 80], | |
| ] | |
| ) | |
| iface.launch() |