Spaces:

NbAiLab
/

language-identification

Sleeping

App Files Files Community

language-identification / app.py

versae

Adding North Sámi example

a17d6f7 over 2 years ago

raw

history blame contribute delete

3.82 kB

	from typing import Optional, List, Set, Union, Tuple
	from huggingface_hub import hf_hub_download
	import gradio as gr
	import fasttext

	model = fasttext.load_model(hf_hub_download("NbAiLab/nb-nordic-lid", "model.bin"))
	model_labels = set(label[-3:] for label in model.get_labels())
	language_dict = {
	'dan': 'Danish',
	'eng': 'English',
	'fao': 'Faroese',
	'fin': 'Finnish',
	'isl': 'Icelandic',
	'nno': 'Norwegian Nynorsk',
	'nob': 'Norwegian Bokmål',
	'sma': 'Southern Sami',
	'sme': 'Northern Sami',
	'smj': 'Lule Sami',
	'smn': 'Inari Sami',
	'sms': 'Skolt Sami',
	'swe': 'Swedish',
	'und': 'Undetermined',
	}

	def detect_lang(
	text: str,
	langs: Optional[Union[List, Set]]=None,
	threshold: float=-1.0,
	return_proba: bool=False
	) -> Union[str, Tuple[str, float]]:
	"""
	This function takes in a text string and optional arguments for a list or
	set of languages to detect, a threshold for minimum probability of language
	detection, and a boolean for returning the probability of detected language.
	It uses a pre-defined model to predict the language of the text and returns
	the detected ISO-639-3 language code as a string. If the return_proba
	argument is set to True, it will also return a tuple with the language code
	and the probability of detection. If no language is detected, it will
	return "und" as the language code.

	Args:
	- text (str): The text to detect the language of.
	- langs (List or Set, optional): The list or set of languages to detect in
	the text. Defaults to all languages in the model's labels.
	- threshold (float, optional): The minimum probability for a language to be
	considered detected. Defaults to `-1.0`.
	- return_proba (bool, optional): Whether to return the language code and
	probability of detection as a tuple. Defaults to `False`.

	Returns:
	str or Tuple[str, float]: The detected language code as a string, or a
	tuple with the language code and probability of detection if
	return_proba is set to True.
	"""
	if len(text.split()) < 4:
	return [("und", 1.0)] if return_proba else "und"
	if langs:
	langs = set(langs)
	else:
	langs = model_labels
	raw_prediction = model.predict(text, threshold=threshold, k=-1)
	predictions = [
	(label[-3:], min(probability, 1.0))
	for label, probability in zip(*raw_prediction)
	if label[-3:] in langs
	]
	if not predictions:
	return [("und", 1.0)] if return_proba else "und"
	else:
	return predictions if return_proba else predictions[0][0]


	def identify(text, threshold):
	return {language_dict[lang]: proba for lang, proba in detect_lang(text.replace("\n", " "), threshold=threshold / 100.0, return_proba=True)}

	iface = gr.Interface(
	title="NB Nordic Language Identification",
	description="""This demo uses the [NB-Nordic-LID](https://huggingface.co/NbAiLab/nb-nordic-lid) model to classify a given text into one of the 12 Nordic languages supported. <b>At least 3 or 4 words are needed to identify the language.</b>""",
	fn=identify,
	inputs=[gr.Textbox(label="Text to identify language for"), gr.Slider(0, 100, value=80, step=1, label="Probability threshold (%)")],
	outputs=gr.Label(label="Prediction"),
	examples=[
	["Jeg heter Svein Arne", 80],
	["Dán lágan li biejadusá dárogiela, rijkalasj unneplågogielaj ja dáro siejvvemgiela birra", 80],
	["Skriftspråket har derfor helst brukt ord som kan førast attende til gammalnorsk der slike har funnest i levande talemål.", 80],
	["Ođđadárogiela vuođđun leat leamaš Norgga suopmanat, ja dasto das eai leat nu olu dánskkagiel sánit go girjedárogielas.", 80],
	]
	)
	iface.launch()