Spaces:

symanto
/

generation_evaluator

Runtime error

App Files Files Community

generation_evaluator / textgen_evaluator.py

HalteroXHunter

first try

417f39c over 1 year ago

raw

history blame

3.44 kB

	import datasets
	import evaluate

	_CITATION = """\
	@inproceedings{lin-2004-rouge,
	title = "{ROUGE}: A Package for Automatic Evaluation of Summaries",
	author = "Lin, Chin-Yew",
	booktitle = "Text Summarization Branches Out",
	month = jul,
	year = "2004",
	address = "Barcelona, Spain",
	publisher = "Association for Computational Linguistics",
	url = "https://www.aclweb.org/anthology/W04-1013",
	pages = "74--81",
	}
	"""

	_DESCRIPTION = """\
	ROUGE, or Recall-Oriented Understudy for Gisting Evaluation, is a set of metrics and a software package used for
	evaluating automatic summarization and machine translation software in natural language processing.
	The metrics compare an automatically produced summary or translation against a reference or a set of references (human-produced) summary or translation.

	Note that ROUGE is case insensitive, meaning that upper case letters are treated the same way as lower case letters.

	This metrics is a wrapper around Google Research reimplementation of ROUGE:
	https://github.com/google-research/google-research/tree/master/rouge
	"""

	_KWARGS_DESCRIPTION = """
	Calculates average rouge scores for a list of hypotheses and references
	Args:
	predictions: list of predictions to score. Each prediction
	should be a string with tokens separated by spaces.
	references: list of reference for each prediction. Each
	reference should be a string with tokens separated by spaces.
	rouge_types: A list of rouge types to calculate.
	Valid names:
	`"rouge{n}"` (e.g. `"rouge1"`, `"rouge2"`) where: {n} is the n-gram based scoring,
	`"rougeL"`: Longest common subsequence based scoring.
	`"rougeLSum"`: rougeLsum splits text using `"\n"`.
	See details in https://github.com/huggingface/datasets/issues/617
	use_stemmer: Bool indicating whether Porter stemmer should be used to strip word suffixes.
	use_aggregator: Return aggregates if this is set to True
	Returns:
	rouge1: rouge_1 (precision, recall, f1),
	rouge2: rouge_2 (precision, recall, f1),
	rougeL: rouge_l (precision, recall, f1),
	rougeLsum: rouge_lsum (precision, recall, f1)
	"""


	@datasets.utils.file_utils.add_start_docstrings(
	_DESCRIPTION, _KWARGS_DESCRIPTION
	)
	class TextGenEvaluatorTest(datasets.Metric):
	def _info(self):
	return datasets.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=datasets.Features(
	{
	"predictions": datasets.Value("string"),
	"references": datasets.Value("string"),
	}
	),
	codebase_urls=[
	"https://github.com/google-research/google-research/tree/master/rouge"
	],
	reference_urls=[
	"https://en.wikipedia.org/wiki/ROUGE_(metric)",
	"https://github.com/google-research/google-research/tree/master/rouge",
	],
	)

	def _compute(self, predictions, references):

	rouge_score = evaluate.load("rouge")

	scores = rouge_score.compute(
	predictions=predictions, references=references
	)
	bleu_score = evaluate.load("bleu")
	results = bleu_score.compute(
	predictions=predictions, references=references
	)

	return {"ROUGE": scores, "BLEU": results}