Aidan Phillips
commited on
Commit
·
d7d2fdd
1
Parent(s):
36599ed
accuracy scores much much better
Browse files- categories/accuracy.py +9 -19
categories/accuracy.py
CHANGED
|
@@ -5,13 +5,15 @@ import numpy as np
|
|
| 5 |
from scipy.spatial.distance import cosine
|
| 6 |
from simalign import SentenceAligner
|
| 7 |
from transformers import AutoModel, AutoTokenizer
|
|
|
|
| 8 |
|
| 9 |
# setup global variables on import (bad practice, but whatever)
|
| 10 |
# --------------------------------------------------------------
|
| 11 |
|
| 12 |
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
| 13 |
-
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
|
@@ -66,23 +68,11 @@ def __get_bertscore(src_sentence: str, trg_sentence: str) -> float:
|
|
| 66 |
float: The BERTScore.
|
| 67 |
"""
|
| 68 |
# Tokenize and generate embeddings
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
)
|
| 72 |
-
inputs_trg = tokenizer(
|
| 73 |
-
trg_sentence, return_tensors="pt", padding=True, truncation=True
|
| 74 |
-
)
|
| 75 |
-
|
| 76 |
-
with torch.no_grad():
|
| 77 |
-
outputs_src = model(**inputs_src)
|
| 78 |
-
outputs_trg = model(**inputs_trg)
|
| 79 |
-
|
| 80 |
-
# Get sentence embeddings by averaging token embeddings (from last hidden state)
|
| 81 |
-
src_embedding = torch.mean(outputs_src.last_hidden_state, dim=1).squeeze().numpy()
|
| 82 |
-
trg_embedding = torch.mean(outputs_trg.last_hidden_state, dim=1).squeeze().numpy()
|
| 83 |
|
| 84 |
# Calculate cosine similarity (1 - cosine distance)
|
| 85 |
-
similarity = 1 - cosine(
|
| 86 |
|
| 87 |
return similarity
|
| 88 |
|
|
@@ -122,8 +112,8 @@ def __get_alignment_score(src_sentence: str, trg_sentence: str) -> list:
|
|
| 122 |
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
| 123 |
alignments = aligner.get_word_aligns(src_list, trg_list)
|
| 124 |
|
| 125 |
-
src_aligns = {x[0] for x in alignments["
|
| 126 |
-
trg_aligns = {x[1] for x in alignments["
|
| 127 |
|
| 128 |
mistranslations = []
|
| 129 |
for i in range(len(src_list)):
|
|
|
|
| 5 |
from scipy.spatial.distance import cosine
|
| 6 |
from simalign import SentenceAligner
|
| 7 |
from transformers import AutoModel, AutoTokenizer
|
| 8 |
+
from laser_encoders import LaserEncoderPipeline
|
| 9 |
|
| 10 |
# setup global variables on import (bad practice, but whatever)
|
| 11 |
# --------------------------------------------------------------
|
| 12 |
|
| 13 |
aligner = SentenceAligner(model="distilbert-base-multilingual-cased", layer=6)
|
| 14 |
+
|
| 15 |
+
de_encoder = LaserEncoderPipeline(lang="deu_Latn")
|
| 16 |
+
en_encoder = LaserEncoderPipeline(lang="eng_Latn")
|
| 17 |
|
| 18 |
|
| 19 |
def accuracy(src_sentence: str, trg_sentence: str) -> dict:
|
|
|
|
| 68 |
float: The BERTScore.
|
| 69 |
"""
|
| 70 |
# Tokenize and generate embeddings
|
| 71 |
+
emb_src = de_encoder.encode_sentences([src_sentence])[0]
|
| 72 |
+
emb_tgt = en_encoder.encode_sentences([trg_sentence])[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
# Calculate cosine similarity (1 - cosine distance)
|
| 75 |
+
similarity = 1 - cosine(emb_src, emb_tgt)
|
| 76 |
|
| 77 |
return similarity
|
| 78 |
|
|
|
|
| 112 |
# Each method has a list of pairs indicating the indexes of aligned words (The alignments are zero-indexed).
|
| 113 |
alignments = aligner.get_word_aligns(src_list, trg_list)
|
| 114 |
|
| 115 |
+
src_aligns = {x[0] for x in alignments["mwmf"]}
|
| 116 |
+
trg_aligns = {x[1] for x in alignments["mwmf"]}
|
| 117 |
|
| 118 |
mistranslations = []
|
| 119 |
for i in range(len(src_list)):
|