Shuwei Hou
commited on
Commit
·
9e45db3
1
Parent(s):
37ea16b
fix_morpheme_index
Browse files- morpheme.py +38 -7
- morpheme_omission.py +41 -8
morpheme.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import stanza
|
|
|
|
| 4 |
|
| 5 |
nlp = stanza.Pipeline(
|
| 6 |
lang="en",
|
|
@@ -66,8 +67,38 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
|
|
| 66 |
return None
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def extract_inflectional_morphemes(text: str):
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
results = []
|
| 72 |
|
| 73 |
for sent in doc.sentences:
|
|
@@ -86,7 +117,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 86 |
results.append({
|
| 87 |
"word": prev.text + surf,
|
| 88 |
"lemma": prev.lemma,
|
| 89 |
-
"index": i - 1,
|
| 90 |
"inflectional_morpheme": "Possessive",
|
| 91 |
"morpheme_form": "'/s",
|
| 92 |
})
|
|
@@ -94,7 +125,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 94 |
results.append({
|
| 95 |
"word": prev.text + surf,
|
| 96 |
"lemma": prev.lemma,
|
| 97 |
-
"index": i - 1,
|
| 98 |
"inflectional_morpheme": "Contraction",
|
| 99 |
"morpheme_form": "'/s",
|
| 100 |
})
|
|
@@ -106,7 +137,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 106 |
results.append({
|
| 107 |
"word": prev.text + surf,
|
| 108 |
"lemma": prev.lemma,
|
| 109 |
-
"index": i - 1,
|
| 110 |
"inflectional_morpheme": "Contraction",
|
| 111 |
"morpheme_form": low_txt,
|
| 112 |
})
|
|
@@ -120,7 +151,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 120 |
results.append({
|
| 121 |
"word": surf,
|
| 122 |
"lemma": lem,
|
| 123 |
-
"index": i,
|
| 124 |
"inflectional_morpheme": "Possessive",
|
| 125 |
"morpheme_form": morpheme_form,
|
| 126 |
})
|
|
@@ -155,7 +186,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 155 |
results.append({
|
| 156 |
"word": surf,
|
| 157 |
"lemma": lem,
|
| 158 |
-
"index": i,
|
| 159 |
"inflectional_morpheme": inflect_type,
|
| 160 |
"morpheme_form": morpheme_form,
|
| 161 |
})
|
|
@@ -187,4 +218,4 @@ def annotate_morpheme(session_id, base_dir="session_data"):
|
|
| 187 |
|
| 188 |
|
| 189 |
if __name__ == "__main__":
|
| 190 |
-
print(extract_inflectional_morphemes("
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import stanza
|
| 4 |
+
import re
|
| 5 |
|
| 6 |
nlp = stanza.Pipeline(
|
| 7 |
lang="en",
|
|
|
|
| 67 |
return None
|
| 68 |
|
| 69 |
|
| 70 |
+
def preprocess_text(text: str) -> tuple[str, list[int]]:
|
| 71 |
+
|
| 72 |
+
original_words = text.split()
|
| 73 |
+
|
| 74 |
+
position_map = [] # position_map[original_index] = cleaned_index
|
| 75 |
+
cleaned_words = []
|
| 76 |
+
|
| 77 |
+
for i, word in enumerate(original_words):
|
| 78 |
+
if re.match(r'\[.*\]', word):
|
| 79 |
+
position_map.append(-1)
|
| 80 |
+
else:
|
| 81 |
+
position_map.append(len(cleaned_words))
|
| 82 |
+
cleaned_words.append(word)
|
| 83 |
+
|
| 84 |
+
cleaned_text = ' '.join(cleaned_words)
|
| 85 |
+
return cleaned_text, position_map
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
|
| 89 |
+
|
| 90 |
+
for original_index, cleaned_pos in enumerate(position_map):
|
| 91 |
+
if cleaned_pos == cleaned_index:
|
| 92 |
+
return original_index
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
return cleaned_index
|
| 96 |
+
|
| 97 |
+
|
| 98 |
def extract_inflectional_morphemes(text: str):
|
| 99 |
+
cleaned_text, position_map = preprocess_text(text)
|
| 100 |
+
|
| 101 |
+
doc = nlp(cleaned_text)
|
| 102 |
results = []
|
| 103 |
|
| 104 |
for sent in doc.sentences:
|
|
|
|
| 117 |
results.append({
|
| 118 |
"word": prev.text + surf,
|
| 119 |
"lemma": prev.lemma,
|
| 120 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
| 121 |
"inflectional_morpheme": "Possessive",
|
| 122 |
"morpheme_form": "'/s",
|
| 123 |
})
|
|
|
|
| 125 |
results.append({
|
| 126 |
"word": prev.text + surf,
|
| 127 |
"lemma": prev.lemma,
|
| 128 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
| 129 |
"inflectional_morpheme": "Contraction",
|
| 130 |
"morpheme_form": "'/s",
|
| 131 |
})
|
|
|
|
| 137 |
results.append({
|
| 138 |
"word": prev.text + surf,
|
| 139 |
"lemma": prev.lemma,
|
| 140 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
| 141 |
"inflectional_morpheme": "Contraction",
|
| 142 |
"morpheme_form": low_txt,
|
| 143 |
})
|
|
|
|
| 151 |
results.append({
|
| 152 |
"word": surf,
|
| 153 |
"lemma": lem,
|
| 154 |
+
"index": calculate_adjusted_index(i, position_map),
|
| 155 |
"inflectional_morpheme": "Possessive",
|
| 156 |
"morpheme_form": morpheme_form,
|
| 157 |
})
|
|
|
|
| 186 |
results.append({
|
| 187 |
"word": surf,
|
| 188 |
"lemma": lem,
|
| 189 |
+
"index": calculate_adjusted_index(i, position_map),
|
| 190 |
"inflectional_morpheme": inflect_type,
|
| 191 |
"morpheme_form": morpheme_form,
|
| 192 |
})
|
|
|
|
| 218 |
|
| 219 |
|
| 220 |
if __name__ == "__main__":
|
| 221 |
+
print(extract_inflectional_morphemes("And he [UH] [UM] the rabbit [UH] makes [UH] sand castle."))
|
morpheme_omission.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import stanza
|
|
|
|
| 4 |
|
| 5 |
nlp = stanza.Pipeline(
|
| 6 |
lang="en",
|
|
@@ -66,9 +67,39 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
|
|
| 66 |
return None
|
| 67 |
|
| 68 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
def extract_inflectional_morphemes(text: str):
|
| 70 |
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
results = []
|
| 73 |
|
| 74 |
for sent in doc.sentences:
|
|
@@ -85,7 +116,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 85 |
results.append({
|
| 86 |
"word": prev.text + surf,
|
| 87 |
"lemma": prev.lemma,
|
| 88 |
-
"index": i - 1,
|
| 89 |
"inflectional_morpheme": "Possessive"
|
| 90 |
if prev.upos in {"NOUN", "PROPN"} else "Contraction",
|
| 91 |
"morpheme_form": "'/s",
|
|
@@ -98,7 +129,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 98 |
results.append({
|
| 99 |
"word": prev.text + surf,
|
| 100 |
"lemma": prev.lemma,
|
| 101 |
-
"index": i - 1,
|
| 102 |
"inflectional_morpheme": "Contraction",
|
| 103 |
"morpheme_form": low_txt,
|
| 104 |
})
|
|
@@ -112,7 +143,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 112 |
results.append({
|
| 113 |
"word": surf,
|
| 114 |
"lemma": lem,
|
| 115 |
-
"index": i,
|
| 116 |
"inflectional_morpheme": "Possessive",
|
| 117 |
"morpheme_form": morpheme_form,
|
| 118 |
})
|
|
@@ -147,7 +178,7 @@ def extract_inflectional_morphemes(text: str):
|
|
| 147 |
results.append({
|
| 148 |
"word": surf,
|
| 149 |
"lemma": lem,
|
| 150 |
-
"index": i,
|
| 151 |
"inflectional_morpheme": inflect_type,
|
| 152 |
"morpheme_form": morpheme_form,
|
| 153 |
})
|
|
@@ -159,7 +190,9 @@ def extract_inflectional_morphemes(text: str):
|
|
| 159 |
|
| 160 |
def extract_morpheme_omissions(text: str):
|
| 161 |
|
| 162 |
-
|
|
|
|
|
|
|
| 163 |
omissions = []
|
| 164 |
|
| 165 |
for sent in doc.sentences:
|
|
@@ -191,7 +224,7 @@ def extract_morpheme_omissions(text: str):
|
|
| 191 |
omissions.append({
|
| 192 |
"word": surf,
|
| 193 |
"lemma": lem,
|
| 194 |
-
"index": i,
|
| 195 |
"inflectional_morpheme": inflect_type,
|
| 196 |
"morpheme_form": "<OMI>",
|
| 197 |
})
|
|
@@ -223,7 +256,7 @@ def annotate_morpheme_omission(session_id, base_dir="session_data"):
|
|
| 223 |
|
| 224 |
|
| 225 |
if __name__ == "__main__":
|
| 226 |
-
sample = "
|
| 227 |
print("Inflectional Morphemes:")
|
| 228 |
print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
|
| 229 |
print("\nMorpheme Omissions:")
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import stanza
|
| 4 |
+
import re
|
| 5 |
|
| 6 |
nlp = stanza.Pipeline(
|
| 7 |
lang="en",
|
|
|
|
| 67 |
return None
|
| 68 |
|
| 69 |
|
| 70 |
+
def preprocess_text(text: str) -> tuple[str, list[int]]:
|
| 71 |
+
|
| 72 |
+
original_words = text.split()
|
| 73 |
+
|
| 74 |
+
position_map = [] # position_map[original_index] = cleaned_index
|
| 75 |
+
cleaned_words = []
|
| 76 |
+
|
| 77 |
+
for i, word in enumerate(original_words):
|
| 78 |
+
if re.match(r'\[.*\]', word):
|
| 79 |
+
position_map.append(-1)
|
| 80 |
+
else:
|
| 81 |
+
position_map.append(len(cleaned_words))
|
| 82 |
+
cleaned_words.append(word)
|
| 83 |
+
|
| 84 |
+
cleaned_text = ' '.join(cleaned_words)
|
| 85 |
+
return cleaned_text, position_map
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
def calculate_adjusted_index(cleaned_index: int, position_map: list[int]) -> int:
|
| 89 |
+
|
| 90 |
+
for original_index, cleaned_pos in enumerate(position_map):
|
| 91 |
+
if cleaned_pos == cleaned_index:
|
| 92 |
+
return original_index
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
return cleaned_index
|
| 96 |
+
|
| 97 |
+
|
| 98 |
def extract_inflectional_morphemes(text: str):
|
| 99 |
|
| 100 |
+
cleaned_text, position_map = preprocess_text(text)
|
| 101 |
+
|
| 102 |
+
doc = nlp(cleaned_text)
|
| 103 |
results = []
|
| 104 |
|
| 105 |
for sent in doc.sentences:
|
|
|
|
| 116 |
results.append({
|
| 117 |
"word": prev.text + surf,
|
| 118 |
"lemma": prev.lemma,
|
| 119 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
| 120 |
"inflectional_morpheme": "Possessive"
|
| 121 |
if prev.upos in {"NOUN", "PROPN"} else "Contraction",
|
| 122 |
"morpheme_form": "'/s",
|
|
|
|
| 129 |
results.append({
|
| 130 |
"word": prev.text + surf,
|
| 131 |
"lemma": prev.lemma,
|
| 132 |
+
"index": calculate_adjusted_index(i - 1, position_map),
|
| 133 |
"inflectional_morpheme": "Contraction",
|
| 134 |
"morpheme_form": low_txt,
|
| 135 |
})
|
|
|
|
| 143 |
results.append({
|
| 144 |
"word": surf,
|
| 145 |
"lemma": lem,
|
| 146 |
+
"index": calculate_adjusted_index(i, position_map),
|
| 147 |
"inflectional_morpheme": "Possessive",
|
| 148 |
"morpheme_form": morpheme_form,
|
| 149 |
})
|
|
|
|
| 178 |
results.append({
|
| 179 |
"word": surf,
|
| 180 |
"lemma": lem,
|
| 181 |
+
"index": calculate_adjusted_index(i, position_map),
|
| 182 |
"inflectional_morpheme": inflect_type,
|
| 183 |
"morpheme_form": morpheme_form,
|
| 184 |
})
|
|
|
|
| 190 |
|
| 191 |
def extract_morpheme_omissions(text: str):
|
| 192 |
|
| 193 |
+
cleaned_text, position_map = preprocess_text(text)
|
| 194 |
+
|
| 195 |
+
doc = nlp(cleaned_text)
|
| 196 |
omissions = []
|
| 197 |
|
| 198 |
for sent in doc.sentences:
|
|
|
|
| 224 |
omissions.append({
|
| 225 |
"word": surf,
|
| 226 |
"lemma": lem,
|
| 227 |
+
"index": calculate_adjusted_index(i, position_map),
|
| 228 |
"inflectional_morpheme": inflect_type,
|
| 229 |
"morpheme_form": "<OMI>",
|
| 230 |
})
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
if __name__ == "__main__":
|
| 259 |
+
sample = "And he [UM] [UM] the rabbit [UM] [UH] [UH] make [UH] sand castle."
|
| 260 |
print("Inflectional Morphemes:")
|
| 261 |
print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
|
| 262 |
print("\nMorpheme Omissions:")
|