Shuwei Hou commited on
Commit
a04f574
·
1 Parent(s): cedcb9f

add_simple_morpheme_omission

Browse files
Files changed (3) hide show
  1. main_socket.py +2 -0
  2. morpheme.py +8 -3
  3. morpheme_omission.py +230 -0
main_socket.py CHANGED
@@ -10,6 +10,7 @@ from repetition import annotate_repetitions
10
  from syllable import annotate_syllables
11
  from fillerword import annotate_fillerwords
12
  from morpheme import annotate_morpheme
 
13
 
14
  from annotation import annotate_transcript
15
 
@@ -71,6 +72,7 @@ def process_audio():
71
  annotate_fillerwords(session_id)
72
  # annotate_transcript(session_id)
73
  annotate_morpheme(session_id)
 
74
 
75
 
76
 
 
10
  from syllable import annotate_syllables
11
  from fillerword import annotate_fillerwords
12
  from morpheme import annotate_morpheme
13
+ from morpheme_omission import annotate_morpheme_omission
14
 
15
  from annotation import annotate_transcript
16
 
 
72
  annotate_fillerwords(session_id)
73
  # annotate_transcript(session_id)
74
  annotate_morpheme(session_id)
75
+ annotate_morpheme_omission(session_id)
76
 
77
 
78
 
morpheme.py CHANGED
@@ -101,7 +101,6 @@ def extract_inflectional_morphemes(text: str):
101
  i += 1
102
  continue
103
 
104
-
105
  if low_txt in _CONTRACTION_PARTICLES and i > 0:
106
  prev = words[i - 1]
107
  results.append({
@@ -114,7 +113,6 @@ def extract_inflectional_morphemes(text: str):
114
  i += 1
115
  continue
116
 
117
-
118
  if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
119
  low_lem, low_surf = lem.lower(), surf.lower()
120
  suf = get_suffix(low_lem, low_surf)
@@ -129,7 +127,6 @@ def extract_inflectional_morphemes(text: str):
129
  i += 1
130
  continue
131
 
132
-
133
  inflect_type = None
134
  if pos == "NOUN" and feats.get("Number") == "Plur":
135
  inflect_type = "Plural"
@@ -148,6 +145,10 @@ def extract_inflectional_morphemes(text: str):
148
  inflect_type = "Progressive"
149
 
150
  if inflect_type:
 
 
 
 
151
  raw_suffix = get_suffix(lem, low_txt)
152
  canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
153
  morpheme_form = f"/{canon}" if canon else "<IRR>"
@@ -165,6 +166,7 @@ def extract_inflectional_morphemes(text: str):
165
 
166
 
167
  def annotate_morpheme(session_id, base_dir="session_data"):
 
168
  base_dir = base_dir or os.getcwd()
169
  json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
170
 
@@ -183,3 +185,6 @@ def annotate_morpheme(session_id, base_dir="session_data"):
183
  with open(json_file, "w", encoding="utf-8") as f:
184
  json.dump(data, f, ensure_ascii=False, indent=2)
185
 
 
 
 
 
101
  i += 1
102
  continue
103
 
 
104
  if low_txt in _CONTRACTION_PARTICLES and i > 0:
105
  prev = words[i - 1]
106
  results.append({
 
113
  i += 1
114
  continue
115
 
 
116
  if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
117
  low_lem, low_surf = lem.lower(), surf.lower()
118
  suf = get_suffix(low_lem, low_surf)
 
127
  i += 1
128
  continue
129
 
 
130
  inflect_type = None
131
  if pos == "NOUN" and feats.get("Number") == "Plur":
132
  inflect_type = "Plural"
 
145
  inflect_type = "Progressive"
146
 
147
  if inflect_type:
148
+ if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
149
+ i += 1
150
+ continue
151
+
152
  raw_suffix = get_suffix(lem, low_txt)
153
  canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
154
  morpheme_form = f"/{canon}" if canon else "<IRR>"
 
166
 
167
 
168
  def annotate_morpheme(session_id, base_dir="session_data"):
169
+
170
  base_dir = base_dir or os.getcwd()
171
  json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
172
 
 
185
  with open(json_file, "w", encoding="utf-8") as f:
186
  json.dump(data, f, ensure_ascii=False, indent=2)
187
 
188
+
189
+ if __name__ == "__main__":
190
+ print(extract_inflectional_morphemes("His is more better than mine, he get up in the water. He is take the buses. I like his books."))
morpheme_omission.py ADDED
@@ -0,0 +1,230 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import stanza
4
+
5
+ nlp = stanza.Pipeline(
6
+ lang="en",
7
+ processors="tokenize,pos,lemma",
8
+ tokenize_pretokenized=False,
9
+ )
10
+
11
+ _EXPECTED_SUFFIXES = {
12
+ "Plural": {"s", "es"},
13
+ "Possessive": {"'s", "s"},
14
+ "Comparative": {"er"},
15
+ "Superlative": {"est"},
16
+ "3rd Person Singular": {"s", "es"},
17
+ "Past Tense": {"ed"},
18
+ "Past Participle": {"ed", "en", "n"},
19
+ "Progressive": {"ing"},
20
+ "Gerund": {"ing"},
21
+ }
22
+
23
+ _CONTRACTION_PARTICLES = {
24
+ "'ll": "will", # we'll, he'll
25
+ "'d": "would/had", # I'd, she'd
26
+ "'ve": "have", # we've, they've
27
+ "'re": "are", # you're, they're
28
+ "'m": "am", # I'm
29
+ "n't": "not", # isn't, didn't
30
+ "'s": "is/has", # what's, she's
31
+ }
32
+
33
+ _S_TOKENS = {"'s", "’s"}
34
+
35
+
36
+ def is_possessive_candidate(tok):
37
+ return tok.text in _S_TOKENS and tok.upos == "PART"
38
+
39
+
40
+ def lcp(a: str, b: str) -> str:
41
+ i = 0
42
+ while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
43
+ i += 1
44
+ return a[:i]
45
+
46
+
47
+ def strip_doubling(lemma: str, suf: str) -> str:
48
+ if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
49
+ cand = suf[1:]
50
+ if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
51
+ return cand
52
+ return suf
53
+
54
+
55
+ def get_suffix(lemma: str, surface: str) -> str:
56
+ return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
57
+
58
+
59
+ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
60
+ if raw_suf in expected_set:
61
+ return raw_suf
62
+ if lemma.lower().endswith("y") and raw_suf.startswith("i"):
63
+ alt = raw_suf[1:]
64
+ if alt in expected_set:
65
+ return alt
66
+ return None
67
+
68
+
69
+ def extract_inflectional_morphemes(text: str):
70
+
71
+ doc = nlp(text)
72
+ results = []
73
+
74
+ for sent in doc.sentences:
75
+ words = sent.words
76
+ i = 0
77
+ while i < len(words):
78
+ w = words[i]
79
+ surf, lem, pos = w.text, w.lemma, w.upos
80
+ feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
81
+ low_txt = surf.lower()
82
+
83
+ if is_possessive_candidate(w) and i > 0:
84
+ prev = words[i - 1]
85
+ results.append({
86
+ "word": prev.text + surf,
87
+ "lemma": prev.lemma,
88
+ "index": i - 1,
89
+ "inflectional_morpheme": "Possessive"
90
+ if prev.upos in {"NOUN", "PROPN"} else "Contraction",
91
+ "morpheme_form": "'/s",
92
+ })
93
+ i += 1
94
+ continue
95
+
96
+ if low_txt in _CONTRACTION_PARTICLES and i > 0:
97
+ prev = words[i - 1]
98
+ results.append({
99
+ "word": prev.text + surf,
100
+ "lemma": prev.lemma,
101
+ "index": i - 1,
102
+ "inflectional_morpheme": "Contraction",
103
+ "morpheme_form": low_txt,
104
+ })
105
+ i += 1
106
+ continue
107
+
108
+ if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
109
+ low_lem, low_surf = lem.lower(), surf.lower()
110
+ suf = get_suffix(low_lem, low_surf)
111
+ morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>"
112
+ results.append({
113
+ "word": surf,
114
+ "lemma": lem,
115
+ "index": i,
116
+ "inflectional_morpheme": "Possessive",
117
+ "morpheme_form": morpheme_form,
118
+ })
119
+ i += 1
120
+ continue
121
+
122
+ inflect_type = None
123
+ if pos == "NOUN" and feats.get("Number") == "Plur":
124
+ inflect_type = "Plural"
125
+ elif pos == "ADJ" and feats.get("Degree") == "Cmp":
126
+ inflect_type = "Comparative"
127
+ elif pos == "ADJ" and feats.get("Degree") == "Sup":
128
+ inflect_type = "Superlative"
129
+ elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
130
+ inflect_type = "3rd Person Singular"
131
+ elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
132
+ inflect_type = "Past Tense"
133
+ elif pos == "VERB" and feats.get("VerbForm") == "Part":
134
+ if feats.get("Tense") == "Past" or w.xpos == "VBN":
135
+ inflect_type = "Past Participle"
136
+ elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
137
+ inflect_type = "Progressive"
138
+
139
+ if inflect_type:
140
+ if surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
141
+ i += 1
142
+ continue
143
+
144
+ raw_suffix = get_suffix(lem, low_txt)
145
+ canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
146
+ morpheme_form = f"/{canon}" if canon else "<IRR>"
147
+ results.append({
148
+ "word": surf,
149
+ "lemma": lem,
150
+ "index": i,
151
+ "inflectional_morpheme": inflect_type,
152
+ "morpheme_form": morpheme_form,
153
+ })
154
+
155
+ i += 1
156
+
157
+ return results
158
+
159
+
160
+ def extract_morpheme_omissions(text: str):
161
+
162
+ doc = nlp(text)
163
+ omissions = []
164
+
165
+ for sent in doc.sentences:
166
+ words = sent.words
167
+ i = 0
168
+ while i < len(words):
169
+ w = words[i]
170
+ surf, lem, pos = w.text, w.lemma, w.upos
171
+ feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
172
+
173
+ inflect_type = None
174
+ if pos == "NOUN" and feats.get("Number") == "Plur":
175
+ inflect_type = "Plural"
176
+ elif pos == "ADJ" and feats.get("Degree") == "Cmp":
177
+ inflect_type = "Comparative"
178
+ elif pos == "ADJ" and feats.get("Degree") == "Sup":
179
+ inflect_type = "Superlative"
180
+ elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
181
+ inflect_type = "3rd Person Singular"
182
+ elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
183
+ inflect_type = "Past Tense"
184
+ elif pos == "VERB" and feats.get("VerbForm") == "Part":
185
+ if feats.get("Tense") == "Past" or w.xpos == "VBN":
186
+ inflect_type = "Past Participle"
187
+ elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
188
+ inflect_type = "Progressive"
189
+
190
+ if inflect_type and surf.lower() == lem.lower() and inflect_type not in {"Possessive", "Comparative", "Superlative"}:
191
+ omissions.append({
192
+ "word": surf,
193
+ "lemma": lem,
194
+ "index": i,
195
+ "inflectional_morpheme": inflect_type,
196
+ "morpheme_form": "<OMI>",
197
+ })
198
+
199
+ i += 1
200
+
201
+ return omissions
202
+
203
+
204
+ def annotate_morpheme_omission(session_id, base_dir="session_data"):
205
+
206
+ base_dir = base_dir or os.getcwd()
207
+ json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
208
+
209
+ if not os.path.exists(json_file):
210
+ raise FileNotFoundError(f"{json_file} not found, make sure transcription step ran first.")
211
+
212
+ with open(json_file, "r", encoding="utf-8") as f:
213
+ data = json.load(f)
214
+
215
+ segments = data.get("segments", data) if isinstance(data, dict) else data
216
+
217
+ for seg in segments:
218
+ text = seg.get("text", "")
219
+ seg["morpheme_omissions"] = extract_morpheme_omissions(text)
220
+
221
+ with open(json_file, "w", encoding="utf-8") as f:
222
+ json.dump(data, f, ensure_ascii=False, indent=2)
223
+
224
+
225
+ if __name__ == "__main__":
226
+ sample = "His is more better than mine, he get up in the water. He is take the buses."
227
+ print("Inflectional Morphemes:")
228
+ print(json.dumps(extract_inflectional_morphemes(sample), indent=2, ensure_ascii=False))
229
+ print("\nMorpheme Omissions:")
230
+ print(json.dumps(extract_morpheme_omissions(sample), indent=2, ensure_ascii=False))