Shuwei Hou commited on
Commit
cedcb9f
·
1 Parent(s): 6a467c4

morpheme_annotation_name

Browse files
Files changed (1) hide show
  1. morpheme.py +13 -64
morpheme.py CHANGED
@@ -2,22 +2,12 @@ import os
2
  import json
3
  import stanza
4
 
5
- # -----------------------------------------------------------------------------
6
- # Stanza Pipeline (English, tokenize + POS + lemma)
7
- # -----------------------------------------------------------------------------
8
- # We initialise this **once** at import‑time so that every subsequent call to
9
- # `annotate_morpheme()` re‑uses the same pipeline (avoids re‑loading models).
10
- # -----------------------------------------------------------------------------
11
-
12
  nlp = stanza.Pipeline(
13
  lang="en",
14
  processors="tokenize,pos,lemma",
15
  tokenize_pretokenized=False,
16
  )
17
 
18
- # -----------------------------------------------------------------------------
19
- # Canonical suffix sets for *inflectional* morphemes (unchanged)
20
- # -----------------------------------------------------------------------------
21
  _EXPECTED_SUFFIXES = {
22
  "Plural": {"s", "es"},
23
  "Possessive": {"'s", "s"},
@@ -30,35 +20,24 @@ _EXPECTED_SUFFIXES = {
30
  "Gerund": {"ing"},
31
  }
32
 
33
- # -----------------------------------------------------------------------------
34
- # *New* : Contraction particles (clitics)
35
- # -----------------------------------------------------------------------------
36
- # Mapping from particle → canonical meaning. These are added as a *new* type
37
- # "Contraction" in the output list.
38
- # -----------------------------------------------------------------------------
39
  _CONTRACTION_PARTICLES = {
40
- "'ll": "will", # we'll, he'll
41
- "'d": "would/had", # I'd, she'd
42
- "'ve": "have", # we've, they've
43
- "'re": "are", # you're, they're
44
- "'m": "am", # I'm
45
- "n't": "not", # isn't, didn't
46
- "'s": "is/has", # what's, she's, Tom's(?) – see disambiguation below
47
  }
48
 
49
- _S_TOKENS = {"'s", "’s"} # keep both apostrophe forms
50
 
51
- # -----------------------------------------------------------------------------
52
- # Helper functions
53
- # -----------------------------------------------------------------------------
54
 
55
  def is_possessive_candidate(tok):
56
- """Return True if token text is 's / ’s and UD tag == PART."""
57
  return tok.text in _S_TOKENS and tok.upos == "PART"
58
 
59
 
60
  def lcp(a: str, b: str) -> str:
61
- """Longest common prefix (case‑insensitive)."""
62
  i = 0
63
  while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
64
  i += 1
@@ -66,7 +45,6 @@ def lcp(a: str, b: str) -> str:
66
 
67
 
68
  def strip_doubling(lemma: str, suf: str) -> str:
69
- """Remove doubled final consonant when the suffix repeats it (stop + p + ing)."""
70
  if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
71
  cand = suf[1:]
72
  if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
@@ -75,12 +53,10 @@ def strip_doubling(lemma: str, suf: str) -> str:
75
 
76
 
77
  def get_suffix(lemma: str, surface: str) -> str:
78
- """Return raw suffix after common prefix is stripped and doubling handled."""
79
  return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
80
 
81
 
82
  def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
83
- """Bring irregular spelling variants back to canonical form (e.g. ies → s)."""
84
  if raw_suf in expected_set:
85
  return raw_suf
86
  if lemma.lower().endswith("y") and raw_suf.startswith("i"):
@@ -89,12 +65,8 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
89
  return alt
90
  return None
91
 
92
- # -----------------------------------------------------------------------------
93
- # Core extractor
94
- # -----------------------------------------------------------------------------
95
 
96
  def extract_inflectional_morphemes(text: str):
97
- """Return list of inflectional & contraction morpheme annotations for *text*."""
98
  doc = nlp(text)
99
  results = []
100
 
@@ -107,13 +79,9 @@ def extract_inflectional_morphemes(text: str):
107
  feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
108
  low_txt = surf.lower()
109
 
110
- # -----------------------------------------------------------------
111
- # 1) 's : Disambiguate Possessive vs Contraction
112
- # -----------------------------------------------------------------
113
  if is_possessive_candidate(w) and i > 0:
114
  prev = words[i - 1]
115
- # If the previous token is a NOUN/PROPN we *assume* possessive,
116
- # otherwise treat it as a contraction for *is/has*.
117
  if prev.upos in {"NOUN", "PROPN"}:
118
  results.append({
119
  "word": prev.text + surf,
@@ -122,7 +90,7 @@ def extract_inflectional_morphemes(text: str):
122
  "inflectional_morpheme": "Possessive",
123
  "morpheme_form": "'/s",
124
  })
125
- else: # Contraction: what’s / she’s / it’s …
126
  results.append({
127
  "word": prev.text + surf,
128
  "lemma": prev.lemma,
@@ -133,9 +101,7 @@ def extract_inflectional_morphemes(text: str):
133
  i += 1
134
  continue
135
 
136
- # -----------------------------------------------------------------
137
- # 2) Other contraction particles ( 'll, 're, 'm, 've, 'd, n't )
138
- # -----------------------------------------------------------------
139
  if low_txt in _CONTRACTION_PARTICLES and i > 0:
140
  prev = words[i - 1]
141
  results.append({
@@ -148,9 +114,7 @@ def extract_inflectional_morphemes(text: str):
148
  i += 1
149
  continue
150
 
151
- # -----------------------------------------------------------------
152
- # 3) Possessive pronouns / determiners (his, yours …)
153
- # -----------------------------------------------------------------
154
  if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
155
  low_lem, low_surf = lem.lower(), surf.lower()
156
  suf = get_suffix(low_lem, low_surf)
@@ -165,9 +129,7 @@ def extract_inflectional_morphemes(text: str):
165
  i += 1
166
  continue
167
 
168
- # -----------------------------------------------------------------
169
- # 4) Standard inflectional endings (plural, tense, degree …)
170
- # -----------------------------------------------------------------
171
  inflect_type = None
172
  if pos == "NOUN" and feats.get("Number") == "Plur":
173
  inflect_type = "Plural"
@@ -201,12 +163,8 @@ def extract_inflectional_morphemes(text: str):
201
 
202
  return results
203
 
204
- # -----------------------------------------------------------------------------
205
- # Pipeline entry‑point used by main_socket / other modules
206
- # -----------------------------------------------------------------------------
207
 
208
  def annotate_morpheme(session_id, base_dir="session_data"):
209
- """Annotate `{session_id}_transcriptionCW.json` with morpheme information."""
210
  base_dir = base_dir or os.getcwd()
211
  json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
212
 
@@ -216,7 +174,6 @@ def annotate_morpheme(session_id, base_dir="session_data"):
216
  with open(json_file, "r", encoding="utf-8") as f:
217
  data = json.load(f)
218
 
219
- # Support both list‑of‑segments or {segments: [...]} formats
220
  segments = data.get("segments", data) if isinstance(data, dict) else data
221
 
222
  for seg in segments:
@@ -226,11 +183,3 @@ def annotate_morpheme(session_id, base_dir="session_data"):
226
  with open(json_file, "w", encoding="utf-8") as f:
227
  json.dump(data, f, ensure_ascii=False, indent=2)
228
 
229
-
230
- # Example usage inside main_socket.py
231
- # -----------------------------------------------------------------------------
232
- # from morpheme import annotate_morpheme
233
- # def handle_session(session_id: str):
234
- # ... # other processing steps
235
- # annotate_morpheme(session_id, base_dir=session_data_dir)
236
- # ... # return/serve updated JSON
 
2
  import json
3
  import stanza
4
 
 
 
 
 
 
 
 
5
  nlp = stanza.Pipeline(
6
  lang="en",
7
  processors="tokenize,pos,lemma",
8
  tokenize_pretokenized=False,
9
  )
10
 
 
 
 
11
  _EXPECTED_SUFFIXES = {
12
  "Plural": {"s", "es"},
13
  "Possessive": {"'s", "s"},
 
20
  "Gerund": {"ing"},
21
  }
22
 
 
 
 
 
 
 
23
  _CONTRACTION_PARTICLES = {
24
+ "'ll": "will", # we'll, he'll
25
+ "'d": "would/had", # I'd, she'd
26
+ "'ve": "have", # we've, they've
27
+ "'re": "are", # you're, they're
28
+ "'m": "am", # I'm
29
+ "n't": "not", # isn't, didn't
30
+ "'s": "is/has", # what's, she's
31
  }
32
 
33
+ _S_TOKENS = {"'s", "’s"}
34
 
 
 
 
35
 
36
  def is_possessive_candidate(tok):
 
37
  return tok.text in _S_TOKENS and tok.upos == "PART"
38
 
39
 
40
  def lcp(a: str, b: str) -> str:
 
41
  i = 0
42
  while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
43
  i += 1
 
45
 
46
 
47
  def strip_doubling(lemma: str, suf: str) -> str:
 
48
  if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
49
  cand = suf[1:]
50
  if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
 
53
 
54
 
55
  def get_suffix(lemma: str, surface: str) -> str:
 
56
  return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
57
 
58
 
59
  def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
 
60
  if raw_suf in expected_set:
61
  return raw_suf
62
  if lemma.lower().endswith("y") and raw_suf.startswith("i"):
 
65
  return alt
66
  return None
67
 
 
 
 
68
 
69
  def extract_inflectional_morphemes(text: str):
 
70
  doc = nlp(text)
71
  results = []
72
 
 
79
  feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
80
  low_txt = surf.lower()
81
 
 
 
 
82
  if is_possessive_candidate(w) and i > 0:
83
  prev = words[i - 1]
84
+
 
85
  if prev.upos in {"NOUN", "PROPN"}:
86
  results.append({
87
  "word": prev.text + surf,
 
90
  "inflectional_morpheme": "Possessive",
91
  "morpheme_form": "'/s",
92
  })
93
+ else:
94
  results.append({
95
  "word": prev.text + surf,
96
  "lemma": prev.lemma,
 
101
  i += 1
102
  continue
103
 
104
+
 
 
105
  if low_txt in _CONTRACTION_PARTICLES and i > 0:
106
  prev = words[i - 1]
107
  results.append({
 
114
  i += 1
115
  continue
116
 
117
+
 
 
118
  if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
119
  low_lem, low_surf = lem.lower(), surf.lower()
120
  suf = get_suffix(low_lem, low_surf)
 
129
  i += 1
130
  continue
131
 
132
+
 
 
133
  inflect_type = None
134
  if pos == "NOUN" and feats.get("Number") == "Plur":
135
  inflect_type = "Plural"
 
163
 
164
  return results
165
 
 
 
 
166
 
167
  def annotate_morpheme(session_id, base_dir="session_data"):
 
168
  base_dir = base_dir or os.getcwd()
169
  json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
170
 
 
174
  with open(json_file, "r", encoding="utf-8") as f:
175
  data = json.load(f)
176
 
 
177
  segments = data.get("segments", data) if isinstance(data, dict) else data
178
 
179
  for seg in segments:
 
183
  with open(json_file, "w", encoding="utf-8") as f:
184
  json.dump(data, f, ensure_ascii=False, indent=2)
185