File size: 10,107 Bytes
9e39004
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6a467c4
9e39004
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
import os
import json
import stanza

# -----------------------------------------------------------------------------
#  Stanza Pipeline (English, tokenize + POS + lemma)
# -----------------------------------------------------------------------------
#   We initialise this **once** at import‑time so that every subsequent call to
#   `annotate_morpheme()` re‑uses the same pipeline (avoids re‑loading models).
# -----------------------------------------------------------------------------

nlp = stanza.Pipeline(
    lang="en",
    processors="tokenize,pos,lemma",
    tokenize_pretokenized=False,
)

# -----------------------------------------------------------------------------
#  Canonical suffix sets for *inflectional* morphemes (unchanged)
# -----------------------------------------------------------------------------
_EXPECTED_SUFFIXES = {
    "Plural":              {"s", "es"},
    "Possessive":          {"'s", "s"},
    "Comparative":         {"er"},
    "Superlative":         {"est"},
    "3rd Person Singular": {"s", "es"},
    "Past Tense":          {"ed"},
    "Past Participle":     {"ed", "en", "n"},
    "Progressive":         {"ing"},
    "Gerund":              {"ing"},
}

# -----------------------------------------------------------------------------
#  *New* :  Contraction particles (clitics)
# -----------------------------------------------------------------------------
#  Mapping from particle → canonical meaning.  These are added as a *new* type
#  "Contraction" in the output list.
# -----------------------------------------------------------------------------
_CONTRACTION_PARTICLES = {
    "'ll": "will",          # we'll, he'll …
    "'d":  "would/had",     # I'd, she'd …
    "'ve": "have",          # we've, they've …
    "'re": "are",           # you're, they're …
    "'m":  "am",            # I'm …
    "n't": "not",           # isn't, didn't …
    "'s":  "is/has",        # what's, she's, Tom's(?) – see disambiguation below
}

_S_TOKENS = {"'s", "’s"}  # keep both apostrophe forms

# -----------------------------------------------------------------------------
#  Helper functions
# -----------------------------------------------------------------------------

def is_possessive_candidate(tok):
    """Return True if token text is 's / ’s and UD tag == PART."""
    return tok.text in _S_TOKENS and tok.upos == "PART"


def lcp(a: str, b: str) -> str:
    """Longest common prefix (case‑insensitive)."""
    i = 0
    while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
        i += 1
    return a[:i]


def strip_doubling(lemma: str, suf: str) -> str:
    """Remove doubled final consonant when the suffix repeats it (stop + p + ing)."""
    if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
        cand = suf[1:]
        if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
            return cand
    return suf


def get_suffix(lemma: str, surface: str) -> str:
    """Return raw suffix after common prefix is stripped and doubling handled."""
    return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])


def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
    """Bring irregular spelling variants back to canonical form (e.g. ies → s)."""
    if raw_suf in expected_set:
        return raw_suf
    if lemma.lower().endswith("y") and raw_suf.startswith("i"):
        alt = raw_suf[1:]
        if alt in expected_set:
            return alt
    return None

# -----------------------------------------------------------------------------
#  Core extractor
# -----------------------------------------------------------------------------

def extract_inflectional_morphemes(text: str):
    """Return list of inflectional & contraction morpheme annotations for *text*."""
    doc = nlp(text)
    results = []

    for sent in doc.sentences:
        words = sent.words
        i = 0
        while i < len(words):
            w = words[i]
            surf, lem, pos = w.text, w.lemma, w.upos
            feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
            low_txt = surf.lower()

            # -----------------------------------------------------------------
            # 1) 's : Disambiguate Possessive vs Contraction
            # -----------------------------------------------------------------
            if is_possessive_candidate(w) and i > 0:
                prev = words[i - 1]
                # If the previous token is a NOUN/PROPN we *assume* possessive,
                # otherwise treat it as a contraction for *is/has*.
                if prev.upos in {"NOUN", "PROPN"}:
                    results.append({
                        "word": prev.text + surf,
                        "lemma": prev.lemma,
                        "index": i - 1,
                        "inflectional_morpheme": "Possessive",
                        "morpheme_form": "'/s",
                    })
                else:  # Contraction: what’s / she’s / it’s …
                    results.append({
                        "word": prev.text + surf,
                        "lemma": prev.lemma,
                        "index": i - 1,
                        "inflectional_morpheme": "Contraction",
                        "morpheme_form": "'/s",
                    })
                i += 1
                continue

            # -----------------------------------------------------------------
            # 2) Other contraction particles ( 'll, 're, 'm, 've, 'd, n't )
            # -----------------------------------------------------------------
            if low_txt in _CONTRACTION_PARTICLES and i > 0:
                prev = words[i - 1]
                results.append({
                    "word": prev.text + surf,
                    "lemma": prev.lemma,
                    "index": i - 1,
                    "inflectional_morpheme": "Contraction",
                    "morpheme_form": low_txt,
                })
                i += 1
                continue

            # -----------------------------------------------------------------
            # 3) Possessive pronouns / determiners (his, yours …)
            # -----------------------------------------------------------------
            if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
                low_lem, low_surf = lem.lower(), surf.lower()
                suf = get_suffix(low_lem, low_surf)
                morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>"
                results.append({
                    "word": surf,
                    "lemma": lem,
                    "index": i,
                    "inflectional_morpheme": "Possessive",
                    "morpheme_form": morpheme_form,
                })
                i += 1
                continue

            # -----------------------------------------------------------------
            # 4) Standard inflectional endings (plural, tense, degree …)
            # -----------------------------------------------------------------
            inflect_type = None
            if pos == "NOUN" and feats.get("Number") == "Plur":
                inflect_type = "Plural"
            elif pos == "ADJ" and feats.get("Degree") == "Cmp":
                inflect_type = "Comparative"
            elif pos == "ADJ" and feats.get("Degree") == "Sup":
                inflect_type = "Superlative"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
                inflect_type = "3rd Person Singular"
            elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
                inflect_type = "Past Tense"
            elif pos == "VERB" and feats.get("VerbForm") == "Part":
                if feats.get("Tense") == "Past" or w.xpos == "VBN":
                    inflect_type = "Past Participle"
                elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
                    inflect_type = "Progressive"

            if inflect_type:
                raw_suffix = get_suffix(lem, low_txt)
                canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
                morpheme_form = f"/{canon}" if canon else "<IRR>"
                results.append({
                    "word": surf,
                    "lemma": lem,
                    "index": i,
                    "inflectional_morpheme": inflect_type,
                    "morpheme_form": morpheme_form,
                })

            i += 1

    return results

# -----------------------------------------------------------------------------
#  Pipeline entry‑point used by main_socket / other modules
# -----------------------------------------------------------------------------

def annotate_morpheme(session_id, base_dir="session_data"):
    """Annotate `{session_id}_transcriptionCW.json` with morpheme information."""
    base_dir = base_dir or os.getcwd()
    json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")

    if not os.path.exists(json_file):
        raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")

    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # Support both list‑of‑segments or {segments: [...]} formats
    segments = data.get("segments", data) if isinstance(data, dict) else data

    for seg in segments:
        text = seg.get("text", "")
        seg["morphemes"] = extract_inflectional_morphemes(text)

    with open(json_file, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


# Example usage inside main_socket.py
# -----------------------------------------------------------------------------
# from morpheme import annotate_morpheme
# def handle_session(session_id: str):
#     ...  # other processing steps
#     annotate_morpheme(session_id, base_dir=session_data_dir)
#     ...  # return/serve updated JSON