Spaces:

Sven33
/

SATE

Runtime error

App Files Files Community

Shuwei Hou commited on Jun 16

Commit

cedcb9f

1 Parent(s): 6a467c4

morpheme_annotation_name

Browse files

Files changed (1) hide show

morpheme.py +13 -64

morpheme.py CHANGED Viewed

@@ -2,22 +2,12 @@ import os
 import json
 import stanza
-# -----------------------------------------------------------------------------
-#  Stanza Pipeline (English, tokenize + POS + lemma)
-# -----------------------------------------------------------------------------
-#   We initialise this **once** at import‑time so that every subsequent call to
-#   `annotate_morpheme()` re‑uses the same pipeline (avoids re‑loading models).
-# -----------------------------------------------------------------------------
 nlp = stanza.Pipeline(
     lang="en",
     processors="tokenize,pos,lemma",
     tokenize_pretokenized=False,
 )
-# -----------------------------------------------------------------------------
-#  Canonical suffix sets for *inflectional* morphemes (unchanged)
-# -----------------------------------------------------------------------------
 _EXPECTED_SUFFIXES = {
     "Plural":              {"s", "es"},
     "Possessive":          {"'s", "s"},
@@ -30,35 +20,24 @@ _EXPECTED_SUFFIXES = {
     "Gerund":              {"ing"},
 }
-# -----------------------------------------------------------------------------
-#  *New* :  Contraction particles (clitics)
-# -----------------------------------------------------------------------------
-#  Mapping from particle → canonical meaning.  These are added as a *new* type
-#  "Contraction" in the output list.
-# -----------------------------------------------------------------------------
 _CONTRACTION_PARTICLES = {
-    "'ll": "will",          # we'll, he'll …
-    "'d":  "would/had",     # I'd, she'd …
-    "'ve": "have",          # we've, they've …
-    "'re": "are",           # you're, they're …
-    "'m":  "am",            # I'm …
-    "n't": "not",           # isn't, didn't …
-    "'s":  "is/has",        # what's, she's, Tom's(?) – see disambiguation below
 }
-_S_TOKENS = {"'s", "’s"}  # keep both apostrophe forms
-# -----------------------------------------------------------------------------
-#  Helper functions
-# -----------------------------------------------------------------------------
 def is_possessive_candidate(tok):
-    """Return True if token text is 's / ’s and UD tag == PART."""
     return tok.text in _S_TOKENS and tok.upos == "PART"
 def lcp(a: str, b: str) -> str:
-    """Longest common prefix (case‑insensitive)."""
     i = 0
     while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
         i += 1
@@ -66,7 +45,6 @@ def lcp(a: str, b: str) -> str:
 def strip_doubling(lemma: str, suf: str) -> str:
-    """Remove doubled final consonant when the suffix repeats it (stop + p + ing)."""
     if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
         cand = suf[1:]
         if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
@@ -75,12 +53,10 @@ def strip_doubling(lemma: str, suf: str) -> str:
 def get_suffix(lemma: str, surface: str) -> str:
-    """Return raw suffix after common prefix is stripped and doubling handled."""
     return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
 def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
-    """Bring irregular spelling variants back to canonical form (e.g. ies → s)."""
     if raw_suf in expected_set:
         return raw_suf
     if lemma.lower().endswith("y") and raw_suf.startswith("i"):
@@ -89,12 +65,8 @@ def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
             return alt
     return None
-# -----------------------------------------------------------------------------
-#  Core extractor
-# -----------------------------------------------------------------------------
 def extract_inflectional_morphemes(text: str):
-    """Return list of inflectional & contraction morpheme annotations for *text*."""
     doc = nlp(text)
     results = []
@@ -107,13 +79,9 @@ def extract_inflectional_morphemes(text: str):
             feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
             low_txt = surf.lower()
-            # -----------------------------------------------------------------
-            # 1) 's : Disambiguate Possessive vs Contraction
-            # -----------------------------------------------------------------
             if is_possessive_candidate(w) and i > 0:
                 prev = words[i - 1]
-                # If the previous token is a NOUN/PROPN we *assume* possessive,
-                # otherwise treat it as a contraction for *is/has*.
                 if prev.upos in {"NOUN", "PROPN"}:
                     results.append({
                         "word": prev.text + surf,
@@ -122,7 +90,7 @@ def extract_inflectional_morphemes(text: str):
                         "inflectional_morpheme": "Possessive",
                         "morpheme_form": "'/s",
                     })
-                else:  # Contraction: what’s / she’s / it’s …
                     results.append({
                         "word": prev.text + surf,
                         "lemma": prev.lemma,
@@ -133,9 +101,7 @@ def extract_inflectional_morphemes(text: str):
                 i += 1
                 continue
-            # -----------------------------------------------------------------
-            # 2) Other contraction particles ( 'll, 're, 'm, 've, 'd, n't )
-            # -----------------------------------------------------------------
             if low_txt in _CONTRACTION_PARTICLES and i > 0:
                 prev = words[i - 1]
                 results.append({
@@ -148,9 +114,7 @@ def extract_inflectional_morphemes(text: str):
                 i += 1
                 continue
-            # -----------------------------------------------------------------
-            # 3) Possessive pronouns / determiners (his, yours …)
-            # -----------------------------------------------------------------
             if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
                 low_lem, low_surf = lem.lower(), surf.lower()
                 suf = get_suffix(low_lem, low_surf)
@@ -165,9 +129,7 @@ def extract_inflectional_morphemes(text: str):
                 i += 1
                 continue
-            # -----------------------------------------------------------------
-            # 4) Standard inflectional endings (plural, tense, degree …)
-            # -----------------------------------------------------------------
             inflect_type = None
             if pos == "NOUN" and feats.get("Number") == "Plur":
                 inflect_type = "Plural"
@@ -201,12 +163,8 @@ def extract_inflectional_morphemes(text: str):
     return results
-# -----------------------------------------------------------------------------
-#  Pipeline entry‑point used by main_socket / other modules
-# -----------------------------------------------------------------------------
 def annotate_morpheme(session_id, base_dir="session_data"):
-    """Annotate `{session_id}_transcriptionCW.json` with morpheme information."""
     base_dir = base_dir or os.getcwd()
     json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
@@ -216,7 +174,6 @@ def annotate_morpheme(session_id, base_dir="session_data"):
     with open(json_file, "r", encoding="utf-8") as f:
         data = json.load(f)
-    # Support both list‑of‑segments or {segments: [...]} formats
     segments = data.get("segments", data) if isinstance(data, dict) else data
     for seg in segments:
@@ -226,11 +183,3 @@ def annotate_morpheme(session_id, base_dir="session_data"):
     with open(json_file, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)
-# Example usage inside main_socket.py
-# -----------------------------------------------------------------------------
-# from morpheme import annotate_morpheme
-# def handle_session(session_id: str):
-#     ...  # other processing steps
-#     annotate_morpheme(session_id, base_dir=session_data_dir)
-#     ...  # return/serve updated JSON

 import json
 import stanza
 nlp = stanza.Pipeline(
     lang="en",
     processors="tokenize,pos,lemma",
     tokenize_pretokenized=False,
 )
 _EXPECTED_SUFFIXES = {
     "Plural":              {"s", "es"},
     "Possessive":          {"'s", "s"},
     "Gerund":              {"ing"},
 }
 _CONTRACTION_PARTICLES = {
+    "'ll": "will",          # we'll, he'll
+    "'d":  "would/had",     # I'd, she'd
+    "'ve": "have",          # we've, they've
+    "'re": "are",           # you're, they're
+    "'m":  "am",            # I'm
+    "n't": "not",           # isn't, didn't
+    "'s":  "is/has",        # what's, she's
 }
+_S_TOKENS = {"'s", "’s"}
 def is_possessive_candidate(tok):
     return tok.text in _S_TOKENS and tok.upos == "PART"
 def lcp(a: str, b: str) -> str:
     i = 0
     while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
         i += 1
 def strip_doubling(lemma: str, suf: str) -> str:
     if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
         cand = suf[1:]
         if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
 def get_suffix(lemma: str, surface: str) -> str:
     return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])
 def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str | None:
     if raw_suf in expected_set:
         return raw_suf
     if lemma.lower().endswith("y") and raw_suf.startswith("i"):
             return alt
     return None
 def extract_inflectional_morphemes(text: str):
     doc = nlp(text)
     results = []
             feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("|") if "=" in f)}
             low_txt = surf.lower()
             if is_possessive_candidate(w) and i > 0:
                 prev = words[i - 1]
                 if prev.upos in {"NOUN", "PROPN"}:
                     results.append({
                         "word": prev.text + surf,
                         "inflectional_morpheme": "Possessive",
                         "morpheme_form": "'/s",
                     })
+                else:
                     results.append({
                         "word": prev.text + surf,
                         "lemma": prev.lemma,
                 i += 1
                 continue
             if low_txt in _CONTRACTION_PARTICLES and i > 0:
                 prev = words[i - 1]
                 results.append({
                 i += 1
                 continue
             if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
                 low_lem, low_surf = lem.lower(), surf.lower()
                 suf = get_suffix(low_lem, low_surf)
                 i += 1
                 continue
             inflect_type = None
             if pos == "NOUN" and feats.get("Number") == "Plur":
                 inflect_type = "Plural"
     return results
 def annotate_morpheme(session_id, base_dir="session_data"):
     base_dir = base_dir or os.getcwd()
     json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")
     with open(json_file, "r", encoding="utf-8") as f:
         data = json.load(f)
     segments = data.get("segments", data) if isinstance(data, dict) else data
     for seg in segments:
     with open(json_file, "w", encoding="utf-8") as f:
         json.dump(data, f, ensure_ascii=False, indent=2)