Spaces:

Sven33
/

SATE

Runtime error

SATE / morpheme.py

Shuwei Hou

morpheme_annotation_name

6a467c4 5 months ago

10.1 kB

	import os
	import json
	import stanza

	# -----------------------------------------------------------------------------
	# Stanza Pipeline (English, tokenize + POS + lemma)
	# -----------------------------------------------------------------------------
	# We initialise this once at import‑time so that every subsequent call to
	# `annotate_morpheme()` re‑uses the same pipeline (avoids re‑loading models).
	# -----------------------------------------------------------------------------

	nlp = stanza.Pipeline(
	lang="en",
	processors="tokenize,pos,lemma",
	tokenize_pretokenized=False,
	)

	# -----------------------------------------------------------------------------
	# Canonical suffix sets for inflectional morphemes (unchanged)
	# -----------------------------------------------------------------------------
	_EXPECTED_SUFFIXES = {
	"Plural": {"s", "es"},
	"Possessive": {"'s", "s"},
	"Comparative": {"er"},
	"Superlative": {"est"},
	"3rd Person Singular": {"s", "es"},
	"Past Tense": {"ed"},
	"Past Participle": {"ed", "en", "n"},
	"Progressive": {"ing"},
	"Gerund": {"ing"},
	}

	# -----------------------------------------------------------------------------
	# New : Contraction particles (clitics)
	# -----------------------------------------------------------------------------
	# Mapping from particle → canonical meaning. These are added as a new type
	# "Contraction" in the output list.
	# -----------------------------------------------------------------------------
	_CONTRACTION_PARTICLES = {
	"'ll": "will", # we'll, he'll …
	"'d": "would/had", # I'd, she'd …
	"'ve": "have", # we've, they've …
	"'re": "are", # you're, they're …
	"'m": "am", # I'm …
	"n't": "not", # isn't, didn't …
	"'s": "is/has", # what's, she's, Tom's(?) – see disambiguation below
	}

	_S_TOKENS = {"'s", "’s"} # keep both apostrophe forms

	# -----------------------------------------------------------------------------
	# Helper functions
	# -----------------------------------------------------------------------------

	def is_possessive_candidate(tok):
	"""Return True if token text is 's / ’s and UD tag == PART."""
	return tok.text in _S_TOKENS and tok.upos == "PART"


	def lcp(a: str, b: str) -> str:
	"""Longest common prefix (case‑insensitive)."""
	i = 0
	while i < min(len(a), len(b)) and a[i].lower() == b[i].lower():
	i += 1
	return a[:i]


	def strip_doubling(lemma: str, suf: str) -> str:
	"""Remove doubled final consonant when the suffix repeats it (stop + p + ing)."""
	if suf and len(suf) >= 2 and suf[0] == lemma[-1]:
	cand = suf[1:]
	if any(cand in v for v in _EXPECTED_SUFFIXES.values()):
	return cand
	return suf


	def get_suffix(lemma: str, surface: str) -> str:
	"""Return raw suffix after common prefix is stripped and doubling handled."""
	return strip_doubling(lemma, surface[len(lcp(lemma, surface)):])


	def normalize_suffix(lemma: str, raw_suf: str, expected_set: set) -> str \| None:
	"""Bring irregular spelling variants back to canonical form (e.g. ies → s)."""
	if raw_suf in expected_set:
	return raw_suf
	if lemma.lower().endswith("y") and raw_suf.startswith("i"):
	alt = raw_suf[1:]
	if alt in expected_set:
	return alt
	return None

	# -----------------------------------------------------------------------------
	# Core extractor
	# -----------------------------------------------------------------------------

	def extract_inflectional_morphemes(text: str):
	"""Return list of inflectional & contraction morpheme annotations for text."""
	doc = nlp(text)
	results = []

	for sent in doc.sentences:
	words = sent.words
	i = 0
	while i < len(words):
	w = words[i]
	surf, lem, pos = w.text, w.lemma, w.upos
	feats = {k: v for k, v in (f.split("=", 1) for f in (w.feats or "").split("\|") if "=" in f)}
	low_txt = surf.lower()

	# -----------------------------------------------------------------
	# 1) 's : Disambiguate Possessive vs Contraction
	# -----------------------------------------------------------------
	if is_possessive_candidate(w) and i > 0:
	prev = words[i - 1]
	# If the previous token is a NOUN/PROPN we assume possessive,
	# otherwise treat it as a contraction for is/has.
	if prev.upos in {"NOUN", "PROPN"}:
	results.append({
	"word": prev.text + surf,
	"lemma": prev.lemma,
	"index": i - 1,
	"inflectional_morpheme": "Possessive",
	"morpheme_form": "'/s",
	})
	else: # Contraction: what’s / she’s / it’s …
	results.append({
	"word": prev.text + surf,
	"lemma": prev.lemma,
	"index": i - 1,
	"inflectional_morpheme": "Contraction",
	"morpheme_form": "'/s",
	})
	i += 1
	continue

	# -----------------------------------------------------------------
	# 2) Other contraction particles ( 'll, 're, 'm, 've, 'd, n't )
	# -----------------------------------------------------------------
	if low_txt in _CONTRACTION_PARTICLES and i > 0:
	prev = words[i - 1]
	results.append({
	"word": prev.text + surf,
	"lemma": prev.lemma,
	"index": i - 1,
	"inflectional_morpheme": "Contraction",
	"morpheme_form": low_txt,
	})
	i += 1
	continue

	# -----------------------------------------------------------------
	# 3) Possessive pronouns / determiners (his, yours …)
	# -----------------------------------------------------------------
	if feats.get("Poss") == "Yes" and pos in {"PRON", "DET"}:
	low_lem, low_surf = lem.lower(), surf.lower()
	suf = get_suffix(low_lem, low_surf)
	morpheme_form = "/s" if suf in {"s", "es"} and low_lem + suf == low_surf else "<IRR>"
	results.append({
	"word": surf,
	"lemma": lem,
	"index": i,
	"inflectional_morpheme": "Possessive",
	"morpheme_form": morpheme_form,
	})
	i += 1
	continue

	# -----------------------------------------------------------------
	# 4) Standard inflectional endings (plural, tense, degree …)
	# -----------------------------------------------------------------
	inflect_type = None
	if pos == "NOUN" and feats.get("Number") == "Plur":
	inflect_type = "Plural"
	elif pos == "ADJ" and feats.get("Degree") == "Cmp":
	inflect_type = "Comparative"
	elif pos == "ADJ" and feats.get("Degree") == "Sup":
	inflect_type = "Superlative"
	elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Pres" and feats.get("Person") == "3":
	inflect_type = "3rd Person Singular"
	elif pos == "VERB" and feats.get("VerbForm") == "Fin" and feats.get("Tense") == "Past":
	inflect_type = "Past Tense"
	elif pos == "VERB" and feats.get("VerbForm") == "Part":
	if feats.get("Tense") == "Past" or w.xpos == "VBN":
	inflect_type = "Past Participle"
	elif feats.get("Tense") == "Pres" or w.xpos == "VBG":
	inflect_type = "Progressive"

	if inflect_type:
	raw_suffix = get_suffix(lem, low_txt)
	canon = normalize_suffix(lem, raw_suffix, _EXPECTED_SUFFIXES[inflect_type])
	morpheme_form = f"/{canon}" if canon else "<IRR>"
	results.append({
	"word": surf,
	"lemma": lem,
	"index": i,
	"inflectional_morpheme": inflect_type,
	"morpheme_form": morpheme_form,
	})

	i += 1

	return results

	# -----------------------------------------------------------------------------
	# Pipeline entry‑point used by main_socket / other modules
	# -----------------------------------------------------------------------------

	def annotate_morpheme(session_id, base_dir="session_data"):
	"""Annotate `{session_id}_transcriptionCW.json` with morpheme information."""
	base_dir = base_dir or os.getcwd()
	json_file = os.path.join(base_dir, f"{session_id}/{session_id}_transcriptionCW.json")

	if not os.path.exists(json_file):
	raise FileNotFoundError(f"{json_file} not found – make sure transcription step ran first.")

	with open(json_file, "r", encoding="utf-8") as f:
	data = json.load(f)

	# Support both list‑of‑segments or {segments: [...]} formats
	segments = data.get("segments", data) if isinstance(data, dict) else data

	for seg in segments:
	text = seg.get("text", "")
	seg["morphemes"] = extract_inflectional_morphemes(text)

	with open(json_file, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=2)


	# Example usage inside main_socket.py
	# -----------------------------------------------------------------------------
	# from morpheme import annotate_morpheme
	# def handle_session(session_id: str):
	# ... # other processing steps
	# annotate_morpheme(session_id, base_dir=session_data_dir)
	# ... # return/serve updated JSON