| import os | |
| import json | |
| def annotate_repetitions(session_id, base_dir="session_data"): | |
| session_dir = os.path.join(base_dir, session_id) | |
| json_file = os.path.join(session_dir, f"{session_id}_transcriptionCW.json") | |
| if not os.path.exists(json_file): | |
| print(f"Error: could not find {json_file}") | |
| return | |
| with open(json_file, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| segments = data.get("segments", []) | |
| for segment in segments: | |
| if "repetitions" in segment: | |
| del segment["repetitions"] | |
| words_list = segment.get("words", []) | |
| tokens = [w.get("word", "") for w in words_list] | |
| reps = [] | |
| i = 0 | |
| n = len(tokens) | |
| while i < n: | |
| found = False | |
| maxL = (n - i) // 2 | |
| for L in range(maxL, 0, -1): | |
| if tokens[i:i+L] == tokens[i+L:i+2*L]: | |
| count = 2 | |
| while i + count * L <= n and tokens[i:i+L] == tokens[i+(count-1)*L:i+count*L]: | |
| count += 1 | |
| count -= 1 | |
| rep_count = count - 1 | |
| rep_obj = { | |
| "content": " ".join(tokens[i:i+L] * rep_count), | |
| "words": list(range(i, i + rep_count * L)), | |
| "mark_location": i + rep_count * L - 1 | |
| } | |
| reps.append(rep_obj) | |
| i += count * L | |
| found = True | |
| break | |
| if not found: | |
| i += 1 | |
| segment["repetitions"] = reps | |
| with open(json_file, "w", encoding="utf-8") as f: | |
| json.dump(data, f, ensure_ascii=False, indent=4) | |
| print(f"Session {session_id} repetition annotation done: {json_file}") | |
| return data | |
| if __name__ == "__main__": | |
| annotate_repetitions("000030") | |