# scripts/create_test_set.py import os # --- Configuration --- DATA_DIR = "data/processed" TEST_DIR = "data/test_sets" SOURCE_FILE = os.path.join(DATA_DIR, "nepali.ne") TARGET_FILE = os.path.join(DATA_DIR, "nepali.en") NUM_TEST_LINES = 500 # --- print("--- Creating a held-back test set for Nepali ---") os.makedirs(TEST_DIR, exist_ok=True) # Read all lines from the original files with open(SOURCE_FILE, "r", encoding="utf-8") as f: source_lines = f.readlines() with open(TARGET_FILE, "r", encoding="utf-8") as f: target_lines = f.readlines() # Ensure the files have the same number of lines assert len(source_lines) == len(target_lines), "Source and target files have different lengths!" # Split the data train_source_lines = source_lines[:-NUM_TEST_LINES] test_source_lines = source_lines[-NUM_TEST_LINES:] train_target_lines = target_lines[:-NUM_TEST_LINES] test_target_lines = target_lines[-NUM_TEST_LINES:] # Write the new, smaller training files (overwriting the old ones) with open(SOURCE_FILE, "w", encoding="utf-8") as f: f.writelines(train_source_lines) with open(TARGET_FILE, "w", encoding="utf-8") as f: f.writelines(train_target_lines) # Write the new test files with open(os.path.join(TEST_DIR, "test.ne"), "w", encoding="utf-8") as f: f.writelines(test_source_lines) with open(os.path.join(TEST_DIR, "test.en"), "w", encoding="utf-8") as f: f.writelines(test_target_lines) print(f"Successfully created a test set with {NUM_TEST_LINES} lines for Nepali.") print(f"The original training files in '{DATA_DIR}' have been updated.")