#!/usr/bin/env python # coding: utf-8 ### Experiment 019-4 # - Model: Qwen/Qwen3-Embedding-8B import os from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, make_scorer, classification_report from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC import time import pickle import numpy as np import pandas as pd import torch from torch import Tensor from transformers import AutoModel, AutoTokenizer from transformers.utils import is_flash_attn_2_available import wandb from wandb import AlertLevel os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = '1' os.environ["WANDB_PROJECT"] = "GermEval2025-Substask1" os.environ["WANDB_LOG_MODEL"] = "false" if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') print("CUDA not available, using CPU") experiment_name = "exp019-4" testing_mode = False # Load data comments = pd.read_csv("../../share-GermEval2025-data/Data/training data/comments.csv") task1 = pd.read_csv("../../share-GermEval2025-data/Data/training data/task1.csv") comments = comments.merge(task1, on=["document", "comment_id"]) # Remove duplicates df = comments.drop_duplicates(subset=['comment', 'flausch']) df.reset_index(drop=True, inplace=True) # Use only a small subset for testing if testing_mode: os.environ["WANDB_MODE"] = "offline" testing_mode_sample_size = 1000 df = df.sample(n=testing_mode_sample_size, random_state=42).reset_index(drop=True) print(f"Testing mode: using only {testing_mode_sample_size} samples for quick testing.") def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0]) if left_padding: return last_hidden_states[:, -1] else: sequence_lengths = attention_mask.sum(dim=1) - 1 batch_size = last_hidden_states.shape[0] return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths] class Qwen3Embedder: def __init__(self, model_name='Qwen/Qwen3-Embedding-8B', instruction=None, max_length=1024): if instruction is None: instruction = 'Classify a given comment as either flausch (a positive, supportive expression) or non-flausch.' self.instruction = instruction if is_flash_attn_2_available(): self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation="flash_attention_2", torch_dtype=torch.float16) else: self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16) self.model = self.model.cuda() self.model.eval() self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left') self.max_length = max_length def get_detailed_instruct(self, query: str) -> str: return f'Instruct: {self.instruction}\nQuery:{query}' def encode_batch(self, texts, batch_size=32): """Encode texts in batches to handle memory efficiently""" all_embeddings = [] for i in range(0, len(texts), batch_size): batch_texts = [self.get_detailed_instruct(comment) for comment in texts[i:i + batch_size]] # Tokenize batch inputs = self.tokenizer( batch_texts, padding=True, truncation=True, max_length=self.max_length, return_tensors='pt' ).to(device) # Get embeddings with torch.no_grad(): outputs = self.model(**inputs) # Mean pooling embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask']) #embeddings = embeddings.float() all_embeddings.append(embeddings.cpu().numpy()) # Normalize embeddings (sollte ich?) #import torch.nn.functional as F #output = F.normalize(all_embeddings, p=2, dim=1) return np.vstack(all_embeddings) # Initialize embedder print("Loading Qwen3 Embeddings v3...") embedder = Qwen3Embedder(instruction='Classify a given comment as either flausch (a positive, supportive expression) or non-flausch') X, y = df["comment"], df["flausch"].map(dict(yes=1, no=0)) # load embeddings if they exist embeddings_file = f'{"testing_" if testing_mode else ""}Qwen3-Embedding-8B-{experiment_name}.npy' if os.path.exists(embeddings_file): print(f"Loading existing embeddings from {embeddings_file}") X_embeddings = np.load(embeddings_file) else: print("Embeddings not found, generating new embeddings...") # Encode texts in batches to avoid memory issues X_embeddings = embedder.encode_batch(X.tolist(), batch_size=64) print(f"Generated embeddings with shape: {X_embeddings.shape}") # save embeddings to avoid recomputation np.save(embeddings_file, X_embeddings) wandb.init( project=os.environ["WANDB_PROJECT"], dir='./wandb_logs', name=f"{experiment_name}", ) # 5-fold stratified cross-validation kf_splits = 5 pipe = Pipeline([ ("scaler", StandardScaler()), ("svm", SVC(random_state=42, cache_size=2000)) ]) param_grid = [ { # Fitting 5 folds for each of 25 candidates, totalling 125 fits 'svm__kernel': ['rbf'], 'svm__C': [5, 6, 7, 8, 9, 10], 'svm__gamma': [0.00008, 0.0001, 0.0002, 1/4096, 0.0003, 0.0004, 0.0005, 0.0006] # wähle diesen Bereich, da wir mit Qwen3-Embedding-8B 4096 Dimensionen haben # und wir bei auto bei 1/4096 also ca. 2.4e-4 landen würden }, # { # 'kernel': ['poly'], # 'C': [0.1, 1, 10, 100], # 'degree': [2, 3, 4], # 'gamma': ['scale', 'auto', 0.001, 0.01], # 'coef0': [0.0, 0.1, 0.5, 1] # } ] f1_pos_scorer = make_scorer(f1_score, pos_label=1, average='binary') X_train = X_embeddings y_train = y # 5‐fach StratifiedCV für die Grid‐Search cv_inner = StratifiedKFold(n_splits=kf_splits, shuffle=True, random_state=42) grid = GridSearchCV( estimator=pipe, param_grid=param_grid, cv=cv_inner, scoring=f1_pos_scorer, n_jobs=63, verbose=3, return_train_score=True ) grid.fit(X_train, y_train) # 6. Ergebnisse ausgeben print("Best F1 (pos) auf CV:", grid.best_score_) print("Beste Parameter:", grid.best_params_) print("Best estimator:", grid.best_estimator_) with open(f'scores.{experiment_name}.txt', 'a') as f: f.write(f'[{time.strftime("%Y-%m-%d %H:%M:%S")}] {kf_splits}Fold CV\n') f.write(f'[{experiment_name}] Best F1 (pos) auf CV: {grid.best_score_}\n') f.write(f'[{experiment_name}] Beste Parameter: {grid.best_params_}\n') f.write(f'[{experiment_name}] Best estimator: {grid.best_estimator_}\n') results = pd.DataFrame(grid.cv_results_).sort_values("rank_test_score") print("grid.cv_results_:") print(results) results.to_csv(f'grid_cv_results.{experiment_name}.csv', index=False) with open(f"grid_cv.{experiment_name}.pkl", "wb") as f: pickle.dump(grid, f) print(f"GridSearchCV results saved to grid_cv_results.{experiment_name}.csv") print(f"Training completed with {len(X_train)} samples...") print("Experiment completed!") wandb.alert( title=f'Experiment {experiment_name} finished!', text=f'Best F1 (pos): {grid.best_score_:.4f}\nBest Params: {grid.best_params_}', level=AlertLevel.INFO ) wandb.finish() print("Notification sent via Weights & Biases.")