{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "## Experiment 019-4\n", "\n", "SVM mit RBF Kernel, C=5 und Gamma=0.0002" ], "id": "8d9679176b5367c7" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-06-23T18:30:56.081332Z", "start_time": "2025-06-23T18:30:55.935044Z" } }, "source": [ "import os\n", "from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, make_scorer, classification_report\n", "from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.svm import SVC\n", "import time\n", "import pickle\n", "import numpy as np\n", "import pandas as pd\n", "import torch\n", "from torch import Tensor\n", "from transformers import AutoModel, AutoTokenizer\n", "from transformers.utils import is_flash_attn_2_available\n", "import wandb\n", "from wandb import AlertLevel\n", "\n", "\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'\n", "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '1'\n", "os.environ[\"WANDB_PROJECT\"] = \"GermEval2025-Substask1\"\n", "os.environ[\"WANDB_LOG_MODEL\"] = \"false\"\n", "\n", "if torch.cuda.is_available():\n", " device = torch.device('cuda')\n", "else:\n", " device = torch.device('cpu')\n", " print(\"CUDA not available, using CPU\")\n", "\n", "experiment_name = \"exp019-4\"\n", "\n", "testing_mode = False\n", "\n", "# Load data\n", "comments = pd.read_csv(\"./share-GermEval2025-data/Data/training data/comments.csv\")\n", "task1 = pd.read_csv(\"./share-GermEval2025-data/Data/training data/task1.csv\")\n", "comments = comments.merge(task1, on=[\"document\", \"comment_id\"])\n", "\n", "# Remove duplicates\n", "df = comments.drop_duplicates(subset=['comment', 'flausch'])\n", "df.reset_index(drop=True, inplace=True)" ], "outputs": [], "execution_count": 2 }, { "metadata": {}, "cell_type": "code", "source": [ "def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:\n", " left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n", " if left_padding:\n", " return last_hidden_states[:, -1]\n", " else:\n", " sequence_lengths = attention_mask.sum(dim=1) - 1\n", " batch_size = last_hidden_states.shape[0]\n", " return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\n", "\n", "class Qwen3Embedder:\n", " def __init__(self, model_name='Qwen/Qwen3-Embedding-8B', instruction=None, max_length=1024):\n", " if instruction is None:\n", " instruction = 'Classify a given comment as either flausch (a positive, supportive expression) or non-flausch.'\n", " self.instruction = instruction\n", "\n", " if is_flash_attn_2_available():\n", " self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)\n", " else:\n", " self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)\n", "\n", " self.model = self.model.cuda()\n", " self.model.eval()\n", "\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')\n", " self.max_length = max_length\n", "\n", " def get_detailed_instruct(self, query: str) -> str:\n", " return f'Instruct: {self.instruction}\\nQuery:{query}'\n", "\n", " def encode_batch(self, texts, batch_size=32):\n", " \"\"\"Encode texts in batches to handle memory efficiently\"\"\"\n", " all_embeddings = []\n", "\n", " for i in range(0, len(texts), batch_size):\n", " batch_texts = [self.get_detailed_instruct(comment) for comment in texts[i:i + batch_size]]\n", "\n", " # Tokenize batch\n", " inputs = self.tokenizer(\n", " batch_texts,\n", " padding=True,\n", " truncation=True,\n", " max_length=self.max_length,\n", " return_tensors='pt'\n", " ).to(device)\n", "\n", " # Get embeddings\n", " with torch.no_grad():\n", " outputs = self.model(**inputs)\n", " # Mean pooling\n", " embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])\n", " #embeddings = embeddings.float()\n", "\n", " all_embeddings.append(embeddings.cpu().numpy())\n", "\n", " # Normalize embeddings (sollte ich?)\n", " #import torch.nn.functional as F\n", " #output = F.normalize(all_embeddings, p=2, dim=1)\n", " return np.vstack(all_embeddings)\n", "\n", "# Initialize embedder\n", "print(\"Loading Qwen3 Embeddings v3...\")\n", "embedder = Qwen3Embedder(instruction='Classify a given comment as either flausch (a positive, supportive expression) or non-flausch')\n", "\n", "X, y = df[\"comment\"], df[\"flausch\"].map(dict(yes=1, no=0))\n", "\n", "# load embeddings if they exist\n", "embeddings_file = f'Qwen3-Embedding-8B-{experiment_name}.npy'\n", "if os.path.exists(embeddings_file):\n", " print(f\"Loading existing embeddings from {embeddings_file}\")\n", " X_embeddings = np.load(embeddings_file)\n", "else:\n", " print(\"Embeddings not found, generating new embeddings...\")\n", " # Encode texts in batches to avoid memory issues\n", " X_embeddings = embedder.encode_batch(X.tolist(), batch_size=64)\n", " print(f\"Generated embeddings with shape: {X_embeddings.shape}\")\n", "\n", " # save embeddings to avoid recomputation\n", " np.save(embeddings_file, X_embeddings)\n", "\n", "pipe = Pipeline([\n", " (\"scaler\", StandardScaler()),\n", " (\"svm\", SVC(random_state=42, C=5, gamma=0.0002, cache_size=2000))\n", "])\n", "\n", "f1_pos_scorer = make_scorer(f1_score, pos_label=1, average='binary')\n", "\n", "X_train = X_embeddings\n", "y_train = y\n", "\n", "pipe.fit(X_train, y_train)" ], "id": "59ef5a54cb69530f", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T18:30:59.602524Z", "start_time": "2025-06-23T18:30:59.570290Z" } }, "cell_type": "code", "source": [ "test_data: pd.DataFrame = pd.read_csv(\"./share-GermEval2025-data/Data/test data/comments.csv\")\n", "test_data" ], "id": "a842bfa29d59c84b", "outputs": [ { "data": { "text/plain": [ " document comment_id comment\n", "0 NDY-004 1 Lol i love lochis\n", "1 NDY-004 2 ihr singt voll gut :)\n", "2 NDY-004 3 Junge fick dich\n", "3 NDY-004 4 Ihr seit die besten\n", "4 NDY-004 5 ihr seit die ALLER besten ich finde euch soooo...\n", "... ... ... ...\n", "9224 NDY-203 522 hihi kannst du mich grΓΌΓen π π π Achso wusstes...\n", "9225 NDY-203 523 #Glocke aktiviert π Ich liebe deine Videos π π...\n", "9226 NDY-203 524 Bist die beste β€ Bitte GrΓΌΓe mich π β€ π π\n", "9227 NDY-203 525 Hi Bonny β€οΈ War letztens auf'm Flughafen , und...\n", "9228 NDY-203 526 du bist die beste ich bin neu ich hab dich sof...\n", "\n", "[9229 rows x 3 columns]" ], "text/html": [ "
| \n", " | document | \n", "comment_id | \n", "comment | \n", "
|---|---|---|---|
| 0 | \n", "NDY-004 | \n", "1 | \n", "Lol i love lochis | \n", "
| 1 | \n", "NDY-004 | \n", "2 | \n", "ihr singt voll gut :) | \n", "
| 2 | \n", "NDY-004 | \n", "3 | \n", "Junge fick dich | \n", "
| 3 | \n", "NDY-004 | \n", "4 | \n", "Ihr seit die besten | \n", "
| 4 | \n", "NDY-004 | \n", "5 | \n", "ihr seit die ALLER besten ich finde euch soooo... | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 9224 | \n", "NDY-203 | \n", "522 | \n", "hihi kannst du mich grΓΌΓen π π π Achso wusstes... | \n", "
| 9225 | \n", "NDY-203 | \n", "523 | \n", "#Glocke aktiviert π Ich liebe deine Videos π π... | \n", "
| 9226 | \n", "NDY-203 | \n", "524 | \n", "Bist die beste β€ Bitte GrΓΌΓe mich π β€ π π | \n", "
| 9227 | \n", "NDY-203 | \n", "525 | \n", "Hi Bonny β€οΈ War letztens auf'm Flughafen , und... | \n", "
| 9228 | \n", "NDY-203 | \n", "526 | \n", "du bist die beste ich bin neu ich hab dich sof... | \n", "
9229 rows Γ 3 columns
\n", "| \n", " | document | \n", "comment_id | \n", "comment | \n", "flausch | \n", "
|---|---|---|---|---|
| 0 | \n", "NDY-004 | \n", "1 | \n", "Lol i love lochis | \n", "no | \n", "
| 1 | \n", "NDY-004 | \n", "2 | \n", "ihr singt voll gut :) | \n", "yes | \n", "
| 2 | \n", "NDY-004 | \n", "3 | \n", "Junge fick dich | \n", "no | \n", "
| 3 | \n", "NDY-004 | \n", "4 | \n", "Ihr seit die besten | \n", "yes | \n", "
| 4 | \n", "NDY-004 | \n", "5 | \n", "ihr seit die ALLER besten ich finde euch soooo... | \n", "yes | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
| 9224 | \n", "NDY-203 | \n", "522 | \n", "hihi kannst du mich grΓΌΓen π π π Achso wusstes... | \n", "no | \n", "
| 9225 | \n", "NDY-203 | \n", "523 | \n", "#Glocke aktiviert π Ich liebe deine Videos π π... | \n", "yes | \n", "
| 9226 | \n", "NDY-203 | \n", "524 | \n", "Bist die beste β€ Bitte GrΓΌΓe mich π β€ π π | \n", "yes | \n", "
| 9227 | \n", "NDY-203 | \n", "525 | \n", "Hi Bonny β€οΈ War letztens auf'm Flughafen , und... | \n", "yes | \n", "
| 9228 | \n", "NDY-203 | \n", "526 | \n", "du bist die beste ich bin neu ich hab dich sof... | \n", "yes | \n", "
9229 rows Γ 4 columns
\n", "| \n", " | document | \n", "comment_id | \n", "flausch | \n", "
|---|---|---|---|
| 0 | \n", "NDY-004 | \n", "1 | \n", "no | \n", "
| 1 | \n", "NDY-004 | \n", "2 | \n", "yes | \n", "
| 2 | \n", "NDY-004 | \n", "3 | \n", "no | \n", "
| 3 | \n", "NDY-004 | \n", "4 | \n", "yes | \n", "
| 4 | \n", "NDY-004 | \n", "5 | \n", "yes | \n", "
| ... | \n", "... | \n", "... | \n", "... | \n", "
| 9224 | \n", "NDY-203 | \n", "522 | \n", "no | \n", "
| 9225 | \n", "NDY-203 | \n", "523 | \n", "yes | \n", "
| 9226 | \n", "NDY-203 | \n", "524 | \n", "yes | \n", "
| 9227 | \n", "NDY-203 | \n", "525 | \n", "yes | \n", "
| 9228 | \n", "NDY-203 | \n", "526 | \n", "yes | \n", "
9229 rows Γ 3 columns
\n", "