{ "cells": [ { "metadata": {}, "cell_type": "markdown", "source": [ "## Experiment 019-4\n", "\n", "SVM mit RBF Kernel, C=5 und Gamma=0.0002" ], "id": "8d9679176b5367c7" }, { "cell_type": "code", "id": "initial_id", "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2025-06-23T18:30:56.081332Z", "start_time": "2025-06-23T18:30:55.935044Z" } }, "source": [ "import os\n", "from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, make_scorer, classification_report\n", "from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.svm import SVC\n", "import time\n", "import pickle\n", "import numpy as np\n", "import pandas as pd\n", "import torch\n", "from torch import Tensor\n", "from transformers import AutoModel, AutoTokenizer\n", "from transformers.utils import is_flash_attn_2_available\n", "import wandb\n", "from wandb import AlertLevel\n", "\n", "\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'\n", "os.environ[\"CUDA_DEVICE_ORDER\"] = \"PCI_BUS_ID\"\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = '1'\n", "os.environ[\"WANDB_PROJECT\"] = \"GermEval2025-Substask1\"\n", "os.environ[\"WANDB_LOG_MODEL\"] = \"false\"\n", "\n", "if torch.cuda.is_available():\n", " device = torch.device('cuda')\n", "else:\n", " device = torch.device('cpu')\n", " print(\"CUDA not available, using CPU\")\n", "\n", "experiment_name = \"exp019-4\"\n", "\n", "testing_mode = False\n", "\n", "# Load data\n", "comments = pd.read_csv(\"./share-GermEval2025-data/Data/training data/comments.csv\")\n", "task1 = pd.read_csv(\"./share-GermEval2025-data/Data/training data/task1.csv\")\n", "comments = comments.merge(task1, on=[\"document\", \"comment_id\"])\n", "\n", "# Remove duplicates\n", "df = comments.drop_duplicates(subset=['comment', 'flausch'])\n", "df.reset_index(drop=True, inplace=True)" ], "outputs": [], "execution_count": 2 }, { "metadata": {}, "cell_type": "code", "source": [ "def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:\n", " left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])\n", " if left_padding:\n", " return last_hidden_states[:, -1]\n", " else:\n", " sequence_lengths = attention_mask.sum(dim=1) - 1\n", " batch_size = last_hidden_states.shape[0]\n", " return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]\n", "\n", "class Qwen3Embedder:\n", " def __init__(self, model_name='Qwen/Qwen3-Embedding-8B', instruction=None, max_length=1024):\n", " if instruction is None:\n", " instruction = 'Classify a given comment as either flausch (a positive, supportive expression) or non-flausch.'\n", " self.instruction = instruction\n", "\n", " if is_flash_attn_2_available():\n", " self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, attn_implementation=\"flash_attention_2\", torch_dtype=torch.float16)\n", " else:\n", " self.model = AutoModel.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)\n", "\n", " self.model = self.model.cuda()\n", " self.model.eval()\n", "\n", " self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, padding_side='left')\n", " self.max_length = max_length\n", "\n", " def get_detailed_instruct(self, query: str) -> str:\n", " return f'Instruct: {self.instruction}\\nQuery:{query}'\n", "\n", " def encode_batch(self, texts, batch_size=32):\n", " \"\"\"Encode texts in batches to handle memory efficiently\"\"\"\n", " all_embeddings = []\n", "\n", " for i in range(0, len(texts), batch_size):\n", " batch_texts = [self.get_detailed_instruct(comment) for comment in texts[i:i + batch_size]]\n", "\n", " # Tokenize batch\n", " inputs = self.tokenizer(\n", " batch_texts,\n", " padding=True,\n", " truncation=True,\n", " max_length=self.max_length,\n", " return_tensors='pt'\n", " ).to(device)\n", "\n", " # Get embeddings\n", " with torch.no_grad():\n", " outputs = self.model(**inputs)\n", " # Mean pooling\n", " embeddings = last_token_pool(outputs.last_hidden_state, inputs['attention_mask'])\n", " #embeddings = embeddings.float()\n", "\n", " all_embeddings.append(embeddings.cpu().numpy())\n", "\n", " # Normalize embeddings (sollte ich?)\n", " #import torch.nn.functional as F\n", " #output = F.normalize(all_embeddings, p=2, dim=1)\n", " return np.vstack(all_embeddings)\n", "\n", "# Initialize embedder\n", "print(\"Loading Qwen3 Embeddings v3...\")\n", "embedder = Qwen3Embedder(instruction='Classify a given comment as either flausch (a positive, supportive expression) or non-flausch')\n", "\n", "X, y = df[\"comment\"], df[\"flausch\"].map(dict(yes=1, no=0))\n", "\n", "# load embeddings if they exist\n", "embeddings_file = f'Qwen3-Embedding-8B-{experiment_name}.npy'\n", "if os.path.exists(embeddings_file):\n", " print(f\"Loading existing embeddings from {embeddings_file}\")\n", " X_embeddings = np.load(embeddings_file)\n", "else:\n", " print(\"Embeddings not found, generating new embeddings...\")\n", " # Encode texts in batches to avoid memory issues\n", " X_embeddings = embedder.encode_batch(X.tolist(), batch_size=64)\n", " print(f\"Generated embeddings with shape: {X_embeddings.shape}\")\n", "\n", " # save embeddings to avoid recomputation\n", " np.save(embeddings_file, X_embeddings)\n", "\n", "pipe = Pipeline([\n", " (\"scaler\", StandardScaler()),\n", " (\"svm\", SVC(random_state=42, C=5, gamma=0.0002, cache_size=2000))\n", "])\n", "\n", "f1_pos_scorer = make_scorer(f1_score, pos_label=1, average='binary')\n", "\n", "X_train = X_embeddings\n", "y_train = y\n", "\n", "pipe.fit(X_train, y_train)" ], "id": "59ef5a54cb69530f", "outputs": [], "execution_count": null }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T18:30:59.602524Z", "start_time": "2025-06-23T18:30:59.570290Z" } }, "cell_type": "code", "source": [ "test_data: pd.DataFrame = pd.read_csv(\"./share-GermEval2025-data/Data/test data/comments.csv\")\n", "test_data" ], "id": "a842bfa29d59c84b", "outputs": [ { "data": { "text/plain": [ " document comment_id comment\n", "0 NDY-004 1 Lol i love lochis\n", "1 NDY-004 2 ihr singt voll gut :)\n", "2 NDY-004 3 Junge fick dich\n", "3 NDY-004 4 Ihr seit die besten\n", "4 NDY-004 5 ihr seit die ALLER besten ich finde euch soooo...\n", "... ... ... ...\n", "9224 NDY-203 522 hihi kannst du mich grüßen πŸ’• πŸ‘‹ 😍 Achso wusstes...\n", "9225 NDY-203 523 #Glocke aktiviert πŸ‘‘ Ich liebe deine Videos πŸ’ πŸ’Ž...\n", "9226 NDY-203 524 Bist die beste ❀ Bitte Grüße mich πŸ’• ❀ 😘 😍\n", "9227 NDY-203 525 Hi Bonny ❀️ War letztens auf'm Flughafen , und...\n", "9228 NDY-203 526 du bist die beste ich bin neu ich hab dich sof...\n", "\n", "[9229 rows x 3 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentcomment_idcomment
0NDY-0041Lol i love lochis
1NDY-0042ihr singt voll gut :)
2NDY-0043Junge fick dich
3NDY-0044Ihr seit die besten
4NDY-0045ihr seit die ALLER besten ich finde euch soooo...
............
9224NDY-203522hihi kannst du mich grüßen πŸ’• πŸ‘‹ 😍 Achso wusstes...
9225NDY-203523#Glocke aktiviert πŸ‘‘ Ich liebe deine Videos πŸ’ πŸ’Ž...
9226NDY-203524Bist die beste ❀ Bitte Grüße mich πŸ’• ❀ 😘 😍
9227NDY-203525Hi Bonny ❀️ War letztens auf'm Flughafen , und...
9228NDY-203526du bist die beste ich bin neu ich hab dich sof...
\n", "

9229 rows Γ— 3 columns

\n", "
" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 3 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:22:07.211246Z", "start_time": "2025-06-23T19:17:34.390901Z" } }, "cell_type": "code", "source": "X_test_data = embedder.encode_batch(test_data['comment'].tolist(), batch_size=64)", "id": "b2f18769fe09b609", "outputs": [], "execution_count": 6 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:25:42.858436Z", "start_time": "2025-06-23T19:22:07.287233Z" } }, "cell_type": "code", "source": "y_prediction = pipe.predict(X_test_data)", "id": "3a7abacf1694b415", "outputs": [], "execution_count": 7 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:31:30.676051Z", "start_time": "2025-06-23T19:31:30.667660Z" } }, "cell_type": "code", "source": [ "test_data['flausch'] = y_prediction\n", "test_data['flausch'] = test_data['flausch'].map({1: 'yes', 0: 'no'})\n", "test_data" ], "id": "d342aed9b9070ad4", "outputs": [ { "data": { "text/plain": [ " document comment_id comment \\\n", "0 NDY-004 1 Lol i love lochis \n", "1 NDY-004 2 ihr singt voll gut :) \n", "2 NDY-004 3 Junge fick dich \n", "3 NDY-004 4 Ihr seit die besten \n", "4 NDY-004 5 ihr seit die ALLER besten ich finde euch soooo... \n", "... ... ... ... \n", "9224 NDY-203 522 hihi kannst du mich grüßen πŸ’• πŸ‘‹ 😍 Achso wusstes... \n", "9225 NDY-203 523 #Glocke aktiviert πŸ‘‘ Ich liebe deine Videos πŸ’ πŸ’Ž... \n", "9226 NDY-203 524 Bist die beste ❀ Bitte Grüße mich πŸ’• ❀ 😘 😍 \n", "9227 NDY-203 525 Hi Bonny ❀️ War letztens auf'm Flughafen , und... \n", "9228 NDY-203 526 du bist die beste ich bin neu ich hab dich sof... \n", "\n", " flausch \n", "0 no \n", "1 yes \n", "2 no \n", "3 yes \n", "4 yes \n", "... ... \n", "9224 no \n", "9225 yes \n", "9226 yes \n", "9227 yes \n", "9228 yes \n", "\n", "[9229 rows x 4 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentcomment_idcommentflausch
0NDY-0041Lol i love lochisno
1NDY-0042ihr singt voll gut :)yes
2NDY-0043Junge fick dichno
3NDY-0044Ihr seit die bestenyes
4NDY-0045ihr seit die ALLER besten ich finde euch soooo...yes
...............
9224NDY-203522hihi kannst du mich grüßen πŸ’• πŸ‘‹ 😍 Achso wusstes...no
9225NDY-203523#Glocke aktiviert πŸ‘‘ Ich liebe deine Videos πŸ’ πŸ’Ž...yes
9226NDY-203524Bist die beste ❀ Bitte Grüße mich πŸ’• ❀ 😘 😍yes
9227NDY-203525Hi Bonny ❀️ War letztens auf'm Flughafen , und...yes
9228NDY-203526du bist die beste ich bin neu ich hab dich sof...yes
\n", "

9229 rows Γ— 4 columns

\n", "
" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 11 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:33:51.519362Z", "start_time": "2025-06-23T19:33:51.512704Z" } }, "cell_type": "code", "source": "test_data[['document', 'comment_id', 'flausch']]", "id": "ac4077f355d0a379", "outputs": [ { "data": { "text/plain": [ " document comment_id flausch\n", "0 NDY-004 1 no\n", "1 NDY-004 2 yes\n", "2 NDY-004 3 no\n", "3 NDY-004 4 yes\n", "4 NDY-004 5 yes\n", "... ... ... ...\n", "9224 NDY-203 522 no\n", "9225 NDY-203 523 yes\n", "9226 NDY-203 524 yes\n", "9227 NDY-203 525 yes\n", "9228 NDY-203 526 yes\n", "\n", "[9229 rows x 3 columns]" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
documentcomment_idflausch
0NDY-0041no
1NDY-0042yes
2NDY-0043no
3NDY-0044yes
4NDY-0045yes
............
9224NDY-203522no
9225NDY-203523yes
9226NDY-203524yes
9227NDY-203525yes
9228NDY-203526yes
\n", "

9229 rows Γ— 3 columns

\n", "
" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "execution_count": 12 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:34:57.446239Z", "start_time": "2025-06-23T19:34:57.431741Z" } }, "cell_type": "code", "source": "test_data[['document', 'comment_id', 'flausch']].to_csv(f'./submissions/subtask1_submission1.csv', index=False)", "id": "ce927f8936231813", "outputs": [], "execution_count": 16 }, { "metadata": { "ExecuteTime": { "end_time": "2025-06-23T19:37:22.875657Z", "start_time": "2025-06-23T19:37:22.653931Z" } }, "cell_type": "code", "source": "!head -n 10 './submissions/subtask1_submission1.csv'", "id": "e358ae2660d91769", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "document,comment_id,flausch\r\n", "NDY-004,1,no\r\n", "NDY-004,2,yes\r\n", "NDY-004,3,no\r\n", "NDY-004,4,yes\r\n", "NDY-004,5,yes\r\n", "NDY-004,6,yes\r\n", "NDY-004,7,no\r\n", "NDY-004,8,no\r\n", "NDY-004,9,no\r\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", "To disable this warning, you can either:\n", "\t- Avoid using `tokenizers` before the fork if possible\n", "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" ] } ], "execution_count": 19 }, { "metadata": {}, "cell_type": "code", "source": "!cp './submissions/subtask1_submission1.csv' './submissions/task1-predicted.csv'", "id": "e820c01a833df1db", "outputs": [], "execution_count": null }, { "metadata": {}, "cell_type": "markdown", "source": [ " Score fΓΌr Subtask 1:\n", "\n", " β†’ 0.88" ], "id": "c441568bcdde6462" } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 5 }