Spaces:

inflaton-ai
/

logical-reasoning

Build error

App Files Files Community

nicoleathy commited on Jul 15, 2024

Commit

d071d21

verified ·

1 Parent(s): 387046f

Delete competition/Gemma-2-9b.ipynb

Browse files

Files changed (1) hide show

competition/Gemma-2-9b.ipynb +0 -132

competition/Gemma-2-9b.ipynb DELETED Viewed

@@ -1,132 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer\n",
-    "from datasets import Dataset\n",
-    "import pandas as pd\n",
-    "from sklearn.model_selection import train_test_split\n",
-    "\n",
-    "# Load the dataset\n",
-    "file_path = 'train_en.csv'\n",
-    "dataset = pd.read_csv(file_path)\n",
-    "\n",
-    "# Map labels to expected responses\n",
-    "label_mapping = {\n",
-    "    \"Yes\": 0,\n",
-    "    \"No\": 1,\n",
-    "    \"It doesn't matter\": 2,\n",
-    "    \"Unimportant\": 2,  # Assuming \"unimportant\" is synonymous with \"It doesn't matter\"\n",
-    "    \"Incorrect questioning\": 3,\n",
-    "    \"Correct answers\": 4\n",
-    "}\n",
-    "\n",
-    "# Apply label mapping\n",
-    "dataset['label'] = dataset['label'].map(label_mapping)\n",
-    "\n",
-    "# Handle NaN values: Drop rows where label is NaN\n",
-    "dataset = dataset.dropna(subset=['label'])\n",
-    "\n",
-    "# Ensure labels are integers\n",
-    "dataset['label'] = dataset['label'].astype(int)\n",
-    "\n",
-    "# Split the dataset into training and validation sets\n",
-    "train_df, val_df = train_test_split(dataset, test_size=0.2, random_state=42)\n",
-    "\n",
-    "# Convert the dataframes to datasets\n",
-    "train_dataset = Dataset.from_pandas(train_df)\n",
-    "val_dataset = Dataset.from_pandas(val_df)\n",
-    "\n",
-    "# Load the tokenizer and model\n",
-    "model_name = \"google/gemma-2-9b\"\n",
-    "tokenizer = AutoTokenizer.from_pretrained(model_name)\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)\n",
-    "\n",
-    "# Tokenize the data\n",
-    "def tokenize_function(examples):\n",
-    "    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)\n",
-    "\n",
-    "train_dataset = train_dataset.map(tokenize_function, batched=True)\n",
-    "val_dataset = val_dataset.map(tokenize_function, batched=True)\n",
-    "\n",
-    "# Set the format for PyTorch\n",
-    "train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
-    "val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])\n",
-    "\n",
-    "# Define training arguments\n",
-    "training_args = TrainingArguments(\n",
-    "    output_dir='./results',\n",
-    "    evaluation_strategy='epoch',\n",
-    "    learning_rate=2e-5,\n",
-    "    per_device_train_batch_size=8,\n",
-    "    per_device_eval_batch_size=8,\n",
-    "    num_train_epochs=3,\n",
-    "    weight_decay=0.01,\n",
-    ")\n",
-    "\n",
-    "# Initialize the Trainer\n",
-    "trainer = Trainer(\n",
-    "    model=model,\n",
-    "    args=training_args,\n",
-    "    train_dataset=train_dataset,\n",
-    "    eval_dataset=val_dataset,\n",
-    ")\n",
-    "\n",
-    "# Train the model\n",
-    "trainer.train()\n",
-    "\n",
-    "# Save the model\n",
-    "model.save_pretrained('trained_gemma_model')\n",
-    "tokenizer.save_pretrained('trained_gemma_model')\n",
-    "\n",
-    "# Evaluate the model\n",
-    "trainer.evaluate()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Load the trained model and tokenizer\n",
-    "model = AutoModelForSequenceClassification.from_pretrained('trained_gemma_model')\n",
-    "tokenizer = AutoTokenizer.from_pretrained('trained_gemma_model')\n",
-    "\n",
-    "# Function to make predictions\n",
-    "def predict(texts):\n",
-    "    inputs = tokenizer(texts, return_tensors=\"pt\", truncation=True, padding='max_length', max_length=128)\n",
-    "    outputs = model(**inputs)\n",
-    "    predictions = outputs.logits.argmax(dim=-1).tolist()\n",
-    "    return predictions\n",
-    "\n",
-    "# Apply the predictions to the dataset\n",
-    "dataset['predicted_label'] = predict(dataset['text'].tolist())\n",
-    "\n",
-    "# Map the predicted labels back to the response texts\n",
-    "reverse_label_mapping = {v: k for k, v in label_mapping.items()}\n",
-    "dataset['predicted_label'] = dataset['predicted_label'].map(reverse_label_mapping)\n",
-    "\n",
-    "# Save the results\n",
-    "dataset.to_csv('gemma-2-9b_predicted_results.csv', index=False)"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "base",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "name": "python",
-   "version": "3.11.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}