{ "cells": [ { "cell_type": "markdown", "id": "fce70006-809b-4c98-b89c-00910b8bbea1", "metadata": {}, "source": [ "Implementation for blog post" ] }, { "cell_type": "code", "execution_count": 1, "id": "1eaa3a9f-0b39-4d77-91d6-f935d226ac98", "metadata": {}, "outputs": [], "source": [ "import math\n", "import pickle\n", "import os\n", "import time\n", "import matplotlib.pyplot as plt\n", "from tabulate import tabulate\n", "\n", "from transformers import pipeline\n", "import json\n", "import pandas as pd\n", "\n", "# Get candidate labels\n", "with open(\"packing_label_structure.json\", \"r\") as file:\n", " candidate_labels = json.load(file)\n", "keys_list = list(candidate_labels.keys())\n", "\n", "# Load test data (list of dictionaries)\n", "# with open(\"test_data.json\", \"r\") as file:\n", "# packing_data = json.load(file)\n", "# Extract trip descriptions and classification (trip_types)\n", "# trip_descriptions = [trip['description'] for trip in packing_data]\n", "# trip_types = [trip['trip_types'] for trip in packing_data]\n" ] }, { "cell_type": "code", "execution_count": 2, "id": "bb1bc7ed-227e-4c0b-b769-ead4daf01c57", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", " activity_type :\n", "\t hut trek (summer)\n", "\t hut trek (winter)\n", "\t camping trip (wild camping)\n", "\t camping trip (campground)\n", "\t ski tour / skitour\n", "\t snowboard / splitboard trip\n", "\t long-distance hike / thru-hike\n", "\t digital nomad trip\n", "\t city trip\n", "\t road trip (car/camper)\n", "\t festival trip\n", "\t yoga / wellness retreat\n", "\t micro-adventure / weekend trip\n", "\t beach vacation\n", "\t cultural exploration\n", "\t nature escape\n", "\n", " activities :\n", "\t swimming\n", "\t going to the beach\n", "\t relaxing\n", "\t sightseeing\n", "\t biking\n", "\t running\n", "\t skiing\n", "\t cross-country skiing\n", "\t ski touring\n", "\t hiking\n", "\t hut-to-hut hiking\n", "\t rock climbing\n", "\t ice climbing\n", "\t snowshoe hiking\n", "\t kayaking / canoeing\n", "\t stand-up paddleboarding (SUP)\n", "\t snorkeling\n", "\t scuba diving\n", "\t surfing\n", "\t paragliding\n", "\t horseback riding\n", "\t photography\n", "\t fishing\n", "\t rafting\n", "\t yoga\n", "\n", " climate_or_season :\n", "\t cold destination / winter\n", "\t warm destination / summer\n", "\t variable weather / spring / autumn\n", "\t tropical / humid\n", "\t dry / desert-like\n", "\t rainy climate\n", "\n", " style_or_comfort :\n", "\t ultralight\n", "\t lightweight (but comfortable)\n", "\t luxury (including evening wear)\n", "\t minimalist\n", "\n", " dress_code :\n", "\t casual\n", "\t formal (business trip)\n", "\t conservative\n", "\n", " accommodation :\n", "\t indoor\n", "\t huts with half board\n", "\t sleeping in a tent\n", "\t sleeping in a car\n", "\n", " transportation :\n", "\t own vehicle\n", "\t no own vehicle\n", "\n", " special_conditions :\n", "\t off-grid / no electricity\n", "\t self-supported (bring your own cooking gear)\n", "\t travel with children\n", "\t pet-friendly\n", "\t snow and ice\n", "\t high alpine terrain\n", "\t snow, ice and avalanche-prone terrain\n", "\t no special conditions to consider\n", "\n", " trip_length_days :\n", "\t 1 day\n", "\t 2 days\n", "\t 3 days\n", "\t 4 days\n", "\t 5 days\n", "\t 6 days\n", "\t 7 days\n", "\t 7+ days\n" ] } ], "source": [ "for key in candidate_labels:\n", " print(\"\\n\", key, \":\")\n", " for item in candidate_labels[key]:\n", " print(\"\\t\", item)\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "4b3a1bcb-3450-4128-b941-952f145baf99", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " Label Score\n", "0 beach vacation 0.376311\n", "1 micro-adventure / weekend trip 0.350168\n", "2 nature escape 0.133974\n", "3 digital nomad trip 0.031636\n", "4 cultural exploration 0.031271\n", "5 yoga / wellness retreat 0.012846\n", "6 festival trip 0.012700\n", "7 long-distance hike / thru-hike 0.009527\n", "8 hut trek (summer) 0.008148\n", "9 city trip 0.007793\n", "10 road trip (car/camper) 0.006512\n", "11 ski tour / skitour 0.005670\n", "12 camping trip (campground) 0.004448\n", "13 snowboard / splitboard trip 0.004113\n", "14 camping trip (wild camping) 0.002714\n", "15 hut trek (winter) 0.002170\n" ] } ], "source": [ "key = keys_list[0]\n", "model_name = \"facebook/bart-large-mnli\"\n", "trip_descr = \"I am planning a trip to Greece with my boyfriend, where we will visit two islands. We have booked an apartment on each island for a few days and plan to spend most of our time relaxing. Our main goals are to enjoy the beach, try delicious local food, and possibly go on a hike—if it’s not too hot. We will be relying solely on public transport. We’re in our late 20s and traveling from the Netherlands.\"\n", "classifier = pipeline(\"zero-shot-classification\", model = model_name)\n", "result = classifier(trip_descr, candidate_labels[keys_list[0]])\n", "# Create DataFrame\n", "df = pd.DataFrame({\n", " \"Label\": result[\"labels\"],\n", " \"Score\": result[\"scores\"]\n", "})\n", "print(df)" ] }, { "cell_type": "code", "execution_count": 4, "id": "04208f9e-59bb-408b-92c6-941d064bf43d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "beach vacation\n" ] } ], "source": [ "# the labels are sorted by score. We choose the first one as our best guess for a class label\n", "class_label = result[\"labels\"][0]\n", "print(class_label)" ] }, { "cell_type": "code", "execution_count": 5, "id": "9f5f1c45-b411-4de1-a0a6-a7ecde5d8eae", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " Label Score\n", "0 going to the beach 0.991486\n", "1 relaxing 0.977136\n", "2 hiking 0.942628\n", "3 swimming 0.219020\n", "4 sightseeing 0.175862\n", "5 running 0.098545\n", "6 hut-to-hut hiking 0.083704\n", "7 biking 0.036792\n", "8 photography 0.036690\n", "9 surfing 0.030993\n", "10 stand-up paddleboarding (SUP) 0.025300\n", "11 snorkeling 0.021451\n", "12 yoga 0.011070\n", "13 kayaking / canoeing 0.007511\n", "14 rock climbing 0.006307\n", "15 fishing 0.003497\n", "16 paragliding 0.002656\n", "17 rafting 0.001970\n", "18 horseback riding 0.001560\n", "19 snowshoe hiking 0.001528\n", "20 cross-country skiing 0.001502\n", "21 ice climbing 0.001434\n", "22 skiing 0.001169\n", "23 scuba diving 0.000789\n", "24 ski touring 0.000491\n", "['going to the beach', 'relaxing', 'hiking']\n" ] } ], "source": [ "# we do this for each superclass and receive a list of class labels for our trip. We did do things differently for activities\n", "cut_off = 0.5\n", "result_activ = classifier(trip_descr, candidate_labels[\"activities\"], multi_label=True)\n", "indices = [i for i, score in enumerate(result_activ['scores']) if score > cut_off]\n", "classes = [result_activ['labels'][i] for i in indices]\n", "\n", "df = pd.DataFrame({\n", " \"Label\": result_activ[\"labels\"],\n", " \"Score\": result_activ[\"scores\"]\n", "})\n", "print(df)\n", "print(classes)" ] }, { "cell_type": "code", "execution_count": 6, "id": "3a7287c2-78f0-4a53-af72-1bc0f62da36f", "metadata": {}, "outputs": [], "source": [ "# doing this for all superclasses, depending on local machine this might take a while\n", "def pred_trip(model_name, trip_descr, cut_off = 0.5):\n", " \"\"\"\n", " Classifies trip\n", " \n", " Parameters:\n", " model_name: name of hugging-face model\n", " trip_descr: text describing the trip\n", " cut_off: cut_off for choosing activities\n", "\n", " Returns:\n", " pd Dataframe: with class predictions and true values\n", " \"\"\"\n", " \n", " classifier = pipeline(\"zero-shot-classification\", model=model_name)\n", " df = pd.DataFrame(columns=['superclass', 'pred_class'])\n", " for i, key in enumerate(keys_list):\n", " # print(f\"\\rProcessing {i + 1}/{len(keys_list)}\", end=\"\", flush=True)\n", " if key == 'activities':\n", " result = classifier(trip_descr, candidate_labels[key], multi_label=True)\n", " indices = [i for i, score in enumerate(result['scores']) if score > cut_off]\n", " classes = [result['labels'][i] for i in indices]\n", " else:\n", " result = classifier(trip_descr, candidate_labels[key])\n", " classes = result[\"labels\"][0]\n", " df.loc[i] = [key, classes]\n", " return df" ] }, { "cell_type": "code", "execution_count": 7, "id": "43481d4c-039a-4a37-bd6d-dfe638bf9732", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ " superclass pred_class\n", "0 activity_type beach vacation\n", "1 activities [going to the beach, relaxing, hiking]\n", "2 climate_or_season warm destination / summer\n", "3 style_or_comfort minimalist\n", "4 dress_code casual\n", "5 accommodation huts with half board\n", "6 transportation no own vehicle\n", "7 special_conditions off-grid / no electricity\n", "8 trip_length_days 7+ days\n" ] } ], "source": [ "result = pred_trip(model_name, trip_descr, cut_off = 0.5)\n", "print(result)" ] }, { "cell_type": "markdown", "id": "c4799d6b-6ab5-42da-a992-afe3666d0015", "metadata": {}, "source": [ "Now use gradio app" ] }, { "cell_type": "code", "execution_count": 9, "id": "35e14ac8-4445-4586-a115-081cf1ef2686", "metadata": {}, "outputs": [], "source": [ "# Prerequisites\n", "from transformers import pipeline\n", "import json\n", "import pandas as pd\n", "import gradio as gr" ] }, { "cell_type": "code", "execution_count": 10, "id": "8eefd4cc-c375-4cc0-956b-472b36bafdb7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Running on local URL: http://127.0.0.1:7860\n", "\n", "To create a public link, set `share=True` in `launch()`.\n" ] }, { "data": { "text/html": [ "
" ], "text/plain": [ "