File size: 11,242 Bytes
f8e3778
 
 
 
 
 
 
 
d9dfc08
 
 
 
 
f8e3778
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
# -*- coding: utf-8 -*-
"""
IT Support Chatbot Application
- Converts the original Colab notebook into a deployable Gradio app.
- Loads data from a local CSV file.
- Uses environment variables for API keys.
- Implements a RAG pipeline with LLaMA 3.1, Qdrant, and Hybrid Retrieval.
"""
# --- CELL 0: load secrets from env vars ---
QDRANT_HOST   = os.getenv("QDRANT_HOST")
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
HF_TOKEN      = os.getenv("HF_TOKEN")


# --- CELL 1: Imports, Logging & Reproducibility ---
import os
import random
import logging
import numpy as np
import torch
import nest_asyncio
import pandas as pd
import gradio as gr
from typing import List

# Llama-Index & Transformers
from llama_index.core import (
    SimpleDirectoryReader, VectorStoreIndex, StorageContext,
    PromptTemplate, Settings, QueryBundle, Document
)
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.retrievers import BaseRetriever
from llama_index.retrievers.bm25 import BM25Retriever
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core.node_parser import SentenceSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, BitsAndBytesConfig
from huggingface_hub import login
import qdrant_client

# Configure logging
logging.basicConfig(
    format='%(asctime)s %(levelname)s: %(message)s',
    level=logging.INFO
)
logger = logging.getLogger(__name__)

# Apply nest_asyncio for environments like notebooks
nest_asyncio.apply()

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)


# --- CELL 2: Environment & Qdrant Connection Setup ---

if not all([QDRANT_HOST, QDRANT_API_KEY, HF_TOKEN]):
    raise EnvironmentError(
        "Please set QDRANT_HOST, QDRANT_API_KEY, and HF_TOKEN environment variables."
    )

# Login to Hugging Face
login(token=HF_TOKEN)

# Initialize Qdrant client
qdrant = qdrant_client.QdrantClient(
    url=QDRANT_HOST,
    api_key=QDRANT_API_KEY,
    prefer_grpc=False
)
COLLECTION_NAME = "it_support_rag"


# --- CELL 3: Load Dataset & Build Documents ---
# Load data from a local CSV file.
# Make sure this CSV file is in the same directory as app.py when deploying.
CSV_PATH = "data.csv" # Or whatever you name your CSV file
if not os.path.exists(CSV_PATH):
    raise FileNotFoundError(
        f"The data file was not found at {CSV_PATH}. "
        "Please upload your data CSV and name it correctly."
    )

df = pd.read_csv(CSV_PATH, encoding="ISO-8859-1")

case_docs: List[Document] = []
for _, row in df.iterrows():
    text = str(row.get("text_chunk", ""))
    meta = {
        "source_dataset": str(row.get("source_dataset", ""))[:50],
        "category": str(row.get("category", ""))[:100],
        "orig_query": str(row.get("original_query", ""))[:200],
        "orig_solution": str(row.get("original_solution", ""))[:200]
    }
    case_docs.append(Document(text=text, metadata=meta))
logger.info(f"Loaded {len(case_docs)} documents from {CSV_PATH}.")


# --- CELL 4: Create Vector Index ---
# Embedding model
device = "cuda" if torch.cuda.is_available() else "cpu"
logger.info(f"Using device: {device}")
embed_model = HuggingFaceEmbedding(
    model_name="BAAI/bge-large-en-v1.5",
    device=device
)

# Node parser for chunking
node_parser = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=100,
    paragraph_separator="\n\n"
)

# Qdrant-backed vector store
vector_store = QdrantVectorStore(
    client=qdrant,
    collection_name=COLLECTION_NAME,
    prefer_grpc=False
)

# Build the index (will upload to Qdrant if collection doesn't exist)
# Note: This step can be slow the first time it's run.
logger.info("Initializing VectorStoreIndex...")
index = VectorStoreIndex.from_documents(
    documents=case_docs,
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=embed_model,
    node_parser=node_parser,
    show_progress=True
)
logger.info("VectorStoreIndex initialized successfully.")


# --- CELL 5: Define Hybrid Retriever & Reranker ---
Settings.llm = None # We will use our own LLM pipeline

class HybridRetriever(BaseRetriever):
    def __init__(self, dense, bm25):
        super().__init__()
        self.dense = dense
        self.bm25 = bm25
    def _retrieve(self, query_bundle: QueryBundle) -> List[Document]:
        dense_hits = self.dense.retrieve(query_bundle)
        bm25_hits = self.bm25.retrieve(query_bundle)

        combined = dense_hits + bm25_hits
        unique = []
        seen = set()
        for hit in combined:
            nid = hit.node.node_id
            if nid not in seen:
                seen.add(nid)
                unique.append(hit)
        return unique

# Instantiate retrievers
dense_retriever = index.as_retriever(similarity_top_k=10)
bm25_nodes = node_parser.get_nodes_from_documents(case_docs)
bm25_retriever = BM25Retriever.from_defaults(
    nodes=bm25_nodes,
    similarity_top_k=10,
)
hybrid_retriever = HybridRetriever(dense=dense_retriever, bm25=bm25_retriever)

reranker = SentenceTransformerRerank(
    model="cross-encoder/ms-marco-MiniLM-L-2-v2",
    top_n=4,
    device=device
)

query_engine = index.as_query_engine(
    retriever=hybrid_retriever,
    node_postprocessors=[reranker],
    llm=None
)


# --- CELL 6: Load & Quantize LLaMA Model ---
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16
)

MODEL_ID = "meta-llama/Llama-3.1-8B-Instruct"
logger.info(f"Loading model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
llm = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=quant_config,
    device_map="auto"
)
logger.info("Model loaded successfully.")

generator = pipeline(
    task="text-generation",
    model=llm,
    tokenizer=tokenizer,
    device_map="auto"
)


# --- CELL 7: Chat Logic and Prompting ---
SYSTEM_PROMPT = (
    "You are a friendly and helpful Level 0 IT Support Assistant. "
    "Use a conversational tone and guide users step-by-step. "
    "If the user's question lacks details or clarity, ask a concise follow-up question "
    "to gather the information you need before providing a solution. "
    "Once clarified, then:\n"
    "1. Diagnose the problem.\n"
    "2. Provide step-by-step solutions with bullet points.\n"
    "3. Offer additional recommendations or safety warnings.\n"
    "4. End with a polite closing."
)

HDR = {
    "sys": "<|start_header_id|>system<|end_header_id|>",
    "usr": "<|start_header_id|>user<|end_header_id|>",
    "ast": "<|start_header_id|>assistant<|end_header_id|>",
    "eot": "<|eot_id|>"
}

chat_history = []
GREETINGS = {"hello", "hi", "hey", "good morning", "good afternoon", "good evening"}

def format_history(history):
    return "".join(
        f"{HDR['usr']}\n{u}{HDR['eot']}{HDR['ast']}\n{a}{HDR['eot']}"
        for u, a in history
    )

def build_prompt(query, context, history):
    if query.lower().strip() in GREETINGS:
        return None, "greeting"
    
    words = query.strip().split()
    if len(words) < 3:
        return (
            "Could you provide more detail about what you're experiencing? "
            "Any error messages or steps you've tried will help me assist you."
        ), "clarify"

    context_str = "\n---\n".join(node.text for node in context) if context else "No context provided."
    hist_str = format_history(history[-3:])
    
    prompt = (
        f"<|begin_of_text|>"
        f"{HDR['sys']}\n{SYSTEM_PROMPT}{HDR['eot']}"
        f"{hist_str}"
        f"{HDR['usr']}\nContext:\n{context_str}\n\nQuestion: {query}{HDR['eot']}"
        f"{HDR['ast']}\n"
    )
    return prompt, "rag"

def chat(query, temperature=0.7, top_p=0.9):
    global chat_history
    prompt, mode = build_prompt(query, [], chat_history)

    if mode == "greeting":
        reply = "Hello there! How can I help with your IT support question today?"
        chat_history.append((query, reply))
        return reply

    if mode == "clarify":
        reply = prompt
        chat_history.append((query, reply))
        return reply

    response = query_engine.query(query)
    context_nodes = response.source_nodes
    
    prompt, _ = build_prompt(query, context_nodes, chat_history)
    
    gen_args = {
        "do_sample": True,
        "max_new_tokens": 350,
        "temperature": temperature,
        "top_p": top_p,
        "eos_token_id": tokenizer.eos_token_id
    }
    
    output = generator(prompt, **gen_args)
    text = output[0]["generated_text"]
    answer = text.split(HDR["ast"])[-1].strip()
    
    chat_history.append((query, answer))
    return answer, context_nodes


# --- CELL 8: Gradio Interface ---
with gr.Blocks(theme=gr.themes.Soft(), title="πŸ’¬ Level 0 IT Support Chatbot") as demo:
    gr.Markdown("### πŸ€– Level 0 IT Support Chatbot (RAG + Qdrant + LLaMA3)")

    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="Chat", height=500, bubble_full_width=False)
            inp = gr.Textbox(placeholder="Ask your IT support question...", label="Your Message", lines=2)
            with gr.Row():
                send_btn = gr.Button("Send", variant="primary")
                clear_btn = gr.Button("Clear Chat", variant="secondary")
        with gr.Column(scale=1):
            gr.Markdown("### βš™οΈ Settings")
            k_slider = gr.Slider(1, 20, value=10, step=1, label="Context Hits (k)")
            temp_slider = gr.Slider(0.0, 1.0, value=0.7, step=0.01, label="Temperature")
            top_p_slider = gr.Slider(0.0, 1.0, value=0.9, step=0.01, label="Top-p")
            with gr.Accordion("Show Retrieved Context", open=False):
                context_display = gr.Textbox(label="Retrieved Context", interactive=False, lines=10)

    def respond(message, history, k, temp, top_p):
        global chat_history
        # Update retriever k value
        dense_retriever.similarity_top_k = k
        bm25_retriever.similarity_top_k = k
        
        # Get response and context
        reply, context_nodes = chat(message, temperature=temp, top_p=top_p)
        
        # Format context for display
        ctx_text = "\n\n---\n\n".join([f"**Source {i+1} (Score: {node.score:.4f})**\n{node.text}" for i, node in enumerate(context_nodes)])
        
        history.append([message, reply])
        return "", history, ctx_text

    def clear_chat():
        global chat_history
        chat_history = []
        return [], None

    # Event Listeners
    inp.submit(respond, [inp, chatbot, k_slider, temp_slider, top_p_slider], [inp, chatbot, context_display])
    send_btn.click(respond, [inp, chatbot, k_slider, temp_slider, top_p_slider], [inp, chatbot, context_display])
    clear_btn.click(clear_chat, None, [chatbot, context_display], queue=False)

# --- Main execution block ---
if __name__ == "__main__":
    # The launch() command will start a web server that serves the interface.
    # It will block the script from exiting.
    logger.info("Launching Gradio interface...")
    demo.launch(server_name="0.0.0.0", server_port=7860)