amiguel commited on
Commit
29104c7
·
verified ·
1 Parent(s): f24a30d

Upload 13 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ src/notifs_data.db filter=lfs diff=lfs merge=lfs -text
src/FUTURE_ENHANCEMENTS.md ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DigiTwin Application - Future Enhancements
2
+
3
+ ## Overview
4
+ This document outlines planned enhancements for the DigiTwin application to improve performance, data management, and user experience through advanced analytics and AI capabilities.
5
+
6
+ ---
7
+
8
+ ## 1. Data Preprocessing Module
9
+
10
+ ### Objective
11
+ Create a dedicated preprocessing module to optimize dataset size and improve application performance by removing unnecessary columns and cleaning data before storage.
12
+
13
+ ### Implementation Plan
14
+
15
+ #### 1.1 Column Analysis & Removal
16
+ - **Module**: `preprocessing.py`
17
+ - **Functionality**:
18
+ - Analyze uploaded Excel files for column usage patterns
19
+ - Identify and remove columns with:
20
+ - High percentage of null values (>80%)
21
+ - Redundant information
22
+ - Non-essential metadata
23
+ - Preserve critical columns: FPSO, Main WorkCtr, Notification Type, Location, Keywords, etc.
24
+
25
+ #### 1.2 Data Cleaning Pipeline
26
+ ```python
27
+ def preprocess_notifications_data(df):
28
+ """
29
+ Preprocess notification data to reduce size and improve quality
30
+ """
31
+ # Remove unnecessary columns to improve memory footprint
32
+ columns_to_remove = [
33
+ 'Priority', # Redundant priority information
34
+ 'Notification', # Duplicate notification data
35
+ 'Order', # Order information not needed for analytics
36
+ 'Planner group' # Planner group metadata
37
+ ]
38
+
39
+ # Remove specified columns
40
+ df = df.drop(columns=columns_to_remove, errors='ignore')
41
+
42
+ # Clean data types
43
+ # Remove duplicates
44
+ # Standardize text fields
45
+ # Optimize memory usage
46
+
47
+ return cleaned_df
48
+ ```
49
+
50
+ #### 1.3 Benefits
51
+ - **Reduced Database Size**: Removal of Priority, Notification, Order, and Planner group columns reduces dataset size by 15-25%
52
+ - **Improved Performance**: Faster loading and processing times due to reduced memory footprint
53
+ - **Better Memory Management**: Optimized data types and structures for cached database
54
+ - **Data Quality**: Consistent formatting and validation while preserving essential analytics columns
55
+ - **Focused Analytics**: Streamlined dataset containing only relevant columns for FPSO analysis
56
+
57
+ ---
58
+
59
+ ## 2. Feature Engineering Enhancements
60
+
61
+ ### Objective
62
+ Enhance the dataset with derived features to provide deeper insights and better analytics capabilities.
63
+
64
+ ### Implementation Plan
65
+
66
+ #### 2.1 Main WorkCtr Feature Engineering
67
+ - **Categorization**: Group work centers into logical categories
68
+ - **Priority Levels**: Assign priority based on work center type
69
+ - **Frequency Analysis**: Track most common work centers per FPSO
70
+
71
+ #### 2.2 Additional Feature Engineering
72
+ ```python
73
+ def engineer_features(df):
74
+ """
75
+ Create new features from existing data
76
+ """
77
+ # Time-based features
78
+ df['notification_age_days'] = (pd.Timestamp.now() - df['date']).dt.days
79
+ df['is_urgent'] = df['notification_age_days'] <= 7
80
+
81
+ # Location-based features
82
+ df['location_category'] = categorize_location(df['location'])
83
+ df['is_critical_area'] = is_critical_location(df['location'])
84
+
85
+ # Keyword-based features
86
+ df['keyword_count'] = df['keywords'].str.count(',') + 1
87
+ df['has_safety_keyword'] = df['keywords'].str.contains('safety|emergency', case=False)
88
+
89
+ # FPSO-specific features
90
+ df['fpso_notification_density'] = df.groupby('fpso')['notification_id'].transform('count')
91
+
92
+ return df
93
+ ```
94
+
95
+ #### 2.3 New Features to Add
96
+ - **Temporal Features**:
97
+ - Notification age (days since creation)
98
+ - Urgency indicators
99
+ - Seasonal patterns
100
+
101
+ - **Spatial Features**:
102
+ - Location categories (Deck, Hull, Machinery, etc.)
103
+ - Critical area flags
104
+ - Zone-based grouping
105
+
106
+ - **Operational Features**:
107
+ - Work center complexity scores
108
+ - Resource allocation indicators
109
+ - Maintenance priority levels
110
+
111
+ ---
112
+
113
+ ## 3. LLM Integration with RAG (Retrieval-Augmented Generation)
114
+
115
+ ### Objective
116
+ Implement conversational AI capabilities to allow users to query the cached dataset using natural language, providing intelligent insights and recommendations.
117
+
118
+ ### Implementation Plan
119
+
120
+ #### 3.1 RAG Architecture
121
+ ```python
122
+ class DigiTwinRAG:
123
+ """
124
+ RAG system for querying notification data
125
+ """
126
+ def __init__(self, db_path):
127
+ self.db_path = db_path
128
+ self.vector_store = None
129
+ self.llm_model = None
130
+
131
+ def setup_vector_store(self):
132
+ """Create vector embeddings for notification data"""
133
+ # Load data from SQLite
134
+ # Create embeddings using sentence-transformers
135
+ # Store in vector database (Chroma/FAISS)
136
+
137
+ def query_notifications(self, user_query):
138
+ """Process natural language queries"""
139
+ # Retrieve relevant documents
140
+ # Generate response using LLM
141
+ # Return formatted results
142
+ ```
143
+
144
+ #### 3.2 LLM Model Integration
145
+ - **Model Options**:
146
+ - **Local**: Llama 2, Mistral, or similar open-source models
147
+ - **Cloud**: OpenAI GPT-4, Anthropic Claude, or Azure OpenAI
148
+ - **Hybrid**: Local for basic queries, cloud for complex analysis
149
+
150
+ #### 3.3 Query Capabilities
151
+ ```python
152
+ # Example queries the system should handle:
153
+ queries = [
154
+ "Show me all urgent notifications from the last week",
155
+ "Which FPSO has the most safety-related issues?",
156
+ "What are the common keywords in deck maintenance notifications?",
157
+ "Compare notification patterns between PAZ and DAL FPSOs",
158
+ "Generate a summary of critical maintenance needs",
159
+ "What work centers require immediate attention?"
160
+ ]
161
+ ```
162
+
163
+ #### 3.4 Implementation Steps
164
+ 1. **Vector Database Setup**:
165
+ - Install and configure vector database (Chroma/FAISS)
166
+ - Create embeddings for notification text
167
+ - Index metadata fields
168
+
169
+ 2. **LLM Integration**:
170
+ - Set up model API connections
171
+ - Create prompt templates
172
+ - Implement response formatting
173
+
174
+ 3. **User Interface**:
175
+ - Add chat interface to Streamlit app
176
+ - Display query results with visualizations
177
+ - Provide query suggestions
178
+
179
+ 4. **Response Enhancement**:
180
+ - Generate charts and graphs from queries
181
+ - Provide actionable insights
182
+ - Link to relevant data views
183
+
184
+ ---
185
+
186
+ ## 4. Technical Requirements
187
+
188
+ ### 4.1 New Dependencies
189
+ ```txt
190
+ # preprocessing.py
191
+ pandas>=2.0.0
192
+ numpy>=1.24.0
193
+
194
+ # feature_engineering.py
195
+ scikit-learn>=1.3.0
196
+ category_encoders>=2.6.0
197
+
198
+ # rag_system.py
199
+ sentence-transformers>=2.2.0
200
+ chromadb>=0.4.0
201
+ langchain>=0.1.0
202
+ openai>=1.0.0 # or other LLM provider
203
+ ```
204
+
205
+ ### 4.2 Database Schema Updates
206
+ ```sql
207
+ -- New tables for enhanced features
208
+ CREATE TABLE notification_features (
209
+ id INTEGER PRIMARY KEY,
210
+ notification_id TEXT,
211
+ urgency_score REAL,
212
+ location_category TEXT,
213
+ keyword_count INTEGER,
214
+ fpso_density REAL,
215
+ created_at TIMESTAMP
216
+ );
217
+
218
+ CREATE TABLE vector_embeddings (
219
+ id INTEGER PRIMARY KEY,
220
+ notification_id TEXT,
221
+ embedding_vector BLOB,
222
+ metadata TEXT
223
+ );
224
+ ```
225
+
226
+ ---
227
+
228
+ ## 5. Implementation Timeline
229
+
230
+ ### Phase 1: Data Preprocessing (Week 1-2)
231
+ - [ ] Create preprocessing module
232
+ - [ ] Implement column analysis
233
+ - [ ] Add data cleaning pipeline
234
+ - [ ] Test with existing datasets
235
+
236
+ ### Phase 2: Feature Engineering (Week 3-4)
237
+ - [ ] Implement feature engineering functions
238
+ - [ ] Add new derived features
239
+ - [ ] Update database schema
240
+ - [ ] Integrate with main application
241
+
242
+ ### Phase 3: RAG System (Week 5-8)
243
+ - [ ] Set up vector database
244
+ - [ ] Implement LLM integration
245
+ - [ ] Create chat interface
246
+ - [ ] Test and optimize queries
247
+
248
+ ### Phase 4: Integration & Testing (Week 9-10)
249
+ - [ ] Integrate all modules
250
+ - [ ] Performance testing
251
+ - [ ] User acceptance testing
252
+ - [ ] Documentation and deployment
253
+
254
+ ---
255
+
256
+ ## 6. Success Metrics
257
+
258
+ ### Performance Improvements
259
+ - **Data Size Reduction**: Target 15-25% reduction through removal of Priority, Notification, Order, and Planner group columns
260
+ - **Query Speed**: 30-40% faster data loading and processing due to reduced memory footprint
261
+ - **Memory Usage**: 20-30% reduction in memory consumption for cached database
262
+
263
+ ### User Experience
264
+ - **Query Response Time**: <3 seconds for RAG queries
265
+ - **Accuracy**: >90% relevance for retrieved documents
266
+ - **User Satisfaction**: Improved through natural language interaction
267
+
268
+ ### Analytics Capabilities
269
+ - **Insight Generation**: Automated identification of patterns and trends
270
+ - **Recommendation Quality**: Actionable maintenance and safety recommendations
271
+ - **Data Coverage**: Enhanced analysis across all FPSO units
272
+
273
+ ---
274
+
275
+ ## 7. Risk Mitigation
276
+
277
+ ### Technical Risks
278
+ - **Model Performance**: Start with simple models, gradually increase complexity
279
+ - **Data Privacy**: Ensure all data processing remains local/secure
280
+ - **Scalability**: Design modular architecture for easy scaling
281
+
282
+ ### Operational Risks
283
+ - **User Adoption**: Provide training and documentation
284
+ - **Maintenance**: Create automated testing and monitoring
285
+ - **Integration**: Maintain backward compatibility with existing features
286
+
287
+ ---
288
+
289
+ ## 8. Future Considerations
290
+
291
+ ### Advanced Features
292
+ - **Predictive Analytics**: Forecast maintenance needs and safety incidents
293
+ - **Real-time Monitoring**: Live data integration and alerts
294
+ - **Mobile Application**: Extend capabilities to mobile devices
295
+ - **API Integration**: Connect with external maintenance systems
296
+
297
+ ### Scalability
298
+ - **Multi-tenant Support**: Support multiple organizations
299
+ - **Cloud Deployment**: Scalable cloud infrastructure
300
+ - **Advanced Analytics**: Machine learning for pattern recognition
301
+
302
+ ---
303
+
304
+ *Document created: December 2024*
305
+ *Last updated: [Date]*
306
+ *Maintained by: ValonyLabs Development Team*
src/IMPLEMENTATION_ROADMAP.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # DigiTwin Implementation Roadmap
2
+
3
+ ## Current Status: ✅ Production Ready
4
+ - ✅ Core application functionality
5
+ - ✅ Data upload and processing
6
+ - ✅ FPSO visualizations
7
+ - ✅ Pivot table analytics
8
+ - ✅ Database persistence
9
+ - ✅ Responsive UI with custom styling
10
+ - ✅ Sidebar layout optimizations
11
+
12
+ ---
13
+
14
+ ## Phase 1: Data Preprocessing Module
15
+ **Timeline**: Week 1-2
16
+ **Status**: 🔄 Planned
17
+
18
+ ### Tasks:
19
+ - [ ] Create `preprocessing.py` module
20
+ - [ ] Implement column analysis functionality
21
+ - [ ] Add data cleaning pipeline
22
+ - [ ] Integrate with main application
23
+ - [ ] Test with existing datasets
24
+
25
+ ### Deliverables:
26
+ - Preprocessing module with column removal logic
27
+ - Data size reduction by 40-60%
28
+ - Improved loading performance
29
+
30
+ ---
31
+
32
+ ## Phase 2: Feature Engineering
33
+ **Timeline**: Week 3-4
34
+ **Status**: 🔄 Planned
35
+
36
+ ### Tasks:
37
+ - [ ] Create `feature_engineering.py` module
38
+ - [ ] Implement Main WorkCtr categorization
39
+ - [ ] Add temporal and spatial features
40
+ - [ ] Update database schema
41
+ - [ ] Integrate with analytics
42
+
43
+ ### Deliverables:
44
+ - Enhanced dataset with derived features
45
+ - Improved analytics capabilities
46
+ - Better insights generation
47
+
48
+ ---
49
+
50
+ ## Phase 3: LLM Integration with RAG
51
+ **Timeline**: Week 5-8
52
+ **Status**: 🔄 Planned
53
+
54
+ ### Tasks:
55
+ - [ ] Set up vector database (Chroma/FAISS)
56
+ - [ ] Implement LLM model integration
57
+ - [ ] Create RAG query system
58
+ - [ ] Add chat interface to Streamlit
59
+ - [ ] Test and optimize
60
+
61
+ ### Deliverables:
62
+ - Natural language query capability
63
+ - Intelligent insights generation
64
+ - Enhanced user experience
65
+
66
+ ---
67
+
68
+ ## Phase 4: Integration & Testing
69
+ **Timeline**: Week 9-10
70
+ **Status**: 🔄 Planned
71
+
72
+ ### Tasks:
73
+ - [ ] Integrate all modules
74
+ - [ ] Performance testing
75
+ - [ ] User acceptance testing
76
+ - [ ] Documentation updates
77
+ - [ ] Production deployment
78
+
79
+ ### Deliverables:
80
+ - Fully integrated enhanced application
81
+ - Performance benchmarks
82
+ - User documentation
83
+
84
+ ---
85
+
86
+ ## Success Metrics
87
+
88
+ ### Performance Targets:
89
+ - ⚡ 50% faster data loading
90
+ - 💾 40-60% data size reduction
91
+ - 🧠 <3 second RAG query response
92
+ - 📊 >90% query accuracy
93
+
94
+ ### User Experience:
95
+ - 🎯 Natural language interaction
96
+ - 📈 Enhanced analytics insights
97
+ - 🔍 Improved data discovery
98
+ - 🚀 Better overall performance
99
+
100
+ ---
101
+
102
+ ## Notes
103
+ - All enhancements maintain backward compatibility
104
+ - Modular design for easy integration
105
+ - Focus on user experience and performance
106
+ - Scalable architecture for future growth
107
+
108
+ ---
109
+
110
+ *Last Updated: December 2024*
111
+ *Next Review: [Date]*
src/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # reparos
2
+ Repairs
src/README_RAG.md ADDED
@@ -0,0 +1,240 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🤖 DigiTwin RAG Assistant
2
+
3
+ A comprehensive Retrieval-Augmented Generation (RAG) system integrated into the DigiTwin FPSO notifications analysis platform. This system provides intelligent conversational AI capabilities to query and analyze your notifications data using natural language.
4
+
5
+ ## 🚀 Features
6
+
7
+ ### Core RAG Capabilities
8
+ - **Hybrid Search**: Combines semantic and keyword-based search for optimal retrieval
9
+ - **Query Rewriting**: Intelligently reformulates user queries for better results
10
+ - **Streaming Responses**: Real-time token-by-token response generation
11
+ - **Pivot Analysis Integration**: Incorporates existing analytics into responses
12
+ - **Multi-LLM Support**: Works with Groq API and local Ollama models
13
+
14
+ ### Technical Features
15
+ - **Vector Databases**: Support for Weaviate and FAISS
16
+ - **Embedding Models**: Sentence Transformers for semantic understanding
17
+ - **Modern Chat Interface**: Streamlit-based chat UI with message history
18
+ - **Error Handling**: Graceful fallbacks and informative error messages
19
+ - **Modular Design**: Clean separation of concerns and easy extensibility
20
+
21
+ ## 📋 Prerequisites
22
+
23
+ - Python 3.8 or higher
24
+ - Streamlit application with notifications data
25
+ - Internet connection (for Groq API)
26
+ - Optional: Ollama for local LLM inference
27
+ - Optional: Docker for Weaviate vector database
28
+
29
+ ## 🛠️ Installation
30
+
31
+ ### Quick Setup
32
+ ```bash
33
+ # Run the automated setup script
34
+ python setup_rag.py
35
+ ```
36
+
37
+ ### Manual Installation
38
+ ```bash
39
+ # Install RAG dependencies
40
+ pip install -r requirements_rag.txt
41
+
42
+ # Or install individual packages
43
+ pip install sentence-transformers faiss-cpu weaviate-client groq ollama
44
+ ```
45
+
46
+ ### Environment Configuration
47
+ 1. Create a `.env` file in the project root:
48
+ ```bash
49
+ # Groq API Configuration
50
+ GROQ_API_KEY=your_groq_api_key_here
51
+
52
+ # Ollama Configuration (optional)
53
+ OLLAMA_HOST=http://localhost:11434
54
+
55
+ # Vector Database Configuration (optional)
56
+ WEAVIATE_URL=http://localhost:8080
57
+
58
+ # Embedding Model Configuration
59
+ EMBEDDING_MODEL=all-MiniLM-L6-v2
60
+ ```
61
+
62
+ 2. Get your Groq API key from [console.groq.com](https://console.groq.com/)
63
+
64
+ ## 🚀 Usage
65
+
66
+ ### Starting the Application
67
+ ```bash
68
+ streamlit run notifs.py
69
+ ```
70
+
71
+ ### Using the RAG Assistant
72
+ 1. Upload your notifications data or load from database
73
+ 2. Navigate to the "🤖 RAG Assistant" tab
74
+ 3. Start asking questions in natural language!
75
+
76
+ ### Example Queries
77
+ ```
78
+ "Which FPSO has the most NI notifications?"
79
+ "What are the common keywords in PAZ notifications?"
80
+ "Show me all safety-related notifications from last month"
81
+ "Compare notification patterns between GIR and DAL"
82
+ "What equipment has the most maintenance issues?"
83
+ "Which work centers require immediate attention?"
84
+ ```
85
+
86
+ ## 🏗️ Architecture
87
+
88
+ ### System Components
89
+
90
+ ```
91
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
92
+ │ User Query │───▶│ Query Rewriter │───▶│ Hybrid Search │
93
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
94
+
95
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
96
+ │ Pivot Analysis │◀───│ RAG Prompt │◀───│ Context Docs │
97
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
98
+
99
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
100
+ │ LLM Response │◀───│ Response Gen │◀───│ Vector Store │
101
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
102
+ ```
103
+
104
+ ### Data Flow
105
+ 1. **Query Input**: User submits natural language query
106
+ 2. **Query Rewriting**: LLM reformulates query for better retrieval
107
+ 3. **Hybrid Search**: Combines semantic and keyword search
108
+ 4. **Context Retrieval**: Fetches relevant documents and pivot analysis
109
+ 5. **Prompt Engineering**: Creates optimized RAG prompt
110
+ 6. **Response Generation**: LLM generates streaming response
111
+ 7. **Display**: Real-time response display in chat interface
112
+
113
+ ## 🔧 Configuration
114
+
115
+ ### LLM Models
116
+ - **Groq**: Fast inference with Llama3-8b-8192 model
117
+ - **Ollama**: Local inference with customizable models
118
+
119
+ ### Vector Databases
120
+ - **FAISS**: Lightweight, in-memory vector search
121
+ - **Weaviate**: Production-ready vector database with Docker
122
+
123
+ ### Embedding Models
124
+ - **all-MiniLM-L6-v2**: Fast, efficient sentence embeddings
125
+ - **Customizable**: Easy to switch to other models
126
+
127
+ ## 📊 Performance
128
+
129
+ ### Expected Performance
130
+ - **Query Response Time**: <3 seconds for most queries
131
+ - **Memory Usage**: Optimized for large datasets
132
+ - **Accuracy**: >90% relevance for retrieved documents
133
+ - **Scalability**: Handles thousands of notifications efficiently
134
+
135
+ ### Optimization Features
136
+ - **Data Preprocessing**: Removes unnecessary columns
137
+ - **Memory Optimization**: Efficient data types and structures
138
+ - **Caching**: Vector embeddings and search results
139
+ - **Streaming**: Real-time response generation
140
+
141
+ ## 🐛 Troubleshooting
142
+
143
+ ### Common Issues
144
+
145
+ #### "RAG module not available"
146
+ ```bash
147
+ # Install dependencies
148
+ pip install -r requirements_rag.txt
149
+ ```
150
+
151
+ #### "Groq API key not found"
152
+ ```bash
153
+ # Set environment variable
154
+ export GROQ_API_KEY=your_api_key_here
155
+ ```
156
+
157
+ #### "Vector database connection failed"
158
+ ```bash
159
+ # Start Weaviate (optional)
160
+ docker run -d -p 8080:8080 semitechnologies/weaviate:1.22.4
161
+ ```
162
+
163
+ #### "Embedding model loading failed"
164
+ ```bash
165
+ # Check internet connection and try again
166
+ # The model will download automatically on first use
167
+ ```
168
+
169
+ ### Debug Mode
170
+ Enable debug logging by setting:
171
+ ```bash
172
+ export STREAMLIT_LOG_LEVEL=debug
173
+ ```
174
+
175
+ ## 🔄 Development
176
+
177
+ ### Adding New Features
178
+ 1. **Custom Embeddings**: Modify `create_embeddings()` method
179
+ 2. **New LLM Providers**: Extend `initialize_llm_clients()` method
180
+ 3. **Additional Search**: Enhance `hybrid_search()` method
181
+ 4. **UI Improvements**: Modify `render_chat_interface()` function
182
+
183
+ ### Testing
184
+ ```bash
185
+ # Run setup tests
186
+ python setup_rag.py
187
+
188
+ # Test individual components
189
+ python -c "from rag_chatbot import DigiTwinRAG; rag = DigiTwinRAG()"
190
+ ```
191
+
192
+ ## 📈 Advanced Usage
193
+
194
+ ### Custom Prompts
195
+ Modify the RAG prompt template in `create_rag_prompt()` method:
196
+ ```python
197
+ def create_rag_prompt(self, query: str, context: List[Dict[str, Any]], pivot_analysis: str) -> str:
198
+ # Customize prompt engineering here
199
+ pass
200
+ ```
201
+
202
+ ### Adding New Data Sources
203
+ Extend the data loading in `load_notifications_data()` method:
204
+ ```python
205
+ def load_notifications_data(self) -> pd.DataFrame:
206
+ # Add support for new data sources
207
+ pass
208
+ ```
209
+
210
+ ### Custom Search Strategies
211
+ Enhance the hybrid search in `hybrid_search()` method:
212
+ ```python
213
+ def hybrid_search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
214
+ # Add custom search algorithms
215
+ pass
216
+ ```
217
+
218
+ ## 🤝 Contributing
219
+
220
+ 1. Fork the repository
221
+ 2. Create a feature branch
222
+ 3. Make your changes
223
+ 4. Add tests
224
+ 5. Submit a pull request
225
+
226
+ ## 📄 License
227
+
228
+ This project is part of the DigiTwin platform and follows the same licensing terms.
229
+
230
+ ## 🆘 Support
231
+
232
+ For support and questions:
233
+ - Check the troubleshooting section
234
+ - Review the example queries
235
+ - Test with the setup script
236
+ - Contact the development team
237
+
238
+ ---
239
+
240
+ **🚀 Built with Pride - STP/INSP/MET | Powered by ValonyLabs**
src/clv.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # clv.py
2
+
3
+ # CLV-specific keywords and location dictionaries
4
+ clv_module_keywords = ['M110', 'M111', 'M112', 'M113', 'M114', 'M115', 'M116', 'H151',
5
+ 'M120', 'M121', 'M122', 'M123', 'M124', 'M125', 'M126', 'M151']
6
+ clv_rack_keywords = ['141', '142', '143', '144', '145', '146']
7
+ clv_living_quarters_keywords = ['LQ', 'LQ1', 'LQ2', 'LQ3', 'LQ4', 'LQL0', 'LQPS', 'LQSB', 'LQROOF', 'LQL4', 'LQL2', 'LQ-5', 'LQPD', 'LQ PS', 'LQAFT', 'LQ-T', 'LQL1S']
8
+ clv_flare_keywords = ['131']
9
+ clv_fwd_keywords = ['FWD']
10
+ clv_hexagons_keywords = ['HELIDECK']
11
+
12
+ clv_modules = {
13
+ 'M120': (0.75, 2), 'M121': (0.5, 3), 'M122': (0.5, 4), 'M123': (0.5, 5),
14
+ 'M124': (0.5, 6), 'M125': (0.5, 7), 'M126': (0.5, 8), 'M151': (0.5, 9), 'M110': (1.75, 2),
15
+ 'M111': (2, 3), 'M112': (2, 4), 'M113': (2, 5), 'M114': (2, 6),
16
+ 'M115': (2, 7), 'M116': (2, 8), 'H151': (2, 9)
17
+ }
18
+ clv_racks = {
19
+ '141': (1.5, 3), '142': (1.5, 4), '143': (1.5, 5),
20
+ '144': (1.5, 6), '145': (1.5, 7), '146': (1.5, 8)
21
+ }
22
+ clv_flare = {'131': (1.5, 9)}
23
+ clv_living_quarters = {'LQ': (0.5, 1)}
24
+ clv_hexagons = {'HELIDECK': (2.75, 1)}
25
+ clv_fwd = {'FWD': (0.5, 10)}
26
+
27
+ def draw_clv(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd):
28
+ for module, (row, col) in clv_modules.items():
29
+ if module == 'M110':
30
+ height, y_position, text_y = 1.25, row, row + 0.5
31
+ elif module == 'M120':
32
+ height, y_position, text_y = 1.25, row - 0.25, row + 0.25
33
+ else:
34
+ height, y_position, text_y = 1, row, row + 0.5
35
+ add_chamfered_rectangle(ax, (col, y_position), 1, height, 0.1, edgecolor='black', facecolor='white')
36
+ ax.text(col + 0.5, text_y, module, ha='center', va='center', fontsize=7, weight='bold')
37
+
38
+ for rack, (row, col) in clv_racks.items():
39
+ add_chamfered_rectangle(ax, (col, row), 1, 0.5, 0.05, edgecolor='black', facecolor='white')
40
+ ax.text(col + 0.5, row + 0.25, rack, ha='center', va='center', fontsize=7, weight='bold')
41
+
42
+ for flare_loc, (row, col) in clv_flare.items():
43
+ add_chamfered_rectangle(ax, (col, row), 1, 0.5, 0.05, edgecolor='black', facecolor='white')
44
+ ax.text(col + 0.5, row + 0.25, flare_loc, ha='center', va='center', fontsize=7, weight='bold')
45
+
46
+ for living_quarter, (row, col) in clv_living_quarters.items():
47
+ add_rectangle(ax, (col, row), 1, 2.5, edgecolor='black', facecolor='white')
48
+ ax.text(col + 0.5, row + 1.25, living_quarter, ha='center', va='center', fontsize=7, rotation=90, weight='bold')
49
+
50
+ for hexagon, (row, col) in clv_hexagons.items():
51
+ add_hexagon(ax, (col, row), 0.60, edgecolor='black', facecolor='white')
52
+ ax.text(col, row, hexagon, ha='center', va='center', fontsize=7, weight='bold')
53
+
54
+ for fwd_loc, (row, col) in clv_fwd.items():
55
+ add_fwd(ax, (col, row), 2.5, -1, edgecolor='black', facecolor='white')
src/dal.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # dal.py
2
+
3
+ # DAL-specific keywords and location dictionaries
4
+ dal_module_keywords = ['P11', 'P21', 'P31', 'P41', 'P51', 'P61', 'P12', 'P22', 'P32', 'P42', 'P52', 'P62']
5
+ dal_rack_keywords = ['R11', 'R12', 'R13', 'R14', 'R15', 'R16']
6
+ dal_living_quarters_keywords = ['LQ', 'LQ1', 'LQ2', 'LQ3', 'LQ4', 'LQL0', 'LQPS', 'LQSB', 'LQROOF', 'LQL4', 'LQL2', 'LQ-5', 'LQPD', 'LQ PS', 'LQAFT', 'LQ-T', 'LQL1S']
7
+ dal_flare_keywords = ['FLARE']
8
+ dal_fwd_keywords = ['FWD']
9
+ dal_hexagons_keywords = ['HELIDECK']
10
+
11
+ dal_modules = {
12
+ 'P11': (0.5, 2), 'P21': (0.5, 3), 'P31': (0.5, 4), 'P41': (0.5, 5),
13
+ 'P51': (0.5, 6), 'P61': (0.5, 7), 'P12': (2, 2), 'P22': (2, 3),
14
+ 'P32': (2, 4), 'P42': (2, 5), 'P52': (2, 6), 'P62': (2, 7)
15
+ }
16
+ dal_racks = {
17
+ 'R11': (1.5, 2), 'R12': (1.5, 3), 'R13': (1.5, 4),
18
+ 'R14': (1.5, 5), 'R15': (1.5, 6), 'R16': (1.5, 7)
19
+ }
20
+ dal_flare = {'FLARE': (0.5, 8)}
21
+ dal_living_quarters = {'LQ': (0.5, 1)}
22
+ dal_hexagons = {'HELIDECK': (2.75, 1)}
23
+ dal_fwd = {'FWD': (0.5, 8.75)}
24
+
25
+ def draw_dal(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd):
26
+ for module, (row, col) in dal_modules.items():
27
+ if module == 'P11':
28
+ height, y_position, text_y = 1, row, row + 0.5
29
+ elif module == 'P12':
30
+ height, y_position, text_y = 1, row, row + 0.25
31
+ else:
32
+ height, y_position, text_y = 1, row, row + 0.5
33
+ add_chamfered_rectangle(ax, (col, y_position), 1, height, 0.1, edgecolor='black', facecolor='white')
34
+ ax.text(col + 0.5, text_y, module, ha='center', va='center', fontsize=7, weight='bold')
35
+
36
+ for rack, (row, col) in dal_racks.items():
37
+ add_chamfered_rectangle(ax, (col, row), 1, 0.5, 0.05, edgecolor='black', facecolor='white')
38
+ ax.text(col + 0.5, row + 0.25, rack, ha='center', va='center', fontsize=7, weight='bold')
39
+
40
+ for flare_loc, (row, col) in dal_flare.items():
41
+ add_chamfered_rectangle(ax, (col, row), 0.75, 2.5, 0.05, edgecolor='black', facecolor='white')
42
+ ax.text(col + 0.35, row + 1.25, flare_loc, ha='center', va='center', fontsize=7, weight='bold')
43
+
44
+
45
+ for living_quarter, (row, col) in dal_living_quarters.items():
46
+ add_rectangle(ax, (col, row), 1, 2.5, edgecolor='black', facecolor='white')
47
+ ax.text(col + 0.5, row + 1.25, living_quarter, ha='center', va='center', fontsize=7, rotation=90, weight='bold')
48
+
49
+ for hexagon, (row, col) in dal_hexagons.items():
50
+ add_hexagon(ax, (col, row), 0.60, edgecolor='black', facecolor='white')
51
+ ax.text(col, row, hexagon, ha='center', va='center', fontsize=7, weight='bold')
52
+
53
+ for fwd_loc, (row, col) in dal_fwd.items():
54
+ add_fwd(ax, (col, row), 2.5, -1, edgecolor='black', facecolor='white')
55
+
56
+
57
+ #
58
+ #add_chamfered_rectangle(ax, (col, row), 1, 0.5, 0.05, edgecolor='black', facecolor='white')
59
+ #ax.text(col + 0.5, row + 0.25, flare_loc, ha='center', va='center', fontsize=7, weight='bold')
60
+
src/gir.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # gir.py
2
+
3
+ # GIR-specific keywords and location dictionaries (placeholder values, update as needed)
4
+ gir_module_keywords = []
5
+ gir_rack_keywords = []
6
+ gir_living_quarters_keywords = []
7
+ gir_flare_keywords = []
8
+ gir_fwd_keywords = []
9
+ gir_hexagons_keywords = []
10
+
11
+ gir_modules = {}
12
+ gir_racks = {}
13
+ gir_flare = {}
14
+ gir_living_quarters = {}
15
+ gir_hexagons = {}
16
+ gir_fwd = {}
17
+
18
+ def draw_gir(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd):
19
+ # TODO: Implement GIR drawing logic based on actual requirements
20
+ pass
src/notifs.py ADDED
@@ -0,0 +1,948 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ import matplotlib.pyplot as plt
5
+ import matplotlib.patches as patches
6
+ import math
7
+ import matplotlib.transforms as transforms
8
+ import sqlite3
9
+
10
+ # Import FPSO-specific modules
11
+ from clv import *
12
+ from paz import *
13
+ from dal import *
14
+ from gir import *
15
+ # Import shared utilities
16
+ # Remove these imports:
17
+ # from utils import preprocess_keywords, extract_ni_nc_keywords, extract_location_keywords
18
+
19
+ # --- UI CONFIG & STYLE ---
20
+ st.set_page_config(page_title="B17 - Notifications", layout="wide")
21
+
22
+ st.markdown("""
23
+ <style>
24
+ @import url('https://fonts.cdnfonts.com/css/tw-cen-mt');
25
+ * {
26
+ font-family: 'Tw Cen MT', sans-serif !important;
27
+ }
28
+
29
+ /* Sidebar arrow fix */
30
+ section[data-testid="stSidebar"] [data-testid="stSidebarNav"]::before {
31
+ content: "▶";
32
+ font-size: 1.3rem;
33
+ margin-right: 0.4rem;
34
+ }
35
+
36
+ /* Fix sidebar expander layout */
37
+ section[data-testid="stSidebar"] [data-testid="stExpander"] {
38
+ margin-bottom: 1rem;
39
+ }
40
+
41
+ section[data-testid="stSidebar"] [data-testid="stExpander"] [data-testid="stExpanderHeader"] {
42
+ padding: 0.5rem 0.75rem;
43
+ font-size: 0.9rem;
44
+ line-height: 1.2;
45
+ word-wrap: break-word;
46
+ overflow-wrap: break-word;
47
+ }
48
+
49
+ section[data-testid="stSidebar"] [data-testid="stExpander"] [data-testid="stExpanderContent"] {
50
+ padding: 0.5rem 0.75rem;
51
+ }
52
+
53
+ /* Ensure proper spacing for sidebar elements */
54
+ section[data-testid="stSidebar"] .stMarkdown {
55
+ margin-bottom: 0.5rem;
56
+ }
57
+
58
+ section[data-testid="stSidebar"] .stButton {
59
+ margin-top: 0.5rem;
60
+ }
61
+
62
+ /* Ensure sidebar has proper width */
63
+ section[data-testid="stSidebar"] {
64
+ min-width: 300px;
65
+ }
66
+
67
+ /* Improve expander content readability */
68
+ section[data-testid="stSidebar"] [data-testid="stExpander"] .stMarkdown {
69
+ font-size: 0.85rem;
70
+ line-height: 1.3;
71
+ }
72
+
73
+ section[data-testid="stSidebar"] [data-testid="stExpander"] .stMarkdown p {
74
+ margin-bottom: 0.25rem;
75
+ }
76
+
77
+ /* Top-right logo placement - responsive to scrolling */
78
+ .logo-container {
79
+ position: absolute;
80
+ top: 1rem;
81
+ right: 2rem;
82
+ z-index: 1000;
83
+ transition: all 0.3s ease;
84
+ }
85
+
86
+ /* Adjust logo position when scrolling */
87
+ .logo-container.scrolled {
88
+ position: fixed;
89
+ top: 0.5rem;
90
+ right: 1rem;
91
+ transform: scale(0.8);
92
+ }
93
+
94
+ /* Ensure main content doesn't overlap with logo */
95
+ .main .block-container {
96
+ padding-top: 2rem !important;
97
+ }
98
+
99
+ /* Smooth transitions for logo */
100
+ .logo-container img {
101
+ transition: all 0.3s ease;
102
+ }
103
+
104
+ /* Logo hover effect */
105
+ .logo-container:hover {
106
+ transform: scale(1.05);
107
+ }
108
+
109
+ .logo-container.scrolled:hover {
110
+ transform: scale(0.85);
111
+ }
112
+ </style>
113
+ """, unsafe_allow_html=True)
114
+
115
+ # Display logo (responsive to scrolling)
116
+ st.markdown(
117
+ """
118
+ <div class="logo-container" id="logo-container">
119
+ <img src="https://github.com/valonys/DigiTwin/blob/29dd50da95bec35a5abdca4bdda1967f0e5efff6/ValonyLabs_Logo.png?raw=true" width="70">
120
+ </div>
121
+
122
+ <script>
123
+ // Handle logo positioning on scroll
124
+ window.addEventListener('scroll', function() {
125
+ const logo = document.getElementById('logo-container');
126
+ if (window.scrollY > 100) {
127
+ logo.classList.add('scrolled');
128
+ } else {
129
+ logo.classList.remove('scrolled');
130
+ }
131
+ });
132
+
133
+ // Initial check for scroll position
134
+ document.addEventListener('DOMContentLoaded', function() {
135
+ const logo = document.getElementById('logo-container');
136
+ if (window.scrollY > 100) {
137
+ logo.classList.add('scrolled');
138
+ }
139
+ });
140
+ </script>
141
+ """,
142
+ unsafe_allow_html=True
143
+ )
144
+
145
+ st.title("📊 DigiTwin - The Inspekta Deck")
146
+
147
+ # --- AVATARS ---
148
+ USER_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/9904d9a0d445ab0488cf7395cb863cce7621d897/USER_AVATAR.png"
149
+ BOT_AVATAR = "https://raw.githubusercontent.com/achilela/vila_fofoka_analysis/991f4c6e4e1dc7a8e24876ca5aae5228bcdb4dba/Ataliba_Avatar.jpg"
150
+
151
+ # --- FAST LOCAL PREPROCESSING FUNCTIONS ---
152
+ def preprocess_keywords(description):
153
+ description = str(description).upper()
154
+ for lq_variant in clv_living_quarters_keywords:
155
+ if lq_variant != 'LQ':
156
+ description = description.replace(lq_variant, 'LQ')
157
+ for module in clv_module_keywords:
158
+ number = module[1:]
159
+ if number in description:
160
+ description = description.replace(number, module)
161
+ for module in paz_module_keywords:
162
+ if module in description:
163
+ description = description.replace(module, module)
164
+ for rack in paz_rack_keywords:
165
+ if rack in description:
166
+ description = description.replace(rack, rack)
167
+ for module in dal_module_keywords:
168
+ if module in description:
169
+ description = description.replace(module, module)
170
+ for rack in dal_rack_keywords:
171
+ if rack in description:
172
+ description = description.replace(rack, rack)
173
+ # If you use NI_keyword_map and NC_keyword_map, add them here as well
174
+ return description
175
+
176
+ def extract_ni_nc_keywords(row, notif_type_col, desc_col):
177
+ description = preprocess_keywords(row[desc_col])
178
+ notif_type = row[notif_type_col]
179
+ if notif_type == 'NI':
180
+ keywords = [kw for kw in NI_keywords if kw in description]
181
+ elif notif_type == 'NC':
182
+ keywords = [kw for kw in NC_keywords if kw in description]
183
+ else:
184
+ keywords = []
185
+ return ', '.join(keywords) if keywords else 'None'
186
+
187
+ def extract_location_keywords(row, desc_col, keyword_list):
188
+ description = preprocess_keywords(row[desc_col])
189
+ if keyword_list == clv_living_quarters_keywords:
190
+ return 'LQ' if any(kw in description for kw in clv_living_quarters_keywords) else 'None'
191
+ else:
192
+ locations = [kw for kw in keyword_list if kw in description]
193
+ return ', '.join(locations) if locations else 'None'
194
+
195
+ def create_pivot_table(df, index, columns, aggfunc='size', fill_value=0):
196
+ """Create pivot table from dataframe"""
197
+ df_exploded = df.assign(Keywords=df[columns].str.split(', ')).explode('Keywords')
198
+ df_exploded = df_exploded[df_exploded['Keywords'] != 'None']
199
+ pivot = pd.pivot_table(df_exploded, index=index, columns='Keywords', aggfunc=aggfunc, fill_value=fill_value)
200
+ return pivot
201
+
202
+ def apply_fpso_colors(df):
203
+ """Apply color styling to FPSO dataframe"""
204
+ styles = pd.DataFrame('', index=df.index, columns=df.columns)
205
+ color_map = {'GIR': '#FFA07A', 'DAL': '#ADD8E6', 'PAZ': '#D8BFD8', 'CLV': '#90EE90'}
206
+ for fpso, color in color_map.items():
207
+ if fpso in df.index:
208
+ styles.loc[fpso] = f'background-color: {color}'
209
+ return styles
210
+
211
+ def add_rectangle(ax, xy, width, height, **kwargs):
212
+ rectangle = patches.Rectangle(xy, width, height, **kwargs)
213
+ ax.add_patch(rectangle)
214
+
215
+ def add_chamfered_rectangle(ax, xy, width, height, chamfer, **kwargs):
216
+ x, y = xy
217
+ coords = [
218
+ (x + chamfer, y),
219
+ (x + width - chamfer, y),
220
+ (x + width, y + chamfer),
221
+ (x + width, y + height - chamfer),
222
+ (x + width - chamfer, y + height),
223
+ (x + chamfer, y + height),
224
+ (x, y + height - chamfer),
225
+ (x, y + chamfer)
226
+ ]
227
+ polygon = patches.Polygon(coords, closed=True, **kwargs)
228
+ ax.add_patch(polygon)
229
+
230
+ def add_hexagon(ax, xy, radius, **kwargs):
231
+ x, y = xy
232
+ vertices = [(x + radius * math.cos(2 * math.pi * n / 6), y + radius * math.sin(2 * math.pi * n / 6)) for n in range(6)]
233
+ hexagon = patches.Polygon(vertices, closed=True, **kwargs)
234
+ ax.add_patch(hexagon)
235
+
236
+ def add_fwd(ax, xy, width, height, **kwargs):
237
+ x, y = xy
238
+ top_width = width * 0.80
239
+ coords = [
240
+ (0, 0),
241
+ (width, 0),
242
+ (width - (width - top_width) / 2, height),
243
+ ((width - top_width) / 2, height)
244
+ ]
245
+ trapezoid = patches.Polygon(coords, closed=True, **kwargs)
246
+ t = transforms.Affine2D().rotate_deg(90).translate(x, y)
247
+ trapezoid.set_transform(t + ax.transData)
248
+ ax.add_patch(trapezoid)
249
+ text_t = transforms.Affine2D().rotate_deg(90).translate(x + height / 2, y + width / 2)
250
+ ax.text(0, -1, "FWD", ha='center', va='center', fontsize=7, weight='bold', transform=text_t + ax.transData)
251
+
252
+ # Sidebar file upload and FPSO selection
253
+ st.sidebar.title("Upload Notifications Dataset")
254
+
255
+ # Add database loading option
256
+ load_from_db = st.sidebar.checkbox("Load from Database", help="Load previously uploaded data from database")
257
+
258
+ # Add preprocessing option
259
+ enable_preprocessing = st.sidebar.checkbox("Enable Data Preprocessing", value=True,
260
+ help="Remove unnecessary columns and optimize memory usage")
261
+
262
+ uploaded_file = st.sidebar.file_uploader("Choose an Excel file", type=["xlsx"])
263
+
264
+ # Add FPSO selection dropdown in the sidebar
265
+ selected_fpso = st.sidebar.selectbox("Select FPSO for Layout", ['GIR', 'DAL', 'PAZ', 'CLV'])
266
+
267
+
268
+
269
+ # NI/NC keywords (if not already in utils.py, move them there)
270
+ NI_keywords = ['WRAP', 'WELD', 'TBR', 'PACH', 'PATCH', 'OTHE', 'CLMP', 'REPL',
271
+ 'BOND', 'BOLT', 'SUPP', 'OT', 'GASK', 'CLAMP']
272
+ NC_keywords = ['COA', 'ICOA', 'CUSP', 'WELD', 'REPL', 'CUSP1', 'CUSP2']
273
+
274
+ DB_PATH = 'notifs_data.db'
275
+ TABLE_NAME = 'notifications'
276
+
277
+ # Utility to save DataFrame to SQLite
278
+ def save_df_to_db(df, db_path=DB_PATH, table_name=TABLE_NAME):
279
+ with sqlite3.connect(db_path) as conn:
280
+ df.to_sql(table_name, conn, if_exists='replace', index=False)
281
+ # Save timestamp
282
+ from datetime import datetime
283
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
284
+ conn.execute("CREATE TABLE IF NOT EXISTS metadata (key TEXT PRIMARY KEY, value TEXT)")
285
+ conn.execute("INSERT OR REPLACE INTO metadata VALUES (?, ?)", ('last_updated', timestamp))
286
+
287
+ # Utility to load DataFrame from SQLite
288
+ def load_df_from_db(db_path=DB_PATH, table_name=TABLE_NAME):
289
+ with sqlite3.connect(db_path) as conn:
290
+ try:
291
+ return pd.read_sql(f'SELECT * FROM {table_name}', conn)
292
+ except Exception:
293
+ return None
294
+
295
+ # Utility to get last update timestamp
296
+ def get_last_update_time(db_path=DB_PATH):
297
+ with sqlite3.connect(db_path) as conn:
298
+ try:
299
+ result = conn.execute("SELECT value FROM metadata WHERE key = 'last_updated'").fetchone()
300
+ return result[0] if result else None
301
+ except Exception:
302
+ return None
303
+
304
+ # Data Preprocessing Function
305
+ def preprocess_notifications_data(df):
306
+ """
307
+ Preprocess notification data to reduce size and improve performance
308
+ by removing unnecessary columns and optimizing memory usage.
309
+ """
310
+ # Store original shape for comparison
311
+ original_shape = df.shape
312
+ original_memory = df.memory_usage(deep=True).sum()
313
+
314
+ # Remove unnecessary columns to improve memory footprint
315
+ columns_to_remove = [
316
+ 'Priority', # Redundant priority information
317
+ 'Notification', # Duplicate notification data
318
+ 'Order', # Order information not needed for analytics
319
+ 'Planner group' # Planner group metadata
320
+ ]
321
+
322
+ # Remove specified columns (ignore if they don't exist)
323
+ df_cleaned = df.drop(columns=columns_to_remove, errors='ignore')
324
+
325
+ # Remove columns with high percentage of null values (>80%)
326
+ null_percentage = df_cleaned.isnull().sum() / len(df_cleaned) * 100
327
+ high_null_columns = null_percentage[null_percentage > 80].index.tolist()
328
+ df_cleaned = df_cleaned.drop(columns=high_null_columns)
329
+
330
+ # Remove duplicate rows
331
+ df_cleaned = df_cleaned.drop_duplicates()
332
+
333
+ # Optimize data types for memory efficiency
334
+ for col in df_cleaned.columns:
335
+ if df_cleaned[col].dtype == 'object':
336
+ # Convert object columns to category if they have few unique values
337
+ if df_cleaned[col].nunique() / len(df_cleaned) < 0.5:
338
+ df_cleaned[col] = df_cleaned[col].astype('category')
339
+ elif df_cleaned[col].dtype == 'int64':
340
+ # Downcast integers
341
+ df_cleaned[col] = pd.to_numeric(df_cleaned[col], downcast='integer')
342
+ elif df_cleaned[col].dtype == 'float64':
343
+ # Downcast floats
344
+ df_cleaned[col] = pd.to_numeric(df_cleaned[col], downcast='float')
345
+
346
+ # Calculate improvements
347
+ final_shape = df_cleaned.shape
348
+ final_memory = df_cleaned.memory_usage(deep=True).sum()
349
+
350
+ # Create summary of preprocessing results
351
+ preprocessing_summary = {
352
+ 'original_rows': original_shape[0],
353
+ 'original_cols': original_shape[1],
354
+ 'final_rows': final_shape[0],
355
+ 'final_cols': final_shape[1],
356
+ 'rows_removed': original_shape[0] - final_shape[0],
357
+ 'cols_removed': original_shape[1] - final_shape[1],
358
+ 'original_memory_mb': original_memory / 1024 / 1024,
359
+ 'final_memory_mb': final_memory / 1024 / 1024,
360
+ 'memory_reduction_mb': (original_memory - final_memory) / 1024 / 1024,
361
+ 'memory_reduction_percent': ((original_memory - final_memory) / original_memory) * 100,
362
+ 'removed_columns': columns_to_remove + high_null_columns
363
+ }
364
+
365
+ return df_cleaned, preprocessing_summary
366
+
367
+ # Data Management Section
368
+ st.sidebar.markdown("---")
369
+ st.sidebar.subheader("Data Management")
370
+
371
+ # Check if data exists in database
372
+ existing_data = load_df_from_db()
373
+ if existing_data is not None:
374
+ st.sidebar.info(f"📊 Database contains {len(existing_data)} records")
375
+
376
+ # Show last update time
377
+ last_update = get_last_update_time()
378
+ if last_update:
379
+ st.sidebar.caption(f"🕒 Last updated: {last_update}")
380
+
381
+ # Show data summary
382
+ with st.sidebar.expander("Data Summary"):
383
+ if 'FPSO' in existing_data.columns:
384
+ fpsos = existing_data['FPSO'].value_counts()
385
+ st.write("**FPSO Distribution:**")
386
+ for fpso, count in fpsos.items():
387
+ st.write(f"• {fpso}: {count}")
388
+
389
+ if 'Notifictn type' in existing_data.columns:
390
+ notif_types = existing_data['Notifictn type'].value_counts()
391
+ st.write("**Notification Types:**")
392
+ for ntype, count in notif_types.items():
393
+ st.write(f"• {ntype}: {count}")
394
+
395
+ # Add clear database option
396
+ if st.sidebar.button("🗑️ Clear Database"):
397
+ import os
398
+ if os.path.exists(DB_PATH):
399
+ os.remove(DB_PATH)
400
+ st.sidebar.success("Database cleared successfully!")
401
+ st.rerun()
402
+ else:
403
+ st.sidebar.warning("No data in database")
404
+
405
+
406
+
407
+ # Main app logic
408
+ if uploaded_file is not None or load_from_db:
409
+ try:
410
+ if load_from_db:
411
+ df = load_df_from_db()
412
+ if df is None:
413
+ st.warning("No data found in the database. Please upload a new file or ensure it's saved.")
414
+ st.stop()
415
+ else:
416
+ st.success("📊 Data loaded from database successfully!")
417
+ else:
418
+ # Read the Excel file
419
+ df = pd.read_excel(uploaded_file, sheet_name='Global Notifications')
420
+
421
+ # Apply data preprocessing if enabled
422
+ if enable_preprocessing:
423
+ st.info("🔄 Preprocessing data to optimize performance...")
424
+ df, preprocessing_summary = preprocess_notifications_data(df)
425
+
426
+ # Display preprocessing results
427
+ with st.expander("📊 Data Preprocessing Summary", expanded=True):
428
+ col1, col2, col3 = st.columns(3)
429
+ with col1:
430
+ st.metric("Rows", f"{preprocessing_summary['final_rows']:,}",
431
+ f"-{preprocessing_summary['rows_removed']:,}")
432
+ with col2:
433
+ st.metric("Columns", f"{preprocessing_summary['final_cols']}",
434
+ f"-{preprocessing_summary['cols_removed']}")
435
+ with col3:
436
+ st.metric("Memory", f"{preprocessing_summary['final_memory_mb']:.1f} MB",
437
+ f"-{preprocessing_summary['memory_reduction_mb']:.1f} MB")
438
+
439
+ st.write(f"**Memory reduction:** {preprocessing_summary['memory_reduction_percent']:.1f}%")
440
+
441
+ if preprocessing_summary['removed_columns']:
442
+ st.write("**Removed columns:**")
443
+ for col in preprocessing_summary['removed_columns']:
444
+ st.write(f"• {col}")
445
+
446
+ # Save preprocessed data to DB for persistence
447
+ save_df_to_db(df)
448
+ st.success("✅ Data preprocessed and saved to database!")
449
+ else:
450
+ # Save original data to DB for persistence
451
+ save_df_to_db(df)
452
+ st.success("✅ Data uploaded and saved to database!")
453
+
454
+ # Strip whitespace from column names
455
+ df.columns = df.columns.str.strip()
456
+
457
+ # Define expected columns with corrected spelling
458
+ expected_columns = {
459
+ 'Notifictn type': 'Notifictn type', # Corrected spelling
460
+ 'Created on': 'Created on', # Corrected spelling
461
+ 'Description': 'Description',
462
+ 'FPSO': 'FPSO'
463
+ }
464
+
465
+ # Check if all expected columns are present and map them
466
+ missing_columns = []
467
+ column_mapping = {}
468
+ for expected, actual in expected_columns.items():
469
+ if actual in df.columns:
470
+ column_mapping[expected] = actual
471
+ else:
472
+ missing_columns.append(actual)
473
+
474
+ if missing_columns:
475
+ st.error(f"The following expected columns are missing: {missing_columns}")
476
+ st.write("Please ensure your Excel file contains these columns with the exact names.")
477
+ st.stop()
478
+
479
+ # Rename columns for consistency in processing
480
+ df = df[list(column_mapping.values())]
481
+ df.columns = list(expected_columns.keys())
482
+ # Ensure df is a DataFrame after slicing
483
+ if not isinstance(df, pd.DataFrame):
484
+ df = pd.DataFrame(df)
485
+
486
+ # Preprocess FPSO: Keep only GIR, DAL, PAZ, CLV
487
+ valid_fpsos = ['GIR', 'DAL', 'PAZ', 'CLV']
488
+ df = df[df['FPSO'].isin(valid_fpsos)]
489
+ if not isinstance(df, pd.DataFrame):
490
+ df = pd.DataFrame(df)
491
+
492
+ # Extract NI/NC keywords
493
+ df['Extracted_Keywords'] = df.apply(extract_ni_nc_keywords, axis=1, args=('Notifictn type', 'Description'))
494
+
495
+ # Extract location keywords (modules, racks, etc.)
496
+ df['Extracted_Modules'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_module_keywords))
497
+ df['Extracted_Racks'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_rack_keywords))
498
+ df['Extracted_LivingQuarters'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_living_quarters_keywords))
499
+ df['Extracted_Flare'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_flare_keywords))
500
+ df['Extracted_FWD'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_fwd_keywords))
501
+ df['Extracted_HeliDeck'] = df.apply(extract_location_keywords, axis=1, args=('Description', clv_hexagons_keywords))
502
+
503
+ # Extract PAZ-specific location keywords
504
+ df['Extracted_PAZ_Modules'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_module_keywords))
505
+ df['Extracted_PAZ_Racks'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_rack_keywords))
506
+ df['Extracted_PAZ_LivingQuarters'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_living_quarters_keywords))
507
+ df['Extracted_PAZ_Flare'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_flare_keywords))
508
+ df['Extracted_PAZ_FWD'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_fwd_keywords))
509
+ df['Extracted_PAZ_HeliDeck'] = df.apply(extract_location_keywords, axis=1, args=('Description', paz_hexagons_keywords))
510
+
511
+ # Extract DAL-specific location keywords
512
+ df['Extracted_DAL_Modules'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_module_keywords))
513
+ df['Extracted_DAL_Racks'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_rack_keywords))
514
+ df['Extracted_DAL_LivingQuarters'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_living_quarters_keywords))
515
+ df['Extracted_DAL_Flare'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_flare_keywords))
516
+ df['Extracted_DAL_FWD'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_fwd_keywords))
517
+ df['Extracted_DAL_HeliDeck'] = df.apply(extract_location_keywords, axis=1, args=('Description', dal_hexagons_keywords))
518
+
519
+ # Split dataframe into NI and NC
520
+ df_ni = df[df['Notifictn type'] == 'NI'].copy()
521
+ if not isinstance(df_ni, pd.DataFrame):
522
+ df_ni = pd.DataFrame(df_ni)
523
+ df_nc = df[df['Notifictn type'] == 'NC'].copy()
524
+ if not isinstance(df_nc, pd.DataFrame):
525
+ df_nc = pd.DataFrame(df_nc)
526
+
527
+ # Create tabs
528
+ tab1, tab2, tab3, tab4, tab5 = st.tabs(["NI Notifications", "NC Notifications", "Summary Stats", "FPSO Layout", "🤖 RAG Assistant"])
529
+
530
+ # NI Notifications Tab
531
+ with tab1:
532
+ st.subheader("NI Notifications Analysis")
533
+ if not df_ni.empty:
534
+ ni_pivot = create_pivot_table(df_ni, index='FPSO', columns='Extracted_Keywords')
535
+ st.write("Pivot Table (Count of Keywords by FPSO):")
536
+ styled_ni_pivot = ni_pivot.style.apply(apply_fpso_colors, axis=None)
537
+ st.dataframe(styled_ni_pivot)
538
+ st.write(f"Total NI Notifications: {df_ni.shape[0]}")
539
+ else:
540
+ st.write("No NI notifications found in the dataset.")
541
+
542
+ # NC Notifications Tab
543
+ with tab2:
544
+ st.subheader("NC Notifications Analysis")
545
+ if not df_nc.empty:
546
+ nc_pivot = create_pivot_table(df_nc, index='FPSO', columns='Extracted_Keywords')
547
+ st.write("Pivot Table (Count of Keywords by FPSO):")
548
+ styled_nc_pivot = nc_pivot.style.apply(apply_fpso_colors, axis=None)
549
+ st.dataframe(styled_nc_pivot)
550
+ st.write(f"Total NC Notifications: {df_nc.shape[0]}")
551
+ else:
552
+ st.write("No NC notifications found in the dataset.")
553
+
554
+ # NI Summary 2025 Tab
555
+ with tab3:
556
+ st.subheader("2025 Raised")
557
+ # Filter for notifications in 2025
558
+ created_on_series = pd.to_datetime(df['Created on'])
559
+ df_2025 = df[created_on_series.dt.year == 2025].copy()
560
+ if not df_2025.empty:
561
+ # Add 'Month' column for monthly analysis
562
+ df_2025['Month'] = pd.to_datetime(df_2025['Created on']).dt.strftime('%b')
563
+ months_order = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
564
+ df_2025['Month'] = pd.Categorical(df_2025['Month'], categories=months_order, ordered=True)
565
+ # Group by FPSO, Month, and Notification Type
566
+ summary = df_2025.groupby(['FPSO', 'Month', 'Notifictn type']).size().unstack(fill_value=0)
567
+ # Reshape the data for NI and NC notifications
568
+ ni_summary = summary['NI'].unstack(level='Month') if 'NI' in summary else pd.DataFrame(index=pd.Index([]), columns=pd.Index(months_order))
569
+ nc_summary = summary['NC'].unstack(level='Month') if 'NC' in summary else pd.DataFrame(index=pd.Index([]), columns=pd.Index(months_order))
570
+ ni_summary = ni_summary.reindex(columns=pd.Index(months_order), fill_value=0) if not ni_summary.empty else pd.DataFrame(index=pd.Index([]), columns=pd.Index(months_order))
571
+ nc_summary = nc_summary.reindex(columns=pd.Index(months_order), fill_value=0) if not nc_summary.empty else pd.DataFrame(index=pd.Index([]), columns=pd.Index(months_order))
572
+ # Display NI Summary Table
573
+ st.write("NI's:")
574
+ st.dataframe(
575
+ ni_summary.style.set_table_styles([
576
+ {'selector': 'thead', 'props': [('display', 'none')]}
577
+ ]).set_properties(**{'text-align': 'center'})
578
+ )
579
+ # Display NC Summary Table
580
+ st.write("NC's:")
581
+ st.dataframe(
582
+ nc_summary.style.set_table_styles([
583
+ {'selector': 'thead', 'props': [('display', 'none')]}
584
+ ]).set_properties(**{'text-align': 'center'})
585
+ )
586
+ # Calculate totals
587
+ total_ni = df_2025[df_2025['Notifictn type'] == 'NI'].shape[0]
588
+ total_nc = df_2025[df_2025['Notifictn type'] == 'NC'].shape[0]
589
+ st.write(f"Grand Total NI Notifications: {total_ni}")
590
+ st.write(f"Grand Total NC Notifications: {total_nc}")
591
+ else:
592
+ st.write("No notifications found for 2025 in the dataset.")
593
+
594
+ with tab4:
595
+ st.subheader("FPSO Layout Visualization")
596
+ notification_type = st.radio("Select Notification Type", ['NI', 'NC'])
597
+ # Count NI or NC notifications for each location type for the selected FPSO (CLV, PAZ, DAL)
598
+ df_selected = df[df['FPSO'] == selected_fpso].copy()
599
+ if notification_type == 'NI':
600
+ df_selected = df_selected[df_selected['Notifictn type'] == 'NI']
601
+ else: # NC
602
+ df_selected = df_selected[df_selected['Notifictn type'] == 'NC']
603
+ # Initialize counts for all location types
604
+ location_counts = {
605
+ 'Modules': pd.DataFrame(index=pd.Index(clv_module_keywords), columns=['Count']).fillna(0),
606
+ 'Racks': pd.DataFrame(index=pd.Index(clv_rack_keywords), columns=['Count']).fillna(0),
607
+ 'LivingQuarters': pd.DataFrame(index=pd.Index(clv_living_quarters_keywords), columns=['Count']).fillna(0),
608
+ 'Flare': pd.DataFrame(index=pd.Index(clv_flare_keywords), columns=['Count']).fillna(0),
609
+ 'FWD': pd.DataFrame(index=pd.Index(clv_fwd_keywords), columns=['Count']).fillna(0),
610
+ 'HeliDeck': pd.DataFrame(index=pd.Index(clv_hexagons_keywords), columns=['Count']).fillna(0)
611
+ }
612
+ paz_location_counts = {
613
+ 'PAZ_Modules': pd.DataFrame(index=pd.Index(paz_module_keywords), columns=['Count']).fillna(0),
614
+ 'PAZ_Racks': pd.DataFrame(index=pd.Index(paz_rack_keywords), columns=['Count']).fillna(0),
615
+ 'LivingQuarters': pd.DataFrame(index=pd.Index(paz_living_quarters_keywords), columns=['Count']).fillna(0),
616
+ 'Flare': pd.DataFrame(index=pd.Index(paz_flare_keywords), columns=['Count']).fillna(0),
617
+ 'FWD': pd.DataFrame(index=pd.Index(paz_fwd_keywords), columns=['Count']).fillna(0),
618
+ 'HeliDeck': pd.DataFrame(index=pd.Index(paz_hexagons_keywords), columns=['Count']).fillna(0)
619
+ }
620
+ dal_location_counts = {
621
+ 'DAL_Modules': pd.DataFrame(index=pd.Index(dal_module_keywords), columns=['Count']).fillna(0),
622
+ 'DAL_Racks': pd.DataFrame(index=pd.Index(dal_rack_keywords), columns=['Count']).fillna(0),
623
+ 'LivingQuarters': pd.DataFrame(index=pd.Index(dal_living_quarters_keywords), columns=['Count']).fillna(0),
624
+ 'Flare': pd.DataFrame(index=pd.Index(dal_flare_keywords), columns=['Count']).fillna(0),
625
+ 'FWD': pd.DataFrame(index=pd.Index(dal_fwd_keywords), columns=['Count']).fillna(0),
626
+ 'HeliDeck': pd.DataFrame(index=pd.Index(dal_hexagons_keywords), columns=['Count']).fillna(0)
627
+ }
628
+ # Count notifications for each location type and placement
629
+ for location_type, keywords in [
630
+ ('Modules', clv_module_keywords),
631
+ ('Racks', clv_rack_keywords),
632
+ ('LivingQuarters', clv_living_quarters_keywords),
633
+ ('Flare', clv_flare_keywords),
634
+ ('FWD', clv_fwd_keywords),
635
+ ('HeliDeck', clv_hexagons_keywords)
636
+ ]:
637
+ for keyword in keywords:
638
+ count = df_selected[f'Extracted_{location_type}'].str.contains(keyword, na=False).sum()
639
+ location_counts[location_type].loc[keyword, 'Count'] = count
640
+ for location_type, keywords in [
641
+ ('PAZ_Modules', paz_module_keywords),
642
+ ('PAZ_Racks', paz_rack_keywords),
643
+ ('LivingQuarters', paz_living_quarters_keywords),
644
+ ('Flare', paz_flare_keywords),
645
+ ('FWD', paz_fwd_keywords),
646
+ ('HeliDeck', paz_hexagons_keywords)
647
+ ]:
648
+ for keyword in keywords:
649
+ if location_type == 'PAZ_Modules':
650
+ count = df_selected['Extracted_PAZ_Modules'].str.contains(keyword, na=False).sum()
651
+ paz_location_counts[location_type].loc[keyword, 'Count'] = count
652
+ elif location_type == 'PAZ_Racks':
653
+ count = df_selected['Extracted_PAZ_Racks'].str.contains(keyword, na=False).sum()
654
+ paz_location_counts[location_type].loc[keyword, 'Count'] = count
655
+ else:
656
+ count = df_selected[f'Extracted_{location_type}'].str.contains(keyword, na=False).sum()
657
+ paz_location_counts[location_type].loc[keyword, 'Count'] = count
658
+ for location_type, keywords in [
659
+ ('DAL_Modules', dal_module_keywords),
660
+ ('DAL_Racks', dal_rack_keywords),
661
+ ('LivingQuarters', dal_living_quarters_keywords),
662
+ ('Flare', dal_flare_keywords),
663
+ ('FWD', dal_fwd_keywords),
664
+ ('HeliDeck', dal_hexagons_keywords)
665
+ ]:
666
+ for keyword in keywords:
667
+ if location_type == 'DAL_Modules':
668
+ count = df_selected['Extracted_DAL_Modules'].str.contains(keyword, na=False).sum()
669
+ dal_location_counts[location_type].loc[keyword, 'Count'] = count
670
+ elif location_type == 'DAL_Racks':
671
+ count = df_selected['Extracted_DAL_Racks'].str.contains(keyword, na=False).sum()
672
+ dal_location_counts[location_type].loc[keyword, 'Count'] = count
673
+ else:
674
+ count = df_selected[f'Extracted_{location_type}'].str.contains(keyword, na=False).sum()
675
+ dal_location_counts[location_type].loc[keyword, 'Count'] = count
676
+ total_lq_count = sum(
677
+ df_selected['Extracted_LivingQuarters'].str.contains(keyword, na=False).sum()
678
+ for keyword in clv_living_quarters_keywords
679
+ )
680
+ # Draw the FPSO layout and overlay notification counts
681
+ def draw_fpso_layout(selected_unit):
682
+ fig, ax = plt.subplots(figsize=(13, 8))
683
+ ax.set_xlim(0, 13.5)
684
+ ax.set_ylim(0, 3.5)
685
+ ax.set_aspect('equal')
686
+ ax.grid(False)
687
+ ax.set_facecolor('#E6F3FF')
688
+
689
+ # Remove axes for cleaner visualization
690
+ ax.set_xticks([])
691
+ ax.set_yticks([])
692
+ ax.spines['top'].set_visible(False)
693
+ ax.spines['right'].set_visible(False)
694
+ ax.spines['bottom'].set_visible(False)
695
+ ax.spines['left'].set_visible(False)
696
+ if selected_unit == 'CLV':
697
+ draw_clv(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd)
698
+ elif selected_unit == 'PAZ':
699
+ draw_paz(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd)
700
+ elif selected_unit == 'DAL':
701
+ draw_dal(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd)
702
+ elif selected_unit == 'GIR':
703
+ draw_gir(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd)
704
+ return fig
705
+ fig = draw_fpso_layout(selected_fpso)
706
+ ax = fig.gca()
707
+ # Overlay notification counts on locations for CLV and PAZ
708
+ if selected_fpso == 'CLV':
709
+ # Modules
710
+ for module, (row, col) in clv_modules.items():
711
+ if module in clv_module_keywords:
712
+ count = int(location_counts['Modules'].loc[module, 'Count'])
713
+ if count > 0:
714
+ # Position count slightly above and to the right of the module text for clarity >> col moves horizontally in x axis whilst row moves vertically in y axis
715
+ ax.text(col + 0.8, row + 0.8, f"{count}",
716
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
717
+
718
+ # Racks
719
+ for rack, (row, col) in clv_racks.items():
720
+ if rack in clv_rack_keywords:
721
+ count = int(location_counts['Racks'].loc[rack, 'Count'])
722
+ if count > 0:
723
+ # Position count slightly above and to the right of the rack text
724
+ ax.text(col + 0.7, row + 0.4, f"{count}",
725
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
726
+
727
+ # Living Quarters (with total count)
728
+ for lq, (row, col) in clv_living_quarters.items():
729
+ if total_lq_count > 0:
730
+ # Position count slightly above and to the right of the LQ text
731
+ ax.text(col + 0.7, row + 1.4, f"{total_lq_count}",
732
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
733
+
734
+ # Flare
735
+ for flare_loc, (row, col) in clv_flare.items():
736
+ if flare_loc in clv_flare_keywords:
737
+ count = int(location_counts['Flare'].loc[flare_loc, 'Count'])
738
+ if count > 0:
739
+ # Position count slightly above and to the right of the flare text
740
+ ax.text(col + 0.7, row + 0.4, f"{count}",
741
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
742
+
743
+ # FWD
744
+ for fwd_loc, (row, col) in clv_fwd.items():
745
+ if fwd_loc in clv_fwd_keywords:
746
+ count = int(location_counts['FWD'].loc[fwd_loc, 'Count'])
747
+ if count > 0:
748
+ # Position count slightly above and to the left of the FWD text (adjusted for rotation)
749
+ ax.text(col + 0.75, row + 1.4, f"{count}",
750
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
751
+
752
+ # Heli-deck
753
+ for hexagon, (row, col) in clv_hexagons.items():
754
+ if hexagon in clv_hexagons_keywords:
755
+ count = int(location_counts['HeliDeck'].loc[hexagon, 'Count'])
756
+ if count > 0:
757
+ # Position count slightly above and to the right of the heli-deck text
758
+ ax.text(col + 0.2, row + 0.2, f"{count}",
759
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
760
+
761
+ # Total counts at the bottom (matching your image)
762
+ total_ni = df_selected[df_selected['Notifictn type'] == 'NI'].shape[0]
763
+ total_nc = df_selected[df_selected['Notifictn type'] == 'NC'].shape[0]
764
+ ax.text(6, 0.25, f"NI: {total_ni}\nNC: {total_nc}", ha='center', va='center', fontsize=8, weight='bold', color='red')
765
+
766
+ elif selected_fpso == 'PAZ':
767
+ # PAZ Modules
768
+ for module, (row, col) in paz_modules.items():
769
+ if module in paz_module_keywords:
770
+ count = int(paz_location_counts['PAZ_Modules'].loc[module, 'Count'])
771
+ if count > 0:
772
+ # Position count slightly above and to the right of the module text
773
+ ax.text(col + 0.8, row + 0.8, f"{count}",
774
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
775
+
776
+ # PAZ Racks
777
+ for rack, (row, col) in paz_racks.items():
778
+ if rack in paz_rack_keywords:
779
+ count = int(paz_location_counts['PAZ_Racks'].loc[rack, 'Count'])
780
+ if count > 0:
781
+ # Position count slightly above and to the right of the rack text
782
+ ax.text(col + 0.7, row + 0.4, f"{count}",
783
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
784
+
785
+ # Living Quarters (with total count)
786
+ for lq, (row, col) in paz_living_quarters.items():
787
+ if total_lq_count > 0:
788
+ # Position count slightly above and to the right of the LQ text
789
+ ax.text(col + 0.7, row + 1.4, f"{total_lq_count}",
790
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
791
+
792
+ # Flare
793
+ for flare_loc, (row, col) in paz_flare.items():
794
+ if flare_loc in paz_flare_keywords:
795
+ count = int(paz_location_counts['Flare'].loc[flare_loc, 'Count'])
796
+ if count > 0:
797
+ # Position count slightly above and to the right of the flare text
798
+ ax.text(col + 0.7, row + 0.4, f"{count}",
799
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
800
+
801
+ # FWD
802
+ for fwd_loc, (row, col) in paz_fwd.items():
803
+ if fwd_loc in paz_fwd_keywords:
804
+ count = int(paz_location_counts['FWD'].loc[fwd_loc, 'Count'])
805
+ if count > 0:
806
+ # Position count slightly above and to the left of the FWD text (adjusted for rotation)
807
+ ax.text(col + 0.75, row + 1.4, f"{count}",
808
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
809
+
810
+ # Heli-deck
811
+ for hexagon, (row, col) in paz_hexagons.items():
812
+ if hexagon in paz_hexagons_keywords:
813
+ count = int(paz_location_counts['HeliDeck'].loc[hexagon, 'Count'])
814
+ if count > 0:
815
+ # Position count slightly above and to the right of the heli-deck text
816
+ ax.text(col + 0.2, row + 0.2, f"{count}",
817
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
818
+
819
+ # Total counts at the bottom
820
+ total_ni = df_selected[df_selected['Notifictn type'] == 'NI'].shape[0]
821
+ total_nc = df_selected[df_selected['Notifictn type'] == 'NC'].shape[0]
822
+ ax.text(6, 0.25, f"NI: {total_ni}\nNC: {total_nc}", ha='center', va='center', fontsize=8, weight='bold', color='red')
823
+
824
+ elif selected_fpso == 'DAL':
825
+ # DAL Modules
826
+ for module, (row, col) in dal_modules.items():
827
+ if module in dal_module_keywords:
828
+ count = int(dal_location_counts['DAL_Modules'].loc[module, 'Count'])
829
+ if count > 0:
830
+ # Position count slightly above and to the right of the module text
831
+ ax.text(col + 0.8, row + 0.8, f"{count}",
832
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
833
+
834
+ # DAL Racks
835
+ for rack, (row, col) in dal_racks.items():
836
+ if rack in dal_rack_keywords:
837
+ count = int(dal_location_counts['DAL_Racks'].loc[rack, 'Count'])
838
+ if count > 0:
839
+ # Position count slightly above and to the right of the rack text
840
+ ax.text(col + 0.7, row + 0.4, f"{count}",
841
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
842
+
843
+ # Living Quarters (with total count)
844
+ for lq, (row, col) in dal_living_quarters.items():
845
+ if total_lq_count > 0:
846
+ # Position count slightly above and to the right of the LQ text
847
+ ax.text(col + 0.7, row + 1.4, f"{total_lq_count}",
848
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
849
+
850
+ # Flare
851
+ for flare_loc, (row, col) in dal_flare.items():
852
+ if flare_loc in dal_flare_keywords:
853
+ count = int(dal_location_counts['Flare'].loc[flare_loc, 'Count'])
854
+ if count > 0:
855
+ # Position count slightly above and to the right of the flare text
856
+ ax.text(col + 0.7, row + 0.4, f"{count}",
857
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
858
+
859
+ # FWD
860
+ for fwd_loc, (row, col) in dal_fwd.items():
861
+ if fwd_loc in dal_fwd_keywords:
862
+ count = int(dal_location_counts['FWD'].loc[fwd_loc, 'Count'])
863
+ if count > 0:
864
+ # Position count slightly above and to the left of the FWD text (adjusted for rotation)
865
+ ax.text(col + 0.75, row + 1.4, f"{count}",
866
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
867
+
868
+ # Heli-deck
869
+ for hexagon, (row, col) in dal_hexagons.items():
870
+ if hexagon in dal_hexagons_keywords:
871
+ count = int(dal_location_counts['HeliDeck'].loc[hexagon, 'Count'])
872
+ if count > 0:
873
+ # Position count slightly above and to the right of the heli-deck text
874
+ ax.text(col + 0.2, row + 0.2, f"{count}",
875
+ ha='center', va='center', fontsize=6, weight='bold', color='red')
876
+
877
+ # Total counts at the bottom
878
+ total_ni = df_selected[df_selected['Notifictn type'] == 'NI'].shape[0]
879
+ total_nc = df_selected[df_selected['Notifictn type'] == 'NC'].shape[0]
880
+ ax.text(6, 0.25, f"NI: {total_ni}\nNC: {total_nc}", ha='center', va='center', fontsize=8, weight='bold', color='red')
881
+
882
+ else:
883
+ # Display placeholder text for non-implemented FPSOs
884
+ ax.text(6, 1.75, f"{selected_fpso} Layout\n(Implementation work in progress...)", ha='center', va='center', fontsize=16, weight='bold')
885
+
886
+ plt.title(f"FPSO Visualization - {selected_fpso}", fontsize=16)
887
+ st.pyplot(fig)
888
+ plt.close(fig) # Close the figure to free memory
889
+
890
+ # RAG Assistant Tab
891
+ with tab5:
892
+ st.subheader("🤖 DigiTwin RAG Assistant")
893
+ st.markdown("Ask me anything about your FPSO notifications data!")
894
+
895
+ # Import and initialize RAG system
896
+ try:
897
+ from rag_chatbot import DigiTwinRAG, render_chat_interface
898
+
899
+ # Initialize RAG system
900
+ if 'rag_system' not in st.session_state:
901
+ with st.spinner("Initializing RAG system..."):
902
+ st.session_state.rag_system = DigiTwinRAG()
903
+
904
+ # Render chat interface
905
+ render_chat_interface(st.session_state.rag_system)
906
+
907
+ except ImportError as e:
908
+ st.error(f"❌ RAG module not available: {e}")
909
+ st.info("💡 To enable RAG functionality, install the required dependencies:")
910
+ st.code("pip install -r requirements_rag.txt")
911
+
912
+ # Show sample questions
913
+ st.markdown("### 💡 Sample Questions You Can Ask:")
914
+ sample_questions = [
915
+ "Which FPSO has the most NI notifications?",
916
+ "What are the common keywords in PAZ notifications?",
917
+ "Show me all safety-related notifications from last month",
918
+ "Compare notification patterns between GIR and DAL",
919
+ "What equipment has the most maintenance issues?",
920
+ "Which work centers require immediate attention?"
921
+ ]
922
+
923
+ for question in sample_questions:
924
+ st.write(f"• {question}")
925
+
926
+ except Exception as e:
927
+ st.error(f"❌ Error initializing RAG system: {e}")
928
+ st.info("Please check your LLM configuration and vector database setup.")
929
+
930
+ except Exception as e:
931
+ st.error(f"An error occurred: {e}")
932
+ else:
933
+ st.write('Please upload an Excel file to proceed.')
934
+
935
+ # Add footer with rocket emojis and branding
936
+ st.markdown("---")
937
+ st.markdown(
938
+ """
939
+ <div style="text-align: center; padding: 20px; border-radius: 10px; margin-top: 30px;">
940
+ <p style="font-size: 14px; color: #6c757d; margin: 0;">
941
+ 🚀 Built with Pride - STP/INSP/MET | Powered by <a href="https://www.valonylabs.com" target="_blank" style="color: #007bff; text-decoration: none; font-weight: bold;">ValonyLabs</a> 🚀
942
+ </p>
943
+ </div>
944
+ """,
945
+ unsafe_allow_html=True
946
+ )
947
+
948
+
src/notifs_data.db ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:547aa0eedb75395560b1fbbef46d6773e006f3fb628aa787b7ce2efd72a067f7
3
+ size 4038656
src/paz.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # paz.py
2
+
3
+ # PAZ-specific keywords and location dictionaries
4
+ paz_module_keywords = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
5
+ paz_rack_keywords = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8']
6
+ paz_living_quarters_keywords = ['LQ', 'LQ1', 'LQ2', 'LQ3', 'LQ4', 'LQL0', 'LQPS', 'LQSB', 'LQROOF', 'LQL4', 'LQL2', 'LQ-5', 'LQPD', 'LQ PS', 'LQAFT', 'LQ-T', 'LQL1S']
7
+ paz_flare_keywords = ['FLARE']
8
+ paz_fwd_keywords = ['FWD']
9
+ paz_hexagons_keywords = ['HELIDECK']
10
+
11
+ paz_modules = {
12
+ 'L1': (0.75, 2), 'P1': (0.5, 3), 'P2': (0.5, 4), 'P3': (0.5, 5), 'P4': (0.5, 6),
13
+ 'P5': (0.5, 7), 'P6': (0.5, 8), 'P7': (0.5, 9), 'P8': (0.5, 10), 'L2': (1.75, 2),
14
+ 'S1': (2, 3), 'S2': (2, 4), 'S3': (2, 5), 'S4': (2, 6),
15
+ 'S5': (2, 7), 'S6': (2, 8), 'S7': (2, 9), 'S8': (2, 10)
16
+ }
17
+ paz_racks = {
18
+ 'R1': (1.5, 3), 'R2': (1.5, 4), 'R3': (1.5, 5),
19
+ 'R4': (1.5, 6), 'R5': (1.5, 7), 'R6': (1.5, 8),
20
+ 'R7': (1.5, 9), 'R8': (1.5, 10)
21
+ }
22
+ paz_flare = {'FLARE': (0.5, 11)}
23
+ paz_living_quarters = {'LQ': (0.5, 1)}
24
+ paz_hexagons = {'HELIDECK': (2.75, 1)}
25
+ paz_fwd = {'FWD': (0.5, 11.75)}
26
+
27
+ def draw_paz(ax, add_chamfered_rectangle, add_rectangle, add_hexagon, add_fwd):
28
+ for module, (row, col) in paz_modules.items():
29
+ if module == 'L2':
30
+ height, y_position, text_y = 1.25, row, row + 0.5
31
+ elif module == 'L1':
32
+ height, y_position, text_y = 1.25, row - 0.25, row + 0.25
33
+ else:
34
+ height, y_position, text_y = 1, row, row + 0.5
35
+ add_chamfered_rectangle(ax, (col, y_position), 1, height, 0.1, edgecolor='black', facecolor='white')
36
+ ax.text(col + 0.5, text_y, module, ha='center', va='center', fontsize=7, weight='bold')
37
+
38
+ for rack, (row, col) in paz_racks.items():
39
+ add_chamfered_rectangle(ax, (col, row), 1, 0.5, 0.05, edgecolor='black', facecolor='white')
40
+ ax.text(col + 0.5, row + 0.25, rack, ha='center', va='center', fontsize=7, weight='bold')
41
+
42
+ for flare_loc, (row, col) in paz_flare.items():
43
+ add_chamfered_rectangle(ax, (col, row), 0.75, 2.5, 0.05, edgecolor='black', facecolor='white')
44
+ ax.text(col + 0.35, row + 1.25, flare_loc, ha='center', va='center', fontsize=7, weight='bold')
45
+
46
+ for living_quarter, (row, col) in paz_living_quarters.items():
47
+ add_rectangle(ax, (col, row), 1, 2.5, edgecolor='black', facecolor='white')
48
+ ax.text(col + 0.5, row + 1.25, living_quarter, ha='center', va='center', fontsize=7, rotation=90, weight='bold')
49
+
50
+ for hexagon, (row, col) in paz_hexagons.items():
51
+ add_hexagon(ax, (col, row), 0.60, edgecolor='black', facecolor='white')
52
+ ax.text(col, row, hexagon, ha='center', va='center', fontsize=7, weight='bold')
53
+
54
+ for fwd_loc, (row, col) in paz_fwd.items():
55
+ add_fwd(ax, (col, row), 2.5, -1, edgecolor='black', facecolor='white')
src/rag_chatbot.py ADDED
@@ -0,0 +1,548 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ DigiTwin RAG Chatbot Module
3
+ A comprehensive RAG system with hybrid search, query rewriting, and streaming responses
4
+ """
5
+
6
+ import streamlit as st
7
+ import pandas as pd
8
+ import numpy as np
9
+ import sqlite3
10
+ import json
11
+ import time
12
+ from typing import List, Dict, Any, Optional, Tuple
13
+ import requests
14
+ from datetime import datetime
15
+ import re
16
+
17
+ # Vector database imports
18
+ try:
19
+ import weaviate
20
+ from weaviate import Client
21
+ WEAVIATE_AVAILABLE = True
22
+ except ImportError:
23
+ WEAVIATE_AVAILABLE = False
24
+
25
+ try:
26
+ import faiss
27
+ import faiss.cpu
28
+ FAISS_AVAILABLE = True
29
+ except ImportError:
30
+ FAISS_AVAILABLE = False
31
+
32
+ # Embedding imports
33
+ try:
34
+ from sentence_transformers import SentenceTransformer
35
+ SENTENCE_TRANSFORMERS_AVAILABLE = True
36
+ except ImportError:
37
+ SENTENCE_TRANSFORMERS_AVAILABLE = False
38
+
39
+ # LLM imports
40
+ try:
41
+ import groq
42
+ GROQ_AVAILABLE = True
43
+ except ImportError:
44
+ GROQ_AVAILABLE = False
45
+
46
+ try:
47
+ import ollama
48
+ OLLAMA_AVAILABLE = True
49
+ except ImportError:
50
+ OLLAMA_AVAILABLE = False
51
+
52
+ # Configuration
53
+ DB_PATH = 'notifs_data.db'
54
+ TABLE_NAME = 'notifications'
55
+ VECTOR_DB_PATH = 'vector_store'
56
+ EMBEDDING_MODEL = 'all-MiniLM-L6-v2'
57
+
58
+ class DigiTwinRAG:
59
+ """
60
+ Comprehensive RAG system for DigiTwin notifications analysis
61
+ """
62
+
63
+ def __init__(self, db_path: str = DB_PATH, vector_db_path: str = VECTOR_DB_PATH):
64
+ self.db_path = db_path
65
+ self.vector_db_path = vector_db_path
66
+ self.embedding_model = None
67
+ self.vector_store = None
68
+ self.llm_client = None
69
+ self.initialize_components()
70
+
71
+ def initialize_components(self):
72
+ """Initialize all RAG components"""
73
+ # Initialize embedding model
74
+ if SENTENCE_TRANSFORMERS_AVAILABLE:
75
+ try:
76
+ self.embedding_model = SentenceTransformer(EMBEDDING_MODEL)
77
+ st.success(f"✅ Embedding model loaded: {EMBEDDING_MODEL}")
78
+ except Exception as e:
79
+ st.error(f"❌ Failed to load embedding model: {e}")
80
+
81
+ # Initialize vector store
82
+ self.initialize_vector_store()
83
+
84
+ # Initialize LLM clients
85
+ self.initialize_llm_clients()
86
+
87
+ def initialize_vector_store(self):
88
+ """Initialize vector database (Weaviate or FAISS)"""
89
+ if WEAVIATE_AVAILABLE:
90
+ try:
91
+ self.vector_store = Client("http://localhost:8080")
92
+ st.success("✅ Weaviate vector store connected")
93
+ except Exception as e:
94
+ st.warning(f"⚠️ Weaviate not available: {e}")
95
+ self.vector_store = None
96
+
97
+ if not self.vector_store and FAISS_AVAILABLE:
98
+ try:
99
+ # Initialize FAISS index
100
+ dimension = 384 # all-MiniLM-L6-v2 dimension
101
+ self.vector_store = faiss.IndexFlatIP(dimension)
102
+ st.success("✅ FAISS vector store initialized")
103
+ except Exception as e:
104
+ st.error(f"❌ Failed to initialize FAISS: {e}")
105
+ self.vector_store = None
106
+
107
+ def initialize_llm_clients(self):
108
+ """Initialize LLM clients (Groq and Ollama)"""
109
+ self.llm_client = {}
110
+
111
+ # Initialize Groq client
112
+ if GROQ_AVAILABLE:
113
+ try:
114
+ # You'll need to set GROQ_API_KEY in environment
115
+ import os
116
+ api_key = os.getenv('GROQ_API_KEY')
117
+ if api_key:
118
+ self.llm_client['groq'] = groq.Groq(api_key=api_key)
119
+ st.success("✅ Groq client initialized")
120
+ else:
121
+ st.warning("⚠️ GROQ_API_KEY not found in environment")
122
+ except Exception as e:
123
+ st.warning(f"⚠️ Groq initialization failed: {e}")
124
+
125
+ # Initialize Ollama client
126
+ if OLLAMA_AVAILABLE:
127
+ try:
128
+ self.llm_client['ollama'] = ollama.Client(host='http://localhost:11434')
129
+ st.success("✅ Ollama client initialized")
130
+ except Exception as e:
131
+ st.warning(f"⚠️ Ollama initialization failed: {e}")
132
+
133
+ def load_notifications_data(self) -> pd.DataFrame:
134
+ """Load notifications data from SQLite database"""
135
+ try:
136
+ with sqlite3.connect(self.db_path) as conn:
137
+ df = pd.read_sql(f'SELECT * FROM {TABLE_NAME}', conn)
138
+ return df
139
+ except Exception as e:
140
+ st.error(f"❌ Failed to load data: {e}")
141
+ return pd.DataFrame()
142
+
143
+ def create_document_chunks(self, df: pd.DataFrame) -> List[Dict[str, Any]]:
144
+ """Create document chunks for vectorization"""
145
+ documents = []
146
+
147
+ for idx, row in df.iterrows():
148
+ # Create rich document representation
149
+ doc = {
150
+ 'id': f"doc_{idx}",
151
+ 'content': f"""
152
+ FPSO: {row.get('FPSO', 'N/A')}
153
+ Notification Type: {row.get('Notifictn type', 'N/A')}
154
+ {'(Notification of Integrity)' if row.get('Notifictn type') == 'NI' else '(Notification of Conformity)' if row.get('Notifictn type') == 'NC' else ''}
155
+ Description: {row.get('Description', 'N/A')}
156
+ Created: {row.get('Created on', 'N/A')}
157
+ Keywords: {row.get('Extracted_Keywords', 'N/A')}
158
+ Modules: {row.get('Extracted_Modules', 'N/A')}
159
+ Racks: {row.get('Extracted_Racks', 'N/A')}
160
+ """.strip(),
161
+ 'metadata': {
162
+ 'fpso': row.get('FPSO', 'N/A'),
163
+ 'notification_type': row.get('Notifictn type', 'N/A'),
164
+ 'created_date': row.get('Created on', 'N/A'),
165
+ 'keywords': row.get('Extracted_Keywords', 'N/A'),
166
+ 'modules': row.get('Extracted_Modules', 'N/A'),
167
+ 'racks': row.get('Extracted_Racks', 'N/A')
168
+ }
169
+ }
170
+ documents.append(doc)
171
+
172
+ return documents
173
+
174
+ def create_embeddings(self, documents: List[Dict[str, Any]]) -> np.ndarray:
175
+ """Create embeddings for documents"""
176
+ if not self.embedding_model:
177
+ st.error("❌ Embedding model not available")
178
+ return np.array([])
179
+
180
+ texts = [doc['content'] for doc in documents]
181
+ embeddings = self.embedding_model.encode(texts, show_progress_bar=True)
182
+ return embeddings
183
+
184
+ def index_documents(self, documents: List[Dict[str, Any]], embeddings: np.ndarray):
185
+ """Index documents in vector store"""
186
+ if not self.vector_store:
187
+ st.error("❌ Vector store not available")
188
+ return
189
+
190
+ if WEAVIATE_AVAILABLE and isinstance(self.vector_store, Client):
191
+ # Index in Weaviate
192
+ try:
193
+ for doc, embedding in zip(documents, embeddings):
194
+ self.vector_store.data_object.create(
195
+ data_object=doc['metadata'],
196
+ class_name="Notification",
197
+ vector=embedding.tolist()
198
+ )
199
+ st.success(f"✅ Indexed {len(documents)} documents in Weaviate")
200
+ except Exception as e:
201
+ st.error(f"❌ Failed to index in Weaviate: {e}")
202
+
203
+ elif FAISS_AVAILABLE and hasattr(self.vector_store, 'add'):
204
+ # Index in FAISS
205
+ try:
206
+ self.vector_store.add(embeddings.astype('float32'))
207
+ # Save document metadata separately
208
+ import pickle
209
+ with open(f"{self.vector_db_path}_metadata.pkl", 'wb') as f:
210
+ pickle.dump(documents, f)
211
+ st.success(f"✅ Indexed {len(documents)} documents in FAISS")
212
+ except Exception as e:
213
+ st.error(f"❌ Failed to index in FAISS: {e}")
214
+
215
+ def hybrid_search(self, query: str, k: int = 5) -> List[Dict[str, Any]]:
216
+ """Perform hybrid search (semantic + keyword)"""
217
+ results = []
218
+
219
+ # Semantic search
220
+ if self.embedding_model and self.vector_store:
221
+ query_embedding = self.embedding_model.encode([query])
222
+
223
+ if WEAVIATE_AVAILABLE and isinstance(self.vector_store, Client):
224
+ # Weaviate semantic search
225
+ try:
226
+ semantic_results = self.vector_store.query.get("Notification", [
227
+ "fpso", "notification_type", "created_date", "keywords", "modules", "racks"
228
+ ]).with_near_vector({
229
+ "vector": query_embedding[0].tolist()
230
+ }).with_limit(k).do()
231
+
232
+ for result in semantic_results['data']['Get']['Notification']:
233
+ results.append({
234
+ 'content': f"FPSO: {result['fpso']}, Type: {result['notification_type']}, Keywords: {result['keywords']}",
235
+ 'metadata': result,
236
+ 'score': 1.0 # Weaviate doesn't return scores by default
237
+ })
238
+ except Exception as e:
239
+ st.warning(f"⚠️ Weaviate search failed: {e}")
240
+
241
+ elif FAISS_AVAILABLE and hasattr(self.vector_store, 'search'):
242
+ # FAISS semantic search
243
+ try:
244
+ scores, indices = self.vector_store.search(query_embedding.astype('float32'), k)
245
+
246
+ # Load document metadata
247
+ import pickle
248
+ with open(f"{self.vector_db_path}_metadata.pkl", 'rb') as f:
249
+ documents = pickle.load(f)
250
+
251
+ for score, idx in zip(scores[0], indices[0]):
252
+ if idx < len(documents):
253
+ results.append({
254
+ 'content': documents[idx]['content'],
255
+ 'metadata': documents[idx]['metadata'],
256
+ 'score': float(score)
257
+ })
258
+ except Exception as e:
259
+ st.warning(f"⚠️ FAISS search failed: {e}")
260
+
261
+ # Keyword search as fallback
262
+ if not results:
263
+ df = self.load_notifications_data()
264
+ if not df.empty:
265
+ # Simple keyword matching
266
+ query_terms = query.lower().split()
267
+ for idx, row in df.iterrows():
268
+ text = f"{row.get('Description', '')} {row.get('Extracted_Keywords', '')}".lower()
269
+ if any(term in text for term in query_terms):
270
+ results.append({
271
+ 'content': f"FPSO: {row.get('FPSO')}, Type: {row.get('Notifictn type')}, Description: {row.get('Description', '')[:100]}...",
272
+ 'metadata': row.to_dict(),
273
+ 'score': 0.5
274
+ })
275
+ if len(results) >= k:
276
+ break
277
+
278
+ return results[:k]
279
+
280
+ def query_rewriter(self, query: str) -> str:
281
+ """Rewrite query for better retrieval"""
282
+ rewrite_prompt = f"""
283
+ Rewrite the following query to be more specific and searchable for FPSO notifications data.
284
+ Focus on technical terms, FPSO names (GIR, DAL, PAZ, CLV), notification types (NI/NC), and equipment.
285
+
286
+ Original query: {query}
287
+
288
+ Rewritten query:"""
289
+
290
+ # Use LLM to rewrite query
291
+ rewritten_query = self.generate_response(rewrite_prompt, max_tokens=50, temperature=0.3)
292
+ return rewritten_query.strip() if rewritten_query else query
293
+
294
+ def generate_pivot_analysis(self, df: pd.DataFrame) -> str:
295
+ """Generate pivot analysis summary"""
296
+ analysis = []
297
+
298
+ # FPSO distribution
299
+ if 'FPSO' in df.columns:
300
+ fpso_counts = df['FPSO'].value_counts()
301
+ analysis.append(f"**FPSO Distribution:** {', '.join([f'{fpso}: {count}' for fpso, count in fpso_counts.items()])}")
302
+
303
+ # Notification type distribution
304
+ if 'Notifictn type' in df.columns:
305
+ type_counts = df['Notifictn type'].value_counts()
306
+ analysis.append(f"**Notification Types:** {', '.join([f'{ntype}: {count}' for ntype, count in type_counts.items()])}")
307
+
308
+ # Keyword analysis
309
+ if 'Extracted_Keywords' in df.columns:
310
+ keywords = df['Extracted_Keywords'].str.split(', ').explode()
311
+ keywords = keywords[keywords != 'None']
312
+ if not keywords.empty:
313
+ top_keywords = keywords.value_counts().head(5)
314
+ analysis.append(f"**Top Keywords:** {', '.join([f'{kw}: {count}' for kw, count in top_keywords.items()])}")
315
+
316
+ return "\n".join(analysis)
317
+
318
+ def generate_response(self, prompt: str, max_tokens: int = 500, temperature: float = 0.7, stream: bool = False) -> str:
319
+ """Generate response using available LLM"""
320
+
321
+ # Try Groq first
322
+ if 'groq' in self.llm_client:
323
+ try:
324
+ if stream:
325
+ return self._stream_groq_response(prompt, max_tokens, temperature)
326
+ else:
327
+ response = self.llm_client['groq'].chat.completions.create(
328
+ model="llama3-8b-8192",
329
+ messages=[{"role": "user", "content": prompt}],
330
+ max_tokens=max_tokens,
331
+ temperature=temperature
332
+ )
333
+ return response.choices[0].message.content
334
+ except Exception as e:
335
+ st.warning(f"⚠️ Groq generation failed: {e}")
336
+
337
+ # Try Ollama as fallback
338
+ if 'ollama' in self.llm_client:
339
+ try:
340
+ if stream:
341
+ return self._stream_ollama_response(prompt, max_tokens, temperature)
342
+ else:
343
+ response = self.llm_client['ollama'].chat(
344
+ model='llama3.2',
345
+ messages=[{'role': 'user', 'content': prompt}]
346
+ )
347
+ return response['message']['content']
348
+ except Exception as e:
349
+ st.warning(f"⚠️ Ollama generation failed: {e}")
350
+
351
+ return "I apologize, but I'm unable to generate a response at the moment. Please check your LLM configuration."
352
+
353
+ def _stream_groq_response(self, prompt: str, max_tokens: int, temperature: float):
354
+ """Stream response from Groq"""
355
+ try:
356
+ response = self.llm_client['groq'].chat.completions.create(
357
+ model="llama3-8b-8192",
358
+ messages=[{"role": "user", "content": prompt}],
359
+ max_tokens=max_tokens,
360
+ temperature=temperature,
361
+ stream=True
362
+ )
363
+
364
+ full_response = ""
365
+ for chunk in response:
366
+ if chunk.choices[0].delta.content:
367
+ content = chunk.choices[0].delta.content
368
+ full_response += content
369
+ yield content
370
+
371
+ return full_response
372
+ except Exception as e:
373
+ st.error(f"❌ Groq streaming failed: {e}")
374
+ return ""
375
+
376
+ def _stream_ollama_response(self, prompt: str, max_tokens: int, temperature: float):
377
+ """Stream response from Ollama"""
378
+ try:
379
+ response = self.llm_client['ollama'].chat(
380
+ model='llama3.2',
381
+ messages=[{'role': 'user', 'content': prompt}],
382
+ stream=True
383
+ )
384
+
385
+ full_response = ""
386
+ for chunk in response:
387
+ if 'message' in chunk and 'content' in chunk['message']:
388
+ content = chunk['message']['content']
389
+ full_response += content
390
+ yield content
391
+
392
+ return full_response
393
+ except Exception as e:
394
+ st.error(f"❌ Ollama streaming failed: {e}")
395
+ return ""
396
+
397
+ def create_rag_prompt(self, query: str, context: List[Dict[str, Any]], pivot_analysis: str) -> str:
398
+ """Create optimized RAG prompt"""
399
+
400
+ # Format context
401
+ context_text = "\n\n".join([
402
+ f"Document {i+1}:\n{doc['content']}\nRelevance Score: {doc['score']:.3f}"
403
+ for i, doc in enumerate(context)
404
+ ])
405
+
406
+ prompt = f"""
407
+ You are DigiTwin, an expert FPSO (Floating Production Storage and Offloading) notifications analyst.
408
+
409
+ **Context Information:**
410
+ {context_text}
411
+
412
+ **Current Dataset Analysis:**
413
+ {pivot_analysis}
414
+
415
+ **Important Definitions:**
416
+ - NI = Notification of Integrity (maintenance and safety notifications)
417
+ - NC = Notification of Conformity (compliance and regulatory notifications)
418
+ - FPSO Units: GIR, DAL, PAZ, CLV
419
+
420
+ **User Query:** {query}
421
+
422
+ Please provide a comprehensive, accurate response based on the context and dataset analysis.
423
+ Include specific details about FPSO units, notification types, and relevant insights.
424
+ If the context doesn't contain enough information, say so clearly.
425
+
426
+ **Response:**"""
427
+
428
+ return prompt
429
+
430
+ def process_query(self, query: str, stream: bool = True) -> str:
431
+ """Process user query through the complete RAG pipeline"""
432
+
433
+ # Step 1: Query rewriting
434
+ rewritten_query = self.query_rewriter(query)
435
+
436
+ # Step 2: Hybrid search
437
+ search_results = self.hybrid_search(rewritten_query, k=5)
438
+
439
+ # Step 3: Load data for pivot analysis
440
+ df = self.load_notifications_data()
441
+ pivot_analysis = self.generate_pivot_analysis(df) if not df.empty else "No data available"
442
+
443
+ # Step 4: Create RAG prompt
444
+ rag_prompt = self.create_rag_prompt(query, search_results, pivot_analysis)
445
+
446
+ # Step 5: Generate response
447
+ if stream:
448
+ return self.generate_response(rag_prompt, max_tokens=800, temperature=0.7, stream=True)
449
+ else:
450
+ return self.generate_response(rag_prompt, max_tokens=800, temperature=0.7, stream=False)
451
+
452
+ def initialize_rag_system():
453
+ """Initialize the RAG system"""
454
+ with st.spinner("Initializing RAG system..."):
455
+ rag = DigiTwinRAG()
456
+ return rag
457
+
458
+ def render_chat_interface(rag: DigiTwinRAG):
459
+ """Render the chat interface"""
460
+
461
+ # Initialize chat history
462
+ if "messages" not in st.session_state:
463
+ st.session_state.messages = []
464
+
465
+ # Chat header
466
+ st.markdown("### 🤖 DigiTwin RAG Assistant")
467
+ st.markdown("Ask me anything about your FPSO notifications data!")
468
+
469
+ # Display chat messages
470
+ for message in st.session_state.messages:
471
+ with st.chat_message(message["role"], avatar=message.get("avatar")):
472
+ st.markdown(message["content"])
473
+
474
+ # Chat input
475
+ if prompt := st.chat_input("Ask about your notifications data..."):
476
+ # Add user message
477
+ st.session_state.messages.append({
478
+ "role": "user",
479
+ "content": prompt,
480
+ "avatar": "👤"
481
+ })
482
+
483
+ # Display user message
484
+ with st.chat_message("user", avatar="👤"):
485
+ st.markdown(prompt)
486
+
487
+ # Generate and display assistant response
488
+ with st.chat_message("assistant", avatar="🤖"):
489
+ message_placeholder = st.empty()
490
+
491
+ try:
492
+ # Process query with streaming
493
+ full_response = ""
494
+ for chunk in rag.process_query(prompt, stream=True):
495
+ full_response += chunk
496
+ message_placeholder.markdown(full_response + "▌")
497
+
498
+ message_placeholder.markdown(full_response)
499
+
500
+ # Add assistant message to history
501
+ st.session_state.messages.append({
502
+ "role": "assistant",
503
+ "content": full_response,
504
+ "avatar": "🤖"
505
+ })
506
+
507
+ except Exception as e:
508
+ error_msg = f"❌ Error processing query: {str(e)}"
509
+ message_placeholder.markdown(error_msg)
510
+ st.session_state.messages.append({
511
+ "role": "assistant",
512
+ "content": error_msg,
513
+ "avatar": "🤖"
514
+ })
515
+
516
+ # Sidebar controls
517
+ with st.sidebar:
518
+ st.markdown("### 🔧 RAG Controls")
519
+
520
+ # Clear chat
521
+ if st.button("🗑️ Clear Chat"):
522
+ st.session_state.messages = []
523
+ st.rerun()
524
+
525
+ # Rebuild vector index
526
+ if st.button("🔄 Rebuild Vector Index"):
527
+ with st.spinner("Rebuilding vector index..."):
528
+ df = rag.load_notifications_data()
529
+ if not df.empty:
530
+ documents = rag.create_document_chunks(df)
531
+ embeddings = rag.create_embeddings(documents)
532
+ rag.index_documents(documents, embeddings)
533
+ st.success("✅ Vector index rebuilt!")
534
+ else:
535
+ st.error("❌ No data available for indexing")
536
+
537
+ def main():
538
+ """Main function to run the RAG chatbot"""
539
+ st.set_page_config(page_title="DigiTwin RAG Assistant", layout="wide")
540
+
541
+ # Initialize RAG system
542
+ rag = initialize_rag_system()
543
+
544
+ # Render chat interface
545
+ render_chat_interface(rag)
546
+
547
+ if __name__ == "__main__":
548
+ main()
src/setup_rag.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ DigiTwin RAG Setup Script
4
+ Helps users install and configure the RAG system dependencies
5
+ """
6
+
7
+ import subprocess
8
+ import sys
9
+ import os
10
+ from pathlib import Path
11
+
12
+ def run_command(command, description):
13
+ """Run a command and handle errors"""
14
+ print(f"🔄 {description}...")
15
+ try:
16
+ result = subprocess.run(command, shell=True, check=True, capture_output=True, text=True)
17
+ print(f"✅ {description} completed successfully")
18
+ return True
19
+ except subprocess.CalledProcessError as e:
20
+ print(f"❌ {description} failed: {e}")
21
+ print(f"Error output: {e.stderr}")
22
+ return False
23
+
24
+ def check_python_version():
25
+ """Check if Python version is compatible"""
26
+ version = sys.version_info
27
+ if version.major < 3 or (version.major == 3 and version.minor < 8):
28
+ print("❌ Python 3.8 or higher is required")
29
+ return False
30
+ print(f"✅ Python {version.major}.{version.minor}.{version.micro} is compatible")
31
+ return True
32
+
33
+ def install_dependencies():
34
+ """Install RAG dependencies"""
35
+ print("🚀 Installing DigiTwin RAG Dependencies")
36
+ print("=" * 50)
37
+
38
+ # Check Python version
39
+ if not check_python_version():
40
+ return False
41
+
42
+ # Install core dependencies
43
+ dependencies = [
44
+ ("sentence-transformers", "Sentence Transformers for embeddings"),
45
+ ("faiss-cpu", "FAISS vector database"),
46
+ ("weaviate-client", "Weaviate vector database client"),
47
+ ("groq", "Groq LLM API client"),
48
+ ("ollama", "Ollama local LLM client"),
49
+ ("numpy", "Numerical computing"),
50
+ ("pandas", "Data manipulation"),
51
+ ("streamlit", "Web application framework")
52
+ ]
53
+
54
+ success_count = 0
55
+ for package, description in dependencies:
56
+ if run_command(f"pip install {package}", f"Installing {description}"):
57
+ success_count += 1
58
+
59
+ print(f"\n📊 Installation Summary: {success_count}/{len(dependencies)} packages installed successfully")
60
+ return success_count == len(dependencies)
61
+
62
+ def setup_environment():
63
+ """Setup environment variables and configuration"""
64
+ print("\n🔧 Setting up environment...")
65
+
66
+ # Create .env file template
67
+ env_content = """# DigiTwin RAG Environment Configuration
68
+
69
+ # Groq API Configuration
70
+ # Get your API key from: https://console.groq.com/
71
+ GROQ_API_KEY=your_groq_api_key_here
72
+
73
+ # Ollama Configuration (optional)
74
+ # Install Ollama from: https://ollama.ai/
75
+ OLLAMA_HOST=http://localhost:11434
76
+
77
+ # Vector Database Configuration
78
+ # Weaviate (optional) - Install with: docker run -d -p 8080:8080 semitechnologies/weaviate:1.22.4
79
+ WEAVIATE_URL=http://localhost:8080
80
+
81
+ # Embedding Model Configuration
82
+ EMBEDDING_MODEL=all-MiniLM-L6-v2
83
+ """
84
+
85
+ env_file = Path(".env")
86
+ if not env_file.exists():
87
+ with open(env_file, "w") as f:
88
+ f.write(env_content)
89
+ print("✅ Created .env file template")
90
+ print("📝 Please edit .env file with your API keys")
91
+ else:
92
+ print("ℹ️ .env file already exists")
93
+
94
+ def create_directories():
95
+ """Create necessary directories"""
96
+ print("\n📁 Creating directories...")
97
+
98
+ directories = [
99
+ "vector_store",
100
+ "logs",
101
+ "models"
102
+ ]
103
+
104
+ for directory in directories:
105
+ Path(directory).mkdir(exist_ok=True)
106
+ print(f"✅ Created directory: {directory}")
107
+
108
+ def test_installation():
109
+ """Test the RAG installation"""
110
+ print("\n🧪 Testing RAG installation...")
111
+
112
+ test_script = """
113
+ import sys
114
+ import importlib
115
+
116
+ # Test imports
117
+ modules_to_test = [
118
+ 'sentence_transformers',
119
+ 'faiss',
120
+ 'weaviate',
121
+ 'groq',
122
+ 'ollama',
123
+ 'numpy',
124
+ 'pandas',
125
+ 'streamlit'
126
+ ]
127
+
128
+ print("Testing module imports...")
129
+ for module in modules_to_test:
130
+ try:
131
+ importlib.import_module(module)
132
+ print(f"✅ {module}")
133
+ except ImportError as e:
134
+ print(f"❌ {module}: {e}")
135
+
136
+ print("\\nTesting embedding model...")
137
+ try:
138
+ from sentence_transformers import SentenceTransformer
139
+ model = SentenceTransformer('all-MiniLM-L6-v2')
140
+ test_embedding = model.encode(['test sentence'])
141
+ print(f"✅ Embedding model working (shape: {test_embedding.shape})")
142
+ except Exception as e:
143
+ print(f"❌ Embedding model failed: {e}")
144
+
145
+ print("\\nRAG system test completed!")
146
+ """
147
+
148
+ with open("test_rag.py", "w") as f:
149
+ f.write(test_script)
150
+
151
+ if run_command("python test_rag.py", "Running RAG system test"):
152
+ print("✅ RAG system test passed!")
153
+ os.remove("test_rag.py")
154
+ else:
155
+ print("❌ RAG system test failed. Please check the errors above.")
156
+
157
+ def main():
158
+ """Main setup function"""
159
+ print("🤖 DigiTwin RAG Setup")
160
+ print("=" * 50)
161
+
162
+ # Install dependencies
163
+ if not install_dependencies():
164
+ print("\n❌ Some dependencies failed to install. Please check the errors above.")
165
+ return
166
+
167
+ # Setup environment
168
+ setup_environment()
169
+
170
+ # Create directories
171
+ create_directories()
172
+
173
+ # Test installation
174
+ test_installation()
175
+
176
+ print("\n🎉 Setup completed!")
177
+ print("\n📋 Next steps:")
178
+ print("1. Edit .env file with your API keys")
179
+ print("2. Install Ollama (optional): https://ollama.ai/")
180
+ print("3. Start Weaviate (optional): docker run -d -p 8080:8080 semitechnologies/weaviate:1.22.4")
181
+ print("4. Run the application: streamlit run notifs.py")
182
+ print("\n🚀 Happy coding with DigiTwin RAG!")
183
+
184
+ if __name__ == "__main__":
185
+ main()
src/utils.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utilities module for DigiTwin Analytics
3
+ Contains common functions, decorators, and data processing utilities
4
+ """
5
+
6
+ import logging
7
+ import pandas as pd
8
+ from functools import wraps
9
+ from PyPDF2 import PdfReader
10
+ from langchain_community.vectorstores import FAISS
11
+ from langchain_community.embeddings import HuggingFaceEmbeddings
12
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
13
+ from langchain.schema import Document as LCDocument
14
+ import streamlit as st
15
+ from config import (
16
+ NI_keywords, NC_keywords, module_keywords, rack_keywords,
17
+ living_quarters_keywords, flare_keywords, fwd_keywords, hexagons_keywords,
18
+ NI_keyword_map, NC_keyword_map
19
+ )
20
+
21
+ import matplotlib.patches as patches
22
+ import math
23
+ import matplotlib.transforms as transforms
24
+
25
+ # PAZ-specific keywords for data processing
26
+ paz_module_keywords = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8']
27
+ paz_rack_keywords = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6']
28
+
29
+ # PAZ keyword mapping for preprocessing
30
+ paz_keyword_map = {
31
+ 'P1': 'P1', 'P2': 'P2', 'P3': 'P3', 'P4': 'P4', 'P5': 'P5', 'P6': 'P6', 'P7': 'P7', 'P8': 'P8',
32
+ 'S1': 'S1', 'S2': 'S2', 'S3': 'S3', 'S4': 'S4', 'S5': 'S5', 'S6': 'S6', 'S7': 'S7', 'S8': 'S8',
33
+ 'R1': 'R1', 'R2': 'R2', 'R3': 'R3', 'R4': 'R4', 'R5': 'R5', 'R6': 'R6'
34
+ }
35
+
36
+ # Setup logging
37
+ logging.basicConfig(level=logging.INFO)
38
+ logger = logging.getLogger(__name__)
39
+
40
+ # --- DECORATORS ---
41
+ def log_execution(func):
42
+ """Decorator to log function execution for debugging"""
43
+ @wraps(func)
44
+ def wrapper(*args, **kwargs):
45
+ logger.info(f"Executing {func.__name__} with args: {args}, kwargs: {kwargs}")
46
+ try:
47
+ result = func(*args, **kwargs)
48
+ logger.info(f"{func.__name__} executed successfully")
49
+ return result
50
+ except Exception as e:
51
+ logger.error(f"Error in {func.__name__}: {str(e)}")
52
+ raise
53
+ return wrapper
54
+
55
+ # --- DATA PROCESSING FUNCTIONS ---
56
+ @log_execution
57
+ def parse_pdf(file):
58
+ """Parse PDF file and extract text content"""
59
+ reader = PdfReader(file)
60
+ return "\n".join([page.extract_text() for page in reader.pages if page.extract_text()])
61
+
62
+ @st.cache_resource
63
+ def build_faiss_vectorstore(_docs):
64
+ """Build FAISS vectorstore from documents with caching"""
65
+ embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
66
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
67
+ chunks = []
68
+ for i, doc in enumerate(_docs):
69
+ for chunk in splitter.split_text(doc.page_content):
70
+ chunks.append(LCDocument(page_content=chunk, metadata={"source": f"doc_{i}"}))
71
+ return FAISS.from_documents(chunks, embeddings)
72
+
73
+ @log_execution
74
+ def preprocess_keywords(description):
75
+ """Preprocess description text for keyword extraction"""
76
+ description = str(description).upper()
77
+ for lq_variant in living_quarters_keywords:
78
+ if lq_variant != 'LQ':
79
+ description = description.replace(lq_variant, 'LQ')
80
+
81
+ # Handle CLV module keywords
82
+ for module in module_keywords:
83
+ number = module[1:]
84
+ if number in description:
85
+ description = description.replace(number, module)
86
+
87
+ # Handle PAZ module keywords
88
+ for module in paz_module_keywords:
89
+ if module in description:
90
+ description = description.replace(module, module)
91
+
92
+ # Handle PAZ rack keywords
93
+ for rack in paz_rack_keywords:
94
+ if rack in description:
95
+ description = description.replace(rack, rack)
96
+
97
+ for original, grouped in {**NI_keyword_map, **NC_keyword_map}.items():
98
+ description = description.replace(original, grouped)
99
+ return description
100
+
101
+ @log_execution
102
+ def extract_ni_nc_keywords(row, notif_type_col, desc_col):
103
+ """Extract NI/NC keywords from notification row"""
104
+ description = preprocess_keywords(row[desc_col])
105
+ notif_type = row[notif_type_col]
106
+ keywords = [kw for kw in (NI_keywords if notif_type == 'NI' else NC_keywords) if kw in description]
107
+ return ', '.join(keywords) if keywords else 'None'
108
+
109
+ @log_execution
110
+ def extract_location_keywords(row, desc_col, keyword_list):
111
+ """Extract location keywords from notification row"""
112
+ description = preprocess_keywords(row[desc_col])
113
+ if keyword_list == living_quarters_keywords:
114
+ return 'LQ' if any(kw in description for kw in living_quarters_keywords) else 'None'
115
+ locations = [kw for kw in keyword_list if kw in description]
116
+ return ', '.join(locations) if locations else 'None'
117
+
118
+ @log_execution
119
+ def create_pivot_table(df, index, columns, aggfunc='size', fill_value=0):
120
+ """Create pivot table from dataframe"""
121
+ df_exploded = df.assign(Keywords=df[columns].str.split(', ')).explode('Keywords')
122
+ df_exploded = df_exploded[df_exploded['Keywords'] != 'None']
123
+ pivot = pd.pivot_table(df_exploded, index=index, columns='Keywords', aggfunc=aggfunc, fill_value=fill_value)
124
+ return pivot
125
+
126
+ @log_execution
127
+ def apply_fpso_colors(df):
128
+ """Apply color styling to FPSO dataframe"""
129
+ styles = pd.DataFrame('', index=df.index, columns=df.columns)
130
+ color_map = {'GIR': '#FFA07A', 'DAL': '#ADD8E6', 'PAZ': '#D8BFD8', 'CLV': '#90EE90'}
131
+ for fpso, color in color_map.items():
132
+ if fpso in df.index:
133
+ styles.loc[fpso] = f'background-color: {color}'
134
+ return styles
135
+
136
+ @log_execution
137
+ def process_uploaded_files(files):
138
+ """Process uploaded files and return PDF documents and Excel dataframe"""
139
+ pdf_files = [f for f in files if f.type == "application/pdf"]
140
+ excel_files = [f for f in files if f.type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"]
141
+
142
+ # Process PDF files
143
+ parsed_docs = []
144
+ if pdf_files:
145
+ parsed_docs = [LCDocument(page_content=parse_pdf(f), metadata={"name": f.name}) for f in pdf_files]
146
+ st.sidebar.success(f"{len(parsed_docs)} PDF reports indexed.")
147
+
148
+ # Process Excel files
149
+ df = None
150
+ if excel_files:
151
+ try:
152
+ # Use the first Excel file if multiple are uploaded
153
+ uploaded_xlsx = excel_files[0]
154
+ df = pd.read_excel(uploaded_xlsx, sheet_name='Global Notifications')
155
+ df.columns = df.columns.str.strip()
156
+ expected_columns = {
157
+ 'Notifictn type': 'Notifictn type',
158
+ 'Created on': 'Created on',
159
+ 'Description': 'Description',
160
+ 'FPSO': 'FPSO'
161
+ }
162
+ missing_columns = [col for col in expected_columns.values() if col not in df.columns]
163
+ if missing_columns:
164
+ st.error(f"Missing columns: {missing_columns}")
165
+ return parsed_docs, None
166
+
167
+ df = df[list(expected_columns.values())]
168
+ df.columns = list(expected_columns.keys())
169
+ df = df[df['FPSO'].isin(['GIR', 'DAL', 'PAZ', 'CLV'])]
170
+ df['Extracted_Keywords'] = df.apply(extract_ni_nc_keywords, axis=1, args=('Notifictn type', 'Description'))
171
+ for loc_type, keywords in [
172
+ ('Modules', module_keywords + paz_module_keywords), ('Racks', rack_keywords + paz_rack_keywords), ('LivingQuarters', living_quarters_keywords),
173
+ ('Flare', flare_keywords), ('FWD', fwd_keywords), ('HeliDeck', hexagons_keywords)
174
+ ]:
175
+ df[f'Extracted_{loc_type}'] = df.apply(extract_location_keywords, axis=1, args=('Description', keywords))
176
+ st.sidebar.success("Excel file processed successfully.")
177
+ except Exception as e:
178
+ st.error(f"Error processing Excel: {e}")
179
+ return parsed_docs, None
180
+
181
+ return parsed_docs, df
182
+
183
+ def add_rectangle(ax, xy, width, height, **kwargs):
184
+ rectangle = patches.Rectangle(xy, width, height, **kwargs)
185
+ ax.add_patch(rectangle)
186
+
187
+ def add_chamfered_rectangle(ax, xy, width, height, chamfer, **kwargs):
188
+ x, y = xy
189
+ coords = [
190
+ (x + chamfer, y),
191
+ (x + width - chamfer, y),
192
+ (x + width, y + chamfer),
193
+ (x + width, y + height - chamfer),
194
+ (x + width - chamfer, y + height),
195
+ (x + chamfer, y + height),
196
+ (x, y + height - chamfer),
197
+ (x, y + chamfer)
198
+ ]
199
+ polygon = patches.Polygon(coords, closed=True, **kwargs)
200
+ ax.add_patch(polygon)
201
+
202
+ def add_hexagon(ax, xy, radius, **kwargs):
203
+ x, y = xy
204
+ vertices = [(x + radius * math.cos(2 * math.pi * n / 6), y + radius * math.sin(2 * math.pi * n / 6)) for n in range(6)]
205
+ hexagon = patches.Polygon(vertices, closed=True, **kwargs)
206
+ ax.add_patch(hexagon)
207
+
208
+ def add_fwd(ax, xy, width, height, **kwargs):
209
+ x, y = xy
210
+ top_width = width * 0.80
211
+ coords = [
212
+ (0, 0),
213
+ (width, 0),
214
+ (width - (width - top_width) / 2, height),
215
+ ((width - top_width) / 2, height)
216
+ ]
217
+ trapezoid = patches.Polygon(coords, closed=True, **kwargs)
218
+ t = transforms.Affine2D().rotate_deg(90).translate(x, y)
219
+ trapezoid.set_transform(t + ax.transData)
220
+ ax.add_patch(trapezoid)
221
+ text_t = transforms.Affine2D().rotate_deg(90).translate(x + height / 2, y + width / 2)
222
+ ax.text(0, -1, "FWD", ha='center', va='center', fontsize=7, weight='bold', transform=text_t + ax.transData)