Remostart commited on
Commit
2891d1d
·
verified ·
1 Parent(s): bc078c8

milestone one and two AI BRIAN AND MULTILINGUA commit

Browse files
Files changed (40) hide show
  1. .dockerignore +0 -0
  2. .dockerigore +0 -0
  3. .gitattributes +4 -0
  4. Dockerfile +57 -0
  5. app/__init__.py +0 -0
  6. app/__pycache__/__init__.cpython-311.pyc +0 -0
  7. app/__pycache__/__init__.cpython-312.pyc +0 -0
  8. app/__pycache__/main.cpython-311.pyc +0 -0
  9. app/__pycache__/main.cpython-312.pyc +0 -0
  10. app/agents/__init__.py +0 -0
  11. app/agents/__pycache__/__init__.cpython-311.pyc +0 -0
  12. app/agents/__pycache__/__init__.cpython-312.pyc +0 -0
  13. app/agents/__pycache__/crew_pipeline.cpython-311.pyc +0 -0
  14. app/agents/__pycache__/crew_pipeline.cpython-312.pyc +0 -0
  15. app/agents/crew_pipeline.py +245 -0
  16. app/main.py +86 -0
  17. app/models/__init__.py +0 -0
  18. app/models/intent_classifier_v2.joblib +3 -0
  19. app/tasks/__init__.py +0 -0
  20. app/tasks/__pycache__/__init__.cpython-311.pyc +0 -0
  21. app/tasks/__pycache__/__init__.cpython-312.pyc +0 -0
  22. app/tasks/__pycache__/rag_updater.cpython-311.pyc +0 -0
  23. app/tasks/__pycache__/rag_updater.cpython-312.pyc +0 -0
  24. app/tasks/rag_updater.py +141 -0
  25. app/utils/__init__.py +0 -0
  26. app/utils/__pycache__/__init__.cpython-311.pyc +0 -0
  27. app/utils/__pycache__/__init__.cpython-312.pyc +0 -0
  28. app/utils/__pycache__/config.cpython-311.pyc +0 -0
  29. app/utils/__pycache__/config.cpython-312.pyc +0 -0
  30. app/utils/config.py +54 -0
  31. app/vectorstore/__init__.py +0 -0
  32. app/vectorstore/faiss_index/index.faiss +3 -0
  33. app/vectorstore/faiss_index/index.pkl +3 -0
  34. app/vectorstore/live_rag_index/index.faiss +0 -0
  35. app/vectorstore/live_rag_index/index.pkl +3 -0
  36. app/venv/bin/python +3 -0
  37. app/venv/bin/python3 +3 -0
  38. app/venv/bin/python3.11 +3 -0
  39. app/venv/pyvenv.cfg +5 -0
  40. requirements.txt +21 -0
.dockerignore ADDED
File without changes
.dockerigore ADDED
File without changes
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ app/vectorstore/faiss_index/index.faiss filter=lfs diff=lfs merge=lfs -text
37
+ app/venv/bin/python filter=lfs diff=lfs merge=lfs -text
38
+ app/venv/bin/python3 filter=lfs diff=lfs merge=lfs -text
39
+ app/venv/bin/python3.11 filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ FROM python:3.10-slim
3
+
4
+
5
+ ENV DEBIAN_FRONTEND=noninteractive \
6
+ PYTHONUNBUFFERED=1 \
7
+ PYTHONDONTWRITEBYTECODE=1
8
+
9
+
10
+ WORKDIR /code
11
+
12
+ # Install system dependencies
13
+ RUN apt-get update && apt-get install -y --no-install-recommends \
14
+ build-essential \
15
+ git \
16
+ curl \
17
+ libopenblas-dev \
18
+ libomp-dev \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+
22
+ COPY requirements.txt .
23
+
24
+
25
+ RUN pip install --no-cache-dir -r requirements.txt
26
+
27
+
28
+ RUN pip install --no-cache-dir huggingface-hub sentencepiece accelerate fasttext
29
+
30
+
31
+ ENV HF_HOME=/models/huggingface
32
+ ENV TRANSFORMERS_CACHE=/models/huggingface
33
+ ENV HUGGINGFACE_HUB_CACHE=/models/huggingface
34
+ ENV HF_HUB_CACHE=/models/huggingface
35
+
36
+ # Create cache dir
37
+ RUN mkdir -p /models/huggingface
38
+
39
+ # Pre-download models at build time (Qwen + SentenceTransformer + FastText + NLLB finetuned)
40
+ RUN python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='Qwen/Qwen3-4B-Instruct-2507')" \
41
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')" \
42
+ && python -c "from huggingface_hub import hf_hub_download; hf_hub_download(repo_id='facebook/fasttext-language-identification', filename='model.bin')" \
43
+ && python -c "from huggingface_hub import snapshot_download; snapshot_download(repo_id='drrobot9/nllb-ig-yo-ha-finetuned')"
44
+
45
+ # Preload tokenizers (avoid runtime delays)
46
+ RUN python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('Qwen/Qwen3-4B-Instruct-2507', use_fast=True)" \
47
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', use_fast=True)" \
48
+ && python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('drrobot9/nllb-ig-yo-ha-finetuned', use_fast=True)"
49
+
50
+
51
+ COPY . .
52
+
53
+
54
+ EXPOSE 7860
55
+
56
+ # Run FastAPI app with uvicorn (2 workers for better concurrency)
57
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860", "--workers", "2"]
app/__init__.py ADDED
File without changes
app/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (166 Bytes). View file
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (154 Bytes). View file
 
app/__pycache__/main.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
app/__pycache__/main.cpython-312.pyc ADDED
Binary file (3.41 kB). View file
 
app/agents/__init__.py ADDED
File without changes
app/agents/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (173 Bytes). View file
 
app/agents/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (161 Bytes). View file
 
app/agents/__pycache__/crew_pipeline.cpython-311.pyc ADDED
Binary file (8.73 kB). View file
 
app/agents/__pycache__/crew_pipeline.cpython-312.pyc ADDED
Binary file (11 kB). View file
 
app/agents/crew_pipeline.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua/app/agents/crew_pipeline.pyversion3multilingua
2
+ import os
3
+ import sys
4
+ import requests
5
+ import joblib
6
+ import faiss
7
+ import numpy as np
8
+ import torch
9
+ import fasttext
10
+ from huggingface_hub import hf_hub_download
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
12
+ from sentence_transformers import SentenceTransformer
13
+ from app.utils import config
14
+ import re
15
+
16
+
17
+ hf_cache = "/models/huggingface"
18
+ os.environ["HF_HOME"] = hf_cache
19
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
20
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
21
+ os.makedirs(hf_cache, exist_ok=True)
22
+
23
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
24
+ if BASE_DIR not in sys.path:
25
+ sys.path.insert(0, BASE_DIR)
26
+
27
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
28
+
29
+
30
+ try:
31
+ classifier = joblib.load(config.CLASSIFIER_PATH)
32
+ except Exception:
33
+ classifier = None
34
+
35
+
36
+ print(f"Loading Qwen expert model ({config.EXPERT_MODEL_NAME})...")
37
+ tokenizer = AutoTokenizer.from_pretrained(config.EXPERT_MODEL_NAME)
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ config.EXPERT_MODEL_NAME,
40
+ torch_dtype="auto",
41
+ device_map="auto"
42
+ )
43
+
44
+ embedder = SentenceTransformer(config.EMBEDDING_MODEL)
45
+
46
+
47
+ print(f"Loading FastText language identifier ({config.LANG_ID_MODEL_REPO})...")
48
+ lang_model_path = hf_hub_download(
49
+ repo_id=config.LANG_ID_MODEL_REPO,
50
+ filename=config.LANG_ID_MODEL_FILE
51
+ )
52
+ lang_identifier = fasttext.load_model(lang_model_path)
53
+
54
+ def detect_language(text, top_k=1):
55
+ """Detect language with FastText, ensuring no newlines."""
56
+ clean_text = text.replace("\n", " ").strip()
57
+ labels, probs = lang_identifier.predict(clean_text, k=top_k)
58
+ return [(l.replace("__label__", ""), float(p)) for l, p in zip(labels, probs)]
59
+
60
+
61
+ print(f"Loading NLLB translation model ({config.TRANSLATION_MODEL_NAME})...")
62
+ translation_pipeline = pipeline(
63
+ "translation",
64
+ model=config.TRANSLATION_MODEL_NAME,
65
+ device=0 if DEVICE == "cuda" else -1,
66
+ max_new_tokens=512
67
+ )
68
+
69
+ SUPPORTED_LANGS = {
70
+ "eng_Latn": "English",
71
+ "ibo_Latn": "Igbo",
72
+ "yor_Latn": "Yoruba",
73
+ "hau_Latn": "Hausa",
74
+ "swh_Latn": "Swahili",
75
+ "amh_Ethi": "Amharic"
76
+ }
77
+
78
+
79
+
80
+ def chunk_text(text, max_len=400):
81
+ """Split text into chunks without cutting sentences."""
82
+ sentences = re.split(r'(?<=[.!?]) +', text)
83
+ chunks, current = [], ""
84
+
85
+ for sent in sentences:
86
+ if len(current) + len(sent) < max_len:
87
+ current += " " + sent
88
+ else:
89
+ chunks.append(current.strip())
90
+ current = sent
91
+ if current:
92
+ chunks.append(current.strip())
93
+ return chunks
94
+
95
+
96
+ def translate_text(text, src_lang, tgt_lang):
97
+ """Translate with chunking and stitch results together."""
98
+ if not text.strip():
99
+ return text
100
+ chunks = chunk_text(text)
101
+ results = []
102
+ for chunk in chunks:
103
+ out = translation_pipeline(chunk, src_lang=src_lang, tgt_lang=tgt_lang)
104
+ results.append(out[0]['translation_text'])
105
+ return " ".join(results)
106
+
107
+
108
+
109
+ def retrieve_docs(query, vs_path):
110
+ if not vs_path or not os.path.exists(vs_path):
111
+ return None
112
+ try:
113
+ index = faiss.read_index(vs_path)
114
+ except Exception:
115
+ return None
116
+ query_vec = np.array([embedder.encode(query)], dtype=np.float32)
117
+ D, I = index.search(query_vec, k=3)
118
+ if D[0][0] == 0:
119
+ return None
120
+ meta_path = vs_path + "_meta.npy"
121
+ if os.path.exists(meta_path):
122
+ metadata = np.load(meta_path, allow_pickle=True).item()
123
+ docs = [metadata.get(str(idx), "") for idx in I[0] if str(idx) in metadata]
124
+ docs = [doc for doc in docs if doc]
125
+ return "\n\n".join(docs) if docs else None
126
+ return None
127
+
128
+
129
+
130
+ def get_weather(state_name):
131
+ url = "http://api.weatherapi.com/v1/current.json"
132
+ params = {
133
+ "key": config.WEATHER_API_KEY,
134
+ "q": f"{state_name}, Nigeria",
135
+ "aqi": "no"
136
+ }
137
+ r = requests.get(url, params=params)
138
+ if r.status_code != 200:
139
+ return f"Unable to retrieve weather for {state_name}."
140
+ data = r.json()
141
+ return (
142
+ f"Weather in {state_name}:\n"
143
+ f"- Condition: {data['current']['condition']['text']}\n"
144
+ f"- Temperature: {data['current']['temp_c']}°C\n"
145
+ f"- Humidity: {data['current']['humidity']}%\n"
146
+ f"- Wind: {data['current']['wind_kph']} kph"
147
+ )
148
+
149
+
150
+
151
+ def detect_intent(query):
152
+ q_lower = query.lower()
153
+
154
+ if any(word in q_lower for word in ["weather condition", "forecast"]):
155
+ for state in config.STATES:
156
+ if state.lower() in q_lower:
157
+ return "weather", state
158
+ return "weather", None
159
+
160
+ if any(word in q_lower for word in ["update", "breaking", "news", "current"]):
161
+ return "live_update", None
162
+
163
+ if hasattr(classifier, "predict") and hasattr(classifier, "predict_proba"):
164
+ predicted_intent = classifier.predict([query])[0]
165
+ confidence = max(classifier.predict_proba([query])[0])
166
+ if confidence < config.CLASSIFIER_CONFIDENCE_THRESHOLD:
167
+ return "low_confidence", None
168
+ return predicted_intent, None
169
+
170
+ return "normal", None
171
+
172
+
173
+
174
+ def run_qwen(messages, max_new_tokens=1000):
175
+ text = tokenizer.apply_chat_template(
176
+ messages,
177
+ tokenize=False,
178
+ add_generation_prompt=True
179
+ )
180
+ inputs = tokenizer([text], return_tensors="pt").to(model.device)
181
+
182
+ generated_ids = model.generate(
183
+ **inputs,
184
+ max_new_tokens=max_new_tokens,
185
+ temperature=0.4,
186
+ repetition_penalty=1.1
187
+ )
188
+ output_ids = generated_ids[0][len(inputs.input_ids[0]):].tolist()
189
+ return tokenizer.decode(output_ids, skip_special_tokens=True).strip()
190
+
191
+
192
+
193
+ def run_pipeline(user_query: str):
194
+ lang, prob = detect_language(user_query, top_k=1)[0]
195
+ if lang not in SUPPORTED_LANGS:
196
+ lang = "eng_Latn"
197
+
198
+ # Translate to English if needed
199
+ translated_query = user_query
200
+ if lang != "eng_Latn":
201
+ translated_query = translate_text(user_query, src_lang=lang, tgt_lang="eng_Latn")
202
+
203
+ # Detect intent
204
+ intent, extra = detect_intent(translated_query)
205
+
206
+ if intent == "weather" and extra:
207
+ weather_text = get_weather(extra)
208
+ messages = [
209
+ {"role": "system", "content": "You are FarmLingua, an AI assistant for Nigerian farmers."},
210
+ {"role": "user", "content": f"Rewrite this weather update simply:\n{weather_text}"}
211
+ ]
212
+ english_answer = run_qwen(messages, max_new_tokens=256)
213
+ else:
214
+ if intent == "live_update":
215
+ context = retrieve_docs(translated_query, config.LIVE_VS_PATH)
216
+ if context:
217
+ translated_query += f"\n\nLatest agricultural updates:\n{context}"
218
+
219
+ if intent == "low_confidence":
220
+ context = retrieve_docs(translated_query, config.STATIC_VS_PATH)
221
+ if context:
222
+ translated_query += f"\n\nReference information:\n{context}"
223
+
224
+ messages = [
225
+ {"role": "system", "content": (
226
+ "You are FarmLingua, an AI assistant for Nigerian farmers. "
227
+ "Answer directly without repeating the question. "
228
+ "Use short, clear farmer-friendly English. "
229
+ "Avoid scientific jargon, focus on practical farming advice."
230
+ "When a user ask you who built you or created you say it jackson kelvin a sophisticated AI engineer from Remostart AI company, he built my core brain"
231
+ )},
232
+ {"role": "user", "content": translated_query}
233
+ ]
234
+ english_answer = run_qwen(messages, max_new_tokens=700)
235
+
236
+ # Translate back to original language
237
+ if lang != "eng_Latn":
238
+ final_answer = translate_text(english_answer, src_lang="eng_Latn", tgt_lang=lang)
239
+ else:
240
+ final_answer = english_answer
241
+
242
+ return {
243
+ "detected_language": SUPPORTED_LANGS.get(lang, "Unknown"),
244
+ "answer": final_answer
245
+ }
app/main.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/main.py
2
+ import os
3
+ import sys
4
+ import logging
5
+ from fastapi import FastAPI, Body
6
+ from fastapi.middleware.cors import CORSMiddleware
7
+ import uvicorn
8
+
9
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
10
+ if BASE_DIR not in sys.path:
11
+ sys.path.insert(0, BASE_DIR)
12
+
13
+ from app.tasks.rag_updater import schedule_updates
14
+ from app.utils import config
15
+ from app.agents.crew_pipeline import run_pipeline
16
+
17
+ logging.basicConfig(
18
+ format="%(asctime)s [%(levelname)s] %(message)s",
19
+ level=logging.INFO
20
+ )
21
+
22
+
23
+ app = FastAPI(
24
+ title="FarmLingua Backend",
25
+ description="Backend service for FarmLingua with RAG updates, multilingual support, and expert AI pipeline",
26
+ version="1.1.0"
27
+ )
28
+
29
+
30
+ app.add_middleware(
31
+ CORSMiddleware,
32
+ allow_origins=getattr(config, "ALLOWED_ORIGINS", ["*"]),
33
+ allow_credentials=True,
34
+ allow_methods=["*"],
35
+ allow_headers=["*"],
36
+ )
37
+
38
+
39
+ @app.on_event("startup")
40
+ def startup_event():
41
+ logging.info("Starting FarmLingua backend...")
42
+ schedule_updates()
43
+
44
+
45
+ @app.get("/")
46
+ def home():
47
+ """Health check endpoint."""
48
+ return {
49
+ "status": "FarmLingua backend running",
50
+ "version": "1.1.0",
51
+ "vectorstore_path": config.VECTORSTORE_PATH
52
+ }
53
+
54
+ @app.post("/ask")
55
+ def ask_farmbot(query: str = Body(..., embed=True)):
56
+ """
57
+ Ask FarmLingua a farming-related question.
58
+ Supports Hausa, Igbo, Yoruba, and English.
59
+ Automatically detects user language, translates if needed,
60
+ and returns response in the same language.
61
+ """
62
+ logging.info(f"Received query: {query}")
63
+ answer_data = run_pipeline(query)
64
+
65
+
66
+ detected_lang = answer_data.get("detected_language", "Unknown")
67
+ confidence = answer_data.get("confidence", None)
68
+ logging.info(
69
+ f"Detected language: {detected_lang}"
70
+ + (f" (confidence={confidence:.2f})" if confidence else "")
71
+ )
72
+
73
+ return {
74
+ "query": query,
75
+ #"detected_language": detected_lang,
76
+ "answer": answer_data.get("answer")
77
+ }
78
+
79
+
80
+ if __name__ == "__main__":
81
+ uvicorn.run(
82
+ "app.main:app",
83
+ host="0.0.0.0",
84
+ port=getattr(config, "PORT", 7860),
85
+ reload=bool(getattr(config, "DEBUG", False))
86
+ )
app/models/__init__.py ADDED
File without changes
app/models/intent_classifier_v2.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ffeda9eeb604a1a24ef64e774eb6b503ead5eae6ad3b043401033040a4309405
3
+ size 39296294
app/tasks/__init__.py ADDED
File without changes
app/tasks/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/tasks/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/tasks/__pycache__/rag_updater.cpython-311.pyc ADDED
Binary file (8.43 kB). View file
 
app/tasks/__pycache__/rag_updater.cpython-312.pyc ADDED
Binary file (7.42 kB). View file
 
app/tasks/rag_updater.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # farmlingua_backend/app/tasks/rag_updater.py
2
+ import os
3
+ import sys
4
+ from datetime import datetime, date
5
+ import logging
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ from apscheduler.schedulers.background import BackgroundScheduler
9
+
10
+ from langchain.vectorstores import FAISS
11
+ from langchain.embeddings import SentenceTransformerEmbeddings
12
+ from langchain.docstore.document import Document
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+
15
+ from app.utils import config
16
+
17
+ BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
18
+ if BASE_DIR not in sys.path:
19
+ sys.path.insert(0, BASE_DIR)
20
+
21
+ logging.basicConfig(
22
+ format="%(asctime)s [%(levelname)s] %(message)s",
23
+ level=logging.INFO
24
+ )
25
+
26
+ session = requests.Session()
27
+
28
+ def fetch_weather_now():
29
+ """Fetch current weather for all configured states."""
30
+ docs = []
31
+ for state in config.STATES:
32
+ try:
33
+ url = "http://api.weatherapi.com/v1/current.json"
34
+ params = {
35
+ "key": config.WEATHER_API_KEY,
36
+ "q": f"{state}, Nigeria",
37
+ "aqi": "no"
38
+ }
39
+ res = session.get(url, params=params, timeout=10)
40
+ res.raise_for_status()
41
+ data = res.json()
42
+
43
+ if "current" in data:
44
+ condition = data['current']['condition']['text']
45
+ temp_c = data['current']['temp_c']
46
+ humidity = data['current']['humidity']
47
+ text = (
48
+ f"Weather in {state}: {condition}, "
49
+ f"Temperature: {temp_c}°C, Humidity: {humidity}%"
50
+ )
51
+ docs.append(Document(
52
+ page_content=text,
53
+ metadata={
54
+ "source": "WeatherAPI",
55
+ "location": state,
56
+ "timestamp": datetime.utcnow().isoformat()
57
+ }
58
+ ))
59
+ except Exception as e:
60
+ logging.error(f"Weather fetch failed for {state}: {e}")
61
+ return docs
62
+
63
+ def fetch_harvestplus_articles():
64
+ """Fetch ALL today's articles from HarvestPlus site."""
65
+ try:
66
+ res = session.get(config.DATA_SOURCES["harvestplus"], timeout=10)
67
+ res.raise_for_status()
68
+ soup = BeautifulSoup(res.text, "html.parser")
69
+ articles = soup.find_all("article")
70
+
71
+ docs = []
72
+ today_str = date.today().strftime("%Y-%m-%d")
73
+
74
+ for a in articles:
75
+ content = a.get_text(strip=True)
76
+ if content and len(content) > 100:
77
+
78
+ if today_str in a.text or True:
79
+ docs.append(Document(
80
+ page_content=content,
81
+ metadata={
82
+ "source": "HarvestPlus",
83
+ "timestamp": datetime.utcnow().isoformat()
84
+ }
85
+ ))
86
+ return docs
87
+ except Exception as e:
88
+ logging.error(f"HarvestPlus fetch failed: {e}")
89
+ return []
90
+
91
+ def build_rag_vectorstore(reset=False):
92
+ job_type = "FULL REBUILD" if reset else "INCREMENTAL UPDATE"
93
+ logging.info(f"RAG update started — {job_type}")
94
+
95
+ all_docs = fetch_weather_now() + fetch_harvestplus_articles()
96
+
97
+ logging.info(f"Weather docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'WeatherAPI'])}")
98
+ logging.info(f"News docs fetched: {len([d for d in all_docs if d.metadata['source'] == 'HarvestPlus'])}")
99
+
100
+ if not all_docs:
101
+ logging.warning("No documents fetched, skipping update")
102
+ return
103
+
104
+ splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=64)
105
+ chunks = splitter.split_documents(all_docs)
106
+
107
+ embedder = SentenceTransformerEmbeddings(model_name=config.EMBEDDING_MODEL)
108
+
109
+ vectorstore_path = config.LIVE_VS_PATH
110
+
111
+ if reset and os.path.exists(vectorstore_path):
112
+ for file in os.listdir(vectorstore_path):
113
+ file_path = os.path.join(vectorstore_path, file)
114
+ try:
115
+ os.remove(file_path)
116
+ logging.info(f"Deleted old file: {file_path}")
117
+ except Exception as e:
118
+ logging.error(f"Failed to delete {file_path}: {e}")
119
+
120
+ if os.path.exists(vectorstore_path) and not reset:
121
+ vs = FAISS.load_local(
122
+ vectorstore_path,
123
+ embedder,
124
+ allow_dangerous_deserialization=True
125
+ )
126
+ vs.add_documents(chunks)
127
+ else:
128
+ vs = FAISS.from_documents(chunks, embedder)
129
+
130
+ os.makedirs(vectorstore_path, exist_ok=True)
131
+ vs.save_local(vectorstore_path)
132
+
133
+ logging.info(f"Vectorstore updated at {vectorstore_path}")
134
+
135
+ def schedule_updates():
136
+ scheduler = BackgroundScheduler()
137
+ scheduler.add_job(build_rag_vectorstore, 'interval', hours=12, kwargs={"reset": False})
138
+ scheduler.add_job(build_rag_vectorstore, 'interval', days=7, kwargs={"reset": True})
139
+ scheduler.start()
140
+ logging.info("Scheduler started — 12-hour incremental updates + weekly full rebuild")
141
+ return scheduler
app/utils/__init__.py ADDED
File without changes
app/utils/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (172 Bytes). View file
 
app/utils/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (160 Bytes). View file
 
app/utils/__pycache__/config.cpython-311.pyc ADDED
Binary file (1.85 kB). View file
 
app/utils/__pycache__/config.cpython-312.pyc ADDED
Binary file (2.33 kB). View file
 
app/utils/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ # farmlingua_backend/app/utils/config.py
4
+ from pathlib import Path
5
+ import os
6
+ import sys
7
+
8
+
9
+ BASE_DIR = Path(__file__).resolve().parents[2]
10
+
11
+
12
+ if str(BASE_DIR) not in sys.path:
13
+ sys.path.insert(0, str(BASE_DIR))
14
+
15
+ EMBEDDING_MODEL = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
16
+ STATIC_VS_PATH = BASE_DIR / "app" / "vectorstore" / "faiss_index"
17
+ LIVE_VS_PATH = BASE_DIR / "app" / "vectorstore" / "live_rag_index"
18
+
19
+ VECTORSTORE_PATH = LIVE_VS_PATH
20
+
21
+
22
+ WEATHER_API_KEY = os.getenv("WEATHER_API_KEY", "1eefcad138134d62a1e220003252608")
23
+
24
+
25
+ CLASSIFIER_PATH = BASE_DIR / "app" / "models" / "intent_classifier_v2.joblib"
26
+ CLASSIFIER_CONFIDENCE_THRESHOLD = float(os.getenv("CLASSIFIER_CONFIDENCE_THRESHOLD", "0.6"))
27
+
28
+
29
+ EXPERT_MODEL_NAME = os.getenv("EXPERT_MODEL_NAME", "Qwen/Qwen3-4B-Instruct-2507")
30
+ #FORMATTER_MODEL_NAME = os.getenv("FORMATTER_MODEL_NAME", "google/flan-t5-large")
31
+
32
+ LANG_ID_MODEL_REPO = os.getenv("LANG_ID_MODEL_REPO", "facebook/fasttext-language-identification")
33
+ LANG_ID_MODEL_FILE = os.getenv("LANG_ID_MODEL_FILE", "model.bin")
34
+
35
+ TRANSLATION_MODEL_NAME = os.getenv("TRANSLATION_MODEL_NAME", "drrobot9/nllb-ig-yo-ha-finetuned")
36
+
37
+ DATA_SOURCES = {
38
+ "harvestplus": "https://agronigeria.ng/category/news/",
39
+ }
40
+
41
+ STATES = [
42
+ "Abuja", "Lagos", "Kano", "Kaduna", "Rivers", "Enugu", "Anambra", "Ogun",
43
+ "Oyo", "Delta", "Edo", "Katsina", "Borno", "Benue", "Niger", "Plateau",
44
+ "Bauchi", "Adamawa", "Cross River", "Akwa Ibom", "Ekiti", "Osun", "Ondo",
45
+ "Imo", "Abia", "Ebonyi", "Taraba", "Kebbi", "Zamfara", "Yobe", "Gombe",
46
+ "Sokoto", "Kogi", "Bayelsa", "Nasarawa", "Jigawa"
47
+ ]
48
+
49
+
50
+ hf_cache = "/models/huggingface"
51
+ os.environ["HF_HOME"] = hf_cache
52
+ os.environ["TRANSFORMERS_CACHE"] = hf_cache
53
+ os.environ["HUGGINGFACE_HUB_CACHE"] = hf_cache
54
+ os.makedirs(hf_cache, exist_ok=True)
app/vectorstore/__init__.py ADDED
File without changes
app/vectorstore/faiss_index/index.faiss ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4faefcc68ae5a575b18f559e04cd2c68e166a73c4c89c9550e1794ccbf90695
3
+ size 19648557
app/vectorstore/faiss_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c75f31eab757e90e9c9771b62368c2de5dc11ed776629521fb007d8d47b84a
3
+ size 5863908
app/vectorstore/live_rag_index/index.faiss ADDED
Binary file (70.7 kB). View file
 
app/vectorstore/live_rag_index/index.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:466653741f0cbbcbb51c817af910e5ca03c769e9009b3e3bf0f6fdcad71393b1
3
+ size 12074
app/venv/bin/python ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddfaecd2bd157a57e1211cde4fce9bf8107d4993a131bbf4b890ae53b76554bd
3
+ size 7901928
app/venv/bin/python3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddfaecd2bd157a57e1211cde4fce9bf8107d4993a131bbf4b890ae53b76554bd
3
+ size 7901928
app/venv/bin/python3.11 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddfaecd2bd157a57e1211cde4fce9bf8107d4993a131bbf4b890ae53b76554bd
3
+ size 7901928
app/venv/pyvenv.cfg ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ home = /usr/bin
2
+ include-system-site-packages = false
3
+ version = 3.11.13
4
+ executable = /usr/bin/python3.11
5
+ command = /usr/bin/python3 -m venv /content/drive/MyDrive/farmlingua_backend/app/venv
requirements.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ crewai
2
+ langchain
3
+ langchain-community
4
+ faiss-cpu
5
+ transformers
6
+ sentence-transformers
7
+ pydantic
8
+ joblib
9
+ pyyaml
10
+ torch
11
+ fastapi
12
+ uvicorn
13
+ apscheduler
14
+ numpy<2
15
+ requests
16
+ beautifulsoup4
17
+ huggingface-hub
18
+ python-dotenv
19
+ blobfile
20
+ sentencepiece
21
+ fasttext