Spaces:
Runtime error
Runtime error
Commit
·
a018b00
1
Parent(s):
6e651a2
Fresh start
Browse files- app.py +0 -35
- config.py +0 -14
- utils/data_loader.py +0 -42
- utils/logger.py +0 -11
- utils/vector_utils.py +0 -52
- vector_db/metadata.json +0 -0
app.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
# app.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
from flask import Flask, jsonify
|
| 5 |
-
from utils.data_loader import download_dataset, save_metadata
|
| 6 |
-
from utils.vector_utils import create_vector_db
|
| 7 |
-
from config import v_auth_token, v_vector_folder, v_metadata_file
|
| 8 |
-
|
| 9 |
-
app = Flask(__name__)
|
| 10 |
-
|
| 11 |
-
@app.route('/health', methods=['GET'])
|
| 12 |
-
def health_check():
|
| 13 |
-
return jsonify({"status": "healthy"}), 200
|
| 14 |
-
|
| 15 |
-
@app.route('/initialize', methods=['POST'])
|
| 16 |
-
def initialize():
|
| 17 |
-
if not v_auth_token:
|
| 18 |
-
return jsonify({"error": "Authentication token not found"}), 500
|
| 19 |
-
|
| 20 |
-
# Ensure writable base directory and subdirectories are initialized
|
| 21 |
-
os.makedirs(v_vector_folder, exist_ok=True)
|
| 22 |
-
if not os.path.exists(v_metadata_file):
|
| 23 |
-
save_metadata(v_metadata_file, {})
|
| 24 |
-
|
| 25 |
-
print("Starting Vector Database Creation...")
|
| 26 |
-
|
| 27 |
-
v_dataset_path = download_dataset(v_auth_token, v_metadata_file, v_vector_folder)
|
| 28 |
-
if not v_dataset_path:
|
| 29 |
-
return jsonify({"message": "Vector database is up-to-date"}), 200
|
| 30 |
-
else:
|
| 31 |
-
create_vector_db(v_dataset_path, v_vector_folder, v_auth_token)
|
| 32 |
-
return jsonify({"message": "Vector database created successfully"}), 200
|
| 33 |
-
|
| 34 |
-
if __name__ == "__main__":
|
| 35 |
-
app.run(host="0.0.0.0", port=7860)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
config.py
DELETED
|
@@ -1,14 +0,0 @@
|
|
| 1 |
-
# config.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
|
| 5 |
-
# Dataset configuration
|
| 6 |
-
v_dataset_url = "https://huggingface.co/datasets/vishalsh13/Dataset1/tree/main"
|
| 7 |
-
|
| 8 |
-
# Authentication token retrieved from Hugging Face secret
|
| 9 |
-
v_auth_token = os.getenv("hkey") # The secret name is `hkey`
|
| 10 |
-
|
| 11 |
-
# Paths for vector database and metadata
|
| 12 |
-
v_base_path = "/tmp/vector_db" # Writable directory
|
| 13 |
-
v_vector_folder = os.path.join(v_base_path, "vectors")
|
| 14 |
-
v_metadata_file = os.path.join(v_base_path, "metadata.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/data_loader.py
DELETED
|
@@ -1,42 +0,0 @@
|
|
| 1 |
-
# utils/data_loader.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import json
|
| 5 |
-
from huggingface_hub import hf_hub_download
|
| 6 |
-
|
| 7 |
-
def download_dataset(v_auth_token, v_metadata_file, v_vector_folder):
|
| 8 |
-
v_current_metadata = fetch_metadata()
|
| 9 |
-
v_existing_metadata = load_metadata(v_metadata_file)
|
| 10 |
-
|
| 11 |
-
if v_current_metadata == v_existing_metadata:
|
| 12 |
-
print("No updates detected. Skipping vector creation.")
|
| 13 |
-
return False
|
| 14 |
-
|
| 15 |
-
# Define the specific file to download
|
| 16 |
-
v_filename = "train.csv" # Replace this with the actual filename in your repository
|
| 17 |
-
v_dataset_path = hf_hub_download(
|
| 18 |
-
repo_id="vishalsh13/Dataset1",
|
| 19 |
-
repo_type="dataset",
|
| 20 |
-
#subfolder="data", # Adjust or remove subfolder as needed
|
| 21 |
-
filename=v_filename,
|
| 22 |
-
token=v_auth_token
|
| 23 |
-
)
|
| 24 |
-
|
| 25 |
-
print("Dataset downloaded successfully.")
|
| 26 |
-
save_metadata(v_metadata_file, v_current_metadata)
|
| 27 |
-
return v_dataset_path
|
| 28 |
-
|
| 29 |
-
def fetch_metadata():
|
| 30 |
-
# Simulate fetching metadata (e.g., hash or timestamp)
|
| 31 |
-
return {"dataset_version": "v1.0"}
|
| 32 |
-
|
| 33 |
-
def load_metadata(v_metadata_file):
|
| 34 |
-
if os.path.exists(v_metadata_file):
|
| 35 |
-
with open(v_metadata_file, "r") as file:
|
| 36 |
-
return json.load(file)
|
| 37 |
-
return {}
|
| 38 |
-
|
| 39 |
-
def save_metadata(v_metadata_file, v_metadata):
|
| 40 |
-
os.makedirs(os.path.dirname(v_metadata_file), exist_ok=True)
|
| 41 |
-
with open(v_metadata_file, "w") as file:
|
| 42 |
-
json.dump(v_metadata, file)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/logger.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
# utils/logger.py
|
| 2 |
-
|
| 3 |
-
import logging
|
| 4 |
-
|
| 5 |
-
def setup_logger(v_log_file="app.log"):
|
| 6 |
-
logging.basicConfig(
|
| 7 |
-
filename=v_log_file,
|
| 8 |
-
level=logging.INFO,
|
| 9 |
-
format="%(asctime)s - %(levelname)s - %(message)s"
|
| 10 |
-
)
|
| 11 |
-
return logging.getLogger()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
utils/vector_utils.py
DELETED
|
@@ -1,52 +0,0 @@
|
|
| 1 |
-
# utils/vector_utils.py
|
| 2 |
-
|
| 3 |
-
import os
|
| 4 |
-
import faiss
|
| 5 |
-
import numpy as np
|
| 6 |
-
from huggingface_hub import Repository
|
| 7 |
-
from sentence_transformers import SentenceTransformer
|
| 8 |
-
|
| 9 |
-
def create_vector_db(v_dataset_path, v_vector_folder, v_auth_token):
|
| 10 |
-
# Initialize the model
|
| 11 |
-
obj_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 12 |
-
|
| 13 |
-
# Load and process data
|
| 14 |
-
with open(v_dataset_path, 'r') as file:
|
| 15 |
-
v_data = file.readlines()
|
| 16 |
-
|
| 17 |
-
v_embeddings = obj_model.encode(v_data)
|
| 18 |
-
|
| 19 |
-
# Save vectors locally
|
| 20 |
-
os.makedirs(v_vector_folder, exist_ok=True)
|
| 21 |
-
v_vector_file = os.path.join(v_vector_folder, "vector_index")
|
| 22 |
-
v_index = faiss.IndexFlatL2(v_embeddings.shape[1])
|
| 23 |
-
v_index.add(np.array(v_embeddings))
|
| 24 |
-
faiss.write_index(v_index, v_vector_file)
|
| 25 |
-
|
| 26 |
-
print(f"Vector database created and saved locally at {v_vector_file}")
|
| 27 |
-
|
| 28 |
-
# Save vector file back to Hugging Face dataset repository
|
| 29 |
-
upload_to_huggingface_repo(v_vector_file, v_auth_token)
|
| 30 |
-
print("Vector file successfully uploaded to Hugging Face dataset repository.")
|
| 31 |
-
|
| 32 |
-
def upload_to_huggingface_repo(v_file_path, v_auth_token):
|
| 33 |
-
"""
|
| 34 |
-
Upload the given file to the Hugging Face dataset repository.
|
| 35 |
-
"""
|
| 36 |
-
v_repo_id = "vishalsh13/Dataset1" # Replace with your repository name
|
| 37 |
-
v_repo = Repository(
|
| 38 |
-
local_dir="temp_repo",
|
| 39 |
-
clone_from=v_repo_id,
|
| 40 |
-
use_auth_token=v_auth_token
|
| 41 |
-
)
|
| 42 |
-
|
| 43 |
-
# Copy the file to the repository directory
|
| 44 |
-
os.makedirs(v_repo.local_dir, exist_ok=True)
|
| 45 |
-
v_dest_path = os.path.join(v_repo.local_dir, os.path.basename(v_file_path))
|
| 46 |
-
os.replace(v_file_path, v_dest_path)
|
| 47 |
-
|
| 48 |
-
# Commit and push the changes
|
| 49 |
-
v_repo.git_add(v_dest_path)
|
| 50 |
-
v_repo.git_commit("Upload updated vector file.")
|
| 51 |
-
v_repo.git_push()
|
| 52 |
-
print(f"Uploaded {os.path.basename(v_file_path)} to Hugging Face repository: {v_repo_id}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
vector_db/metadata.json
DELETED
|
File without changes
|