File size: 2,470 Bytes
2dcd5ce
91c65e4
2dcd5ce
91c65e4
 
2dcd5ce
 
91c65e4
 
 
 
2dcd5ce
91c65e4
2dcd5ce
32817e1
2dcd5ce
91c65e4
 
 
 
2dcd5ce
 
 
91c65e4
 
2dcd5ce
 
 
32817e1
2dcd5ce
 
32817e1
2dcd5ce
 
 
32817e1
2dcd5ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c65e4
2dcd5ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91c65e4
2dcd5ce
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# space/tools/predict_tool.py
import os
import json
import pandas as pd
import joblib
from typing import Optional, List

from huggingface_hub import hf_hub_download
from utils.config import AppConfig
from utils.tracing import Tracer


class PredictTool:
    """
    Loads a sklearn-compatible tabular model from a HF repo and runs predictions.
    """
    def __init__(self, cfg: AppConfig, tracer: Tracer):
        self.cfg = cfg
        self.tracer = tracer
        self._model = None
        self._feature_meta = {}
        self._pred_col = "prediction"
        self._feature_order: Optional[List[str]] = None

    def _ensure_loaded(self):
        if self._model is not None:
            return

        token = os.getenv("HF_TOKEN")
        repo = self.cfg.hf_model_repo

        model_path = hf_hub_download(repo_id=repo, filename="model.pkl", token=token)
        self._model = joblib.load(model_path)

        try:
            meta_path = hf_hub_download(repo_id=repo, filename="feature_metadata.json", token=token)
            with open(meta_path, "r", encoding="utf-8") as f:
                self._feature_meta = json.load(f) or {}
        except Exception:
            self._feature_meta = {}

        self._pred_col = self._feature_meta.get("prediction_column", "prediction")
        self._feature_order = self._feature_meta.get("feature_order")

    def _select_features(self, df: pd.DataFrame) -> pd.DataFrame:
        if self._feature_order:
            missing = [c for c in self._feature_order if c not in df.columns]
            if missing:
                raise ValueError(f"Missing required features for model: {missing}")
            return df[self._feature_order].copy()
        return df.copy()

    def run(self, df: Optional[pd.DataFrame]) -> pd.DataFrame:
        self._ensure_loaded()
        if df is None or len(df) == 0:
            return pd.DataFrame()

        X = self._select_features(df)
        model = self._model

        if hasattr(model, "predict_proba"):
            preds = model.predict_proba(X)[:, -1]
        elif hasattr(model, "decision_function"):
            import numpy as np
            raw = model.decision_function(X)
            preds = 1 / (1 + np.exp(-raw))
        else:
            preds = model.predict(X)

        out = df.copy()
        out[self._pred_col] = preds
        try:
            self.tracer.trace_event("predict", {"rows": len(out)})
        except Exception:
            pass
        return out