Spaces:

AshenH
/

ALM_LLM

Sleeping

App Files Files Community

AshenH commited on Oct 8

Commit

2dcd5ce

verified ·

1 Parent(s): 7caa980

Update tools/predict_tool.py

Browse files

Files changed (1) hide show

tools/predict_tool.py +86 -16

tools/predict_tool.py CHANGED Viewed

@@ -1,32 +1,102 @@
 import os
 import pandas as pd
 import joblib
 from huggingface_hub import hf_hub_download
 from utils.config import AppConfig
 from utils.tracing import Tracer
 class PredictTool:
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
         self._model = None
-        self._feature_meta = None
     def _ensure_loaded(self):
-        if self._model is None:
-            path = hf_hub_download(repo_id=self.cfg.hf_model_repo, filename="model.pkl", token=os.getenv("HF_TOKEN"))
-            self._model = joblib.load(path)
-            meta = hf_hub_download(repo_id=self.cfg.hf_model_repo, filename="feature_metadata.json", token=os.getenv("HF_TOKEN"))
-            import json
-            with open(meta, "r") as f:
-                self._feature_meta = json.load(f)
-    def run(self, df: pd.DataFrame) -> pd.DataFrame:
         self._ensure_loaded()
-        use_cols = self._feature_meta.get("feature_order", list(df.columns))
-        X = df[use_cols].copy()
-        preds = self._model.predict_proba(X)[:, 1] if hasattr(self._model, "predict_proba") else self._model.predict(X)
         out = df.copy()
-        out[self._feature_meta.get("prediction_column", "prediction")] = preds
-        self.tracer.trace_event("predict", {"rows": len(out)})
-        return out

+# space/tools/predict_tool.py
 import os
+import json
 import pandas as pd
 import joblib
+from typing import Optional, List
 from huggingface_hub import hf_hub_download
 from utils.config import AppConfig
 from utils.tracing import Tracer
 class PredictTool:
+    """
+    Loads a sklearn-compatible tabular model artifact from a private/public
+    Hugging Face repo and runs batch predictions on a DataFrame.
+    Expects:
+      - model.pkl
+      - feature_metadata.json  (optional but recommended)
+        {
+          "feature_order": ["col1","col2",...],
+          "prediction_column": "prediction",
+          "task": "classification" | "regression"
+        }
+    """
     def __init__(self, cfg: AppConfig, tracer: Tracer):
         self.cfg = cfg
         self.tracer = tracer
         self._model = None
+        self._feature_meta = {}
+        self._pred_col = "prediction"
+        self._feature_order: Optional[List[str]] = None
     def _ensure_loaded(self):
+        if self._model is not None:
+            return
+        token = os.getenv("HF_TOKEN")  # OK if None for public repos
+        repo = self.cfg.hf_model_repo
+        model_path = hf_hub_download(
+            repo_id=repo,
+            filename="model.pkl",
+            token=token
+        )
+        self._model = joblib.load(model_path)
+        # feature metadata is optional; handle gracefully
+        try:
+            meta_path = hf_hub_download(
+                repo_id=repo,
+                filename="feature_metadata.json",
+                token=token
+            )
+            with open(meta_path, "r", encoding="utf-8") as f:
+                self._feature_meta = json.load(f) or {}
+        except Exception:
+            self._feature_meta = {}
+        self._pred_col = self._feature_meta.get("prediction_column", "prediction")
+        self._feature_order = self._feature_meta.get("feature_order")
+    def _select_features(self, df: pd.DataFrame) -> pd.DataFrame:
+        if self._feature_order:
+            # keep only features in the trained order, ignore extras
+            missing = [c for c in self._feature_order if c not in df.columns]
+            if missing:
+                raise ValueError(f"Missing required features for model: {missing}")
+            return df[self._feature_order].copy()
+        # default: use everything present
+        return df.copy()
+    def run(self, df: Optional[pd.DataFrame]) -> pd.DataFrame:
+        """
+        If df is None, returns an empty DataFrame.
+        """
         self._ensure_loaded()
+        if df is None or len(df) == 0:
+            return pd.DataFrame()
+        X = self._select_features(df)
+        model = self._model
+        # classification with probabilities preferred
+        if hasattr(model, "predict_proba"):
+            preds = model.predict_proba(X)[:, -1]
+        elif hasattr(model, "decision_function"):
+            # fallback: map decision function to a score
+            import numpy as np
+            raw = model.decision_function(X)
+            # simple sigmoid to scale-ish if binary
+            preds = 1 / (1 + np.exp(-raw))
+        else:
+            preds = model.predict(X)
         out = df.copy()
+        out[self._pred_col] = preds
+        try:
+            self.tracer.trace_event("predict", {"rows": len(out)})
+        except Exception:
+            pass
+        return out