timeline_1

Running

App Files Files Community

Molbap HF Staff commited on Sep 24

Commit

31b1b7e

1 Parent(s): 4c965e0

update

Browse files

Files changed (3) hide show

app.py +13 -6
build_cache.py +5 -5
modular_graph_and_candidates.py +42 -16

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ from huggingface_hub import hf_hub_download
 import gradio as gr
-from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
 def _escape_srcdoc(text: str) -> str:
     """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
@@ -33,14 +33,21 @@ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimo
     latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
     info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
     sha = info.get("sha")
-    key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
-    html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
     json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
-    raw_html = Path(html_fp).read_text(encoding="utf-8")
-    json_text = Path(json_fp).read_text(encoding="utf-8")
     iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
     tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
-    tmp.write_text(json_text, encoding="utf-8")
     return iframe_html, str(tmp)

 import gradio as gr
+from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html, filter_graph_by_threshold
 def _escape_srcdoc(text: str) -> str:
     """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
     latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
     info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
     sha = info.get("sha")
+    key = f"{sha}/{sim_method}-m{int(multimodal)}"
     json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
+    raw_data = json.loads(Path(json_fp).read_text(encoding="utf-8"))
+    filtered_data = filter_graph_by_threshold(raw_data, threshold)
+    if kind == "timeline":
+        from modular_graph_and_candidates import generate_timeline_html
+        raw_html = generate_timeline_html(filtered_data)
+    else:
+        raw_html = generate_html(filtered_data)
     iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
     tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
+    tmp.write_text(json.dumps(filtered_data), encoding="utf-8")
     return iframe_html, str(tmp)

build_cache.py CHANGED Viewed

@@ -14,7 +14,7 @@ from modular_graph_and_candidates import (
 REPO_URL   = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
 CACHE_REPO = "Molbap/hf_cached_embeds_log"
-THRESH     = float(os.getenv("SIM_THRESHOLD", "0.50"))
 MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
 SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
@@ -24,19 +24,19 @@ def main():
     sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
     repo_path = tmp / "repo"
-    graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
-    timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
     graph_html = generate_html(graph)
     timeline_html = generate_timeline_html(timeline)
     api = HfApi()
     api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
-    key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
     latest = {
         "sha": sha,
         "updated_utc": datetime.now(timezone.utc).isoformat(),
-        "defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
         "paths": {
             "graph_json":    f"graph/{key}.json",
             "graph_html":    f"graph/{key}.html",

 REPO_URL   = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
 CACHE_REPO = "Molbap/hf_cached_embeds_log"
+MIN_THRESH = 0.1  # Minimum threshold for caching similarities
 MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
 SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
     sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
     repo_path = tmp / "repo"
+    graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
+    timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
     graph_html = generate_html(graph)
     timeline_html = generate_timeline_html(timeline)
     api = HfApi()
     api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
+    key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
     latest = {
         "sha": sha,
         "updated_utc": datetime.now(timezone.utc).isoformat(),
+        "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
         "paths": {
             "graph_json":    f"graph/{key}.json",
             "graph_html":    f"graph/{key}.html",

modular_graph_and_candidates.py CHANGED Viewed

@@ -87,8 +87,7 @@ def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict
 def _jaccard(a: Set[str], b: Set[str]) -> float:
     return 0.0 if (not a or not b) else len(a & b) / len(a | b)
-def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tuple[str,str], float]:
-    """Return {(modelA, modelB): score} for pairs with Jaccard ≥ *thr*."""
     largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
     out: Dict[Tuple[str,str], float] = {}
     for m1, m2 in combinations(sorted(largest.keys()), 2):
@@ -98,8 +97,8 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
     return out
 @spaces.GPU
-def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
-    model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
     try:
         cfg = model[0].auto_model.config
@@ -254,7 +253,27 @@ def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], f
         print(f"⚠️  Failed to compute from cache: {e}")
         return {}
@@ -316,21 +335,19 @@ def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[Lis
     return missing, bags, pix_hits
-def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
                         threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
-    """Compute similarities between missing models using specified method."""
     if sim_method == "jaccard":
-        return similarity_clusters({m: bags[m] for m in missing}, threshold)
     else:
-        # Try to use cached embeddings first
         embeddings_path = Path("embeddings_cache.npz")
         if embeddings_path.exists():
-            cached_sims = compute_similarities_from_cache(threshold)
-            if cached_sims:  # Cache exists and worked
                 return cached_sims
-        # Fallback to full computation
-        return embedding_similarity_clusters(models_root, missing, threshold)
 def build_graph_json(
     transformers_dir: Path,
@@ -347,7 +364,7 @@ def build_graph_json(
     if sim_method == "embedding" and embeddings_cache.exists():
         try:
             # Try to compute from cache without accessing repo
-            cached_sims = compute_similarities_from_cache(threshold)
             print(f"🔍 Got {len(cached_sims)} cached similarities")
             if cached_sims:
@@ -393,8 +410,13 @@ def build_graph_json(
                         cls = "derived"
                     nodelist.append({"id": n, "cls": cls, "sz": 1})
                 print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
-                return {"nodes": nodelist, "links": links}
         except Exception as e:
             print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
@@ -452,7 +474,11 @@ def build_graph_json(
             cls = "derived"
         nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
-    graph = {"nodes": nodelist, "links": links}
     return graph

 def _jaccard(a: Set[str], b: Set[str]) -> float:
     return 0.0 if (not a or not b) else len(a & b) / len(a | b)
+def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float = 0.1) -> Dict[Tuple[str,str], float]:
     largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
     out: Dict[Tuple[str,str], float] = {}
     for m1, m2 in combinations(sorted(largest.keys()), 2):
     return out
 @spaces.GPU
+def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float = 0.1) -> Dict[Tuple[str, str], float]:
+    model = SentenceTransformer("microsoft/codebert-base", trust_remote_code=True)
     try:
         cfg = model[0].auto_model.config
         print(f"⚠️  Failed to compute from cache: {e}")
         return {}
+def filter_similarities_by_threshold(similarities: Dict[Tuple[str, str], float], threshold: float) -> Dict[Tuple[str, str], float]:
+    return {pair: score for pair, score in similarities.items() if score >= threshold}
+def filter_graph_by_threshold(graph_data: dict, threshold: float) -> dict:
+    filtered_links = []
+    for link in graph_data["links"]:
+        if link.get("cand", False):
+            try:
+                score = float(link["label"].rstrip('%')) / 100.0
+                if score >= threshold:
+                    filtered_links.append(link)
+            except (ValueError, AttributeError):
+                filtered_links.append(link)
+        else:
+            filtered_links.append(link)
+    return {
+        "nodes": graph_data["nodes"],
+        "links": filtered_links,
+        **{k: v for k, v in graph_data.items() if k not in ["nodes", "links"]}
+    }
     return missing, bags, pix_hits
+def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
                         threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
+    min_threshold = 0.1
     if sim_method == "jaccard":
+        return similarity_clusters({m: bags[m] for m in missing}, min_threshold)
     else:
         embeddings_path = Path("embeddings_cache.npz")
         if embeddings_path.exists():
+            cached_sims = compute_similarities_from_cache(min_threshold)
+            if cached_sims:
                 return cached_sims
+        return embedding_similarity_clusters(models_root, missing, min_threshold)
 def build_graph_json(
     transformers_dir: Path,
     if sim_method == "embedding" and embeddings_cache.exists():
         try:
             # Try to compute from cache without accessing repo
+            cached_sims = compute_similarities_from_cache(0.1)
             print(f"🔍 Got {len(cached_sims)} cached similarities")
             if cached_sims:
                         cls = "derived"
                     nodelist.append({"id": n, "cls": cls, "sz": 1})
+                graph = {"nodes": nodelist, "links": links}
                 print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
+                if threshold > 0.1:
+                    graph = filter_graph_by_threshold(graph, threshold)
+                return graph
         except Exception as e:
             print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
             cls = "derived"
         nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
+    graph = {"nodes": nodelist, "links": links}
+    if threshold > 0.1:
+        graph = filter_graph_by_threshold(graph, threshold)
     return graph