Spaces:
Running
Running
update
Browse files- app.py +13 -6
- build_cache.py +5 -5
- modular_graph_and_candidates.py +42 -16
app.py
CHANGED
|
@@ -11,7 +11,7 @@ from huggingface_hub import hf_hub_download
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
|
| 14 |
-
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
|
| 15 |
|
| 16 |
def _escape_srcdoc(text: str) -> str:
|
| 17 |
"""Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
|
|
@@ -33,14 +33,21 @@ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimo
|
|
| 33 |
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
|
| 34 |
info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
|
| 35 |
sha = info.get("sha")
|
| 36 |
-
key = f"{sha}/{sim_method}-
|
| 37 |
-
html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
|
| 38 |
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
| 42 |
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
|
| 43 |
-
tmp.write_text(
|
| 44 |
return iframe_html, str(tmp)
|
| 45 |
|
| 46 |
|
|
|
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
|
| 14 |
+
from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html, filter_graph_by_threshold
|
| 15 |
|
| 16 |
def _escape_srcdoc(text: str) -> str:
|
| 17 |
"""Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
|
|
|
|
| 33 |
latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
|
| 34 |
info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
|
| 35 |
sha = info.get("sha")
|
| 36 |
+
key = f"{sha}/{sim_method}-m{int(multimodal)}"
|
|
|
|
| 37 |
json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
|
| 38 |
+
|
| 39 |
+
raw_data = json.loads(Path(json_fp).read_text(encoding="utf-8"))
|
| 40 |
+
filtered_data = filter_graph_by_threshold(raw_data, threshold)
|
| 41 |
+
|
| 42 |
+
if kind == "timeline":
|
| 43 |
+
from modular_graph_and_candidates import generate_timeline_html
|
| 44 |
+
raw_html = generate_timeline_html(filtered_data)
|
| 45 |
+
else:
|
| 46 |
+
raw_html = generate_html(filtered_data)
|
| 47 |
+
|
| 48 |
iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
|
| 49 |
tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
|
| 50 |
+
tmp.write_text(json.dumps(filtered_data), encoding="utf-8")
|
| 51 |
return iframe_html, str(tmp)
|
| 52 |
|
| 53 |
|
build_cache.py
CHANGED
|
@@ -14,7 +14,7 @@ from modular_graph_and_candidates import (
|
|
| 14 |
|
| 15 |
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
|
| 16 |
CACHE_REPO = "Molbap/hf_cached_embeds_log"
|
| 17 |
-
|
| 18 |
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
|
| 19 |
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
|
| 20 |
|
|
@@ -24,19 +24,19 @@ def main():
|
|
| 24 |
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
|
| 25 |
repo_path = tmp / "repo"
|
| 26 |
|
| 27 |
-
graph = build_graph_json(repo_path, threshold=
|
| 28 |
-
timeline = build_timeline_json(repo_path, threshold=
|
| 29 |
graph_html = generate_html(graph)
|
| 30 |
timeline_html = generate_timeline_html(timeline)
|
| 31 |
|
| 32 |
api = HfApi()
|
| 33 |
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
|
| 34 |
|
| 35 |
-
key = f"{sha}/{SIM_METHOD}-
|
| 36 |
latest = {
|
| 37 |
"sha": sha,
|
| 38 |
"updated_utc": datetime.now(timezone.utc).isoformat(),
|
| 39 |
-
"defaults": {"sim_method": SIM_METHOD, "
|
| 40 |
"paths": {
|
| 41 |
"graph_json": f"graph/{key}.json",
|
| 42 |
"graph_html": f"graph/{key}.html",
|
|
|
|
| 14 |
|
| 15 |
REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
|
| 16 |
CACHE_REPO = "Molbap/hf_cached_embeds_log"
|
| 17 |
+
MIN_THRESH = 0.1 # Minimum threshold for caching similarities
|
| 18 |
MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
|
| 19 |
SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
|
| 20 |
|
|
|
|
| 24 |
sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
|
| 25 |
repo_path = tmp / "repo"
|
| 26 |
|
| 27 |
+
graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
|
| 28 |
+
timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
|
| 29 |
graph_html = generate_html(graph)
|
| 30 |
timeline_html = generate_timeline_html(timeline)
|
| 31 |
|
| 32 |
api = HfApi()
|
| 33 |
api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
|
| 34 |
|
| 35 |
+
key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
|
| 36 |
latest = {
|
| 37 |
"sha": sha,
|
| 38 |
"updated_utc": datetime.now(timezone.utc).isoformat(),
|
| 39 |
+
"defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
|
| 40 |
"paths": {
|
| 41 |
"graph_json": f"graph/{key}.json",
|
| 42 |
"graph_html": f"graph/{key}.html",
|
modular_graph_and_candidates.py
CHANGED
|
@@ -87,8 +87,7 @@ def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict
|
|
| 87 |
def _jaccard(a: Set[str], b: Set[str]) -> float:
|
| 88 |
return 0.0 if (not a or not b) else len(a & b) / len(a | b)
|
| 89 |
|
| 90 |
-
def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tuple[str,str], float]:
|
| 91 |
-
"""Return {(modelA, modelB): score} for pairs with Jaccard ≥ *thr*."""
|
| 92 |
largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
|
| 93 |
out: Dict[Tuple[str,str], float] = {}
|
| 94 |
for m1, m2 in combinations(sorted(largest.keys()), 2):
|
|
@@ -98,8 +97,8 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
|
|
| 98 |
return out
|
| 99 |
|
| 100 |
@spaces.GPU
|
| 101 |
-
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
|
| 102 |
-
model = SentenceTransformer("
|
| 103 |
|
| 104 |
try:
|
| 105 |
cfg = model[0].auto_model.config
|
|
@@ -254,7 +253,27 @@ def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], f
|
|
| 254 |
print(f"⚠️ Failed to compute from cache: {e}")
|
| 255 |
return {}
|
| 256 |
|
|
|
|
|
|
|
| 257 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
|
| 259 |
|
| 260 |
|
|
@@ -316,21 +335,19 @@ def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[Lis
|
|
| 316 |
|
| 317 |
return missing, bags, pix_hits
|
| 318 |
|
| 319 |
-
def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
|
| 320 |
threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
|
| 321 |
-
|
| 322 |
if sim_method == "jaccard":
|
| 323 |
-
return similarity_clusters({m: bags[m] for m in missing},
|
| 324 |
else:
|
| 325 |
-
# Try to use cached embeddings first
|
| 326 |
embeddings_path = Path("embeddings_cache.npz")
|
| 327 |
if embeddings_path.exists():
|
| 328 |
-
cached_sims = compute_similarities_from_cache(
|
| 329 |
-
if cached_sims:
|
| 330 |
return cached_sims
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
return embedding_similarity_clusters(models_root, missing, threshold)
|
| 334 |
|
| 335 |
def build_graph_json(
|
| 336 |
transformers_dir: Path,
|
|
@@ -347,7 +364,7 @@ def build_graph_json(
|
|
| 347 |
if sim_method == "embedding" and embeddings_cache.exists():
|
| 348 |
try:
|
| 349 |
# Try to compute from cache without accessing repo
|
| 350 |
-
cached_sims = compute_similarities_from_cache(
|
| 351 |
print(f"🔍 Got {len(cached_sims)} cached similarities")
|
| 352 |
|
| 353 |
if cached_sims:
|
|
@@ -393,8 +410,13 @@ def build_graph_json(
|
|
| 393 |
cls = "derived"
|
| 394 |
nodelist.append({"id": n, "cls": cls, "sz": 1})
|
| 395 |
|
|
|
|
| 396 |
print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
|
| 397 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 398 |
except Exception as e:
|
| 399 |
print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
|
| 400 |
|
|
@@ -452,7 +474,11 @@ def build_graph_json(
|
|
| 452 |
cls = "derived"
|
| 453 |
nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
|
| 454 |
|
| 455 |
-
graph = {"nodes": nodelist, "links": links}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
return graph
|
| 457 |
|
| 458 |
|
|
|
|
| 87 |
def _jaccard(a: Set[str], b: Set[str]) -> float:
|
| 88 |
return 0.0 if (not a or not b) else len(a & b) / len(a | b)
|
| 89 |
|
| 90 |
+
def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float = 0.1) -> Dict[Tuple[str,str], float]:
|
|
|
|
| 91 |
largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
|
| 92 |
out: Dict[Tuple[str,str], float] = {}
|
| 93 |
for m1, m2 in combinations(sorted(largest.keys()), 2):
|
|
|
|
| 97 |
return out
|
| 98 |
|
| 99 |
@spaces.GPU
|
| 100 |
+
def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float = 0.1) -> Dict[Tuple[str, str], float]:
|
| 101 |
+
model = SentenceTransformer("microsoft/codebert-base", trust_remote_code=True)
|
| 102 |
|
| 103 |
try:
|
| 104 |
cfg = model[0].auto_model.config
|
|
|
|
| 253 |
print(f"⚠️ Failed to compute from cache: {e}")
|
| 254 |
return {}
|
| 255 |
|
| 256 |
+
def filter_similarities_by_threshold(similarities: Dict[Tuple[str, str], float], threshold: float) -> Dict[Tuple[str, str], float]:
|
| 257 |
+
return {pair: score for pair, score in similarities.items() if score >= threshold}
|
| 258 |
|
| 259 |
+
def filter_graph_by_threshold(graph_data: dict, threshold: float) -> dict:
|
| 260 |
+
filtered_links = []
|
| 261 |
+
for link in graph_data["links"]:
|
| 262 |
+
if link.get("cand", False):
|
| 263 |
+
try:
|
| 264 |
+
score = float(link["label"].rstrip('%')) / 100.0
|
| 265 |
+
if score >= threshold:
|
| 266 |
+
filtered_links.append(link)
|
| 267 |
+
except (ValueError, AttributeError):
|
| 268 |
+
filtered_links.append(link)
|
| 269 |
+
else:
|
| 270 |
+
filtered_links.append(link)
|
| 271 |
+
|
| 272 |
+
return {
|
| 273 |
+
"nodes": graph_data["nodes"],
|
| 274 |
+
"links": filtered_links,
|
| 275 |
+
**{k: v for k, v in graph_data.items() if k not in ["nodes", "links"]}
|
| 276 |
+
}
|
| 277 |
|
| 278 |
|
| 279 |
|
|
|
|
| 335 |
|
| 336 |
return missing, bags, pix_hits
|
| 337 |
|
| 338 |
+
def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
|
| 339 |
threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
|
| 340 |
+
min_threshold = 0.1
|
| 341 |
if sim_method == "jaccard":
|
| 342 |
+
return similarity_clusters({m: bags[m] for m in missing}, min_threshold)
|
| 343 |
else:
|
|
|
|
| 344 |
embeddings_path = Path("embeddings_cache.npz")
|
| 345 |
if embeddings_path.exists():
|
| 346 |
+
cached_sims = compute_similarities_from_cache(min_threshold)
|
| 347 |
+
if cached_sims:
|
| 348 |
return cached_sims
|
| 349 |
+
|
| 350 |
+
return embedding_similarity_clusters(models_root, missing, min_threshold)
|
|
|
|
| 351 |
|
| 352 |
def build_graph_json(
|
| 353 |
transformers_dir: Path,
|
|
|
|
| 364 |
if sim_method == "embedding" and embeddings_cache.exists():
|
| 365 |
try:
|
| 366 |
# Try to compute from cache without accessing repo
|
| 367 |
+
cached_sims = compute_similarities_from_cache(0.1)
|
| 368 |
print(f"🔍 Got {len(cached_sims)} cached similarities")
|
| 369 |
|
| 370 |
if cached_sims:
|
|
|
|
| 410 |
cls = "derived"
|
| 411 |
nodelist.append({"id": n, "cls": cls, "sz": 1})
|
| 412 |
|
| 413 |
+
graph = {"nodes": nodelist, "links": links}
|
| 414 |
print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
|
| 415 |
+
|
| 416 |
+
if threshold > 0.1:
|
| 417 |
+
graph = filter_graph_by_threshold(graph, threshold)
|
| 418 |
+
|
| 419 |
+
return graph
|
| 420 |
except Exception as e:
|
| 421 |
print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
|
| 422 |
|
|
|
|
| 474 |
cls = "derived"
|
| 475 |
nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
|
| 476 |
|
| 477 |
+
graph = {"nodes": nodelist, "links": links}
|
| 478 |
+
|
| 479 |
+
if threshold > 0.1:
|
| 480 |
+
graph = filter_graph_by_threshold(graph, threshold)
|
| 481 |
+
|
| 482 |
return graph
|
| 483 |
|
| 484 |
|