Molbap HF Staff commited on
Commit
31b1b7e
·
1 Parent(s): 4c965e0
Files changed (3) hide show
  1. app.py +13 -6
  2. build_cache.py +5 -5
  3. modular_graph_and_candidates.py +42 -16
app.py CHANGED
@@ -11,7 +11,7 @@ from huggingface_hub import hf_hub_download
11
 
12
  import gradio as gr
13
 
14
- from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html
15
 
16
  def _escape_srcdoc(text: str) -> str:
17
  """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
@@ -33,14 +33,21 @@ def _fetch_from_cache_repo(kind: str, sim_method: str, threshold: float, multimo
33
  latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
34
  info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
35
  sha = info.get("sha")
36
- key = f"{sha}/{sim_method}-{threshold:.2f}-m{int(multimodal)}"
37
- html_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.html", repo_type="dataset")
38
  json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
39
- raw_html = Path(html_fp).read_text(encoding="utf-8")
40
- json_text = Path(json_fp).read_text(encoding="utf-8")
 
 
 
 
 
 
 
 
41
  iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
42
  tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
43
- tmp.write_text(json_text, encoding="utf-8")
44
  return iframe_html, str(tmp)
45
 
46
 
 
11
 
12
  import gradio as gr
13
 
14
+ from modular_graph_and_candidates import build_graph_json, generate_html, build_timeline_json, generate_timeline_html, filter_graph_by_threshold
15
 
16
  def _escape_srcdoc(text: str) -> str:
17
  """Escape for inclusion inside an <iframe srcdoc="…"> attribute."""
 
33
  latest_fp = hf_hub_download(repo_id=repo_id, filename="latest.json", repo_type="dataset")
34
  info = json.loads(Path(latest_fp).read_text(encoding="utf-8"))
35
  sha = info.get("sha")
36
+ key = f"{sha}/{sim_method}-m{int(multimodal)}"
 
37
  json_fp = hf_hub_download(repo_id=repo_id, filename=f"{kind}/{key}.json", repo_type="dataset")
38
+
39
+ raw_data = json.loads(Path(json_fp).read_text(encoding="utf-8"))
40
+ filtered_data = filter_graph_by_threshold(raw_data, threshold)
41
+
42
+ if kind == "timeline":
43
+ from modular_graph_and_candidates import generate_timeline_html
44
+ raw_html = generate_timeline_html(filtered_data)
45
+ else:
46
+ raw_html = generate_html(filtered_data)
47
+
48
  iframe_html = f'<iframe style="width:100%;height:85vh;border:none;" srcdoc="{_escape_srcdoc(raw_html)}"></iframe>'
49
  tmp = Path(tempfile.mkstemp(suffix=("_timeline.json" if kind == "timeline" else ".json"))[1])
50
+ tmp.write_text(json.dumps(filtered_data), encoding="utf-8")
51
  return iframe_html, str(tmp)
52
 
53
 
build_cache.py CHANGED
@@ -14,7 +14,7 @@ from modular_graph_and_candidates import (
14
 
15
  REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
16
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
17
- THRESH = float(os.getenv("SIM_THRESHOLD", "0.50"))
18
  MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
19
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
20
 
@@ -24,19 +24,19 @@ def main():
24
  sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
25
  repo_path = tmp / "repo"
26
 
27
- graph = build_graph_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
28
- timeline = build_timeline_json(repo_path, threshold=THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
29
  graph_html = generate_html(graph)
30
  timeline_html = generate_timeline_html(timeline)
31
 
32
  api = HfApi()
33
  api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
34
 
35
- key = f"{sha}/{SIM_METHOD}-{THRESH:.2f}-m{int(MULTIMODAL)}"
36
  latest = {
37
  "sha": sha,
38
  "updated_utc": datetime.now(timezone.utc).isoformat(),
39
- "defaults": {"sim_method": SIM_METHOD, "threshold": THRESH, "multimodal": MULTIMODAL},
40
  "paths": {
41
  "graph_json": f"graph/{key}.json",
42
  "graph_html": f"graph/{key}.html",
 
14
 
15
  REPO_URL = os.getenv("REPO_URL", "https://github.com/huggingface/transformers")
16
  CACHE_REPO = "Molbap/hf_cached_embeds_log"
17
+ MIN_THRESH = 0.1 # Minimum threshold for caching similarities
18
  MULTIMODAL = os.getenv("MULTIMODAL", "0") in {"1","true","True","YES","yes"}
19
  SIM_METHOD = os.getenv("SIM_METHOD", "jaccard")
20
 
 
24
  sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=tmp / "repo", text=True).strip()
25
  repo_path = tmp / "repo"
26
 
27
+ graph = build_graph_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
28
+ timeline = build_timeline_json(repo_path, threshold=MIN_THRESH, multimodal=MULTIMODAL, sim_method=SIM_METHOD)
29
  graph_html = generate_html(graph)
30
  timeline_html = generate_timeline_html(timeline)
31
 
32
  api = HfApi()
33
  api.create_repo(repo_id=CACHE_REPO, repo_type="dataset", exist_ok=True)
34
 
35
+ key = f"{sha}/{SIM_METHOD}-m{int(MULTIMODAL)}"
36
  latest = {
37
  "sha": sha,
38
  "updated_utc": datetime.now(timezone.utc).isoformat(),
39
+ "defaults": {"sim_method": SIM_METHOD, "min_threshold": MIN_THRESH, "multimodal": MULTIMODAL},
40
  "paths": {
41
  "graph_json": f"graph/{key}.json",
42
  "graph_html": f"graph/{key}.html",
modular_graph_and_candidates.py CHANGED
@@ -87,8 +87,7 @@ def build_token_bags(models_root: Path) -> Tuple[Dict[str, List[Set[str]]], Dict
87
  def _jaccard(a: Set[str], b: Set[str]) -> float:
88
  return 0.0 if (not a or not b) else len(a & b) / len(a | b)
89
 
90
- def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tuple[str,str], float]:
91
- """Return {(modelA, modelB): score} for pairs with Jaccard ≥ *thr*."""
92
  largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
93
  out: Dict[Tuple[str,str], float] = {}
94
  for m1, m2 in combinations(sorted(largest.keys()), 2):
@@ -98,8 +97,8 @@ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float) -> Dict[Tup
98
  return out
99
 
100
  @spaces.GPU
101
- def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float) -> Dict[Tuple[str, str], float]:
102
- model = SentenceTransformer("codesage/codesage-large-v2", device="cuda", trust_remote_code=True)
103
 
104
  try:
105
  cfg = model[0].auto_model.config
@@ -254,7 +253,27 @@ def compute_similarities_from_cache(threshold: float) -> Dict[Tuple[str, str], f
254
  print(f"⚠️ Failed to compute from cache: {e}")
255
  return {}
256
 
 
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
 
260
 
@@ -316,21 +335,19 @@ def get_missing_models(models_root: Path, multimodal: bool = False) -> Tuple[Lis
316
 
317
  return missing, bags, pix_hits
318
 
319
- def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
320
  threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
321
- """Compute similarities between missing models using specified method."""
322
  if sim_method == "jaccard":
323
- return similarity_clusters({m: bags[m] for m in missing}, threshold)
324
  else:
325
- # Try to use cached embeddings first
326
  embeddings_path = Path("embeddings_cache.npz")
327
  if embeddings_path.exists():
328
- cached_sims = compute_similarities_from_cache(threshold)
329
- if cached_sims: # Cache exists and worked
330
  return cached_sims
331
-
332
- # Fallback to full computation
333
- return embedding_similarity_clusters(models_root, missing, threshold)
334
 
335
  def build_graph_json(
336
  transformers_dir: Path,
@@ -347,7 +364,7 @@ def build_graph_json(
347
  if sim_method == "embedding" and embeddings_cache.exists():
348
  try:
349
  # Try to compute from cache without accessing repo
350
- cached_sims = compute_similarities_from_cache(threshold)
351
  print(f"🔍 Got {len(cached_sims)} cached similarities")
352
 
353
  if cached_sims:
@@ -393,8 +410,13 @@ def build_graph_json(
393
  cls = "derived"
394
  nodelist.append({"id": n, "cls": cls, "sz": 1})
395
 
 
396
  print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
397
- return {"nodes": nodelist, "links": links}
 
 
 
 
398
  except Exception as e:
399
  print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
400
 
@@ -452,7 +474,11 @@ def build_graph_json(
452
  cls = "derived"
453
  nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
454
 
455
- graph = {"nodes": nodelist, "links": links}
 
 
 
 
456
  return graph
457
 
458
 
 
87
  def _jaccard(a: Set[str], b: Set[str]) -> float:
88
  return 0.0 if (not a or not b) else len(a & b) / len(a | b)
89
 
90
+ def similarity_clusters(bags: Dict[str, List[Set[str]]], thr: float = 0.1) -> Dict[Tuple[str,str], float]:
 
91
  largest = {m: max(ts, key=len) for m, ts in bags.items() if ts}
92
  out: Dict[Tuple[str,str], float] = {}
93
  for m1, m2 in combinations(sorted(largest.keys()), 2):
 
97
  return out
98
 
99
  @spaces.GPU
100
+ def embedding_similarity_clusters(models_root: Path, missing: List[str], thr: float = 0.1) -> Dict[Tuple[str, str], float]:
101
+ model = SentenceTransformer("microsoft/codebert-base", trust_remote_code=True)
102
 
103
  try:
104
  cfg = model[0].auto_model.config
 
253
  print(f"⚠️ Failed to compute from cache: {e}")
254
  return {}
255
 
256
+ def filter_similarities_by_threshold(similarities: Dict[Tuple[str, str], float], threshold: float) -> Dict[Tuple[str, str], float]:
257
+ return {pair: score for pair, score in similarities.items() if score >= threshold}
258
 
259
+ def filter_graph_by_threshold(graph_data: dict, threshold: float) -> dict:
260
+ filtered_links = []
261
+ for link in graph_data["links"]:
262
+ if link.get("cand", False):
263
+ try:
264
+ score = float(link["label"].rstrip('%')) / 100.0
265
+ if score >= threshold:
266
+ filtered_links.append(link)
267
+ except (ValueError, AttributeError):
268
+ filtered_links.append(link)
269
+ else:
270
+ filtered_links.append(link)
271
+
272
+ return {
273
+ "nodes": graph_data["nodes"],
274
+ "links": filtered_links,
275
+ **{k: v for k, v in graph_data.items() if k not in ["nodes", "links"]}
276
+ }
277
 
278
 
279
 
 
335
 
336
  return missing, bags, pix_hits
337
 
338
+ def compute_similarities(models_root: Path, missing: List[str], bags: Dict[str, List[Set[str]]],
339
  threshold: float, sim_method: str) -> Dict[Tuple[str, str], float]:
340
+ min_threshold = 0.1
341
  if sim_method == "jaccard":
342
+ return similarity_clusters({m: bags[m] for m in missing}, min_threshold)
343
  else:
 
344
  embeddings_path = Path("embeddings_cache.npz")
345
  if embeddings_path.exists():
346
+ cached_sims = compute_similarities_from_cache(min_threshold)
347
+ if cached_sims:
348
  return cached_sims
349
+
350
+ return embedding_similarity_clusters(models_root, missing, min_threshold)
 
351
 
352
  def build_graph_json(
353
  transformers_dir: Path,
 
364
  if sim_method == "embedding" and embeddings_cache.exists():
365
  try:
366
  # Try to compute from cache without accessing repo
367
+ cached_sims = compute_similarities_from_cache(0.1)
368
  print(f"🔍 Got {len(cached_sims)} cached similarities")
369
 
370
  if cached_sims:
 
410
  cls = "derived"
411
  nodelist.append({"id": n, "cls": cls, "sz": 1})
412
 
413
+ graph = {"nodes": nodelist, "links": links}
414
  print(f"⚡ Built graph from cache: {len(nodelist)} nodes, {len(links)} links")
415
+
416
+ if threshold > 0.1:
417
+ graph = filter_graph_by_threshold(graph, threshold)
418
+
419
+ return graph
420
  except Exception as e:
421
  print(f"⚠️ Cache-only build failed: {e}, falling back to full build")
422
 
 
474
  cls = "derived"
475
  nodelist.append({"id": n, "cls": cls, "sz": 1 + 2*(deg[n]/max_deg)})
476
 
477
+ graph = {"nodes": nodelist, "links": links}
478
+
479
+ if threshold > 0.1:
480
+ graph = filter_graph_by_threshold(graph, threshold)
481
+
482
  return graph
483
 
484