| # from huggingface_hub import list_datasets, DatasetCard | |
| # import re | |
| # import pandas as pd | |
| # import os | |
| # import time | |
| # import random | |
| # from concurrent.futures import ThreadPoolExecutor, as_completed | |
| # from requests.exceptions import HTTPError | |
| # # ---------- Retry helper ---------- | |
| # def retry_load_card(dataset_id, retries=5, base_wait=60): | |
| # """ | |
| # Try to load a dataset card with retries if 429 (rate limit) occurs. | |
| # Uses Retry-After header if available, otherwise exponential backoff. | |
| # """ | |
| # for attempt in range(retries): | |
| # try: | |
| # return DatasetCard.load(dataset_id) | |
| # except HTTPError as e: | |
| # if e.response is not None and e.response.status_code == 429: | |
| # wait_time = e.response.headers.get("Retry-After") | |
| # if wait_time is not None: | |
| # wait_time = int(wait_time) | |
| # else: | |
| # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) | |
| # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") | |
| # time.sleep(wait_time) | |
| # continue | |
| # else: | |
| # raise # don't retry for other HTTP errors | |
| # except Exception as e: | |
| # print(f"[ERROR] {dataset_id}: {e}") | |
| # raise | |
| # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") | |
| # # ---------- Heuristic functions with reasons ---------- | |
| # def check_card_quality(card_text, metadata, dataset_url): | |
| # reasons = [] | |
| # length = len(card_text) | |
| # word_count = len(card_text.split()) | |
| # if metadata is None or len(metadata) == 0: | |
| # print(length, word_count, dataset_url) | |
| # if length < 200: | |
| # reasons.append("No metadata and no description") | |
| # return "minimal", reasons, word_count | |
| # else: | |
| # reasons.append("No metadata but has description") | |
| # return "minimal", reasons, word_count | |
| # else: | |
| # if length < 200: | |
| # reasons.append(f"Short description (char count={length}, words={word_count})") | |
| # return "minimal", reasons, word_count | |
| # else: | |
| # return "rich", reasons, word_count | |
| # # ---------- Worker function for one dataset ---------- | |
| # def process_dataset(ds, save_dir): | |
| # try: | |
| # card = retry_load_card(ds.id) | |
| # card_text = card.text or "" | |
| # metadata = card.data.to_dict() if card.data else {} | |
| # dataset_url = f"https://huggingface.co/datasets/{ds.id}" | |
| # # Save README locally | |
| # readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") | |
| # with open(readme_path, "w", encoding="utf-8") as f: | |
| # f.write(card_text) | |
| # category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) | |
| # row = { | |
| # "dataset_id": ds.id, | |
| # "dataset_url": dataset_url, | |
| # "downloads": getattr(ds, "downloads", None), | |
| # "reason": "; ".join(reasons), | |
| # "readme_path": readme_path, | |
| # "word_count": word_count, | |
| # "category": category, | |
| # } | |
| # return row | |
| # except Exception as e: | |
| # return { | |
| # "dataset_id": ds.id, | |
| # "dataset_url": f"https://huggingface.co/datasets/{ds.id}", | |
| # "downloads": getattr(ds, "downloads", None), | |
| # "reason": f"Failed to load card", | |
| # "readme_path": None, | |
| # "word_count": 0, | |
| # "category": "minimal", | |
| # } | |
| # # ---------- Main ---------- | |
| # def collect_dataset_ids(limit=1000, save_dir="dataset_readmes", max_workers=16): | |
| # minimal_results = [] | |
| # rich_results = [] | |
| # os.makedirs(save_dir, exist_ok=True) | |
| # print(f"Fetching up to {limit} datasets (sorted by downloads)...") | |
| # datasets = list_datasets() | |
| # with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| # futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] | |
| # for i, f in enumerate(as_completed(futures), 1): | |
| # row = f.result() | |
| # if row["category"] == "minimal": | |
| # minimal_results.append(row) | |
| # else: | |
| # rich_results.append(row) | |
| # return minimal_results, rich_results | |
| # if __name__ == "__main__": | |
| # minimal, rich = collect_dataset_ids(limit=1000, max_workers=16) | |
| # # Save separate CSV files | |
| # if minimal: | |
| # pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) | |
| # if rich: | |
| # pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) | |
| # print("\nSaved results to:") | |
| # if minimal: | |
| # print(" - minimal_dataset_cards.csv") | |
| # if rich: | |
| # print(" - rich_dataset_cards.csv") | |
| # print(" - README files in ./dataset_readmes/") | |
| # print("\nSummary:") | |
| # print(f"Minimal: {len(minimal)}") | |
| # print(f"Rich: {len(rich)}") | |
| from huggingface_hub import list_datasets, DatasetCard | |
| import re | |
| import pandas as pd | |
| import os | |
| import time | |
| import random | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| from requests.exceptions import HTTPError | |
| # # ---------- Retry helper ---------- | |
| # def retry_load_card(dataset_id, retries=5, base_wait=60): | |
| # for attempt in range(retries): | |
| # try: | |
| # return DatasetCard.load(dataset_id) | |
| # except HTTPError as e: | |
| # if e.response is not None and e.response.status_code == 429: | |
| # wait_time = e.response.headers.get("Retry-After") | |
| # if wait_time is not None: | |
| # wait_time = int(wait_time) | |
| # else: | |
| # wait_time = base_wait * (2 ** attempt) + random.randint(0, 10) | |
| # print(f"[429] Rate limit hit for {dataset_id}. Sleeping {wait_time}s (attempt {attempt+1}/{retries})...") | |
| # time.sleep(wait_time) | |
| # continue | |
| # else: | |
| # raise | |
| # except Exception as e: | |
| # print(f"[ERROR] {dataset_id}: {e}") | |
| # raise | |
| # raise RuntimeError(f"Failed to load {dataset_id} after {retries} retries.") | |
| # ---------- Heuristic functions with reasons ---------- | |
| def check_card_quality(card_text, metadata, dataset_url): | |
| reasons = [] | |
| length = len(card_text) | |
| word_count = len(card_text.split()) | |
| if metadata is None or len(metadata) == 0: | |
| print(length, word_count, dataset_url) | |
| if length < 200: | |
| reasons.append("No metadata and no description") | |
| return "minimal", reasons, word_count | |
| else: | |
| reasons.append("No metadata but has description") | |
| return "minimal", reasons, word_count | |
| else: | |
| if length < 200: | |
| reasons.append(f"Short description (char count={length}, words={word_count})") | |
| return "minimal", reasons, word_count | |
| else: | |
| return "rich", reasons, word_count | |
| # ---------- Worker function for one dataset ---------- | |
| def process_dataset(ds, save_dir): | |
| try: | |
| card = DatasetCard.load(ds.id) | |
| card_text = card.text or "" | |
| metadata = card.data.to_dict() if card.data else {} | |
| dataset_url = f"https://huggingface.co/datasets/{ds.id}" | |
| # Save README locally | |
| readme_path = os.path.join(save_dir, f"{ds.id.replace('/', '__')}_README.md") | |
| with open(readme_path, "w", encoding="utf-8") as f: | |
| f.write(card_text) | |
| category, reasons, word_count = check_card_quality(card_text, metadata, dataset_url) | |
| row = { | |
| "dataset_id": ds.id, | |
| "dataset_url": dataset_url, | |
| "downloads": getattr(ds, "downloads", None), | |
| "author": metadata.get("author", None), | |
| "license": metadata.get("license", None), | |
| "tags": ", ".join(metadata.get("tags", [])) if metadata.get("tags") else None, | |
| "task_categories": ", ".join(metadata.get("task_categories", [])) if metadata.get("task_categories") else None, | |
| "last_modified": getattr(ds, "lastModified", None), | |
| "reason": "; ".join(reasons), | |
| "readme_path": readme_path, | |
| "word_count": word_count, | |
| "category": category, | |
| } | |
| return row | |
| except Exception as e: | |
| return { | |
| "dataset_id": ds.id, | |
| "dataset_url": f"https://huggingface.co/datasets/{ds.id}", | |
| "downloads": getattr(ds, "downloads", None), | |
| "author": None, | |
| "license": None, | |
| "tags": None, | |
| "task_categories": None, | |
| "last_modified": None, | |
| "reason": "Failed to load card", | |
| "readme_path": None, | |
| "word_count": 0, | |
| "category": "minimal", | |
| } | |
| # ---------- Main ---------- | |
| def collect_dataset_ids(save_dir="dataset_readmes", max_workers=16): | |
| minimal_results = [] | |
| rich_results = [] | |
| os.makedirs(save_dir, exist_ok=True) | |
| datasets = list_datasets() | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| futures = [executor.submit(process_dataset, ds, save_dir) for ds in datasets] | |
| for i, f in enumerate(as_completed(futures), 1): | |
| row = f.result() | |
| if row["category"] == "minimal": | |
| minimal_results.append(row) | |
| else: | |
| rich_results.append(row) | |
| return minimal_results, rich_results | |
| if __name__ == "__main__": | |
| minimal, rich = collect_dataset_ids(limit=100, max_workers=16) | |
| # Save separate CSV files | |
| if minimal: | |
| pd.DataFrame(minimal).to_csv("all_minimal_dataset_cards.csv", index=False) | |
| if rich: | |
| pd.DataFrame(rich).to_csv("all_rich_dataset_cards.csv", index=False) | |
| print("\nSaved results to:") | |
| if minimal: | |
| print(" - minimal_dataset_cards.csv") | |
| if rich: | |
| print(" - rich_dataset_cards.csv") | |
| print(" - README files in ./dataset_readmes/") | |
| print("\nSummary:") | |
| print(f"Minimal: {len(minimal)}") | |
| print(f"Rich: {len(rich)}") | |