Spaces:
Paused
Paused
| import streamlit as st | |
| import pandas as pd | |
| import os | |
| from huggingface_hub import HfApi, list_datasets | |
| from datasets import load_dataset | |
| def search_huggingface_datasets(query, limit=20): | |
| """ | |
| Search for datasets on Hugging Face Hub. | |
| Args: | |
| query: Search query string | |
| limit: Maximum number of results to return | |
| Returns: | |
| List of dataset metadata | |
| """ | |
| try: | |
| api = HfApi() | |
| datasets = list_datasets( | |
| filter=query, | |
| limit=limit | |
| ) | |
| # Convert to list of dicts with relevant info | |
| results = [] | |
| for dataset in datasets: | |
| results.append({ | |
| 'id': dataset.id, | |
| 'name': dataset.id.split('/')[-1], | |
| 'description': dataset.description or "No description available", | |
| 'author': dataset.author or "Unknown", | |
| 'tags': dataset.tags, | |
| 'downloads': dataset.downloads | |
| }) | |
| return results | |
| except Exception as e: | |
| st.error(f"Error searching Hugging Face Hub: {str(e)}") | |
| return [] | |
| def load_huggingface_dataset(dataset_id, split='train'): | |
| """ | |
| Load a dataset from Hugging Face Hub. | |
| Args: | |
| dataset_id: ID of the dataset on HF Hub (e.g., 'mnist', 'glue', etc.) | |
| split: Dataset split to load (e.g., 'train', 'test', 'validation') | |
| Returns: | |
| Pandas DataFrame containing the dataset | |
| """ | |
| try: | |
| # Load the dataset | |
| dataset = load_dataset(dataset_id, split=split) | |
| # Convert to pandas DataFrame | |
| df = dataset.to_pandas() | |
| return df | |
| except Exception as e: | |
| st.error(f"Error loading dataset '{dataset_id}': {str(e)}") | |
| raise | |
| def upload_to_huggingface(dataset, dataset_name, token=None): | |
| """ | |
| Upload a dataset to Hugging Face Hub. | |
| Args: | |
| dataset: Pandas DataFrame to upload | |
| dataset_name: Name for the dataset | |
| token: Hugging Face API token (optional, will use environment variable if not provided) | |
| Returns: | |
| URL to the uploaded dataset | |
| """ | |
| # Get token from environment if not provided | |
| if token is None: | |
| token = os.getenv("HF_TOKEN") | |
| if not token: | |
| raise ValueError("No Hugging Face token provided. Set the HF_TOKEN environment variable or pass a token.") | |
| try: | |
| # Convert to HF dataset | |
| from datasets import Dataset | |
| hf_dataset = Dataset.from_pandas(dataset) | |
| # Upload to HF Hub | |
| push_result = hf_dataset.push_to_hub( | |
| dataset_name, | |
| token=token | |
| ) | |
| return f"https://huggingface.co/datasets/{push_result.repo_id}" | |
| except Exception as e: | |
| st.error(f"Error uploading to Hugging Face Hub: {str(e)}") | |
| raise | |