Update app.py
Browse files
app.py
CHANGED
|
@@ -3,11 +3,7 @@ import gradio as gr
|
|
| 3 |
import pandas as pd
|
| 4 |
import requests
|
| 5 |
import io
|
| 6 |
-
import dask.dataframe as dd
|
| 7 |
from datasets import load_dataset, Image
|
| 8 |
-
from mlcroissant import Dataset as CroissantDataset
|
| 9 |
-
from huggingface_hub import get_token
|
| 10 |
-
import polars as pl
|
| 11 |
import warnings
|
| 12 |
import traceback
|
| 13 |
import json
|
|
@@ -16,37 +12,42 @@ import tempfile # Added for creating temporary files
|
|
| 16 |
# π€« Let's ignore those pesky warnings, shall we?
|
| 17 |
warnings.filterwarnings("ignore")
|
| 18 |
|
| 19 |
-
# --- βοΈ Configuration & Constants ---
|
| 20 |
DATASET_CONFIG = {
|
| 21 |
"caselaw": {
|
| 22 |
"name": "common-pile/caselaw_access_project", "emoji": "βοΈ",
|
| 23 |
-
"methods": ["π¨ API (requests)",
|
|
|
|
| 24 |
},
|
| 25 |
"prompts": {
|
| 26 |
"name": "fka/awesome-chatgpt-prompts", "emoji": "π€",
|
| 27 |
-
"methods": ["πΌ Pandas", "π¨ API (requests)",
|
|
|
|
| 28 |
},
|
| 29 |
"finance": {
|
| 30 |
"name": "snorkelai/agent-finance-reasoning", "emoji": "π°",
|
| 31 |
-
"methods": ["πΌ Pandas", "
|
|
|
|
| 32 |
},
|
| 33 |
"medical": {
|
| 34 |
"name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "π©Ί",
|
| 35 |
-
"methods": ["πΌ Pandas",
|
|
|
|
| 36 |
},
|
| 37 |
"inscene": {
|
| 38 |
"name": "peteromallet/InScene-Dataset", "emoji": "πΌοΈ",
|
| 39 |
-
"methods": ["π€ Datasets", "
|
|
|
|
| 40 |
},
|
| 41 |
}
|
| 42 |
|
| 43 |
# --- ν¬ Helpers & Utility Functions ---
|
| 44 |
|
| 45 |
def get_auth_headers():
|
|
|
|
| 46 |
token = get_token()
|
| 47 |
return {"Authorization": f"Bearer {token}"} if token else {}
|
| 48 |
|
| 49 |
-
# --- β¨ FIXED: dataframe_to_outputs to use temporary files ---
|
| 50 |
def dataframe_to_outputs(df: pd.DataFrame):
|
| 51 |
"""
|
| 52 |
π Takes a DataFrame and transforms it into various formats.
|
|
@@ -58,24 +59,17 @@ def dataframe_to_outputs(df: pd.DataFrame):
|
|
| 58 |
df_str = df.astype(str)
|
| 59 |
markdown_output = df_str.to_markdown(index=False)
|
| 60 |
|
| 61 |
-
# Create a temporary CSV file
|
| 62 |
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
|
| 63 |
df.to_csv(tmp_csv.name, index=False)
|
| 64 |
csv_path = tmp_csv.name
|
| 65 |
|
| 66 |
-
# Create a temporary XLSX file
|
| 67 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
|
| 68 |
df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
|
| 69 |
xlsx_path = tmp_xlsx.name
|
| 70 |
|
| 71 |
tab_delimited_output = df.to_csv(sep='\t', index=False)
|
| 72 |
|
| 73 |
-
return (
|
| 74 |
-
markdown_output,
|
| 75 |
-
csv_path,
|
| 76 |
-
xlsx_path,
|
| 77 |
-
tab_delimited_output,
|
| 78 |
-
)
|
| 79 |
|
| 80 |
def handle_error(e: Exception, request=None, response=None):
|
| 81 |
"""
|
|
@@ -131,15 +125,13 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
|
|
| 131 |
MAX_PAGES = 5
|
| 132 |
PAGE_SIZE = 100
|
| 133 |
|
| 134 |
-
if not query:
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
yield tuple(outputs)
|
| 138 |
|
| 139 |
for page in range(MAX_PAGES):
|
| 140 |
-
if query:
|
| 141 |
-
|
| 142 |
-
yield tuple(outputs)
|
| 143 |
|
| 144 |
offset = page * PAGE_SIZE
|
| 145 |
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
|
|
@@ -155,11 +147,8 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
|
|
| 155 |
yield tuple(outputs)
|
| 156 |
break
|
| 157 |
|
| 158 |
-
# --- β¨ FIXED: JSON processing logic ---
|
| 159 |
-
# Extract the actual data from the 'row' key of each item in the list
|
| 160 |
rows_data = [item['row'] for item in data['rows']]
|
| 161 |
page_df = pd.json_normalize(rows_data)
|
| 162 |
-
|
| 163 |
found_in_page = search_dataframe(page_df, query)
|
| 164 |
|
| 165 |
if not found_in_page.empty:
|
|
@@ -167,16 +156,13 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
|
|
| 167 |
outputs[0] = all_results_df
|
| 168 |
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
|
| 169 |
outputs[2] = f"β
Found **{len(all_results_df)}** results so far..."
|
| 170 |
-
|
| 171 |
-
if dataset_key == 'inscene':
|
| 172 |
-
gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
|
| 173 |
-
outputs[1] = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400)
|
| 174 |
yield tuple(outputs)
|
| 175 |
|
| 176 |
outputs[2] = f"π Search complete. Found a total of **{len(all_results_df)}** results."
|
| 177 |
yield tuple(outputs)
|
| 178 |
return
|
| 179 |
|
|
|
|
| 180 |
outputs[2] = f"β³ Loading data via `{access_method}`..."
|
| 181 |
yield tuple(outputs)
|
| 182 |
|
|
@@ -185,13 +171,21 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
|
|
| 185 |
file_path = f"hf://datasets/{repo_id}/"
|
| 186 |
if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
|
| 187 |
else:
|
| 188 |
-
try: df = pd.read_parquet(f"{file_path}
|
| 189 |
-
except:
|
| 190 |
-
|
| 191 |
-
except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
|
| 192 |
elif "Datasets" in access_method:
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
outputs[2] = "π Searching loaded data..."
|
| 197 |
yield tuple(outputs)
|
|
@@ -202,9 +196,19 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
|
|
| 202 |
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
|
| 203 |
outputs[2] = f"π Search complete. Found **{len(final_df)}** results."
|
| 204 |
|
|
|
|
| 205 |
if dataset_key == 'inscene' and not final_df.empty:
|
| 206 |
-
gallery_data = [
|
| 207 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
yield tuple(outputs)
|
| 210 |
|
|
@@ -238,7 +242,6 @@ def create_dataset_tab(dataset_key: str):
|
|
| 238 |
copy_output = gr.Code(label="π Copy-Paste (Tab-Delimited)")
|
| 239 |
|
| 240 |
code_output = gr.Code(label="π» Python Code Snippet", language="python")
|
| 241 |
-
|
| 242 |
debug_log_output = gr.Code(label="π Debug Log", visible=False)
|
| 243 |
|
| 244 |
fetch_button.click(
|
|
@@ -255,8 +258,8 @@ def create_dataset_tab(dataset_key: str):
|
|
| 255 |
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
|
| 256 |
gr.Markdown("# π€ Hugging Face Dataset Explorer")
|
| 257 |
gr.Markdown(
|
| 258 |
-
"Select a dataset
|
| 259 |
-
"
|
| 260 |
)
|
| 261 |
with gr.Tabs():
|
| 262 |
for key in DATASET_CONFIG.keys():
|
|
|
|
| 3 |
import pandas as pd
|
| 4 |
import requests
|
| 5 |
import io
|
|
|
|
| 6 |
from datasets import load_dataset, Image
|
|
|
|
|
|
|
|
|
|
| 7 |
import warnings
|
| 8 |
import traceback
|
| 9 |
import json
|
|
|
|
| 12 |
# π€« Let's ignore those pesky warnings, shall we?
|
| 13 |
warnings.filterwarnings("ignore")
|
| 14 |
|
| 15 |
+
# --- βοΈ Configuration & Constants (Updated per user feedback) ---
|
| 16 |
DATASET_CONFIG = {
|
| 17 |
"caselaw": {
|
| 18 |
"name": "common-pile/caselaw_access_project", "emoji": "βοΈ",
|
| 19 |
+
"methods": ["π¨ API (requests)"], # Kept only working method
|
| 20 |
+
"is_public": True,
|
| 21 |
},
|
| 22 |
"prompts": {
|
| 23 |
"name": "fka/awesome-chatgpt-prompts", "emoji": "π€",
|
| 24 |
+
"methods": ["πΌ Pandas", "π¨ API (requests)"], # Kept only working methods
|
| 25 |
+
"is_public": True,
|
| 26 |
},
|
| 27 |
"finance": {
|
| 28 |
"name": "snorkelai/agent-finance-reasoning", "emoji": "π°",
|
| 29 |
+
"methods": ["πΌ Pandas", "π¨ API (requests)"], # Kept only working methods
|
| 30 |
+
"is_public": False,
|
| 31 |
},
|
| 32 |
"medical": {
|
| 33 |
"name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "π©Ί",
|
| 34 |
+
"methods": ["πΌ Pandas"], # Kept only working method
|
| 35 |
+
"is_public": False,
|
| 36 |
},
|
| 37 |
"inscene": {
|
| 38 |
"name": "peteromallet/InScene-Dataset", "emoji": "πΌοΈ",
|
| 39 |
+
"methods": ["π€ Datasets", "πΌοΈ Datasets (with Images)"], # Refined methods for images
|
| 40 |
+
"is_public": False,
|
| 41 |
},
|
| 42 |
}
|
| 43 |
|
| 44 |
# --- ν¬ Helpers & Utility Functions ---
|
| 45 |
|
| 46 |
def get_auth_headers():
|
| 47 |
+
from huggingface_hub import get_token
|
| 48 |
token = get_token()
|
| 49 |
return {"Authorization": f"Bearer {token}"} if token else {}
|
| 50 |
|
|
|
|
| 51 |
def dataframe_to_outputs(df: pd.DataFrame):
|
| 52 |
"""
|
| 53 |
π Takes a DataFrame and transforms it into various formats.
|
|
|
|
| 59 |
df_str = df.astype(str)
|
| 60 |
markdown_output = df_str.to_markdown(index=False)
|
| 61 |
|
|
|
|
| 62 |
with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
|
| 63 |
df.to_csv(tmp_csv.name, index=False)
|
| 64 |
csv_path = tmp_csv.name
|
| 65 |
|
|
|
|
| 66 |
with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
|
| 67 |
df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
|
| 68 |
xlsx_path = tmp_xlsx.name
|
| 69 |
|
| 70 |
tab_delimited_output = df.to_csv(sep='\t', index=False)
|
| 71 |
|
| 72 |
+
return (markdown_output, csv_path, xlsx_path, tab_delimited_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
def handle_error(e: Exception, request=None, response=None):
|
| 75 |
"""
|
|
|
|
| 125 |
MAX_PAGES = 5
|
| 126 |
PAGE_SIZE = 100
|
| 127 |
|
| 128 |
+
if not query: MAX_PAGES = 1
|
| 129 |
+
outputs[2] = "β³ Fetching data from API..."
|
| 130 |
+
yield tuple(outputs)
|
|
|
|
| 131 |
|
| 132 |
for page in range(MAX_PAGES):
|
| 133 |
+
if query: outputs[2] = f"β³ Searching API page {page + 1}..."
|
| 134 |
+
yield tuple(outputs)
|
|
|
|
| 135 |
|
| 136 |
offset = page * PAGE_SIZE
|
| 137 |
url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
|
|
|
|
| 147 |
yield tuple(outputs)
|
| 148 |
break
|
| 149 |
|
|
|
|
|
|
|
| 150 |
rows_data = [item['row'] for item in data['rows']]
|
| 151 |
page_df = pd.json_normalize(rows_data)
|
|
|
|
| 152 |
found_in_page = search_dataframe(page_df, query)
|
| 153 |
|
| 154 |
if not found_in_page.empty:
|
|
|
|
| 156 |
outputs[0] = all_results_df
|
| 157 |
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
|
| 158 |
outputs[2] = f"β
Found **{len(all_results_df)}** results so far..."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
yield tuple(outputs)
|
| 160 |
|
| 161 |
outputs[2] = f"π Search complete. Found a total of **{len(all_results_df)}** results."
|
| 162 |
yield tuple(outputs)
|
| 163 |
return
|
| 164 |
|
| 165 |
+
# --- All other methods (Pandas, Datasets) ---
|
| 166 |
outputs[2] = f"β³ Loading data via `{access_method}`..."
|
| 167 |
yield tuple(outputs)
|
| 168 |
|
|
|
|
| 171 |
file_path = f"hf://datasets/{repo_id}/"
|
| 172 |
if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
|
| 173 |
else:
|
| 174 |
+
try: df = pd.read_parquet(f"{file_path}train.parquet")
|
| 175 |
+
except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
|
| 176 |
+
|
|
|
|
| 177 |
elif "Datasets" in access_method:
|
| 178 |
+
# --- β¨ NEW: Specific logic for image vs. text-only loading ---
|
| 179 |
+
if "πΌοΈ" in access_method:
|
| 180 |
+
outputs[2] = "β³ Loading dataset with full images (may take a moment)..."
|
| 181 |
+
yield tuple(outputs)
|
| 182 |
+
ds = load_dataset(repo_id, split='train[:25]') # Load small, non-streaming slice
|
| 183 |
+
df = pd.DataFrame(ds)
|
| 184 |
+
else: # "π€ Datasets"
|
| 185 |
+
outputs[2] = "β³ Loading text data via streaming..."
|
| 186 |
+
yield tuple(outputs)
|
| 187 |
+
ds = load_dataset(repo_id, split='train', streaming=True).take(500)
|
| 188 |
+
df = pd.DataFrame(ds)
|
| 189 |
|
| 190 |
outputs[2] = "π Searching loaded data..."
|
| 191 |
yield tuple(outputs)
|
|
|
|
| 196 |
outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
|
| 197 |
outputs[2] = f"π Search complete. Found **{len(final_df)}** results."
|
| 198 |
|
| 199 |
+
# --- β¨ NEW: Robust gallery population ---
|
| 200 |
if dataset_key == 'inscene' and not final_df.empty:
|
| 201 |
+
gallery_data = []
|
| 202 |
+
if 'image' in final_df.columns:
|
| 203 |
+
for _, row in final_df.iterrows():
|
| 204 |
+
# Check if the image data is a valid PIL Image object
|
| 205 |
+
if hasattr(row['image'], 'save'):
|
| 206 |
+
gallery_data.append((row['image'], row.get('text', '')))
|
| 207 |
+
|
| 208 |
+
if gallery_data:
|
| 209 |
+
outputs[1] = gr.Gallery(gallery_data, label="πΌοΈ Image Results", height=400)
|
| 210 |
+
else:
|
| 211 |
+
outputs[2] += "\n\n*Note: No images were found or loaded. Use the 'πΌοΈ Datasets (with Images)' method to see visuals.*"
|
| 212 |
|
| 213 |
yield tuple(outputs)
|
| 214 |
|
|
|
|
| 242 |
copy_output = gr.Code(label="π Copy-Paste (Tab-Delimited)")
|
| 243 |
|
| 244 |
code_output = gr.Code(label="π» Python Code Snippet", language="python")
|
|
|
|
| 245 |
debug_log_output = gr.Code(label="π Debug Log", visible=False)
|
| 246 |
|
| 247 |
fetch_button.click(
|
|
|
|
| 258 |
with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
|
| 259 |
gr.Markdown("# π€ Hugging Face Dataset Explorer")
|
| 260 |
gr.Markdown(
|
| 261 |
+
"Select a dataset and a working access method to search. "
|
| 262 |
+
"For the InScene dataset, choose the 'with Images' option to view pictures."
|
| 263 |
)
|
| 264 |
with gr.Tabs():
|
| 265 |
for key in DATASET_CONFIG.keys():
|