Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Gemini

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

4a6b54a

verified ·

1 Parent(s): d1eb676

Update app.py

Browse files

Files changed (1) hide show

app.py +47 -44

app.py CHANGED Viewed

@@ -3,11 +3,7 @@ import gradio as gr
 import pandas as pd
 import requests
 import io
-import dask.dataframe as dd
 from datasets import load_dataset, Image
-from mlcroissant import Dataset as CroissantDataset
-from huggingface_hub import get_token
-import polars as pl
 import warnings
 import traceback
 import json
@@ -16,37 +12,42 @@ import tempfile # Added for creating temporary files
 # 🤫 Let's ignore those pesky warnings, shall we?
 warnings.filterwarnings("ignore")
-# --- ⚙️ Configuration & Constants ---
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
-        "methods": ["💨 API (requests)", "🧊 Dask", "🥐 Croissant"], "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
-        "methods": ["🐼 Pandas", "💨 API (requests)", "🥐 Croissant"], "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
-        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
-        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
-        "methods": ["🤗 Datasets", "🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
     },
 }
 # --- 헬 Helpers & Utility Functions ---
 def get_auth_headers():
     token = get_token()
     return {"Authorization": f"Bearer {token}"} if token else {}
-# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
 def dataframe_to_outputs(df: pd.DataFrame):
     """
     📜 Takes a DataFrame and transforms it into various formats.
@@ -58,24 +59,17 @@ def dataframe_to_outputs(df: pd.DataFrame):
     df_str = df.astype(str)
     markdown_output = df_str.to_markdown(index=False)
-    # Create a temporary CSV file
     with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
         df.to_csv(tmp_csv.name, index=False)
         csv_path = tmp_csv.name
-    # Create a temporary XLSX file
     with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
         df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
         xlsx_path = tmp_xlsx.name
     tab_delimited_output = df.to_csv(sep='\t', index=False)
-    return (
-        markdown_output,
-        csv_path,
-        xlsx_path,
-        tab_delimited_output,
-    )
 def handle_error(e: Exception, request=None, response=None):
     """
@@ -131,15 +125,13 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
             MAX_PAGES = 5
             PAGE_SIZE = 100
-            if not query:
-                MAX_PAGES = 1
-                outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
-                yield tuple(outputs)
             for page in range(MAX_PAGES):
-                if query:
-                    outputs[2] = f"⏳ Searching page {page + 1}..."
-                    yield tuple(outputs)
                 offset = page * PAGE_SIZE
                 url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
@@ -155,11 +147,8 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
                     yield tuple(outputs)
                     break
-                # --- ✨ FIXED: JSON processing logic ---
-                # Extract the actual data from the 'row' key of each item in the list
                 rows_data = [item['row'] for item in data['rows']]
                 page_df = pd.json_normalize(rows_data)
                 found_in_page = search_dataframe(page_df, query)
                 if not found_in_page.empty:
@@ -167,16 +156,13 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
                     outputs[0] = all_results_df
                     outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
                     outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
-                    if dataset_key == 'inscene':
-                        gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
-                        outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
                     yield tuple(outputs)
             outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
             yield tuple(outputs)
             return
         outputs[2] = f"⏳ Loading data via `{access_method}`..."
         yield tuple(outputs)
@@ -185,13 +171,21 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
             file_path = f"hf://datasets/{repo_id}/"
             if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
             else:
-                try: df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
-                except:
-                     try: df = pd.read_parquet(f"{file_path}train.parquet")
-                     except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
         elif "Datasets" in access_method:
-            ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
-            df = pd.DataFrame(ds)
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
@@ -202,9 +196,19 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
         outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
         outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
         if dataset_key == 'inscene' and not final_df.empty:
-            gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
-            outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
         yield tuple(outputs)
@@ -238,7 +242,6 @@ def create_dataset_tab(dataset_key: str):
             copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")
         code_output = gr.Code(label="💻 Python Code Snippet", language="python")
         debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
         fetch_button.click(
@@ -255,8 +258,8 @@ def create_dataset_tab(dataset_key: str):
 with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
     gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
     gr.Markdown(
-        "Select a dataset, choose an access method, and type a query. "
-        "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
     )
     with gr.Tabs():
         for key in DATASET_CONFIG.keys():

 import pandas as pd
 import requests
 import io
 from datasets import load_dataset, Image
 import warnings
 import traceback
 import json
 # 🤫 Let's ignore those pesky warnings, shall we?
 warnings.filterwarnings("ignore")
+# --- ⚙️ Configuration & Constants (Updated per user feedback) ---
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
+        "methods": ["💨 API (requests)"],  # Kept only working method
+        "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
+        "methods": ["🐼 Pandas", "💨 API (requests)"], # Kept only working methods
+        "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
+        "methods": ["🐼 Pandas", "💨 API (requests)"], # Kept only working methods
+        "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
+        "methods": ["🐼 Pandas"], # Kept only working method
+        "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
+        "methods": ["🤗 Datasets", "🖼️ Datasets (with Images)"], # Refined methods for images
+        "is_public": False,
     },
 }
 # --- 헬 Helpers & Utility Functions ---
 def get_auth_headers():
+    from huggingface_hub import get_token
     token = get_token()
     return {"Authorization": f"Bearer {token}"} if token else {}
 def dataframe_to_outputs(df: pd.DataFrame):
     """
     📜 Takes a DataFrame and transforms it into various formats.
     df_str = df.astype(str)
     markdown_output = df_str.to_markdown(index=False)
     with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
         df.to_csv(tmp_csv.name, index=False)
         csv_path = tmp_csv.name
     with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
         df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
         xlsx_path = tmp_xlsx.name
     tab_delimited_output = df.to_csv(sep='\t', index=False)
+    return (markdown_output, csv_path, xlsx_path, tab_delimited_output)
 def handle_error(e: Exception, request=None, response=None):
     """
             MAX_PAGES = 5
             PAGE_SIZE = 100
+            if not query: MAX_PAGES = 1
+            outputs[2] = "⏳ Fetching data from API..."
+            yield tuple(outputs)
             for page in range(MAX_PAGES):
+                if query: outputs[2] = f"⏳ Searching API page {page + 1}..."
+                yield tuple(outputs)
                 offset = page * PAGE_SIZE
                 url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
                     yield tuple(outputs)
                     break
                 rows_data = [item['row'] for item in data['rows']]
                 page_df = pd.json_normalize(rows_data)
                 found_in_page = search_dataframe(page_df, query)
                 if not found_in_page.empty:
                     outputs[0] = all_results_df
                     outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
                     outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
                     yield tuple(outputs)
             outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
             yield tuple(outputs)
             return
+        # --- All other methods (Pandas, Datasets) ---
         outputs[2] = f"⏳ Loading data via `{access_method}`..."
         yield tuple(outputs)
             file_path = f"hf://datasets/{repo_id}/"
             if repo_id == "fka/awesome-chatgpt-prompts": file_path += "prompts.csv"; df = pd.read_csv(file_path)
             else:
+                try: df = pd.read_parquet(f"{file_path}train.parquet")
+                except: df = pd.read_json(f"{file_path}medical_o1_sft.json")
         elif "Datasets" in access_method:
+            # --- ✨ NEW: Specific logic for image vs. text-only loading ---
+            if "🖼️" in access_method:
+                outputs[2] = "⏳ Loading dataset with full images (may take a moment)..."
+                yield tuple(outputs)
+                ds = load_dataset(repo_id, split='train[:25]') # Load small, non-streaming slice
+                df = pd.DataFrame(ds)
+            else: # "🤗 Datasets"
+                outputs[2] = "⏳ Loading text data via streaming..."
+                yield tuple(outputs)
+                ds = load_dataset(repo_id, split='train', streaming=True).take(500)
+                df = pd.DataFrame(ds)
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
         outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
         outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
+        # --- ✨ NEW: Robust gallery population ---
         if dataset_key == 'inscene' and not final_df.empty:
+            gallery_data = []
+            if 'image' in final_df.columns:
+                for _, row in final_df.iterrows():
+                    # Check if the image data is a valid PIL Image object
+                    if hasattr(row['image'], 'save'):
+                        gallery_data.append((row['image'], row.get('text', '')))
+            if gallery_data:
+                outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+            else:
+                outputs[2] += "\n\n*Note: No images were found or loaded. Use the '🖼️ Datasets (with Images)' method to see visuals.*"
         yield tuple(outputs)
             copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")
         code_output = gr.Code(label="💻 Python Code Snippet", language="python")
         debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
         fetch_button.click(
 with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
     gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
     gr.Markdown(
+        "Select a dataset and a working access method to search. "
+        "For the InScene dataset, choose the 'with Images' option to view pictures."
     )
     with gr.Tabs():
         for key in DATASET_CONFIG.keys():