Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

3537d14

verified ·

1 Parent(s): 55c99d6

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -102

app.py CHANGED Viewed

@@ -47,41 +47,26 @@ except ImportError:
     POLARS_AVAILABLE = False
 # --- ⚙️ Configuration & Constants ---
-def get_available_methods():
-    """🔧 Get available methods based on installed dependencies"""
-    base_methods = ["💨 API (requests)", "🐼 Pandas"]
-    if DATASETS_AVAILABLE:
-        base_methods.append("🤗 Datasets")
-    if POLARS_AVAILABLE:
-        base_methods.append("🧊 Polars")
-    if DASK_AVAILABLE:
-        base_methods.append("🧊 Dask")
-    if CROISSANT_AVAILABLE:
-        base_methods.append("🥐 Croissant")
-    return base_methods
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
-        "methods": get_available_methods(), "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
-        "methods": get_available_methods(), "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
-        "methods": get_available_methods(), "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
-        "methods": get_available_methods(), "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
-        "methods": get_available_methods(), "is_public": False,
     },
 }
@@ -211,11 +196,13 @@ else:
     elif "Pandas" in access_method:
         file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
         return f'''# 🐼 Pandas Access for {repo_id}
 import pandas as pd
 # You may need: huggingface-cli login
-df = pd.read_{"csv" if "csv" in file_path else "parquet"}("hf://datasets/{repo_id}/{file_path}")
 # Search for: "{query}"
 if "{query}":
@@ -230,7 +217,33 @@ print(df.head())
 '''
     elif "Datasets" in access_method:
-        return f'''# 🤗 Datasets Library Access for {repo_id}
 from datasets import load_dataset
 import pandas as pd
@@ -358,65 +371,51 @@ def fetch_data(dataset_key: str, access_method: str, query: str):
         elif "Datasets" in access_method:
             if not DATASETS_AVAILABLE:
                 raise ImportError("datasets library not available. Install with: pip install datasets")
-            ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
-            df = pd.DataFrame(ds)
-        elif "Polars" in access_method:
-            if not POLARS_AVAILABLE:
-                raise ImportError("polars library not available. Install with: pip install polars")
-            outputs[2] = "⏳ Loading with Polars..."
-            yield tuple(outputs)
-            if repo_id == "fka/awesome-chatgpt-prompts":
-                pl_df = pl.read_csv(f"hf://datasets/{repo_id}/prompts.csv")
-            else:
-                pl_df = pl.read_parquet(f"hf://datasets/{repo_id}/train.parquet")
-            df = pl_df.to_pandas()
-        elif "Dask" in access_method:
-            if not DASK_AVAILABLE:
-                raise ImportError("dask library not available. Install with: pip install dask")
-            outputs[2] = "⏳ Loading with Dask..."
-            yield tuple(outputs)
-            dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
-            df = dask_df.head(1000)  # Convert to pandas for processing
-        elif "Croissant" in access_method:
-            if not CROISSANT_AVAILABLE:
-                raise ImportError("mlcroissant library not available. Install with: pip install mlcroissant")
-            outputs[2] = "⏳ Loading with Croissant..."
-            yield tuple(outputs)
-            try:
-                headers = get_auth_headers() if not config["is_public"] else {}
-                croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
-                response = requests.get(croissant_url, headers=headers)
-                response.raise_for_status()
-                jsonld = response.json()
-                # Suppress MLCroissant warnings during dataset creation
-                with warnings.catch_warnings():
-                    warnings.simplefilter("ignore")
-                    ds = CroissantDataset(jsonld=jsonld)
-                    records = list(ds.records("default"))[:1000]  # Take first 1000
-                    df = pd.DataFrame(records)
-            except Exception as croissant_error:
-                # If Croissant fails, fall back to API method
-                outputs[2] = f"⚠️ Croissant method failed, falling back to API method..."
                 yield tuple(outputs)
-                # Retry with API method
-                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset=0&length=100"
-                headers = get_auth_headers() if not config["is_public"] else {}
-                response = requests.get(url, headers=headers)
-                response.raise_for_status()
-                data = response.json()
-                if data.get('rows'):
-                    rows_data = [item['row'] for item in data['rows']]
-                    df = pd.json_normalize(rows_data)
                 else:
-                    raise Exception("No data available from fallback API method")
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
@@ -464,8 +463,10 @@ def create_dataset_tab(dataset_key: str):
         # Show available methods for this dataset
         available_methods = config['methods']
-        if len(available_methods) < 5:  # Some methods missing
-            gr.Markdown(f"**Available methods:** {len(available_methods)} of 6 possible methods")
         with gr.Row():
             access_method = gr.Radio(
@@ -481,7 +482,10 @@ def create_dataset_tab(dataset_key: str):
         fetch_button = gr.Button("🚀 Go Fetch!")
         status_output = gr.Markdown("🏁 Ready to search.")
         df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
-        gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="🖼️ Image Results")
         with gr.Accordion("📂 View/Export Full Results", open=False):
             markdown_output = gr.Markdown(label="📝 Markdown View")
@@ -512,44 +516,46 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as
         "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
     )
-    # Show dependency status
     def get_dependency_status():
-        status = "### 🔧 Available Libraries:\n"
-        status += f"- **💨 API**: ✅ Always available\n"
         status += f"- **🐼 Pandas**: ✅ Available\n"
         status += f"- **🤗 Datasets**: {'✅ Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
-        status += f"- **🧊 Polars**: {'✅ Available' if POLARS_AVAILABLE else '❌ Not installed'}\n"
-        status += f"- **🧊 Dask**: {'✅ Available' if DASK_AVAILABLE else '❌ Not installed'}\n"
-        status += f"- **🥐 Croissant**: {'✅ Available' if CROISSANT_AVAILABLE else '❌ Not installed'}\n"
-        status += f"- **🔑 HF Authentication**: {'✅ Available' if HF_HUB_AVAILABLE else '❌ Not installed'}\n"
         return status
     with gr.Accordion("🔧 Library Status & Quick Start Guide", open=False):
         gr.Markdown(get_dependency_status())
         gr.Markdown("""
-        ### 🚀 Quick Start:
-        1. **🤖 Prompts Tab**: Try API method, search for "translator" or "linux"
-        2. **⚖️ Caselaw Tab**: Try API method, search for "contract" or "court"
-        3. **💰 Finance Tab**: Requires login, try API method first
-        4. **🩺 Medical Tab**: Requires login, try API method first
-        5. **🖼️ InScene Tab**: Requires login, try Datasets method for images
         ### 🔑 Authentication:
-        For gated datasets, run in terminal: `huggingface-cli login`
-        ### 🛠️ Methods:
         - **💨 API**: Fast, reliable, works without login (100 rows max)
         - **🐼 Pandas**: Full dataset access, requires login for gated datasets
-        - **🤗 Datasets**: Good for streaming large datasets
-        - **🧊 Polars/Dask**: Alternative fast data processing
-        - **🥐 Croissant**: Metadata-aware loading (has fallback to API)
-        ### 📦 Missing Libraries:
-        If methods are missing, install with:
-        ```bash
-        pip install datasets polars dask mlcroissant GitPython
-        ```
         """)
     with gr.Tabs():
         for key in DATASET_CONFIG.keys():

     POLARS_AVAILABLE = False
 # --- ⚙️ Configuration & Constants ---
 DATASET_CONFIG = {
     "caselaw": {
         "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
+        "methods": ["💨 API (requests)"], "is_public": True,
     },
     "prompts": {
         "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
+        "methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": True,
     },
     "finance": {
         "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
+        "methods": ["🐼 Pandas", "💨 API (requests)"], "is_public": False,
     },
     "medical": {
         "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
+        "methods": ["🐼 Pandas"], "is_public": False,
     },
     "inscene": {
         "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
+        "methods": ["🤗 Datasets", "🖼️ Datasets with Images"], "is_public": False,
     },
 }
     elif "Pandas" in access_method:
         file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
+        read_function = "read_csv" if "csv" in file_path else "read_parquet"
         return f'''# 🐼 Pandas Access for {repo_id}
 import pandas as pd
 # You may need: huggingface-cli login
+df = pd.{read_function}("hf://datasets/{repo_id}/{file_path}")
 # Search for: "{query}"
 if "{query}":
 '''
     elif "Datasets" in access_method:
+        if "Images" in access_method:
+            return f'''# 🖼️ Datasets Library with Image Access for {repo_id}
+from datasets import load_dataset
+import pandas as pd
+# You may need: huggingface-cli login
+ds = load_dataset("{repo_id}", split="train", streaming=True)
+data = list(ds.take(50))  # Smaller sample for images
+df = pd.DataFrame(data)
+# Process images
+images = []
+for item in data:
+    if 'image' in item and item['image'] is not None:
+        images.append((item['image'], item.get('text', '')))
+print(f"Found {{len(df)}} records with {{len(images)}} images")
+print(df.head())
+# Display first image
+if images:
+    first_image, caption = images[0]
+    first_image.show()  # If PIL Image
+    print(f"Caption: {{caption}}")
+'''
+        else:
+            return f'''# 🤗 Datasets Library Access for {repo_id}
 from datasets import load_dataset
 import pandas as pd
         elif "Datasets" in access_method:
             if not DATASETS_AVAILABLE:
                 raise ImportError("datasets library not available. Install with: pip install datasets")
+            # Special handling for image datasets
+            if dataset_key == 'inscene' and "Images" in access_method:
+                outputs[2] = "🖼️ Loading InScene dataset with image processing..."
                 yield tuple(outputs)
+                # Load with image processing
+                ds = load_dataset(repo_id, split='train', streaming=True)
+                data_list = list(ds.take(50))  # Smaller sample for images
+                df = pd.DataFrame(data_list)
+                # Process images for gallery display
+                gallery_data = []
+                for i, item in enumerate(data_list):
+                    try:
+                        if 'image' in item and item['image'] is not None:
+                            image = item['image']
+                            caption = item.get('text', f'Image {i+1}')
+                            # Convert PIL Image to displayable format
+                            if hasattr(image, 'save'):
+                                gallery_data.append((image, caption))
+                            elif isinstance(image, str):
+                                gallery_data.append((image, caption))
+                        # Limit to first 20 images for performance
+                        if len(gallery_data) >= 20:
+                            break
+                    except Exception as img_error:
+                        continue
+                # Update gallery with images
+                if gallery_data:
+                    outputs[1] = gr.Gallery(gallery_data, label=f"🖼️ Found {len(gallery_data)} Images", height=400, columns=4, rows=2)
+                    outputs[2] = f"🖼️ Loaded {len(df)} records with {len(gallery_data)} images"
                 else:
+                    outputs[2] = "🖼️ Loaded data but no images found to display"
+            else:
+                # Regular datasets loading
+                ds = load_dataset(repo_id, split='train', streaming=True)
+                data_list = list(ds.take(1000))
+                df = pd.DataFrame(data_list)
+                outputs[2] = f"📚 Loaded {len(df)} records via Datasets library"
         outputs[2] = "🔍 Searching loaded data..."
         yield tuple(outputs)
         # Show available methods for this dataset
         available_methods = config['methods']
+        methods_note = f"**Available methods:** {len(available_methods)} tested and working methods"
+        if dataset_key == 'inscene':
+            methods_note += " (🖼️ = Image viewer included)"
+        gr.Markdown(methods_note)
         with gr.Row():
             access_method = gr.Radio(
         fetch_button = gr.Button("🚀 Go Fetch!")
         status_output = gr.Markdown("🏁 Ready to search.")
         df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
+        # Show gallery for InScene dataset or when using image methods
+        show_gallery = (dataset_key == 'inscene')
+        gallery_output = gr.Gallery(visible=show_gallery, label="🖼️ Image Results", height=400, columns=4, rows=2)
         with gr.Accordion("📂 View/Export Full Results", open=False):
             markdown_output = gr.Markdown(label="📝 Markdown View")
         "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
     )
+    # Show dependency status and dataset-specific methods
     def get_dependency_status():
+        status = "### 🔧 Dataset-Specific Methods (Only Working Methods Shown):\n"
+        for key, config in DATASET_CONFIG.items():
+            methods_str = ", ".join(config['methods'])
+            auth_status = "🔐 Requires Auth" if not config['is_public'] else "✅ Public"
+            status += f"- **{config['emoji']} {key.capitalize()}**: {methods_str} ({auth_status})\n"
+        status += "\n### 📚 Library Dependencies:\n"
         status += f"- **🐼 Pandas**: ✅ Available\n"
+        status += f"- **💨 Requests**: ✅ Available\n"
         status += f"- **🤗 Datasets**: {'✅ Available' if DATASETS_AVAILABLE else '❌ Not installed'}\n"
         return status
     with gr.Accordion("🔧 Library Status & Quick Start Guide", open=False):
         gr.Markdown(get_dependency_status())
         gr.Markdown("""
+        ### 🚀 Quick Start Guide:
+        1. **🤖 Prompts**: Try Pandas or API method, search for "translator", "linux", or "writer"
+        2. **⚖️ Caselaw**: Try API method only, search for "contract", "court", or "appeal"
+        3. **💰 Finance**: Try Pandas or API method (requires auth), search for "interest" or "market"
+        4. **🩺 Medical**: Try Pandas method only (requires auth), search for "diagnosis" or "treatment"
+        5. **🖼️ InScene**: Try "🖼️ Datasets with Images" to see actual images, search for "kitchen" or "outdoor"
         ### 🔑 Authentication:
+        For gated datasets (Finance, Medical, InScene), run: `huggingface-cli login`
+        ### 🛠️ Method Explanations:
         - **💨 API**: Fast, reliable, works without login (100 rows max)
         - **🐼 Pandas**: Full dataset access, requires login for gated datasets
+        - **🤗 Datasets**: Standard HuggingFace datasets library
+        - **🖼️ Datasets with Images**: Special image viewer for InScene dataset
+        ### ⚠️ Note:
+        Only working methods are shown for each dataset. Non-functional methods have been removed.
         """)
+        if not DATASETS_AVAILABLE:
+            gr.Markdown("**⚠️ Install datasets library for image viewing:** `pip install datasets`")
     with gr.Tabs():
         for key in DATASET_CONFIG.keys():