Spaces:

awacke1
/

Gradio-Med-Law-Fin-Scene-Claude

Sleeping

App Files Files Community

awacke1 commited on Jul 19

Commit

e293bcf

verified ·

1 Parent(s): 8e8efbf

Create app.py.v1

Browse files

Files changed (1) hide show

app.py.v1 +413 -0

app.py.v1 ADDED Viewed

	@@ -0,0 +1,413 @@

+# app.py
+import gradio as gr
+import pandas as pd
+import requests
+import io
+import dask.dataframe as dd
+from datasets import load_dataset, Image
+from mlcroissant import Dataset as CroissantDataset
+from huggingface_hub import get_token
+import polars as pl
+import warnings
+import traceback
+import json
+import tempfile # Added for creating temporary files
+# 🤫 Let's ignore those pesky warnings, shall we?
+warnings.filterwarnings("ignore")
+# --- ⚙️ Configuration & Constants ---
+DATASET_CONFIG = {
+    "caselaw": {
+        "name": "common-pile/caselaw_access_project", "emoji": "⚖️",
+        "methods": ["💨 API (requests)", "🧊 Dask", "🥐 Croissant"], "is_public": True,
+    },
+    "prompts": {
+        "name": "fka/awesome-chatgpt-prompts", "emoji": "🤖",
+        "methods": ["🐼 Pandas", "💨 API (requests)", "🥐 Croissant"], "is_public": True,
+    },
+    "finance": {
+        "name": "snorkelai/agent-finance-reasoning", "emoji": "💰",
+        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
+    },
+    "medical": {
+        "name": "FreedomIntelligence/medical-o1-reasoning-SFT", "emoji": "🩺",
+        "methods": ["🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
+    },
+    "inscene": {
+        "name": "peteromallet/InScene-Dataset", "emoji": "🖼️",
+        "methods": ["🤗 Datasets", "🐼 Pandas", "🧊 Polars", "💨 API (requests)", "🥐 Croissant"], "is_public": False,
+    },
+}
+# --- 🔧 Helpers & Utility Functions ---
+def get_auth_headers():
+    token = get_token()
+    return {"Authorization": f"Bearer {token}"} if token else {}
+# --- ✨ FIXED: dataframe_to_outputs to use temporary files ---
+def dataframe_to_outputs(df: pd.DataFrame):
+    """
+    📜 Takes a DataFrame and transforms it into various formats.
+    Now uses temporary files for maximum Gradio compatibility.
+    """
+    if df.empty:
+        return "No results found. 🤷", None, None, "No results to copy."
+    df_str = df.astype(str)
+    markdown_output = df_str.to_markdown(index=False)
+    # Create a temporary CSV file
+    with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.csv', encoding='utf-8') as tmp_csv:
+        df.to_csv(tmp_csv.name, index=False)
+        csv_path = tmp_csv.name
+    # Create a temporary XLSX file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.xlsx') as tmp_xlsx:
+        df.to_excel(tmp_xlsx.name, index=False, engine='openpyxl')
+        xlsx_path = tmp_xlsx.name
+    tab_delimited_output = df.to_csv(sep='\t', index=False)
+    return (
+        markdown_output,
+        csv_path,
+        xlsx_path,
+        tab_delimited_output,
+    )
+def handle_error(e: Exception, request=None, response=None):
+    """
+    😱 Oh no! An error! This function now creates a detailed debug log.
+    """
+    error_message = f"🚨 An error occurred: {str(e)}\n"
+    auth_tip = "🔑 For gated datasets, did you log in? Try `huggingface-cli login` in your terminal."
+    full_trace = traceback.format_exc()
+    print(full_trace)
+    if "401" in str(e) or "Gated" in str(e):
+        error_message += auth_tip
+    debug_log = f"""--- 🐞 DEBUG LOG ---\nTraceback:\n{full_trace}\n\nException Type: {type(e).__name__}\nException Details: {e}\n"""
+    if request:
+        debug_log += f"""\n--- REQUEST ---\nMethod: {request.method}\nURL: {request.url}\nHeaders: {json.dumps(dict(request.headers), indent=2)}\n"""
+    if response is not None:
+        try:
+            response_text = json.dumps(response.json(), indent=2)
+        except json.JSONDecodeError:
+            response_text = response.text
+        debug_log += f"""\n--- RESPONSE ---\nStatus Code: {response.status_code}\nHeaders: {json.dumps(dict(response.headers), indent=2)}\nContent:\n{response_text}\n"""
+    return (
+        pd.DataFrame(), gr.Gallery(None), f"### 🚨 Error\nAn error occurred. See the debug log below for details.",
+        "", None, None, "", f"```python\n# 🚨 Error during execution:\n# {e}\n```",
+        gr.Code(value=debug_log, visible=True)
+    )
+def search_dataframe(df: pd.DataFrame, query: str):
+    if not query:
+        return df.head(100)
+    string_cols = df.select_dtypes(include=['object', 'string']).columns
+    if string_cols.empty:
+        return pd.DataFrame()
+    mask = pd.Series([False] * len(df))
+    for col in string_cols:
+        mask |= df[col].astype(str).str.contains(query, case=False, na=False)
+    return df[mask]
+def generate_code_snippet(dataset_key: str, access_method: str, query: str):
+    """
+    💻 Generate Python code snippet for the current operation
+    """
+    config = DATASET_CONFIG[dataset_key]
+    repo_id = config["name"]
+    if "API" in access_method:
+        return f'''# 🌐 API Access for {repo_id}
+import requests
+import pandas as pd
+url = "https://datasets-server.huggingface.co/rows"
+params = {{
+    "dataset": "{repo_id}",
+    "config": "default",
+    "split": "train",
+    "offset": 0,
+    "length": 100
+}}
+headers = {{"Authorization": "Bearer YOUR_HF_TOKEN"}} if needed else {{}}
+response = requests.get(url, params=params, headers=headers)
+if response.status_code == 200:
+    data = response.json()
+    rows_data = [item['row'] for item in data['rows']]
+    df = pd.json_normalize(rows_data)
+    # Search for: "{query}"
+    if "{query}":
+        string_cols = df.select_dtypes(include=['object', 'string']).columns
+        mask = pd.Series([False] * len(df))
+        for col in string_cols:
+            mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
+        df = df[mask]
+    print(f"Found {{len(df)}} results")
+    print(df.head())
+else:
+    print(f"Error: {{response.status_code}} - {{response.text}}")
+'''
+    elif "Pandas" in access_method:
+        file_path = "prompts.csv" if repo_id == "fka/awesome-chatgpt-prompts" else "train.parquet"
+        return f'''# 🐼 Pandas Access for {repo_id}
+import pandas as pd
+# You may need: huggingface-cli login
+df = pd.read_{"csv" if "csv" in file_path else "parquet"}("hf://datasets/{repo_id}/{file_path}")
+# Search for: "{query}"
+if "{query}":
+    string_cols = df.select_dtypes(include=['object', 'string']).columns
+    mask = pd.Series([False] * len(df))
+    for col in string_cols:
+        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
+    df = df[mask]
+print(f"Found {{len(df)}} results")
+print(df.head())
+'''
+    elif "Datasets" in access_method:
+        return f'''# 🤗 Datasets Library Access for {repo_id}
+from datasets import load_dataset
+import pandas as pd
+# You may need: huggingface-cli login
+ds = load_dataset("{repo_id}", split="train", streaming=True)
+data = list(ds.take(1000))
+df = pd.DataFrame(data)
+# Search for: "{query}"
+if "{query}":
+    string_cols = df.select_dtypes(include=['object', 'string']).columns
+    mask = pd.Series([False] * len(df))
+    for col in string_cols:
+        mask |= df[col].astype(str).str.contains("{query}", case=False, na=False)
+    df = df[mask]
+print(f"Found {{len(df)}} results")
+print(df.head())
+'''
+    else:
+        return f"# Code generation for {access_method} not implemented yet"
+# --- 🎣 Data Fetching & Processing Functions ---
+def fetch_data(dataset_key: str, access_method: str, query: str):
+    """
+    🚀 Main mission control. Always yields a tuple of 9 values to match the UI components.
+    """
+    outputs = [pd.DataFrame(), None, "🏁 Ready.", "", None, None, "", "", gr.Code(visible=False)]
+    req, res = None, None
+    try:
+        config = DATASET_CONFIG[dataset_key]
+        repo_id = config["name"]
+        # Generate code snippet
+        code_snippet = generate_code_snippet(dataset_key, access_method, query)
+        outputs[7] = code_snippet
+        if "API" in access_method:
+            all_results_df = pd.DataFrame()
+            MAX_PAGES = 5
+            PAGE_SIZE = 100
+            if not query:
+                MAX_PAGES = 1
+                outputs[2] = "⏳ No search term. Fetching first 100 records as a sample..."
+                yield tuple(outputs)
+            for page in range(MAX_PAGES):
+                if query:
+                    outputs[2] = f"⏳ Searching page {page + 1}..."
+                    yield tuple(outputs)
+                offset = page * PAGE_SIZE
+                url = f"https://datasets-server.huggingface.co/rows?dataset={repo_id}&config=default&split=train&offset={offset}&length={PAGE_SIZE}"
+                headers = get_auth_headers() if not config["is_public"] else {}
+                res = requests.get(url, headers=headers)
+                req = res.request
+                res.raise_for_status()
+                data = res.json()
+                if not data.get('rows'):
+                    outputs[2] = "🏁 No more data to search."
+                    yield tuple(outputs)
+                    break
+                # --- ✨ FIXED: JSON processing logic ---
+                # Extract the actual data from the 'row' key of each item in the list
+                rows_data = [item['row'] for item in data['rows']]
+                page_df = pd.json_normalize(rows_data)
+                found_in_page = search_dataframe(page_df, query)
+                if not found_in_page.empty:
+                    all_results_df = pd.concat([all_results_df, found_in_page]).reset_index(drop=True)
+                    outputs[0] = all_results_df
+                    outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(all_results_df)
+                    outputs[2] = f"✅ Found **{len(all_results_df)}** results so far..."
+                    if dataset_key == 'inscene':
+                        gallery_data = [(row['image'], row.get('text', '')) for _, row in all_results_df.iterrows() if 'image' in row and isinstance(row['image'], Image.Image)]
+                        outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+                    yield tuple(outputs)
+            outputs[2] = f"🏁 Search complete. Found a total of **{len(all_results_df)}** results."
+            yield tuple(outputs)
+            return
+        outputs[2] = f"⏳ Loading data via `{access_method}`..."
+        yield tuple(outputs)
+        df = pd.DataFrame()
+        if "Pandas" in access_method:
+            file_path = f"hf://datasets/{repo_id}/"
+            if repo_id == "fka/awesome-chatgpt-prompts":
+                file_path += "prompts.csv"
+                df = pd.read_csv(file_path)
+            else:
+                try:
+                    df = pd.read_parquet(f"{file_path}data/train-00000-of-00001.parquet")
+                except:
+                     try:
+                         df = pd.read_parquet(f"{file_path}train.parquet")
+                     except:
+                         df = pd.read_json(f"{file_path}medical_o1_sft.json")
+        elif "Datasets" in access_method:
+            ds = load_dataset(repo_id, split='train', streaming=True).take(1000)
+            df = pd.DataFrame(ds)
+        elif "Polars" in access_method:
+            outputs[2] = "⏳ Loading with Polars..."
+            yield tuple(outputs)
+            if repo_id == "fka/awesome-chatgpt-prompts":
+                pl_df = pl.read_csv(f"hf://datasets/{repo_id}/prompts.csv")
+            else:
+                pl_df = pl.read_parquet(f"hf://datasets/{repo_id}/train.parquet")
+            df = pl_df.to_pandas()
+        elif "Dask" in access_method:
+            outputs[2] = "⏳ Loading with Dask..."
+            yield tuple(outputs)
+            dask_df = dd.read_json(f"hf://datasets/{repo_id}/**/*.jsonl.gz")
+            df = dask_df.head(1000)  # Convert to pandas for processing
+        elif "Croissant" in access_method:
+            outputs[2] = "⏳ Loading with Croissant..."
+            yield tuple(outputs)
+            headers = get_auth_headers() if not config["is_public"] else {}
+            croissant_url = f"https://huggingface.co/api/datasets/{repo_id}/croissant"
+            response = requests.get(croissant_url, headers=headers)
+            response.raise_for_status()
+            jsonld = response.json()
+            ds = CroissantDataset(jsonld=jsonld)
+            records = list(ds.records("default"))[:1000]  # Take first 1000
+            df = pd.DataFrame(records)
+        outputs[2] = "🔍 Searching loaded data..."
+        yield tuple(outputs)
+        final_df = search_dataframe(df, query)
+        outputs[0] = final_df
+        outputs[3], outputs[4], outputs[5], outputs[6] = dataframe_to_outputs(final_df)
+        outputs[2] = f"🏁 Search complete. Found **{len(final_df)}** results."
+        if dataset_key == 'inscene' and not final_df.empty:
+            gallery_data = [(row['image'], row.get('text', '')) for _, row in final_df.iterrows() if 'image' in row and isinstance(row.get('image'), Image.Image)]
+            outputs[1] = gr.Gallery(gallery_data, label="🖼️ Image Results", height=400)
+        yield tuple(outputs)
+    except Exception as e:
+        yield handle_error(e, req, res)
+# --- 🖼️ UI Generation ---
+def create_dataset_tab(dataset_key: str):
+    config = DATASET_CONFIG[dataset_key]
+    with gr.Tab(f"{config['emoji']} {dataset_key.capitalize()}"):
+        gr.Markdown(f"## {config['emoji']} Query the `{config['name']}` Dataset")
+        if not config['is_public']:
+            gr.Markdown("**Note:** This is a gated dataset. Please log in via `huggingface-cli login` in your terminal first.")
+        with gr.Row():
+            access_method = gr.Radio(config['methods'], label="🔑 Access Method", value=config['methods'][0])
+            query = gr.Textbox(label="🔍 Search Query", placeholder="Enter any text to search, or leave blank for samples...")
+        fetch_button = gr.Button("🚀 Go Fetch!")
+        status_output = gr.Markdown("🏁 Ready to search.")
+        df_output = gr.DataFrame(label="📊 Results DataFrame", interactive=False, wrap=True)
+        gallery_output = gr.Gallery(visible=(dataset_key == 'inscene'), label="🖼️ Image Results")
+        with gr.Accordion("📂 View/Export Full Results", open=False):
+            markdown_output = gr.Markdown(label="📝 Markdown View")
+            with gr.Row():
+                csv_output = gr.File(label="⬇️ Download CSV")
+                xlsx_output = gr.File(label="⬇️ Download XLSX")
+            copy_output = gr.Code(label="📋 Copy-Paste (Tab-Delimited)")
+        code_output = gr.Code(label="💻 Python Code Snippet", language="python")
+        debug_log_output = gr.Code(label="🐞 Debug Log", visible=False)
+        fetch_button.click(
+            fn=fetch_data,
+            inputs=[gr.State(dataset_key), access_method, query],
+            outputs=[
+                df_output, gallery_output, status_output, markdown_output,
+                csv_output, xlsx_output, copy_output, code_output,
+                debug_log_output
+            ]
+        )
+# --- 🚀 Main App ---
+with gr.Blocks(theme=gr.themes.Soft(), title="Hugging Face Dataset Explorer") as demo:
+    gr.Markdown("# 🤗 Hugging Face Dataset Explorer")
+    gr.Markdown(
+        "Select a dataset, choose an access method, and type a query. "
+        "If an error occurs, a detailed debug log will appear to help troubleshoot the issue."
+    )
+    with gr.Accordion("🔧 Quick Start Guide", open=False):
+        gr.Markdown("""
+        ### 🚀 Quick Start:
+        1. **🤖 Prompts Tab**: Try API method, search for "translator" or "linux"
+        2. **⚖️ Caselaw Tab**: Try API method, search for "contract" or "court"
+        3. **💰 Finance Tab**: Requires login, try API method first
+        4. **🩺 Medical Tab**: Requires login, try API method first
+        5. **🖼️ InScene Tab**: Requires login, try Datasets method for images
+        ### 🔑 Authentication:
+        For gated datasets, run in terminal: `huggingface-cli login`
+        ### 🛠️ Methods:
+        - **💨 API**: Fast, reliable, works without login (100 rows max)
+        - **🐼 Pandas**: Full dataset access, requires login for gated datasets
+        - **🤗 Datasets**: Good for streaming large datasets
+        - **🧊 Polars/Dask**: Alternative fast data processing
+        - **🥐 Croissant**: Metadata-aware loading
+        """)
+    with gr.Tabs():
+        for key in DATASET_CONFIG.keys():
+            create_dataset_tab(key)
+if __name__ == "__main__":
+    demo.launch(debug=True)