Santosh
		
	commited on
		
		
					Commit 
							
							·
						
						bff727e
	
1
								Parent(s):
							
							ebb0ec5
								
Fresh push: Dataset Insight Portal with Parquet files via LFS
Browse files- all_minimal_dataset_cards.parquet +3 -0
- all_rich_dataset_cards.parquet +3 -0
- app.py +107 -0
    	
        all_minimal_dataset_cards.parquet
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:5adb59f94fb6f08f5c0859e21e55ed56ec40f40d9cde349427bf24065e775d60
         | 
| 3 | 
            +
            size 17318878
         | 
    	
        all_rich_dataset_cards.parquet
    ADDED
    
    | @@ -0,0 +1,3 @@ | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            version https://git-lfs.github.com/spec/v1
         | 
| 2 | 
            +
            oid sha256:94ac600eb5100aa7acaeeec3d05becbee7ac11eba9595a0f9e38286879285349
         | 
| 3 | 
            +
            size 5475858
         | 
    	
        app.py
    ADDED
    
    | @@ -0,0 +1,107 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            import gradio as gr
         | 
| 2 | 
            +
            import polars as pl
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            # Paths or HF Hub URLs for Parquet files
         | 
| 5 | 
            +
            RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
         | 
| 6 | 
            +
            MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            ROWS_PER_PAGE = 50
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            # Lazy load datasets
         | 
| 11 | 
            +
            lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
         | 
| 12 | 
            +
            lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            current_lazy_df = lazy_missing  # Default dataset
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            # Helper function to fetch a page
         | 
| 17 | 
            +
            def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
         | 
| 18 | 
            +
                filtered_df = lazy_df
         | 
| 19 | 
            +
                if column and query:
         | 
| 20 | 
            +
                    query_lower = query.lower().strip()
         | 
| 21 | 
            +
                    # Case-insensitive search
         | 
| 22 | 
            +
                    filtered_df = filtered_df.with_columns([
         | 
| 23 | 
            +
                        pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
         | 
| 24 | 
            +
                    ]).filter(pl.col(column).str.contains(query_lower, literal=False))
         | 
| 25 | 
            +
                start = page * ROWS_PER_PAGE
         | 
| 26 | 
            +
                page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
         | 
| 27 | 
            +
                total_rows = filtered_df.collect().height
         | 
| 28 | 
            +
                total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
         | 
| 29 | 
            +
                return page_df, total_pages
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            # Initialize first page
         | 
| 32 | 
            +
            initial_df, total_pages = get_page(current_lazy_df, 0)
         | 
| 33 | 
            +
            columns = list(initial_df.columns)
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            with gr.Blocks() as demo:
         | 
| 36 | 
            +
                gr.Markdown("## Dataset Insight Portal")
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                # Dataset selection
         | 
| 39 | 
            +
                dataset_select = gr.Dropdown(
         | 
| 40 | 
            +
                    choices=["DatasetCards rich in information", "DatasetCards missing information"],
         | 
| 41 | 
            +
                    value="DatasetCards missing information",
         | 
| 42 | 
            +
                    label="Select Dataset"
         | 
| 43 | 
            +
                )
         | 
| 44 | 
            +
             | 
| 45 | 
            +
                # Pagination controls
         | 
| 46 | 
            +
                with gr.Row():
         | 
| 47 | 
            +
                    prev_btn = gr.Button("Previous", elem_id="small-btn")
         | 
| 48 | 
            +
                    next_btn = gr.Button("Next", elem_id="small-btn")
         | 
| 49 | 
            +
                    page_number = gr.Number(value=0, label="Page", precision=0)
         | 
| 50 | 
            +
                    total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
         | 
| 51 | 
            +
             | 
| 52 | 
            +
                # Data table
         | 
| 53 | 
            +
                data_table = gr.Dataframe(
         | 
| 54 | 
            +
                    value=initial_df, headers=columns, datatype="str",
         | 
| 55 | 
            +
                    interactive=False, row_count=ROWS_PER_PAGE
         | 
| 56 | 
            +
                )
         | 
| 57 | 
            +
             | 
| 58 | 
            +
                # Column search
         | 
| 59 | 
            +
                with gr.Row():
         | 
| 60 | 
            +
                    col_dropdown = gr.Dropdown(choices=columns, label="Column")
         | 
| 61 | 
            +
                    search_text = gr.Textbox(label="Search")
         | 
| 62 | 
            +
                    search_btn = gr.Button("Search", elem_id="small-btn")
         | 
| 63 | 
            +
                    reset_btn = gr.Button("Reset", elem_id="small-btn")
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                # --- Functions ---
         | 
| 66 | 
            +
                def load_dataset(dataset_choice):
         | 
| 67 | 
            +
                    global current_lazy_df
         | 
| 68 | 
            +
                    current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
         | 
| 69 | 
            +
                    initial_df, total_pages = get_page(current_lazy_df, 0)
         | 
| 70 | 
            +
                    columns = list(initial_df.columns)
         | 
| 71 | 
            +
                    return (
         | 
| 72 | 
            +
                        gr.update(value=initial_df, headers=columns),
         | 
| 73 | 
            +
                        f"Total Pages: {total_pages}",
         | 
| 74 | 
            +
                        0,
         | 
| 75 | 
            +
                        gr.update(choices=columns, value=columns[0])
         | 
| 76 | 
            +
                    )
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                def next_page_func(page, column, query):
         | 
| 79 | 
            +
                    page += 1
         | 
| 80 | 
            +
                    page_df, total_pages = get_page(current_lazy_df, page, column, query)
         | 
| 81 | 
            +
                    if page >= total_pages:
         | 
| 82 | 
            +
                        page = total_pages - 1
         | 
| 83 | 
            +
                        page_df, total_pages = get_page(current_lazy_df, page, column, query)
         | 
| 84 | 
            +
                    return page_df, f"Total Pages: {total_pages}", page
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                def prev_page_func(page, column, query):
         | 
| 87 | 
            +
                    page -= 1
         | 
| 88 | 
            +
                    page = max(0, page)
         | 
| 89 | 
            +
                    page_df, total_pages = get_page(current_lazy_df, page, column, query)
         | 
| 90 | 
            +
                    return page_df, f"Total Pages: {total_pages}", page
         | 
| 91 | 
            +
             | 
| 92 | 
            +
                def search_func(column, query):
         | 
| 93 | 
            +
                    page_df, total_pages = get_page(current_lazy_df, 0, column, query)
         | 
| 94 | 
            +
                    return page_df, f"Total Pages: {total_pages}", 0
         | 
| 95 | 
            +
             | 
| 96 | 
            +
                def reset_func():
         | 
| 97 | 
            +
                    page_df, total_pages = get_page(current_lazy_df, 0)
         | 
| 98 | 
            +
                    return page_df, f"Total Pages: {total_pages}", 0
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                # --- Event Listeners ---
         | 
| 101 | 
            +
                dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
         | 
| 102 | 
            +
                next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
         | 
| 103 | 
            +
                prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
         | 
| 104 | 
            +
                search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
         | 
| 105 | 
            +
                reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
         | 
| 106 | 
            +
             | 
| 107 | 
            +
            demo.launch()
         | 
