| # import gradio as gr | |
| # import polars as pl | |
| # # Paths or HF Hub URLs for Parquet files | |
| # RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet" | |
| # MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet" | |
| # ROWS_PER_PAGE = 50 | |
| # # Lazy load datasets | |
| # lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH) | |
| # lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH) | |
| # current_lazy_df = lazy_missing # Default dataset | |
| # # Helper function to fetch a page | |
| # def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""): | |
| # filtered_df = lazy_df | |
| # if column and query: | |
| # query_lower = query.lower().strip() | |
| # # Case-insensitive search | |
| # filtered_df = filtered_df.with_columns([ | |
| # pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column) | |
| # ]).filter(pl.col(column).str.contains(query_lower, literal=False)) | |
| # start = page * ROWS_PER_PAGE | |
| # page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas() | |
| # total_rows = filtered_df.collect().height | |
| # total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 | |
| # return page_df, total_pages | |
| # # Initialize first page | |
| # initial_df, total_pages = get_page(current_lazy_df, 0) | |
| # columns = list(initial_df.columns) | |
| # with gr.Blocks() as demo: | |
| # gr.Markdown("## Dataset Insight Portal") | |
| # # Dataset selection | |
| # dataset_select = gr.Dropdown( | |
| # choices=["DatasetCards rich in information", "DatasetCards missing information"], | |
| # value="DatasetCards missing information", | |
| # label="Select Dataset" | |
| # ) | |
| # # Pagination controls | |
| # with gr.Row(): | |
| # prev_btn = gr.Button("Previous", elem_id="small-btn") | |
| # next_btn = gr.Button("Next", elem_id="small-btn") | |
| # page_number = gr.Number(value=0, label="Page", precision=0) | |
| # total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") | |
| # # Data table | |
| # data_table = gr.Dataframe( | |
| # value=initial_df, headers=columns, datatype="str", | |
| # interactive=False, row_count=ROWS_PER_PAGE | |
| # ) | |
| # # Column search | |
| # with gr.Row(): | |
| # col_dropdown = gr.Dropdown(choices=columns, label="Column") | |
| # search_text = gr.Textbox(label="Search") | |
| # search_btn = gr.Button("Search", elem_id="small-btn") | |
| # reset_btn = gr.Button("Reset", elem_id="small-btn") | |
| # # --- Functions --- | |
| # def load_dataset(dataset_choice): | |
| # global current_lazy_df | |
| # current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing | |
| # initial_df, total_pages = get_page(current_lazy_df, 0) | |
| # columns = list(initial_df.columns) | |
| # return ( | |
| # gr.update(value=initial_df, headers=columns), | |
| # f"Total Pages: {total_pages}", | |
| # 0, | |
| # gr.update(choices=columns, value=columns[0]) | |
| # ) | |
| # def next_page_func(page, column, query): | |
| # page += 1 | |
| # page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| # if page >= total_pages: | |
| # page = total_pages - 1 | |
| # page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| # return page_df, f"Total Pages: {total_pages}", page | |
| # def prev_page_func(page, column, query): | |
| # page -= 1 | |
| # page = max(0, page) | |
| # page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| # return page_df, f"Total Pages: {total_pages}", page | |
| # def search_func(column, query): | |
| # page_df, total_pages = get_page(current_lazy_df, 0, column, query) | |
| # return page_df, f"Total Pages: {total_pages}", 0 | |
| # def reset_func(): | |
| # page_df, total_pages = get_page(current_lazy_df, 0) | |
| # return page_df, f"Total Pages: {total_pages}", 0 | |
| # # --- Event Listeners --- | |
| # dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown]) | |
| # next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| # prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| # search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| # reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number]) | |
| # demo.launch() | |
| import gradio as gr | |
| import polars as pl | |
| # Path for the combined Parquet file | |
| COMBINED_PARQUET_PATH = "datasetcards.parquet" | |
| ROWS_PER_PAGE = 50 | |
| # Lazy load dataset | |
| lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH) | |
| # Helper function to fetch a page | |
| def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""): | |
| filtered_df = lazy_df | |
| if column and query: | |
| query_lower = query.lower().strip() | |
| # Case-insensitive search | |
| filtered_df = filtered_df.with_columns([ | |
| pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column) | |
| ]).filter(pl.col(column).str.contains(query_lower, literal=False)) | |
| start = page * ROWS_PER_PAGE | |
| page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas() | |
| total_rows = filtered_df.collect().height | |
| total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 | |
| return page_df, total_pages | |
| # Initialize first page | |
| initial_df, total_pages = get_page(lazy_df, 0) | |
| columns = list(initial_df.columns) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## Dataset Insight Portal") | |
| gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. " | |
| "You can navigate pages, search within columns, and inspect the dataset easily.") | |
| # Pagination controls | |
| with gr.Row(): | |
| prev_btn = gr.Button("Previous", elem_id="small-btn") | |
| next_btn = gr.Button("Next", elem_id="small-btn") | |
| page_number = gr.Number(value=0, label="Page", precision=0) | |
| total_pages_display = gr.Label(value=f"Total Pages: {total_pages}") | |
| # Data table | |
| data_table = gr.Dataframe( | |
| value=initial_df, headers=columns, datatype="str", | |
| interactive=False, row_count=ROWS_PER_PAGE | |
| ) | |
| # Column search | |
| with gr.Row(): | |
| col_dropdown = gr.Dropdown(choices=columns, label="Column") | |
| search_text = gr.Textbox(label="Search") | |
| search_btn = gr.Button("Search", elem_id="small-btn") | |
| reset_btn = gr.Button("Reset", elem_id="small-btn") | |
| # --- Functions --- | |
| current_lazy_df = lazy_df # single dataset | |
| def next_page_func(page, column, query): | |
| page += 1 | |
| page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| if page >= total_pages: | |
| page = total_pages - 1 | |
| page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| return page_df, f"Total Pages: {total_pages}", page | |
| def prev_page_func(page, column, query): | |
| page -= 1 | |
| page = max(0, page) | |
| page_df, total_pages = get_page(current_lazy_df, page, column, query) | |
| return page_df, f"Total Pages: {total_pages}", page | |
| def search_func(column, query): | |
| page_df, total_pages = get_page(current_lazy_df, 0, column, query) | |
| return page_df, f"Total Pages: {total_pages}", 0 | |
| def reset_func(): | |
| page_df, total_pages = get_page(current_lazy_df, 0) | |
| return page_df, f"Total Pages: {total_pages}", 0 | |
| # --- Event Listeners --- | |
| next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number]) | |
| reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number]) | |
| demo.launch() | |