Santosh
updated ryan science tags
2ccb279
raw
history blame
7.92 kB
# import gradio as gr
# import polars as pl
# # Paths or HF Hub URLs for Parquet files
# RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
# MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
# ROWS_PER_PAGE = 50
# # Lazy load datasets
# lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
# lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
# current_lazy_df = lazy_missing # Default dataset
# # Helper function to fetch a page
# def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
# filtered_df = lazy_df
# if column and query:
# query_lower = query.lower().strip()
# # Case-insensitive search
# filtered_df = filtered_df.with_columns([
# pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
# ]).filter(pl.col(column).str.contains(query_lower, literal=False))
# start = page * ROWS_PER_PAGE
# page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
# total_rows = filtered_df.collect().height
# total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
# return page_df, total_pages
# # Initialize first page
# initial_df, total_pages = get_page(current_lazy_df, 0)
# columns = list(initial_df.columns)
# with gr.Blocks() as demo:
# gr.Markdown("## Dataset Insight Portal")
# # Dataset selection
# dataset_select = gr.Dropdown(
# choices=["DatasetCards rich in information", "DatasetCards missing information"],
# value="DatasetCards missing information",
# label="Select Dataset"
# )
# # Pagination controls
# with gr.Row():
# prev_btn = gr.Button("Previous", elem_id="small-btn")
# next_btn = gr.Button("Next", elem_id="small-btn")
# page_number = gr.Number(value=0, label="Page", precision=0)
# total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# # Data table
# data_table = gr.Dataframe(
# value=initial_df, headers=columns, datatype="str",
# interactive=False, row_count=ROWS_PER_PAGE
# )
# # Column search
# with gr.Row():
# col_dropdown = gr.Dropdown(choices=columns, label="Column")
# search_text = gr.Textbox(label="Search")
# search_btn = gr.Button("Search", elem_id="small-btn")
# reset_btn = gr.Button("Reset", elem_id="small-btn")
# # --- Functions ---
# def load_dataset(dataset_choice):
# global current_lazy_df
# current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
# initial_df, total_pages = get_page(current_lazy_df, 0)
# columns = list(initial_df.columns)
# return (
# gr.update(value=initial_df, headers=columns),
# f"Total Pages: {total_pages}",
# 0,
# gr.update(choices=columns, value=columns[0])
# )
# def next_page_func(page, column, query):
# page += 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# if page >= total_pages:
# page = total_pages - 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def prev_page_func(page, column, query):
# page -= 1
# page = max(0, page)
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def search_func(column, query):
# page_df, total_pages = get_page(current_lazy_df, 0, column, query)
# return page_df, f"Total Pages: {total_pages}", 0
# def reset_func():
# page_df, total_pages = get_page(current_lazy_df, 0)
# return page_df, f"Total Pages: {total_pages}", 0
# # --- Event Listeners ---
# dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
# next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
# reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
# demo.launch()
import gradio as gr
import polars as pl
# Path for the combined Parquet file
COMBINED_PARQUET_PATH = "datasetcards.parquet"
ROWS_PER_PAGE = 50
# Lazy load dataset
lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH)
# Helper function to fetch a page
def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
filtered_df = lazy_df
if column and query:
query_lower = query.lower().strip()
# Case-insensitive search
filtered_df = filtered_df.with_columns([
pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
]).filter(pl.col(column).str.contains(query_lower, literal=False))
start = page * ROWS_PER_PAGE
page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
total_rows = filtered_df.collect().height
total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
return page_df, total_pages
# Initialize first page
initial_df, total_pages = get_page(lazy_df, 0)
columns = list(initial_df.columns)
with gr.Blocks() as demo:
gr.Markdown("## Dataset Insight Portal")
gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. "
"You can navigate pages, search within columns, and inspect the dataset easily.")
# Pagination controls
with gr.Row():
prev_btn = gr.Button("Previous", elem_id="small-btn")
next_btn = gr.Button("Next", elem_id="small-btn")
page_number = gr.Number(value=0, label="Page", precision=0)
total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# Data table
data_table = gr.Dataframe(
value=initial_df, headers=columns, datatype="str",
interactive=False, row_count=ROWS_PER_PAGE
)
# Column search
with gr.Row():
col_dropdown = gr.Dropdown(choices=columns, label="Column")
search_text = gr.Textbox(label="Search")
search_btn = gr.Button("Search", elem_id="small-btn")
reset_btn = gr.Button("Reset", elem_id="small-btn")
# --- Functions ---
current_lazy_df = lazy_df # single dataset
def next_page_func(page, column, query):
page += 1
page_df, total_pages = get_page(current_lazy_df, page, column, query)
if page >= total_pages:
page = total_pages - 1
page_df, total_pages = get_page(current_lazy_df, page, column, query)
return page_df, f"Total Pages: {total_pages}", page
def prev_page_func(page, column, query):
page -= 1
page = max(0, page)
page_df, total_pages = get_page(current_lazy_df, page, column, query)
return page_df, f"Total Pages: {total_pages}", page
def search_func(column, query):
page_df, total_pages = get_page(current_lazy_df, 0, column, query)
return page_df, f"Total Pages: {total_pages}", 0
def reset_func():
page_df, total_pages = get_page(current_lazy_df, 0)
return page_df, f"Total Pages: {total_pages}", 0
# --- Event Listeners ---
next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
demo.launch()