File size: 7,923 Bytes
2ccb279 bff727e 2ccb279 bff727e 2ccb279 bff727e 2ccb279 bff727e 2ccb279 bff727e 2ccb279 bff727e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# import gradio as gr
# import polars as pl
# # Paths or HF Hub URLs for Parquet files
# RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
# MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"
# ROWS_PER_PAGE = 50
# # Lazy load datasets
# lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
# lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)
# current_lazy_df = lazy_missing # Default dataset
# # Helper function to fetch a page
# def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
# filtered_df = lazy_df
# if column and query:
# query_lower = query.lower().strip()
# # Case-insensitive search
# filtered_df = filtered_df.with_columns([
# pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
# ]).filter(pl.col(column).str.contains(query_lower, literal=False))
# start = page * ROWS_PER_PAGE
# page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
# total_rows = filtered_df.collect().height
# total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
# return page_df, total_pages
# # Initialize first page
# initial_df, total_pages = get_page(current_lazy_df, 0)
# columns = list(initial_df.columns)
# with gr.Blocks() as demo:
# gr.Markdown("## Dataset Insight Portal")
# # Dataset selection
# dataset_select = gr.Dropdown(
# choices=["DatasetCards rich in information", "DatasetCards missing information"],
# value="DatasetCards missing information",
# label="Select Dataset"
# )
# # Pagination controls
# with gr.Row():
# prev_btn = gr.Button("Previous", elem_id="small-btn")
# next_btn = gr.Button("Next", elem_id="small-btn")
# page_number = gr.Number(value=0, label="Page", precision=0)
# total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# # Data table
# data_table = gr.Dataframe(
# value=initial_df, headers=columns, datatype="str",
# interactive=False, row_count=ROWS_PER_PAGE
# )
# # Column search
# with gr.Row():
# col_dropdown = gr.Dropdown(choices=columns, label="Column")
# search_text = gr.Textbox(label="Search")
# search_btn = gr.Button("Search", elem_id="small-btn")
# reset_btn = gr.Button("Reset", elem_id="small-btn")
# # --- Functions ---
# def load_dataset(dataset_choice):
# global current_lazy_df
# current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
# initial_df, total_pages = get_page(current_lazy_df, 0)
# columns = list(initial_df.columns)
# return (
# gr.update(value=initial_df, headers=columns),
# f"Total Pages: {total_pages}",
# 0,
# gr.update(choices=columns, value=columns[0])
# )
# def next_page_func(page, column, query):
# page += 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# if page >= total_pages:
# page = total_pages - 1
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def prev_page_func(page, column, query):
# page -= 1
# page = max(0, page)
# page_df, total_pages = get_page(current_lazy_df, page, column, query)
# return page_df, f"Total Pages: {total_pages}", page
# def search_func(column, query):
# page_df, total_pages = get_page(current_lazy_df, 0, column, query)
# return page_df, f"Total Pages: {total_pages}", 0
# def reset_func():
# page_df, total_pages = get_page(current_lazy_df, 0)
# return page_df, f"Total Pages: {total_pages}", 0
# # --- Event Listeners ---
# dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
# next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
# search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
# reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
# demo.launch()
import gradio as gr
import polars as pl
# Path for the combined Parquet file
COMBINED_PARQUET_PATH = "datasetcards.parquet"
ROWS_PER_PAGE = 50
# Lazy load dataset
lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH)
# Helper function to fetch a page
def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
filtered_df = lazy_df
if column and query:
query_lower = query.lower().strip()
# Case-insensitive search
filtered_df = filtered_df.with_columns([
pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
]).filter(pl.col(column).str.contains(query_lower, literal=False))
start = page * ROWS_PER_PAGE
page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
total_rows = filtered_df.collect().height
total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
return page_df, total_pages
# Initialize first page
initial_df, total_pages = get_page(lazy_df, 0)
columns = list(initial_df.columns)
with gr.Blocks() as demo:
gr.Markdown("## Dataset Insight Portal")
gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. "
"You can navigate pages, search within columns, and inspect the dataset easily.")
# Pagination controls
with gr.Row():
prev_btn = gr.Button("Previous", elem_id="small-btn")
next_btn = gr.Button("Next", elem_id="small-btn")
page_number = gr.Number(value=0, label="Page", precision=0)
total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
# Data table
data_table = gr.Dataframe(
value=initial_df, headers=columns, datatype="str",
interactive=False, row_count=ROWS_PER_PAGE
)
# Column search
with gr.Row():
col_dropdown = gr.Dropdown(choices=columns, label="Column")
search_text = gr.Textbox(label="Search")
search_btn = gr.Button("Search", elem_id="small-btn")
reset_btn = gr.Button("Reset", elem_id="small-btn")
# --- Functions ---
current_lazy_df = lazy_df # single dataset
def next_page_func(page, column, query):
page += 1
page_df, total_pages = get_page(current_lazy_df, page, column, query)
if page >= total_pages:
page = total_pages - 1
page_df, total_pages = get_page(current_lazy_df, page, column, query)
return page_df, f"Total Pages: {total_pages}", page
def prev_page_func(page, column, query):
page -= 1
page = max(0, page)
page_df, total_pages = get_page(current_lazy_df, page, column, query)
return page_df, f"Total Pages: {total_pages}", page
def search_func(column, query):
page_df, total_pages = get_page(current_lazy_df, 0, column, query)
return page_df, f"Total Pages: {total_pages}", 0
def reset_func():
page_df, total_pages = get_page(current_lazy_df, 0)
return page_df, f"Total Pages: {total_pages}", 0
# --- Event Listeners ---
next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])
demo.launch()
|