File size: 7,923 Bytes
2ccb279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bff727e
 
 
2ccb279
 
bff727e
 
 
2ccb279
 
bff727e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ccb279
bff727e
 
 
 
2ccb279
 
bff727e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2ccb279
bff727e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
# import gradio as gr
# import polars as pl

# # Paths or HF Hub URLs for Parquet files
# RICH_PARQUET_PATH = "all_rich_dataset_cards.parquet"
# MISSING_PARQUET_PATH = "all_minimal_dataset_cards.parquet"

# ROWS_PER_PAGE = 50

# # Lazy load datasets
# lazy_rich = pl.scan_parquet(RICH_PARQUET_PATH)
# lazy_missing = pl.scan_parquet(MISSING_PARQUET_PATH)

# current_lazy_df = lazy_missing  # Default dataset

# # Helper function to fetch a page
# def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
#     filtered_df = lazy_df
#     if column and query:
#         query_lower = query.lower().strip()
#         # Case-insensitive search
#         filtered_df = filtered_df.with_columns([
#             pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
#         ]).filter(pl.col(column).str.contains(query_lower, literal=False))
#     start = page * ROWS_PER_PAGE
#     page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
#     total_rows = filtered_df.collect().height
#     total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
#     return page_df, total_pages

# # Initialize first page
# initial_df, total_pages = get_page(current_lazy_df, 0)
# columns = list(initial_df.columns)

# with gr.Blocks() as demo:
#     gr.Markdown("## Dataset Insight Portal")

#     # Dataset selection
#     dataset_select = gr.Dropdown(
#         choices=["DatasetCards rich in information", "DatasetCards missing information"],
#         value="DatasetCards missing information",
#         label="Select Dataset"
#     )

#     # Pagination controls
#     with gr.Row():
#         prev_btn = gr.Button("Previous", elem_id="small-btn")
#         next_btn = gr.Button("Next", elem_id="small-btn")
#         page_number = gr.Number(value=0, label="Page", precision=0)
#         total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")

#     # Data table
#     data_table = gr.Dataframe(
#         value=initial_df, headers=columns, datatype="str",
#         interactive=False, row_count=ROWS_PER_PAGE
#     )

#     # Column search
#     with gr.Row():
#         col_dropdown = gr.Dropdown(choices=columns, label="Column")
#         search_text = gr.Textbox(label="Search")
#         search_btn = gr.Button("Search", elem_id="small-btn")
#         reset_btn = gr.Button("Reset", elem_id="small-btn")

#     # --- Functions ---
#     def load_dataset(dataset_choice):
#         global current_lazy_df
#         current_lazy_df = lazy_rich if dataset_choice == "DatasetCards rich in information" else lazy_missing
#         initial_df, total_pages = get_page(current_lazy_df, 0)
#         columns = list(initial_df.columns)
#         return (
#             gr.update(value=initial_df, headers=columns),
#             f"Total Pages: {total_pages}",
#             0,
#             gr.update(choices=columns, value=columns[0])
#         )

#     def next_page_func(page, column, query):
#         page += 1
#         page_df, total_pages = get_page(current_lazy_df, page, column, query)
#         if page >= total_pages:
#             page = total_pages - 1
#             page_df, total_pages = get_page(current_lazy_df, page, column, query)
#         return page_df, f"Total Pages: {total_pages}", page

#     def prev_page_func(page, column, query):
#         page -= 1
#         page = max(0, page)
#         page_df, total_pages = get_page(current_lazy_df, page, column, query)
#         return page_df, f"Total Pages: {total_pages}", page

#     def search_func(column, query):
#         page_df, total_pages = get_page(current_lazy_df, 0, column, query)
#         return page_df, f"Total Pages: {total_pages}", 0

#     def reset_func():
#         page_df, total_pages = get_page(current_lazy_df, 0)
#         return page_df, f"Total Pages: {total_pages}", 0

#     # --- Event Listeners ---
#     dataset_select.change(load_dataset, dataset_select, [data_table, total_pages_display, page_number, col_dropdown])
#     next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
#     prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
#     search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
#     reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])

# demo.launch()


import gradio as gr
import polars as pl

# Path for the combined Parquet file
COMBINED_PARQUET_PATH = "datasetcards.parquet"

ROWS_PER_PAGE = 50

# Lazy load dataset
lazy_df = pl.scan_parquet(COMBINED_PARQUET_PATH)

# Helper function to fetch a page
def get_page(lazy_df: pl.LazyFrame, page: int, column: str = None, query: str = ""):
    filtered_df = lazy_df
    if column and query:
        query_lower = query.lower().strip()
        # Case-insensitive search
        filtered_df = filtered_df.with_columns([
            pl.col(column).cast(pl.Utf8).str.to_lowercase().alias(column)
        ]).filter(pl.col(column).str.contains(query_lower, literal=False))
    start = page * ROWS_PER_PAGE
    page_df = filtered_df.slice(start, ROWS_PER_PAGE).collect().to_pandas()
    total_rows = filtered_df.collect().height
    total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1
    return page_df, total_pages

# Initialize first page
initial_df, total_pages = get_page(lazy_df, 0)
columns = list(initial_df.columns)

with gr.Blocks() as demo:
    gr.Markdown("## Dataset Insight Portal")
    gr.Markdown("This space allows you to explore the combined dataset of DatasetCards. "
                "You can navigate pages, search within columns, and inspect the dataset easily.")

    # Pagination controls
    with gr.Row():
        prev_btn = gr.Button("Previous", elem_id="small-btn")
        next_btn = gr.Button("Next", elem_id="small-btn")
        page_number = gr.Number(value=0, label="Page", precision=0)
        total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")

    # Data table
    data_table = gr.Dataframe(
        value=initial_df, headers=columns, datatype="str",
        interactive=False, row_count=ROWS_PER_PAGE
    )

    # Column search
    with gr.Row():
        col_dropdown = gr.Dropdown(choices=columns, label="Column")
        search_text = gr.Textbox(label="Search")
        search_btn = gr.Button("Search", elem_id="small-btn")
        reset_btn = gr.Button("Reset", elem_id="small-btn")

    # --- Functions ---
    current_lazy_df = lazy_df  # single dataset

    def next_page_func(page, column, query):
        page += 1
        page_df, total_pages = get_page(current_lazy_df, page, column, query)
        if page >= total_pages:
            page = total_pages - 1
            page_df, total_pages = get_page(current_lazy_df, page, column, query)
        return page_df, f"Total Pages: {total_pages}", page

    def prev_page_func(page, column, query):
        page -= 1
        page = max(0, page)
        page_df, total_pages = get_page(current_lazy_df, page, column, query)
        return page_df, f"Total Pages: {total_pages}", page

    def search_func(column, query):
        page_df, total_pages = get_page(current_lazy_df, 0, column, query)
        return page_df, f"Total Pages: {total_pages}", 0

    def reset_func():
        page_df, total_pages = get_page(current_lazy_df, 0)
        return page_df, f"Total Pages: {total_pages}", 0

    # --- Event Listeners ---
    next_btn.click(next_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
    prev_btn.click(prev_page_func, [page_number, col_dropdown, search_text], [data_table, total_pages_display, page_number])
    search_btn.click(search_func, [col_dropdown, search_text], [data_table, total_pages_display, page_number])
    reset_btn.click(reset_func, [], [data_table, total_pages_display, page_number])

demo.launch()