Santosh commited on
Commit
0e48604
·
1 Parent(s): aa595c3

made changes

Browse files
Files changed (1) hide show
  1. app.py +263 -98
app.py CHANGED
@@ -485,119 +485,284 @@
485
 
486
  import gradio as gr
487
  import polars as pl
488
- import os
489
- import subprocess
490
  import threading
491
  import time
 
 
492
 
493
- # --- Config ---
494
- COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
495
- UPDATED_PARQUET_PATH = "datasetcards_new.parquet" # overwrite same file
496
- ROWS_PER_PAGE = 50
497
- ORG_NAME = "hugging-science" # replace with your org
498
- SPACE_NAME = "dataset-insight-portal" # replace with your space
499
-
500
- # --- Load dataset ---
501
- df = pl.read_parquet(COMBINED_PARQUET_PATH).with_columns([
502
- pl.lit("").alias("assigned_to"),
503
- pl.lit("todo").alias("status")
504
- ])
505
- columns = df.columns
506
- total_pages = (len(df) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
507
-
508
- # --- Git push helpers ---
509
- def save_and_push():
510
- """Commit and push parquet file changes to the repo."""
511
- try:
512
- subprocess.run(["git", "config", "--global", "user.email", "[email protected]"])
513
- subprocess.run(["git", "config", "--global", "user.name", "Santosh Sanjeev"])
514
-
515
- hf_token = os.environ["HF_TOKEN"]
516
- repo_url = f"https://user:{hf_token}@huggingface.co/spaces/{ORG_NAME}/{SPACE_NAME}"
517
- subprocess.run(["git", "remote", "set-url", "origin", repo_url])
518
-
519
- # Commit only if parquet changed
520
- subprocess.run(["git", "add", UPDATED_PARQUET_PATH])
521
- result = subprocess.run(["git", "diff", "--cached", "--quiet"])
522
- if result.returncode != 0:
523
- subprocess.run(["git", "commit", "-m", "Auto-update parquet file"])
524
- subprocess.run(["git", "push", "origin", "main"])
525
- print("✅ Pushed parquet to repo")
526
- else:
527
- print("ℹ️ No parquet changes to push")
528
- except Exception as e:
529
- print("⚠️ Push failed:", e)
530
 
531
- def auto_push_loop(interval=300):
532
- """Run save_and_push every `interval` seconds (default 5 min)."""
533
- while True:
534
- save_and_push()
535
- time.sleep(interval)
536
 
537
- # --- Gradio app functions ---
538
- def get_page(page_num, col, search_text, search_dropdown):
539
- global df
540
- filtered = df
541
-
542
- if col and col in df.columns:
543
- if col in DROPDOWN_COLUMNS and search_dropdown:
544
- filtered = filtered.filter(pl.col(col) == search_dropdown)
545
- elif search_text:
546
- filtered = filtered.filter(pl.col(col).cast(str).str.contains(search_text, literal=False))
547
-
548
- total_pages = (len(filtered) + ROWS_PER_PAGE - 1) // ROWS_PER_PAGE
549
- start, end = (page_num - 1) * ROWS_PER_PAGE, page_num * ROWS_PER_PAGE
550
- page_df = filtered[start:end]
551
- return page_df.to_pandas(), f"of {total_pages}", page_num, "", "", ""
552
-
553
- def save_changes(dataset_id, assigned_to, status):
554
- global df
555
- mask = df["dataset_id"] == dataset_id
556
- if mask.any():
557
- df = df.with_columns([
558
- pl.when(mask).then(assigned_to).otherwise(df["assigned_to"]).alias("assigned_to"),
559
- pl.when(mask).then(status).otherwise(df["status"]).alias("status")
560
- ])
561
- df.write_parquet(UPDATED_PARQUET_PATH)
562
- save_and_push() # push immediately after change
563
- return f"Saved for {dataset_id} ✅"
564
 
565
- def refresh_all(page_num, col, search_text, search_dropdown):
566
- return get_page(page_num, col, search_text, search_dropdown)
 
 
567
 
568
- # --- UI ---
569
- DROPDOWN_COLUMNS = ["status", "assigned_to"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
 
571
  with gr.Blocks() as demo:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
572
  with gr.Row():
573
- col_dropdown = gr.Dropdown(choices=columns, label="Search Column")
574
- search_text = gr.Textbox(label="Search Text")
575
- search_dropdown = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
576
-
 
 
 
 
 
 
 
 
 
 
 
577
  with gr.Row():
578
- page_number = gr.Number(value=1, precision=0, label="Page #")
579
- total_pages_display = gr.Textbox(value=f"of {total_pages}", interactive=False)
580
-
581
- data_table = gr.Dataframe(headers=columns, datatype=["str"] * len(columns), row_count=ROWS_PER_PAGE)
 
582
 
583
- selected_dataset_id = gr.Textbox(label="Selected Dataset ID", interactive=False)
584
- assigned_to_input = gr.Textbox(label="Assigned To")
585
- status_input = gr.Dropdown(choices=["todo", "inprogress", "PR submitted", "PR merged"], label="Status")
 
586
 
587
  save_btn = gr.Button("Save Changes")
588
- refresh_btn = gr.Button("Refresh")
589
-
590
- output_msg = gr.Textbox(label="Message", interactive=False)
591
 
592
- page_number.change(get_page, inputs=[page_number, col_dropdown, search_text, search_dropdown],
593
- outputs=[data_table, total_pages_display, page_number,
594
- selected_dataset_id, assigned_to_input, status_input])
595
- save_btn.click(save_changes, inputs=[selected_dataset_id, assigned_to_input, status_input], outputs=[output_msg])
596
- refresh_btn.click(refresh_all, inputs=[page_number, col_dropdown, search_text, search_dropdown],
597
- outputs=[data_table, total_pages_display, page_number,
598
- selected_dataset_id, assigned_to_input, status_input])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
- # 🔄 Start auto-push loop
601
  threading.Thread(target=auto_push_loop, args=(300,), daemon=True).start()
602
 
603
- demo.launch()
 
485
 
486
  import gradio as gr
487
  import polars as pl
488
+ from huggingface_hub import HfApi
489
+ import re
490
  import threading
491
  import time
492
+ import subprocess
493
+ import os
494
 
495
+ # --- Hugging Face Org ---
496
+ org_name = "hugging-science"
497
+ api = HfApi()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
+ def fetch_members():
500
+ members = api.list_organization_members(org_name)
501
+ return [member.username for member in members]
 
 
502
 
503
+ member_list = fetch_members()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
504
 
505
+ # --- Dataset ---
506
+ COMBINED_PARQUET_PATH = "datasetcards_new.parquet"
507
+ UPDATED_PARQUET_PATH = "datasetcards_new.parquet"
508
+ ROWS_PER_PAGE = 50
509
 
510
+ df = pl.read_parquet(COMBINED_PARQUET_PATH)
511
+ df = df.with_columns([
512
+ pl.lit("todo").alias("status"),
513
+ pl.lit("").alias("assigned_to")
514
+ ]).sort(by=["downloads", "last_modified", "usedStorage"], descending=[True, True, True])
515
+
516
+ if "reason" in df.columns:
517
+ df = df.with_columns([
518
+ pl.Series(
519
+ "reason",
520
+ ["short description" if x and "short description" in x.lower() else (x if x is not None else "") for x in df["reason"]]
521
+ )
522
+ ])
523
+
524
+ # Add editable columns if missing
525
+ for col in ["assigned_to", "status"]:
526
+ if col not in df.columns:
527
+ default_val = "" if col == "assigned_to" else "todo"
528
+ df = df.with_columns(pl.lit(default_val).alias(col))
529
+ else:
530
+ # Fill nulls with default
531
+ default_val = "" if col == "assigned_to" else "todo"
532
+ df = df.with_columns(pl.col(col).fill_null(default_val))
533
+
534
+ # --- Columns ---
535
+ DROPDOWN_COLUMNS = ["reason", "category", "field", "keyword", "assigned_to", "status"]
536
+ STATUS_OPTIONS = ["todo", "inprogress", "PR submitted", "PR merged"]
537
+
538
+ # Prepare unique values for dropdown search
539
+ unique_values = {col: sorted(df[col].drop_nulls().unique().to_list()) for col in DROPDOWN_COLUMNS}
540
+ unique_values['assigned_to'] = sorted(member_list)
541
+ unique_values['status'] = STATUS_OPTIONS
542
+
543
+ # --- Helper to get page ---
544
+ def get_page(df, page, column=None, query=None):
545
+ filtered_df = df
546
+ if column and query:
547
+ if column in DROPDOWN_COLUMNS:
548
+ filtered_df = filtered_df.filter(pl.col(column) == query)
549
+ else:
550
+ q = query.lower().strip()
551
+ filtered_df = (
552
+ filtered_df.with_columns([pl.col(column).str.to_lowercase().alias(column)])
553
+ .filter(pl.col(column).str.contains(q, literal=False))
554
+ )
555
+ start = page * ROWS_PER_PAGE
556
+ page_df = filtered_df[start:start + ROWS_PER_PAGE].to_pandas().fillna("")
557
+ total_rows = filtered_df.height
558
+ total_pages = (total_rows - 1) // ROWS_PER_PAGE + 1 if total_rows > 0 else 1
559
+ return page_df, total_pages
560
+
561
+ initial_df, total_pages = get_page(df, 0)
562
+ columns = list(initial_df.columns)
563
 
564
  with gr.Blocks() as demo:
565
+ gr.Markdown("""
566
+ # Dataset Insight Portal
567
+
568
+ Welcome! This portal helps you explore and manage datasets from our Hugging Face organization.
569
+
570
+ ## What is this space for?
571
+ This space provides a table of datasets along with metadata. You can:
572
+ - Browse datasets with pagination.
573
+ - Search datasets by various fields.
574
+ - Assign responsibility for reviewing datasets (`assigned_to`).
575
+ - Track progress using `status`.
576
+
577
+ ## Why the table?
578
+ The table gives a structured view of all datasets, making it easy to sort, filter, and update information for each dataset. It consists of all datasets until 20-09-2025.
579
+
580
+ ## What does the table contain?
581
+ Each row represents a dataset. Columns include:
582
+ - **dataset_id**: Unique identifier of the dataset.
583
+ - **dataset_url**: Link to the dataset page on Hugging Face.
584
+ - **downloads**: Number of downloads.
585
+ - **author**: Dataset author.
586
+ - **license**: License type.
587
+ - **tags**: Tags describing the dataset. Obtained from the dataset card.
588
+ - **task_categories**: Categories of tasks the dataset is useful for. Obtained from the dataset card.
589
+ - **last_modified**: Date of last update.
590
+ - **field, keyword**: Metadata columns describing dataset purpose based on heuristics. Use the `field` and `keyword` to filter for science based datasets.
591
+ - **category**: Category of the dataset (`rich` means it is good dataset card. `minimal` means it needs improvement for the reasons below).
592
+ - **reason**: Reason why the dataset is classified as `minimal`. Options: `Failed to load card`, `No metadata and no description`, `No metadata and has description`, `Short description`.
593
+ - **usedStorage**: Storage used by the dataset (bytes).
594
+ - **assigned_to**: Person responsible for the dataset (editable).
595
+ - **status**: Progress status (editable). Options: `todo`, `inprogress`, `PR submitted`, `PR merged`.
596
+
597
+ ## How to use search
598
+ - Select a **column** from the dropdown.
599
+ - If the column is textual, type your query in the text box.
600
+ - If the column is a dropdown (like `assigned_to` or `status`), select the value from the dropdown.
601
+ - Click **Search** to filter the table.
602
+
603
+ ## How to add or update `assigned_to` and `status`
604
+ 1. Search for the **dataset_id** initially.
605
+ 2. Then, select the **dataset_id** from the dropdown below the table.
606
+ 3. Choose the person responsible in **Assigned To**. If you are a member of the organization, your username should appear in the list. Else refresh and try again.
607
+ 4. Select the current status in **Status**.
608
+ 5. Click **Save Changes** to update the table and persist the changes.
609
+ 6. Use **Refresh All** to reload the table and the latest members list.
610
+
611
+ This portal makes it easy to keep track of dataset reviews, assignments, and progress all in one place.
612
+ """)
613
+
614
+ # --- Pagination controls ---
615
  with gr.Row():
616
+ prev_btn = gr.Button("Previous")
617
+ next_btn = gr.Button("Next")
618
+ page_number = gr.Number(value=0, label="Page", precision=0)
619
+ total_pages_display = gr.Label(value=f"Total Pages: {total_pages}")
620
+
621
+ # --- Data table ---
622
+ data_table = gr.Dataframe(
623
+ value=initial_df,
624
+ headers=columns,
625
+ datatype="str",
626
+ interactive=False,
627
+ row_count=ROWS_PER_PAGE
628
+ )
629
+
630
+ # --- Search controls ---
631
  with gr.Row():
632
+ col_dropdown = gr.Dropdown(choices=columns, label="Column to Search")
633
+ search_text = gr.Textbox(label="Search Text")
634
+ search_dropdown = gr.Dropdown(choices=[], label="Select Value", visible=False)
635
+ search_btn = gr.Button("Search")
636
+ reset_btn = gr.Button("Reset")
637
 
638
+ # --- Dataset selection & editable fields ---
639
+ selected_dataset_id = gr.Dropdown(label="Select dataset_id", choices=initial_df['dataset_id'].tolist())
640
+ assigned_to_input = gr.Dropdown(choices=member_list, label="Assigned To")
641
+ status_input = gr.Dropdown(choices=STATUS_OPTIONS, label="Status", value="todo")
642
 
643
  save_btn = gr.Button("Save Changes")
644
+ refresh_btn = gr.Button("Refresh All")
645
+ save_message = gr.Textbox(label="Save Status", interactive=False)
 
646
 
647
+ # --- Update search input depending on column ---
648
+ def update_search_input(column):
649
+ if column in DROPDOWN_COLUMNS:
650
+ return gr.update(choices=unique_values[column], visible=True), gr.update(visible=False)
651
+ else:
652
+ return gr.update(visible=False), gr.update(visible=True)
653
+
654
+ col_dropdown.change(update_search_input, col_dropdown, [search_dropdown, search_text])
655
+
656
+ # --- Prefill editable fields ---
657
+ def prefill_fields(dataset_id):
658
+ if not dataset_id:
659
+ return "", "todo"
660
+ dataset_id = str(dataset_id)
661
+ filtered = [row for row in df.to_dicts() if str(row.get("dataset_id")) == dataset_id]
662
+ if not filtered:
663
+ return "", "todo"
664
+ row = filtered[0]
665
+ return row.get("assigned_to", ""), row.get("status", "todo")
666
+
667
+ selected_dataset_id.change(prefill_fields, selected_dataset_id, [assigned_to_input, status_input])
668
+
669
+ # --- Search function ---
670
+ def search_func(page, column, txt, ddl):
671
+ query = ddl if column in DROPDOWN_COLUMNS else txt
672
+ page_df, total_pages = get_page(df, page, column, query)
673
+ return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
674
+
675
+ # --- Pagination functions ---
676
+ def next_page(page, column, txt, ddl):
677
+ page += 1
678
+ query = ddl if column in DROPDOWN_COLUMNS else txt
679
+ page_df, total_pages = get_page(df, page, column, query)
680
+ if page >= total_pages:
681
+ page = total_pages - 1
682
+ page_df, total_pages = get_page(df, page, column, query)
683
+ return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
684
+
685
+ def prev_page(page, column, txt, ddl):
686
+ page = max(0, page - 1)
687
+ query = ddl if column in DROPDOWN_COLUMNS else txt
688
+ page_df, total_pages = get_page(df, page, column, query)
689
+ return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist())
690
+
691
+ def reset_func():
692
+ page_df, total_pages = get_page(df, 0)
693
+ return page_df, f"Total Pages: {total_pages}", 0, gr.update(choices=page_df['dataset_id'].tolist())
694
+
695
+ # --- Save changes & refresh ---
696
+ def save_changes(dataset_id, assigned_to_val, status_val, page_val, col, txt, ddl):
697
+ global df
698
+ if not dataset_id:
699
+ return gr.update(value="Please select a row first."), None, None, None
700
+ df = df.with_columns([
701
+ pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(assigned_to_val)).otherwise(pl.col("assigned_to")).alias("assigned_to"),
702
+ pl.when(pl.col("dataset_id") == dataset_id).then(pl.lit(status_val)).otherwise(pl.col("status")).alias("status")
703
+ ])
704
+ df.write_parquet(UPDATED_PARQUET_PATH)
705
+ page_df, total_pages = get_page(df, page_val, col, txt if col not in DROPDOWN_COLUMNS else ddl)
706
+ return (
707
+ gr.update(value=f"Saved changes for dataset_id: {dataset_id}"),
708
+ page_df,
709
+ gr.update(choices=page_df['dataset_id'].tolist()),
710
+ f"Total Pages: {total_pages}"
711
+ )
712
+
713
+ # --- Refresh All: table + members ---
714
+ def refresh_all(page, column, txt, ddl):
715
+ global df, member_list, unique_values
716
+ # Refresh members
717
+ member_list = fetch_members()
718
+ unique_values['assigned_to'] = sorted(member_list)
719
+ # Refresh table
720
+ try:
721
+ df = pl.read_parquet(UPDATED_PARQUET_PATH)
722
+ except FileNotFoundError:
723
+ pass
724
+ page_df, total_pages = get_page(df, page, column, txt if column not in DROPDOWN_COLUMNS else ddl)
725
+ return page_df, f"Total Pages: {total_pages}", page, gr.update(choices=page_df['dataset_id'].tolist()), gr.update(choices=member_list)
726
+
727
+ # --- Wire buttons ---
728
+ inputs_search = [page_number, col_dropdown, search_text, search_dropdown]
729
+ outputs_search = [data_table, total_pages_display, page_number, selected_dataset_id]
730
+
731
+ search_btn.click(search_func, inputs_search, outputs_search)
732
+ next_btn.click(next_page, inputs_search, outputs_search)
733
+ prev_btn.click(prev_page, inputs_search, outputs_search)
734
+ reset_btn.click(reset_func, [], outputs_search)
735
+ save_btn.click(
736
+ save_changes,
737
+ [selected_dataset_id, assigned_to_input, status_input, page_number, col_dropdown, search_text, search_dropdown],
738
+ [save_message, data_table, selected_dataset_id, total_pages_display]
739
+ )
740
+ refresh_btn.click(
741
+ refresh_all,
742
+ inputs=[page_number, col_dropdown, search_text, search_dropdown],
743
+ outputs=[data_table, total_pages_display, page_number, selected_dataset_id, assigned_to_input]
744
+ )
745
+
746
+ # --- Auto-push thread ---
747
+ def auto_push_loop(interval=300):
748
+ """Push parquet to repo every 5 min."""
749
+ while True:
750
+ try:
751
+ subprocess.run(["git", "config", "--global", "user.email", "[email protected]"])
752
+ subprocess.run(["git", "config", "--global", "user.name", "Santosh Sanjeev"])
753
+ hf_token = os.environ["HF_TOKEN"]
754
+ repo_url = f"https://user:{hf_token}@huggingface.co/spaces/{org_name}/<your-space-name>"
755
+ subprocess.run(["git", "remote", "set-url", "origin", repo_url])
756
+ subprocess.run(["git", "add", UPDATED_PARQUET_PATH])
757
+ result = subprocess.run(["git", "diff", "--cached", "--quiet"])
758
+ if result.returncode != 0:
759
+ subprocess.run(["git", "commit", "-m", "Auto-update parquet file"])
760
+ subprocess.run(["git", "push", "origin", "main"])
761
+ print("✅ Parquet pushed")
762
+ except Exception as e:
763
+ print("⚠️ Push failed:", e)
764
+ time.sleep(interval)
765
 
 
766
  threading.Thread(target=auto_push_loop, args=(300,), daemon=True).start()
767
 
768
+ demo.launch()