Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import regex as re | |
| import csv | |
| import pandas as pd | |
| from analyzer import combine_repo_files_for_llm, analyze_combined_file, parse_llm_json_response | |
| from hf_utils import download_space_repo, search_top_spaces | |
| from chatbot_page import chat_with_user, extract_keywords_from_conversation | |
| # Import chatbot logic | |
| from analyzer import analyze_code | |
| # Chatbot system prompt | |
| CHATBOT_SYSTEM_PROMPT = ( | |
| "You are a helpful assistant. Your goal is to help the user describe their ideal open-source repo. " | |
| "Ask questions to clarify what they want, their use case, preferred language, features, etc. " | |
| "When the user clicks 'End Chat', analyze the conversation and return about 5 keywords for repo search. " | |
| "Return only the keywords as a comma-separated list." | |
| ) | |
| def read_csv_as_text(csv_filename): | |
| return pd.read_csv(csv_filename, dtype=str) | |
| def process_repo_input(text): | |
| if not text: | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Split by newlines and commas, strip whitespace | |
| repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] | |
| # Write to CSV | |
| csv_filename = "repo_ids.csv" | |
| with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| for repo_id in repo_ids: | |
| writer.writerow([repo_id, "", "", "", ""]) | |
| # Read the CSV into a DataFrame to display | |
| df = read_csv_as_text(csv_filename) | |
| return df | |
| # Store the last entered repo ids and the current index in global variables for button access | |
| last_repo_ids = [] | |
| current_repo_idx = 0 | |
| # Store extracted keywords for the chatbot flow | |
| generated_keywords = [] | |
| def process_repo_input_and_store(text): | |
| global last_repo_ids, current_repo_idx | |
| if not text: | |
| last_repo_ids = [] | |
| current_repo_idx = 0 | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| repo_ids = [repo.strip() for repo in re.split(r'[\n,]+', text) if repo.strip()] | |
| last_repo_ids = repo_ids | |
| current_repo_idx = 0 | |
| csv_filename = "repo_ids.csv" | |
| with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| for repo_id in repo_ids: | |
| writer.writerow([repo_id, "", "", "", ""]) | |
| df = read_csv_as_text(csv_filename) | |
| return df | |
| def keyword_search_and_update(keyword): | |
| global last_repo_ids, current_repo_idx | |
| if not keyword: | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Accept multiple keywords, comma or newline separated | |
| keyword_list = [k.strip() for k in re.split(r'[\n,]+', keyword) if k.strip()] | |
| repo_ids = [] | |
| for kw in keyword_list: | |
| repo_ids.extend(search_top_spaces(kw, limit=5)) | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_repo_ids = [] | |
| for rid in repo_ids: | |
| if rid not in seen: | |
| unique_repo_ids.append(rid) | |
| seen.add(rid) | |
| last_repo_ids = unique_repo_ids | |
| current_repo_idx = 0 | |
| csv_filename = "repo_ids.csv" | |
| with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| for repo_id in unique_repo_ids: | |
| writer.writerow([repo_id, "", "", "", ""]) | |
| df = read_csv_as_text(csv_filename) | |
| return df | |
| def show_combined_repo_and_llm(): | |
| global current_repo_idx | |
| if not last_repo_ids: | |
| return "No repo ID available. Please submit repo IDs first.", "", pd.DataFrame() | |
| if current_repo_idx >= len(last_repo_ids): | |
| return "All repo IDs have been processed.", "", read_csv_as_text("repo_ids.csv") | |
| repo_id = last_repo_ids[current_repo_idx] | |
| try: | |
| download_space_repo(repo_id, local_dir="repo_files") | |
| except Exception as e: | |
| return f"Error downloading repo: {e}", "", read_csv_as_text("repo_ids.csv") | |
| txt_path = combine_repo_files_for_llm() | |
| try: | |
| with open(txt_path, "r", encoding="utf-8") as f: | |
| combined_content = f.read() | |
| except Exception as e: | |
| return f"Error reading {txt_path}: {e}", "", read_csv_as_text("repo_ids.csv") | |
| llm_output = analyze_combined_file(txt_path) | |
| llm_json = parse_llm_json_response(llm_output) | |
| # Update CSV for the current repo id | |
| csv_filename = "repo_ids.csv" | |
| extraction_status = "" | |
| strengths = "" | |
| weaknesses = "" | |
| try: | |
| df = read_csv_as_text(csv_filename) | |
| for col in ["strength", "weaknesses", "speciality", "relevance rating"]: | |
| df[col] = df[col].astype(str) | |
| for idx, row in df.iterrows(): | |
| if row["repo id"] == repo_id: | |
| if isinstance(llm_json, dict) and "error" not in llm_json: | |
| extraction_status = "JSON extraction: SUCCESS" | |
| strengths = llm_json.get("strength", "") | |
| weaknesses = llm_json.get("weaknesses", "") | |
| df.at[idx, "strength"] = strengths | |
| df.at[idx, "weaknesses"] = weaknesses | |
| df.at[idx, "speciality"] = llm_json.get("speciality", "") | |
| df.at[idx, "relevance rating"] = llm_json.get("relevance rating", "") | |
| else: | |
| extraction_status = f"JSON extraction: FAILED\nRaw: {llm_json.get('raw', '') if isinstance(llm_json, dict) else llm_json}" | |
| break | |
| df.to_csv(csv_filename, index=False) | |
| except Exception as e: | |
| df = read_csv_as_text(csv_filename) | |
| extraction_status = f"CSV update error: {e}" | |
| # Move to next repo for next click | |
| current_repo_idx += 1 | |
| summary = f"{extraction_status}\n\nStrengths:\n{strengths}\n\nWeaknesses:\n{weaknesses}" | |
| return combined_content, summary, df | |
| def go_to_analysis(): | |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) | |
| def go_to_input(): | |
| return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False), gr.update(visible=False) | |
| def go_to_chatbot(): | |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), gr.update(visible=False) | |
| def go_to_start(): | |
| return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False) | |
| def go_to_results(): | |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True) | |
| repo_id_input = gr.Textbox(label="Enter repo IDs (comma or newline separated)", lines=5, placeholder="repo1, repo2\nrepo3") | |
| df_output = gr.Dataframe(headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], | |
| datatype=["str", "str", "str", "str", "str", "str"] | |
| ) | |
| def use_keywords_to_search_and_update_csv(keywords): | |
| global last_repo_ids, current_repo_idx | |
| if not keywords: | |
| return pd.DataFrame(columns=["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| # Split keywords and search for each | |
| keyword_list = [k.strip() for k in keywords.split(",") if k.strip()] | |
| repo_ids = [] | |
| for kw in keyword_list: | |
| repo_ids.extend(search_top_spaces(kw, limit=3)) # limit=3 per keyword | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_repo_ids = [] | |
| for rid in repo_ids: | |
| if rid not in seen: | |
| unique_repo_ids.append(rid) | |
| seen.add(rid) | |
| last_repo_ids = unique_repo_ids | |
| current_repo_idx = 0 | |
| csv_filename = "repo_ids.csv" | |
| with open(csv_filename, mode="w", newline='', encoding="utf-8") as csvfile: | |
| writer = csv.writer(csvfile) | |
| writer.writerow(["repo id", "strength", "weaknesses", "speciality", "relevance rating"]) | |
| for repo_id in unique_repo_ids: | |
| writer.writerow([repo_id, "", "", "", ""]) | |
| df = read_csv_as_text(csv_filename) | |
| return df | |
| with gr.Blocks() as demo: | |
| page_state = gr.State(0) | |
| # --- Start Page: Option Selection --- | |
| with gr.Column(visible=True) as start_page: | |
| gr.Markdown("## Welcome! How would you like to proceed?") | |
| option_a_btn = gr.Button("A) I know which repos I want to search and research about") | |
| option_b_btn = gr.Button("B) I don't know exactly what I want (Chatbot)") | |
| # --- Page 1: Input --- | |
| with gr.Column(visible=False) as input_page: | |
| gr.Markdown("## Enter Keyword or Repo IDs") | |
| keyword_input = gr.Textbox(label="Enter keywords to search repos (comma or newline separated)", lines=2, placeholder="e.g. audio, vision\ntext") | |
| keyword_btn = gr.Button("Search and Update Repo List") | |
| repo_id_box = repo_id_input.render() | |
| df_box = df_output.render() | |
| submit_btn = gr.Button("Submit Repo IDs") | |
| next_btn = gr.Button("Next: Go to Analysis") | |
| back_to_start_btn = gr.Button("Back to Start") | |
| # --- Page 2: Analysis --- | |
| with gr.Column(visible=False) as analysis_page: | |
| gr.Markdown("## Combine and Display Repo Files") | |
| combine_btn = gr.Button("Download, Combine & Show .py/.md Files from Next Repo and Analyze") | |
| combined_txt = gr.Textbox(label="Combined Repo Files", lines=20) | |
| llm_output_txt = gr.Textbox(label="LLM Analysis Output", lines=10) | |
| df_display = gr.Dataframe( | |
| headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], | |
| datatype=["str", "str", "str", "str", "str", "str"] | |
| ) | |
| back_btn = gr.Button("Back to Input") | |
| back_to_start_btn2 = gr.Button("Back to Start") | |
| # --- Page 3: Chatbot --- | |
| with gr.Column(visible=False) as chatbot_page: | |
| gr.Markdown("## Repo Recommendation Chatbot") | |
| chatbot = gr.Chatbot() | |
| state = gr.State([]) | |
| user_input = gr.Textbox(label="Your message", placeholder="Describe your ideal repo or answer the assistant's questions...") | |
| send_btn = gr.Button("Send") | |
| end_btn = gr.Button("End Chat and Extract Keywords") | |
| keywords_output = gr.Textbox(label="Extracted Keywords for Repo Search", interactive=False) | |
| go_to_results_btn = gr.Button("Find Repos with These Keywords") | |
| back_to_start_btn3 = gr.Button("Back to Start") | |
| # --- Page 4: Results after Chatbot --- | |
| with gr.Column(visible=False) as results_page: | |
| gr.Markdown("## Repo Results Based on Your Conversation") | |
| results_df = gr.Dataframe( | |
| headers=["repo id", "strength", "weaknesses", "speciality", "relevance rating", "Usecase"], | |
| datatype=["str", "str", "str", "str", "str", "str"] | |
| ) | |
| analyze_next_btn = gr.Button("Download, Combine & Analyze Next Repo") | |
| combined_txt_results = gr.Textbox(label="Combined Repo Files", lines=20) | |
| llm_output_txt_results = gr.Textbox(label="LLM Analysis Output", lines=10) | |
| back_to_start_btn4 = gr.Button("Back to Start") | |
| # Navigation logic | |
| option_a_btn.click(go_to_input, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| option_b_btn.click(go_to_chatbot, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| next_btn.click(go_to_analysis, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page]) | |
| back_btn.click(go_to_input, inputs=None, outputs=[input_page, analysis_page, chatbot_page, results_page]) | |
| back_to_start_btn.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| back_to_start_btn2.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| back_to_start_btn3.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| back_to_start_btn4.click(go_to_start, inputs=None, outputs=[start_page, input_page, chatbot_page, results_page]) | |
| # Keyword and repo input logic | |
| keyword_btn.click(keyword_search_and_update, inputs=keyword_input, outputs=df_box) | |
| submit_btn.click(process_repo_input_and_store, inputs=repo_id_box, outputs=df_box) | |
| # Analysis logic | |
| combine_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt, llm_output_txt, df_display]) | |
| # Chatbot logic | |
| def user_send(user_message, history): | |
| assistant_reply = chat_with_user(user_message, history) | |
| history = history + [[user_message, assistant_reply]] | |
| return history, history, "" | |
| def end_chat(history): | |
| keywords = extract_keywords_from_conversation(history) | |
| global generated_keywords | |
| generated_keywords.clear() | |
| generated_keywords.extend([k.strip() for k in keywords.split(",") if k.strip()]) | |
| return keywords | |
| def go_to_results_from_chatbot(keywords): | |
| # Use the keywords to search and update the CSV, then display the DataFrame | |
| df = use_keywords_to_search_and_update_csv(keywords) | |
| return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=True), df | |
| send_btn.click(user_send, inputs=[user_input, state], outputs=[chatbot, state, user_input]) | |
| end_btn.click(end_chat, inputs=state, outputs=keywords_output) | |
| go_to_results_btn.click( | |
| go_to_results_from_chatbot, | |
| inputs=keywords_output, | |
| outputs=[chatbot_page, input_page, analysis_page, results_page, results_df] | |
| ) | |
| # Add logic for the new button on results_page | |
| analyze_next_btn.click(show_combined_repo_and_llm, inputs=None, outputs=[combined_txt_results, llm_output_txt_results, results_df]) | |
| demo.launch() |