Spaces:

davanstrien
/

collections-info

Runtime error

App Files Files Community

collections-info / app.py

davanstrien HF Staff

open by default

d4089a6 about 2 years ago

raw

history blame contribute delete

3.57 kB

	from huggingface_hub import get_collection, Collection, CollectionItem
	from toolz import groupby, valmap
	from typing import Dict, List
	import pandas as pd
	from huggingface_hub import model_info
	import gradio as gr
	from functools import lru_cache

	test_slug = "HF-IA-archiving/models-to-archive-65006a7fdadb8c628f33aac9"


	def group_collection_by_repo_type(
	collection_slug: str,
	) -> Dict[str, List[CollectionItem]]:
	collection = get_collection(collection_slug)
	return groupby(lambda x: x.repoType, collection.items)


	def render_model_hub_link(hub_id):
	link = f"https://huggingface.co/{hub_id}"
	return (
	f'<a target="_blank" href="{link}" style="color: var(--link-text-color);'
	f' text-decoration: underline;text-decoration-style: dotted;">{hub_id}</a>'
	)


	def load_to_dataframe(data):
	# Columns to keep
	columns = [
	"item_id",
	"downloads",
	"author",
	"likes",
	"pipeline_tag",
	"lastModified",
	]
	# convert to dicts
	data = [item.__dict__ for item in data]

	filtered_data = [
	{key: item[key] for key in columns if key in item} for item in data
	]
	required_info_keys = ["language", "tags", "license", "datasets"]

	for item in filtered_data:
	try:
	card = model_info(item["item_id"]).cardData
	for key in required_info_keys:
	item[key] = card.get(key)
	except AttributeError as e:
	print(e)
	for key in required_info_keys:
	item[key] = None
	# Load into a DataFrame
	df = pd.DataFrame(filtered_data)
	df["item_id"] = df["item_id"].apply(render_model_hub_link)
	return df


	def summary_of_na_values(df):
	na_counts = df.isna().sum()
	na_counts = na_counts[na_counts > 0]
	na_percent = round(na_counts / len(df) * 100, 2)
	return (
	pd.DataFrame({"Missing Count": na_counts, "Missing Percent": na_percent})
	.rename_axis(index="Metadata")
	.reset_index()
	)

	def value_counts(df, column_name):
	return df[column_name].value_counts()



	@lru_cache(maxsize=10)
	def load_data():
	repos_grouped_by_type = group_collection_by_repo_type(test_slug)
	models = repos_grouped_by_type["model"]
	df = load_to_dataframe(models)
	column_names = df.columns.to_list()
	return repos_grouped_by_type, column_names, df



	def generate_markdown_summary_of_collection(
	grouped_collection: Dict[str, List[CollectionItem]]
	):
	counts = valmap(len, grouped_collection)
	results = "This collection contains the following items:\n"
	for k, v in counts.items():
	results += f"- {v} {k}s\n"
	return results


	repos_grouped_by_type, column_names, df = load_data()


	def filter_df(columns_to_show=None):
	*_, df = load_data()
	return df if columns_to_show is None else df[columns_to_show]


	with gr.Blocks() as demo:
	gr.Markdown("## Info about models in this collection")
	gr.Markdown(generate_markdown_summary_of_collection(repos_grouped_by_type))
	gr.Markdown("### Summary of missing metadata values")
	gr.DataFrame(summary_of_na_values(df))
	gr.Markdown("# Models in this collection")
	with gr.Accordion("Models", open=True):
	columns_to_show = gr.Dropdown(
	label="Columns to show",
	value=column_names,
	choices=column_names,
	multiselect=True,
	)
	models_df = gr.DataFrame(filter_df, datatype="markdown")
	columns_to_show.change(filter_df, columns_to_show, models_df)

	demo.launch(debug=True)