Spaces:

mariagrandury
/

language-gap-in-hf-hub

Running

App Files Files Community

language-gap-in-hf-hub / hub_models_by_language.py

mariagrandury

specify resource type in plot names

a938b8a 6 months ago

raw

history blame

4.06 kB

	import os
	import pickle
	from datetime import datetime

	import matplotlib.pyplot as plt
	import pandas as pd
	from huggingface_hub import HfApi

	# Define colors for each language
	LANGUAGE_COLORS = {
	"english": "orange",
	"spanish": "blue",
	}


	def fetch_models(cache_file="models_cache.pkl"):
	"""Fetch and filter models from HuggingFace Hub with caching"""
	# Check if cached data exists and is less than 24 hours old
	if os.path.exists(cache_file):
	cache_age = datetime.now().timestamp() - os.path.getmtime(cache_file)
	if cache_age < 24 * 3600: # 24 hours in seconds
	print("Loading models from cache...")
	with open(cache_file, "rb") as f:
	return pickle.load(f)
	else:
	print("Cache is older than 24 hours, fetching fresh data...")
	else:
	print("No cache found, fetching models from Hugging Face Hub...")

	hf_api = HfApi()
	all_models = list(hf_api.list_models(full=True))

	# Filter models by language
	english_filter = filter(
	lambda m: any(tag == "language:en" for tag in m.tags)
	and not any(
	tag.startswith("language:") and tag != "language:en" for tag in m.tags
	),
	all_models,
	)
	spanish_filter = filter(
	lambda m: any(tag == "language:es" for tag in m.tags)
	and not any(
	tag.startswith("language:") and tag != "language:es" for tag in m.tags
	),
	all_models,
	)

	filtered_models = {
	"english": list(english_filter),
	"spanish": list(spanish_filter),
	}

	# Cache the filtered models
	print("Saving models to cache...")
	with open(cache_file, "wb") as f:
	pickle.dump(filtered_models, f)

	return filtered_models


	def create_stack_area_plot(models, output_dir):
	"""Create stacked area plot for English and Spanish models"""
	# Prepare data for all languages
	all_dates = []
	languages = ["english", "spanish"]
	for lang in languages:
	all_dates.extend([d.created_at.date() for d in models[lang]])

	if not all_dates:
	print("No models found for any language. Skipping plot creation.")
	return

	# Create a common date range for all languages
	min_date = min(all_dates)
	max_date = max(all_dates)
	date_range = pd.date_range(start=min_date, end=max_date, freq="MS")

	# Create separate DataFrames for each language
	dfs = {}
	for lang in languages:
	dates = [d.created_at.date() for d in models[lang]]
	df = pd.DataFrame({"Date": dates})
	df["Count"] = 1
	df["Date"] = pd.to_datetime(df["Date"])
	# Reindex to common date range and fill missing values with 0
	df_grouped = df.groupby(pd.Grouper(key="Date", freq="MS")).sum()
	df_grouped = df_grouped.reindex(date_range, fill_value=0)
	dfs[lang] = df_grouped.cumsum()

	# Plot stacked area for English and Spanish
	plt.figure(figsize=(10, 6))
	plt.stackplot(
	date_range,
	[dfs[lang]["Count"].values for lang in languages],
	labels=["English", "Spanish"],
	colors=[LANGUAGE_COLORS[lang] for lang in languages],
	)

	plt.xlabel("Date", fontsize=10)
	plt.ylabel("Cumulative Number of Models", fontsize=10)
	plt.xticks(rotation=45, fontsize=10)
	plt.legend(loc="upper left")
	plt.tight_layout()
	plt.savefig(f"{output_dir}/models_stack_area_en_es.png")
	plt.close()


	def main():
	# Create output directory if it doesn't exist
	output_dir = "plots"
	os.makedirs(output_dir, exist_ok=True)

	# Fetch models
	print("Fetching models from Hugging Face Hub...")
	models = fetch_models()

	# Print model counts
	print("\nModel counts:")
	for lang, models_list in models.items():
	print(f"{lang.capitalize()}: {len(models_list)}")

	# Create visualization
	print("\nCreating stack area plot...")
	create_stack_area_plot(models, output_dir)

	print(f"Plot has been saved to the '{output_dir}' directory")


	if __name__ == "__main__":
	main()