dependency-history

Build error

App Files Files Community

patrickvonplaten commited on Dec 6, 2023

Commit

109623e

1 Parent(s): 5d18ec4

improve

Browse files

Files changed (1) hide show

app.py +122 -107

app.py CHANGED Viewed

@@ -9,7 +9,9 @@ import streamlit as st
 from datetime import datetime, timedelta
 import matplotlib.pyplot as plt
-libraries = [
     "open-source-metrics/transformers-dependents",
     "open-source-metrics/diffusers-dependents",
     "open-source-metrics/pytorch-image-models-dependents",
@@ -21,130 +23,143 @@ libraries = [
     "open-source-metrics/optimum-dependents",
     "open-source-metrics/hub-docs-dependents",
     "open-source-metrics/huggingface_hub-dependents",
-]
-option = st.selectbox(
-    'Choose library',
-    libraries
-)
-cached_folder = snapshot_download(option, repo_type="dataset")
-num_dependents = defaultdict(int)
-num_stars_all_dependents = defaultdict(int)
-def load_json_files(directory):
-    for subdir, dirs, files in os.walk(directory):
-        for file in files:
-            if file.endswith('.json'):
-                file_path = os.path.join(subdir, file)
-                date = "_".join(file_path.split(".")[-2].split("/")[-3:])
-                with open(file_path, 'r') as f:
-                    data = json.load(f)
-                    # Process the JSON data as needed
-                    if "name" in data and "stars" in data:
-                        num_dependents[date] = len(data["name"])
-                        num_stars_all_dependents[date] = sum(data["stars"])
-# Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
-load_json_files(cached_folder)
-def sort_dict_by_date(d):
-    # Convert date strings to datetime objects and sort
-    sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
-    # Convert back to dictionary if needed
-    return defaultdict(int, sorted_tuples)
-def remove_incorrect_entries(data):
-    # Convert string dates to datetime objects for easier comparison
-    sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
-    # Initialize a new dictionary to store the corrected data
-    corrected_data = defaultdict(int)
-    # Variable to keep track of the number of dependents on the previous date
-    previous_dependents = None
-    for date, dependents in sorted_data:
-        # If the current number of dependents is not less than the previous, add it to the corrected data
-        if previous_dependents is None or dependents >= previous_dependents:
-            corrected_data[date] = dependents
-            previous_dependents = dependents
-    return corrected_data
-def interpolate_missing_dates(data):
-    # Convert string dates to datetime objects
-    temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
-    # Find the min and max dates to establish the range
-    min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
-    # Generate a date range
-    current_date = min_date
-    while current_date <= max_date:
-        # If the current date is missing
-        if current_date not in temp_data:
-            # Find previous and next dates that are present
-            prev_date = current_date - timedelta(days=1)
-            next_date = current_date + timedelta(days=1)
-            while prev_date not in temp_data:
-                prev_date -= timedelta(days=1)
-            while next_date not in temp_data:
-                next_date += timedelta(days=1)
-            # Linear interpolation
-            prev_value = temp_data[prev_date]
-            next_value = temp_data[next_date]
-            interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
-            temp_data[current_date] = interpolated_value
-        current_date += timedelta(days=1)
-    # Convert datetime objects back to string format
-    interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
-    return interpolated_data
-num_dependents = remove_incorrect_entries(num_dependents)
-num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
-num_dependents = interpolate_missing_dates(num_dependents)
-num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
-num_dependents = sort_dict_by_date(num_dependents)
-num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
-num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
-num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
-num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
-num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
-num_dependents_df.set_index('Date', inplace=True)
-num_dependents_df = num_dependents_df.resample('D').asfreq()
-num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
-num_cum_stars_df.set_index('Date', inplace=True)
-num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
-num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
-filename = "_".join(option.split("/"))
-# Plotting
-plt.figure(figsize=(10, 6))
-plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
-plt.plot(num_dependents_df.index, num_dependents_df['Value'], marker='o')
 plt.xlabel('Date')
-plt.ylabel('Number of Dependents')
 plt.title('Dependencies History')
 st.pyplot(plt)
 # Display in Streamlit
-plt.figure(figsize=(10, 6))
 plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
-plt.plot(num_cum_stars_df.index, num_cum_stars_df['Value'], marker='o')
 plt.xlabel('Date')
-plt.ylabel('Cumulative number of stars of Dependents')
 plt.title('Dependents Stars History')
 st.pyplot(plt)

 from datetime import datetime, timedelta
 import matplotlib.pyplot as plt
+plt.rcParams.update({'font.size': 40})
+libraries = {
     "open-source-metrics/transformers-dependents",
     "open-source-metrics/diffusers-dependents",
     "open-source-metrics/pytorch-image-models-dependents",
     "open-source-metrics/optimum-dependents",
     "open-source-metrics/hub-docs-dependents",
     "open-source-metrics/huggingface_hub-dependents",
+}
+MAP = {k.split("/")[-1].split("-")[0]: k for k in libraries}
+selected_libraries = st.multiselect(
+    'Choose libraries',
+    list(MAP.keys())
+)
+def get_frames(option):
+    cached_folder = snapshot_download(option, repo_type="dataset")
+    num_dependents = defaultdict(int)
+    num_stars_all_dependents = defaultdict(int)
+    def load_json_files(directory):
+        for subdir, dirs, files in os.walk(directory):
+            for file in files:
+                if file.endswith('.json'):
+                    file_path = os.path.join(subdir, file)
+                    date = "_".join(file_path.split(".")[-2].split("/")[-3:])
+                    with open(file_path, 'r') as f:
+                        data = json.load(f)
+                        # Process the JSON data as needed
+                        if "name" in data and "stars" in data:
+                            num_dependents[date] = len(data["name"])
+                            num_stars_all_dependents[date] = sum(data["stars"])
+    # Replace 'your_directory_path' with the path to the directory containing your '11' and '12' folders
+    load_json_files(cached_folder)
+    def sort_dict_by_date(d):
+        # Convert date strings to datetime objects and sort
+        sorted_tuples = sorted(d.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
+        # Convert back to dictionary if needed
+        return defaultdict(int, sorted_tuples)
+    def remove_incorrect_entries(data):
+        # Convert string dates to datetime objects for easier comparison
+        sorted_data = sorted(data.items(), key=lambda x: datetime.strptime(x[0], '%Y_%m_%d'))
+        # Initialize a new dictionary to store the corrected data
+        corrected_data = defaultdict(int)
+        # Variable to keep track of the number of dependents on the previous date
+        previous_dependents = None
+        for date, dependents in sorted_data:
+            # If the current number of dependents is not less than the previous, add it to the corrected data
+            if previous_dependents is None or dependents >= previous_dependents:
+                corrected_data[date] = dependents
+                previous_dependents = dependents
+        return corrected_data
+    def interpolate_missing_dates(data):
+        # Convert string dates to datetime objects
+        temp_data = {datetime.strptime(date, '%Y_%m_%d'): value for date, value in data.items()}
+        # Find the min and max dates to establish the range
+        min_date, max_date = min(temp_data.keys()), max(temp_data.keys())
+        # Generate a date range
+        current_date = min_date
+        while current_date <= max_date:
+            # If the current date is missing
+            if current_date not in temp_data:
+                # Find previous and next dates that are present
+                prev_date = current_date - timedelta(days=1)
+                next_date = current_date + timedelta(days=1)
+                while prev_date not in temp_data:
+                    prev_date -= timedelta(days=1)
+                while next_date not in temp_data:
+                    next_date += timedelta(days=1)
+                # Linear interpolation
+                prev_value = temp_data[prev_date]
+                next_value = temp_data[next_date]
+                interpolated_value = prev_value + ((next_value - prev_value) * ((current_date - prev_date) / (next_date - prev_date)))
+                temp_data[current_date] = interpolated_value
+            current_date += timedelta(days=1)
+        # Convert datetime objects back to string format
+        interpolated_data = defaultdict(int, {date.strftime('%Y_%m_%d'): int(value) for date, value in temp_data.items()})
+        return interpolated_data
+    num_dependents = remove_incorrect_entries(num_dependents)
+    num_stars_all_dependents = remove_incorrect_entries(num_stars_all_dependents)
+    num_dependents = interpolate_missing_dates(num_dependents)
+    num_stars_all_dependents = interpolate_missing_dates(num_stars_all_dependents)
+    num_dependents = sort_dict_by_date(num_dependents)
+    num_stars_all_dependents = sort_dict_by_date(num_stars_all_dependents)
+    num_dependents_df = pd.DataFrame(list(num_dependents.items()), columns=['Date', 'Value'])
+    num_cum_stars_df = pd.DataFrame(list(num_stars_all_dependents.items()), columns=['Date', 'Value'])
+    num_dependents_df['Date'] = pd.to_datetime(num_dependents_df['Date'], format='%Y_%m_%d')
+    num_cum_stars_df['Date'] = pd.to_datetime(num_cum_stars_df['Date'], format='%Y_%m_%d')
+    num_dependents_df.set_index('Date', inplace=True)
+    num_dependents_df = num_dependents_df.resample('D').asfreq()
+    num_dependents_df['Value'] = num_dependents_df['Value'].interpolate()
+    num_cum_stars_df.set_index('Date', inplace=True)
+    num_cum_stars_df = num_cum_stars_df.resample('D').asfreq()
+    num_cum_stars_df['Value'] = num_cum_stars_df['Value'].interpolate()
+    return num_dependents_df, num_cum_stars_df
+lib_frames = {l: get_frames(MAP[l]) for l in selected_libraries}
+plt.figure(figsize=(40, 24))
+plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
+for l, (df_dep, _) in lib_frames.items():
+    plt.plot(df_dep.index, df_dep['Value'], label=l, marker='o')
 plt.xlabel('Date')
+plt.ylabel('# Dependencies')
+plt.legend()
 plt.title('Dependencies History')
 st.pyplot(plt)
 # Display in Streamlit
+plt.figure(figsize=(40, 24))
 plt.gca().yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
+for l, (_, df_stars) in lib_frames.items():
+    plt.plot(df_stars.index, df_stars['Value'], label=l, marker='o')
 plt.xlabel('Date')
+plt.ylabel('SUM stars of dependencies')
+plt.legend()
 plt.title('Dependents Stars History')
 st.pyplot(plt)