Spaces:

meg
/

FineWebBiasAnalyses

No application file

meg HF Staff commited on May 29, 2024

Commit

2fa1451

verified ·

1 Parent(s): 4f084e5

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,15 +1,26 @@
 from sklearn.feature_extraction.text import CountVectorizer
 import numpy as np
 from datasets import load_dataset
 text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
-bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
-co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
-print('Printing sparse matrix:')
-print(co_occurrences)
-print('Printing dense matrix')
-print(co_occurrences.todense())
-sum_occ = np.sum(co_occurrences.todense(), axis=0)
-print('Sum of word-word occurrences:')
-print(sum_occ)

 from sklearn.feature_extraction.text import CountVectorizer
 import numpy as np
 from datasets import load_dataset
+import gradio as gr
 text_dataset = load_dataset("HuggingFaceFW/fineweb", name="sample-10BT", split="train", streaming=True, columns=['text'])
+def update(text_dataset):
+    bigram_vectorizer = CountVectorizer(ngram_range=(2, 2), analyzer="word")
+    co_occurrences = bigram_vectorizer.fit_transform(doc['text'] for doc in text_dataset)
+    print('Printing sparse matrix:')
+    print(co_occurrences)
+    print('Printing dense matrix')
+    print(co_occurrences.todense())
+    sum_occ = np.sum(co_occurrences.todense(), axis=0)
+    print('Sum of word-word occurrences:')
+    print(sum_occ)
+    return sum_occ
+with gr.Blocks() as app:
+    gr.Markdown("Click **Run** to start calculating.")
+    btn = gr.Button("Run")
+    btn.click(fn=update, inputs=text_dataset, outputs=out)
+app.launch()