Spaces:
Build error
Build error
Update Space (evaluate main: c447fc8e)
Browse files- requirements.txt +1 -1
- word_count.py +3 -18
requirements.txt
CHANGED
|
@@ -1,2 +1,2 @@
|
|
| 1 |
-
git+https://github.com/huggingface/evaluate.git@
|
| 2 |
sklearn~=0.0
|
|
|
|
| 1 |
+
git+https://github.com/huggingface/evaluate.git@c447fc8eda9c62af501bfdc6988919571050d950
|
| 2 |
sklearn~=0.0
|
word_count.py
CHANGED
|
@@ -12,9 +12,6 @@
|
|
| 12 |
# See the License for the specific language governing permissions and
|
| 13 |
# limitations under the License.
|
| 14 |
|
| 15 |
-
from dataclasses import dataclass
|
| 16 |
-
from typing import Optional
|
| 17 |
-
|
| 18 |
import datasets
|
| 19 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 20 |
|
|
@@ -44,30 +41,18 @@ Examples:
|
|
| 44 |
_CITATION = ""
|
| 45 |
|
| 46 |
|
| 47 |
-
@dataclass
|
| 48 |
-
class WordCount(evaluate.info.Config):
|
| 49 |
-
|
| 50 |
-
name: str = "default"
|
| 51 |
-
|
| 52 |
-
max_vocab: Optional[int] = None
|
| 53 |
-
|
| 54 |
-
|
| 55 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 56 |
class WordCount(evaluate.Measurement):
|
| 57 |
"""This measurement returns the total number of words and the number of unique words
|
| 58 |
in the input string(s)."""
|
| 59 |
|
| 60 |
-
|
| 61 |
-
ALLOWED_CONFIG_NAMES = ["default"]
|
| 62 |
-
|
| 63 |
-
def _info(self, config):
|
| 64 |
return evaluate.MeasurementInfo(
|
| 65 |
# This is the description that will appear on the modules page.
|
| 66 |
module_type="measurement",
|
| 67 |
description=_DESCRIPTION,
|
| 68 |
citation=_CITATION,
|
| 69 |
inputs_description=_KWARGS_DESCRIPTION,
|
| 70 |
-
config=config,
|
| 71 |
features=datasets.Features(
|
| 72 |
{
|
| 73 |
"data": datasets.Value("string"),
|
|
@@ -75,9 +60,9 @@ class WordCount(evaluate.Measurement):
|
|
| 75 |
),
|
| 76 |
)
|
| 77 |
|
| 78 |
-
def _compute(self, data):
|
| 79 |
"""Returns the number of unique words in the input data"""
|
| 80 |
-
count_vectorizer = CountVectorizer(max_features=
|
| 81 |
document_matrix = count_vectorizer.fit_transform(data)
|
| 82 |
word_count = document_matrix.sum()
|
| 83 |
unique_words = document_matrix.shape[1]
|
|
|
|
| 12 |
# See the License for the specific language governing permissions and
|
| 13 |
# limitations under the License.
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
import datasets
|
| 16 |
from sklearn.feature_extraction.text import CountVectorizer
|
| 17 |
|
|
|
|
| 41 |
_CITATION = ""
|
| 42 |
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
| 45 |
class WordCount(evaluate.Measurement):
|
| 46 |
"""This measurement returns the total number of words and the number of unique words
|
| 47 |
in the input string(s)."""
|
| 48 |
|
| 49 |
+
def _info(self):
|
|
|
|
|
|
|
|
|
|
| 50 |
return evaluate.MeasurementInfo(
|
| 51 |
# This is the description that will appear on the modules page.
|
| 52 |
module_type="measurement",
|
| 53 |
description=_DESCRIPTION,
|
| 54 |
citation=_CITATION,
|
| 55 |
inputs_description=_KWARGS_DESCRIPTION,
|
|
|
|
| 56 |
features=datasets.Features(
|
| 57 |
{
|
| 58 |
"data": datasets.Value("string"),
|
|
|
|
| 60 |
),
|
| 61 |
)
|
| 62 |
|
| 63 |
+
def _compute(self, data, max_vocab=None):
|
| 64 |
"""Returns the number of unique words in the input data"""
|
| 65 |
+
count_vectorizer = CountVectorizer(max_features=max_vocab)
|
| 66 |
document_matrix = count_vectorizer.fit_transform(data)
|
| 67 |
word_count = document_matrix.sum()
|
| 68 |
unique_words = document_matrix.shape[1]
|