Adds example on how to use the HearClassifier
Browse files
README.md
CHANGED
|
@@ -105,6 +105,8 @@ of audio, we recommend that you create a production version using [the Vertex
|
|
| 105 |
Model
|
| 106 |
Garden](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/hear).
|
| 107 |
|
|
|
|
|
|
|
| 108 |
```python
|
| 109 |
import torch
|
| 110 |
from fcv_detector.models.hear import (
|
|
@@ -143,6 +145,97 @@ inputs = fe(raw_audio_batch, return_tensors="pt")
|
|
| 143 |
output = model(**inputs)
|
| 144 |
```
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
### Examples
|
| 147 |
|
| 148 |
See the following Colab notebooks for examples of how to use HeAR:
|
|
|
|
| 105 |
Model
|
| 106 |
Garden](https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/hear).
|
| 107 |
|
| 108 |
+
#### Audio representation with embeddings
|
| 109 |
+
|
| 110 |
```python
|
| 111 |
import torch
|
| 112 |
from fcv_detector.models.hear import (
|
|
|
|
| 145 |
output = model(**inputs)
|
| 146 |
```
|
| 147 |
|
| 148 |
+
```
|
| 149 |
+
You are using a model of type vit to instantiate a model of type hear. This is not supported for all configurations of models and can yield errors.
|
| 150 |
+
BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 0.1638, 0.0311, -0.3071, ..., -0.1555, -0.0380, -0.3294],
|
| 151 |
+
[ 0.1879, 0.9123, 0.3434, ..., 2.1157, -0.2212, -0.5031],
|
| 152 |
+
[ 0.4474, -0.3095, 0.1068, ..., -1.1577, -0.1871, -1.1114],
|
| 153 |
+
...,
|
| 154 |
+
[-1.1620, -0.6956, 0.0340, ..., 0.2741, -0.3230, -0.7366],
|
| 155 |
+
[-0.2818, -0.1758, -0.1667, ..., 0.3051, -0.3197, -0.6817],
|
| 156 |
+
[-0.5189, -0.3460, 0.0631, ..., 0.2027, -0.5678, -0.2382]],
|
| 157 |
+
|
| 158 |
+
[[ 0.1788, 0.0652, -0.2803, ..., -0.1490, -0.0312, -0.2837],
|
| 159 |
+
[-0.1547, 0.6340, 0.0806, ..., 2.1374, -0.3951, -0.5316],
|
| 160 |
+
[-0.2770, 0.7531, 0.4323, ..., 0.9180, -0.3570, -0.1897],
|
| 161 |
+
...,
|
| 162 |
+
[-1.3322, -0.0332, -0.2455, ..., 0.4821, -0.0645, -0.9346],
|
| 163 |
+
[-1.3276, -0.6403, -0.0455, ..., 0.6166, -0.4472, -0.4335],
|
| 164 |
+
[-1.0610, 0.2751, -0.2439, ..., 0.7873, -0.1567, -0.4248]],
|
| 165 |
+
|
| 166 |
+
[[ 0.1755, 0.1288, -0.2913, ..., -0.1226, -0.0644, -0.3382],
|
| 167 |
+
[ 0.1055, 1.1124, -0.2281, ..., 3.2376, -0.3979, -0.5840],
|
| 168 |
+
[-0.6490, -0.3893, 0.4327, ..., 2.4446, -0.2480, -0.9221],
|
| 169 |
+
...,
|
| 170 |
+
[-1.5817, -0.0733, -0.7567, ..., 1.0221, -0.4246, -0.9694],
|
| 171 |
+
[ 0.1373, -0.0258, 0.2139, ..., 1.2905, -0.2469, -0.8213],
|
| 172 |
+
[-1.2737, 0.2838, -0.1167, ..., 0.8610, -0.2919, -0.8152]],
|
| 173 |
+
|
| 174 |
+
[[ 0.1398, 0.1110, -0.2897, ..., -0.1562, -0.0699, -0.3052],
|
| 175 |
+
[-0.1940, 0.1297, 0.1607, ..., 3.2720, -0.0289, -1.0005],
|
| 176 |
+
[-0.3104, 0.6009, -0.1392, ..., 2.7523, -0.0829, -0.6996],
|
| 177 |
+
...,
|
| 178 |
+
[-0.9739, -0.4732, 0.0499, ..., 1.8665, -0.2438, -0.7332],
|
| 179 |
+
[-0.3944, 0.1800, -0.0829, ..., 1.2693, -0.6084, -0.7625],
|
| 180 |
+
[-1.5253, 0.4868, -0.3012, ..., 1.5606, -0.0050, -0.4669]]],
|
| 181 |
+
grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.3807, 0.9901, 0.5437, ..., 1.0000, 0.5777, 0.9752],
|
| 182 |
+
[-0.4004, 0.9932, 0.7021, ..., 1.0000, 0.7681, 0.9804],
|
| 183 |
+
[-0.3874, 0.9964, 0.5076, ..., 1.0000, 0.8015, 0.9823],
|
| 184 |
+
[-0.3838, 0.9970, 0.5793, ..., 1.0000, 0.8024, 0.9895]],
|
| 185 |
+
grad_fn=<TanhBackward0>), hidden_states=None, attentions=None)
|
| 186 |
+
```
|
| 187 |
+
|
| 188 |
+
#### Audio classification
|
| 189 |
+
|
| 190 |
+
```python
|
| 191 |
+
import torch
|
| 192 |
+
from fcv_detector.models.hear import (
|
| 193 |
+
HearConfig,
|
| 194 |
+
HearModel,
|
| 195 |
+
HearForAudioClassification,
|
| 196 |
+
HearFeatureExtractor
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
from transformers import (
|
| 200 |
+
AutoConfig,
|
| 201 |
+
AutoModel,
|
| 202 |
+
AutoModelForAudioClassification,
|
| 203 |
+
AutoFeatureExtractor,
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
AutoConfig.register("hear", HearConfig)
|
| 207 |
+
AutoModel.register(HearConfig, HearModel)
|
| 208 |
+
AutoModelForAudioClassification.register(HearConfig, HearForAudioClassification)
|
| 209 |
+
AutoFeatureExtractor.register(HearConfig, HearFeatureExtractor)
|
| 210 |
+
|
| 211 |
+
from huggingface_hub.utils import HfFolder
|
| 212 |
+
from huggingface_hub import notebook_login
|
| 213 |
+
if HfFolder.get_token() is None:
|
| 214 |
+
notebook_login()
|
| 215 |
+
|
| 216 |
+
|
| 217 |
+
model_id = "audiblehealthai/hear-pytorch"
|
| 218 |
+
classifier = HearForAudioClassification.from_pretrained(model_id)
|
| 219 |
+
fe = HearFeatureExtractor.from_pretrained(
|
| 220 |
+
model_id
|
| 221 |
+
)
|
| 222 |
+
|
| 223 |
+
raw_audio_batch = torch.rand((4, 32000), dtype=torch.float32)
|
| 224 |
+
inputs = fe(raw_audio_batch, return_tensors="pt")
|
| 225 |
+
cls_output = classifier(**inputs)
|
| 226 |
+
print(cls_output)
|
| 227 |
+
```
|
| 228 |
+
|
| 229 |
+
```
|
| 230 |
+
You are using a model of type vit to instantiate a model of type hear. This is not supported for all configurations of models and can yield errors.
|
| 231 |
+
Some weights of HearForAudioClassification were not initialized from the model checkpoint at audiblehealthai/hear-pytorch and are newly initialized: ['classifier.layernorm.bias', 'classifier.dense.weight', 'classifier.dense.bias', 'classifier.layernorm.weight']
|
| 232 |
+
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
|
| 233 |
+
SequenceClassifierOutput(loss=None, logits=tensor([[-0.0135, 0.0895],
|
| 234 |
+
[-0.0071, 0.1055],
|
| 235 |
+
[-0.0082, 0.0801],
|
| 236 |
+
[ 0.0145, 0.1028]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
|
| 237 |
+
```
|
| 238 |
+
|
| 239 |
### Examples
|
| 240 |
|
| 241 |
See the following Colab notebooks for examples of how to use HeAR:
|