Spaces:
Sleeping
Sleeping
Upload scripts/preprocess.py with huggingface_hub
Browse files- scripts/preprocess.py +2 -1
scripts/preprocess.py
CHANGED
|
@@ -4,7 +4,7 @@ sys.path.append(sys.path[0].replace('scripts', ''))
|
|
| 4 |
from urllib.request import urlretrieve
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
-
from config.data_paths import
|
| 8 |
import re
|
| 9 |
|
| 10 |
from scripts.utils import load_config
|
|
@@ -28,6 +28,7 @@ def clean_corpus():
|
|
| 28 |
Utility function to clean and preprocess the prompt corpus.
|
| 29 |
"""
|
| 30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
|
|
|
| 31 |
df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
|
| 32 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
| 33 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|
|
|
|
| 4 |
from urllib.request import urlretrieve
|
| 5 |
import pandas as pd
|
| 6 |
|
| 7 |
+
from config.data_paths import PROCESSED_DATA_PATH
|
| 8 |
import re
|
| 9 |
|
| 10 |
from scripts.utils import load_config
|
|
|
|
| 28 |
Utility function to clean and preprocess the prompt corpus.
|
| 29 |
"""
|
| 30 |
if not os.path.isfile(os.path.join(PROCESSED_DATA_PATH, 'prompt_corpus_clean.parquet')): # to speed up the process
|
| 31 |
+
os.makedirs(PROCESSED_DATA_PATH, exist_ok=True)
|
| 32 |
df = pd.read_parquet(PROMPTS_URL).sample(10000, random_state=123)
|
| 33 |
assert 'prompt' in df.columns, "Parquet file must contain a 'prompt' column."
|
| 34 |
df = df[df['prompt'].notna()][['prompt']] # drop missing rows
|