Spaces:
Sleeping
Sleeping
| import re | |
| from transformers import AutoTokenizer | |
| from app.config import EMBEDDING_MODEL | |
| class TextPreprocessor: | |
| """ | |
| A simple text preprocessor for cleaning and tokenizing text. | |
| """ | |
| def __init__(self, model_name: str = EMBEDDING_MODEL): | |
| self.tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| def clean_text(self, text: str) -> str: | |
| """ | |
| Remove extra whitespace and control characters from text. | |
| Args: | |
| text: The text to clean. | |
| Returns: | |
| The cleaned text. | |
| """ | |
| text = re.sub(r"[\s\t\n]+", " ", text) # Normalize whitespace | |
| text = re.sub(r"[\x00-\x1F\x7F]", "", text) # Remove control characters | |
| return text.strip() | |
| def count_tokens(self, text: str) -> int: | |
| """ | |
| Count the number of tokens in the text using a tokenizer. | |
| Args: | |
| text: The text to tokenize. | |
| Returns: | |
| The number of tokens. | |
| """ | |
| # Tokenize the text and return the length of the input IDs | |
| return len(self.tokenizer(text).input_ids) |