Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
| """Extractor functions to retrieve sentences by character chunks from a file | |
| This script contains the logic that allows the user to process and filter | |
| sentences of the original corpus. By default, this considers a minimum sentence | |
| length, and removes newlines and multiple consecutive spaces. | |
| Configuration for existing functionality is at the top of the file. Feel free to | |
| add new processing and/or filter functions. The "process_line" and "filter_line" | |
| functions contain the pipeline for processing the scripts as needed. | |
| """ | |
| import regex as re | |
| import argparse | |
| from pathlib import Path | |
| from functools import partial | |
| from typing import Union | |
| MIN_LINE_LENGTH = 8 # words | |
| def parse_args(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate") | |
| parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl") | |
| args = parser.parse_args() | |
| return args | |
| # ============================================================ | |
| # Helper functions | |
| # ============================================================ | |
| # String -> String | |
| def replace_newlines(s:str) -> str: | |
| return re.sub(r"\n+", r" ", s) | |
| # String -> String | |
| def replace_multispace(s:str) -> str: | |
| return re.sub(r"\s+", r" ", s) | |
| def is_short_sentence(s:str, min_len=8) -> str: | |
| """Returns True if the sentence has less than `min_len` number of words""" | |
| return len(s.split(' ')) < min_len | |
| def contains_char(char:str, s:str) -> str: | |
| return char in s | |
| # ============================================================ | |
| # Compilation functions | |
| # ============================================================ | |
| def process_line(line:str) -> str: | |
| """"Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file. | |
| Args: | |
| line: Chunk of text | |
| Returns: | |
| Input that has been stripped of newlines and multiple consecutive spaces. | |
| """ | |
| s = replace_multispace(replace_newlines(line)) | |
| return s | |
| def filter_line(line:str) -> bool: | |
| """Returns True if the sentence passes the MIN_LINE_LENGTH configuration | |
| Redefine this function with desired helper functions, returning true if you want to keep the line | |
| """ | |
| fails = is_short_sentence(line, MIN_LINE_LENGTH) | |
| return not fails | |
| # ============================================================ | |
| # Main Logic | |
| # ============================================================ | |
| def read_outcomes(chars:str) -> Union[str, None]: | |
| """From a chunk of characters, decide whether to return the processed characters or Nothing. | |
| If the input is the empty string "", raise StopIteration | |
| Args: | |
| chars: Chunk of text to process | |
| Returns: | |
| The processed chunk of text or nothing if the characters do not pass the filtering | |
| Raises: | |
| StopIteration: If the input is the empty string "", raise StopIteration | |
| """ | |
| if chars == '': raise StopIteration | |
| line = process_line(chars) | |
| if filter_line(line): return line | |
| return None | |
| def get_chars(n:int, f) -> Union[str, None]: | |
| """Extract `n` chars from opened file `f` | |
| Args: | |
| n: Number of characters to read from the opened file | |
| f: Opened file from the return of `open(fname)` | |
| Returns: | |
| The processed chunk of text or nothing if the characters do not pass the filtering | |
| Raises: | |
| This function does not raise any errors of its own, but can pass up the StopIteration exception | |
| from read_outcomes | |
| """ | |
| chars = f.read(n) | |
| return read_outcomes(chars) | |
| def get_line(f): | |
| """Given an open file, get the next line and process it. Handles 3 scenarios: | |
| 1. StopIteration indicates the opened file has reached the end | |
| 2. Return a processed line if it passes the filter | |
| 3. If line does not pass the filter line, return None | |
| """ | |
| line = f.readline() | |
| return read_outcomes(line) | |
| def read_on(reader, f): | |
| """Read from an open file `f` according to the function `reader` | |
| Args: | |
| reader: A unary function of signature (f: _io.TextIOWrapper) -> str | |
| f: An opened file, as returned by `open(fname)` | |
| Yields: | |
| A generator that returns lines defined by `reader` until the end of the file is reached. | |
| """ | |
| while True: | |
| try: | |
| line = reader(f) | |
| except StopIteration: | |
| break | |
| if line is not None: | |
| yield line | |
| def extract_chars(infile, n=10000): | |
| """Extract `n` characters from a file""" | |
| reader = partial(get_chars, n) | |
| src = open(infile, 'r') | |
| return read_on(reader, src) | |
| src.close() | |
| def extract_lines(infile): | |
| """Given a file, yield the processed lines from that file""" | |
| src = open(infile, 'r') | |
| return read_on(get_line, src) | |
| src.close() | |
| def extract_sentences_to_file(infile, outfname:str): | |
| """Extract sentences from a file into a new file indicated by `outfname`.""" | |
| out = open(outfname, 'x') | |
| linegen = extract_lines(infile) | |
| for line in linegen: | |
| out.write(line + "\n") | |
| out.close() | |
| def main(infile, outdir): | |
| """Main function for creating the outdir and saving the processed sentences to that file""" | |
| outfname = Path(infile).stem + '.txt' | |
| outdir = Path(outdir) | |
| outdir.mkdir(parents=True, exist_ok=True) | |
| outfile = outdir / outfname | |
| out_path = extract_sentences_to_file(infile, outfile) | |
| return out_path | |
| if __name__ == "__main__": | |
| args = parse_args() | |
| main(args.file, args.outdir) | |