Spaces:

exbert-project
/

exbert

Running on CPU Upgrade

App Files Files Community

exbert / server /data_processing /sentence_extracting.py

bhoov

First commit

63858e7 almost 6 years ago

raw

history blame

5.6 kB

	"""Extractor functions to retrieve sentences by character chunks from a file

	This script contains the logic that allows the user to process and filter
	sentences of the original corpus. By default, this considers a minimum sentence
	length, and removes newlines and multiple consecutive spaces.

	Configuration for existing functionality is at the top of the file. Feel free to
	add new processing and/or filter functions. The "process_line" and "filter_line"
	functions contain the pipeline for processing the scripts as needed.

	"""
	import regex as re
	import argparse
	from pathlib import Path
	from functools import partial
	from typing import Union

	MIN_LINE_LENGTH = 8 # words

	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument("-f", "--file", help="Path to .txt file to analyze and annotate")
	parser.add_argument("-o", "--outdir", help="Path of directory in which to store the analyzed sentences as a .pckl")


	args = parser.parse_args()
	return args

	# ============================================================
	# Helper functions
	# ============================================================
	# String -> String
	def replace_newlines(s:str) -> str:
	return re.sub(r"\n+", r" ", s)

	# String -> String
	def replace_multispace(s:str) -> str:
	return re.sub(r"\s+", r" ", s)

	def is_short_sentence(s:str, min_len=8) -> str:
	"""Returns True if the sentence has less than `min_len` number of words"""
	return len(s.split(' ')) < min_len

	def contains_char(char:str, s:str) -> str:
	return char in s

	# ============================================================
	# Compilation functions
	# ============================================================

	def process_line(line:str) -> str:
	""""Replaces newlines with spaces and removes multiple consecutive spaces from a chunk of file.

	Args:
	line: Chunk of text

	Returns:
	Input that has been stripped of newlines and multiple consecutive spaces.
	"""
	s = replace_multispace(replace_newlines(line))
	return s

	def filter_line(line:str) -> bool:
	"""Returns True if the sentence passes the MIN_LINE_LENGTH configuration

	Redefine this function with desired helper functions, returning true if you want to keep the line
	"""
	fails = is_short_sentence(line, MIN_LINE_LENGTH)

	return not fails

	# ============================================================
	# Main Logic
	# ============================================================

	def read_outcomes(chars:str) -> Union[str, None]:
	"""From a chunk of characters, decide whether to return the processed characters or Nothing.

	If the input is the empty string "", raise StopIteration

	Args:
	chars: Chunk of text to process

	Returns:
	The processed chunk of text or nothing if the characters do not pass the filtering

	Raises:
	StopIteration: If the input is the empty string "", raise StopIteration
	"""

	if chars == '': raise StopIteration
	line = process_line(chars)
	if filter_line(line): return line
	return None

	def get_chars(n:int, f) -> Union[str, None]:
	"""Extract `n` chars from opened file `f`

	Args:
	n: Number of characters to read from the opened file
	f: Opened file from the return of `open(fname)`

	Returns:
	The processed chunk of text or nothing if the characters do not pass the filtering

	Raises:
	This function does not raise any errors of its own, but can pass up the StopIteration exception
	from read_outcomes
	"""
	chars = f.read(n)
	return read_outcomes(chars)

	def get_line(f):
	"""Given an open file, get the next line and process it. Handles 3 scenarios:

	1. StopIteration indicates the opened file has reached the end
	2. Return a processed line if it passes the filter
	3. If line does not pass the filter line, return None
	"""
	line = f.readline()
	return read_outcomes(line)

	def read_on(reader, f):
	"""Read from an open file `f` according to the function `reader`

	Args:
	reader: A unary function of signature (f: _io.TextIOWrapper) -> str
	f: An opened file, as returned by `open(fname)`

	Yields:
	A generator that returns lines defined by `reader` until the end of the file is reached.
	"""
	while True:
	try:
	line = reader(f)
	except StopIteration:
	break

	if line is not None:
	yield line


	def extract_chars(infile, n=10000):
	"""Extract `n` characters from a file"""
	reader = partial(get_chars, n)
	src = open(infile, 'r')
	return read_on(reader, src)
	src.close()


	def extract_lines(infile):
	"""Given a file, yield the processed lines from that file"""
	src = open(infile, 'r')
	return read_on(get_line, src)
	src.close()


	def extract_sentences_to_file(infile, outfname:str):
	"""Extract sentences from a file into a new file indicated by `outfname`."""
	out = open(outfname, 'x')

	linegen = extract_lines(infile)

	for line in linegen:
	out.write(line + "\n")

	out.close()

	def main(infile, outdir):
	"""Main function for creating the outdir and saving the processed sentences to that file"""
	outfname = Path(infile).stem + '.txt'
	outdir = Path(outdir)
	outdir.mkdir(parents=True, exist_ok=True)
	outfile = outdir / outfname
	out_path = extract_sentences_to_file(infile, outfile)

	return out_path

	if __name__ == "__main__":
	args = parse_args()
	main(args.file, args.outdir)