Skip to content
Snippets Groups Projects
Commit 8f89f98b authored by weirdwizardthomas's avatar weirdwizardthomas
Browse files

Added documentation to preprocessor.py and word_prunner.py

parent 0b0b7f0f
No related branches found
No related tags found
No related merge requests found
from src.preprocessing.preprocessor import preprocess_folder, load_documents
preprocess_folder("./data/sample/", "./data/persistence/documents.json")
documents = load_documents("./data/persistence/documents.json")
......@@ -7,47 +7,90 @@ from nltk import WordNetLemmatizer
from src.preprocessing.word_prunner import WordPrunner
 
 
def preprocess_folder(input_folder_path: str, output_persistence_path):
documents = get_documents(input_folder_path)
def preprocess_collection(input_folder_path: str, output_persistence_path):
"""
Parses and saves all documents from input_folder_path to output_persistence_path
:param input_folder_path: path to the document collection to parse
:param output_persistence_path: path to the output persistence file
:return: None
"""
documents = parse_collection(input_folder_path)
 
with open(output_persistence_path, 'w') as file:
json.dump(documents, file)
 
 
def get_documents(input_folder_path: str) -> dict:
def parse_collection(input_folder_path: str) -> dict:
"""
Parses all text files in the input_folder_path
:param input_folder_path: path to the document collection to parse
:return: dictionary, where key: file path, value: dictionary of terms and their frequencies
"""
preprocessor = Preprocessor()
documents = {}
for file in os.listdir(input_folder_path):
if file.endswith(".txt"):
path, words = preprocessor.read_file(input_folder_path + file)
path, words = preprocessor.process_file(input_folder_path + file)
documents[path] = words
 
return documents
 
 
def load_documents(path: str):
"""
Loads processed documents from a persistence file
:param path: Path to the persistence file
:return: dictionary of documents, where key: file path, value: dictionary of terms and their frequencies
"""
with open(path, 'r') as file:
return json.load(file)
 
 
class Preprocessor:
"""A class that processes a document for analysis
Attributes
----------
words: dict
Dictionary of terms and their frequencies in the parsed document
lemmatiser: WordNetLemmatizer
Tool that lemmatises the document
prunner:WordPrunner
Tool that removes stop words, punctuation & other redundant terms from the document
Methods
-------
process_file(path: str) -> (str,dict)
Loads the document defined by path and processes it into terms and their frequencies
"""
def __init__(self):
self.words = {}
self.lemmatiser = WordNetLemmatizer()
self.prunner = WordPrunner()
 
def read_file(self, path: str) -> (str, dict):
self.words = {}
def process_file(self, path: str) -> (str, dict):
"""
Reads a document from file and processes it into terms and their frequencies
:param path: path to the document to open
:return: tuple of document path & dictionary of terms and their frequencies
"""
self.words = {} # reset
with open(path, 'r') as file:
line = " "
while line:
line = file.readline()
tokens = self.prunner.prune(nltk.word_tokenize(line))
for word in tokens:
self.add_word(word)
self.__add_word(word)
return path, self.words
 
def add_word(self, term: str):
def __add_word(self, term: str):
"""
Adds a term to the document's dictionary
:param term: Term to be added
:return: None
"""
# change case to lower
word = self.lemmatiser.lemmatize(term)
# add to words
......
import string
from nltk.corpus import stopwords
 
 
class WordPrunner:
"""A class that removes stop words, numbers & punctuation from a collection of strings.
Attributes
----------
stop_words : set
set of stop word strings
Methods
-------
prune(tokens: list) -> list
removes all stop words, numbers & punctuation from tokens
"""
def __init__(self):
self.stop_words = set(stopwords.words('english'))
 
def prune(self, tokens: list) -> list:
"""
Removes all stop words, numbers & punctuation from tokens
:param tokens: list of strings to be parsed
:return: list of strings not containing any stop words, numbers, or punctuation
"""
# remove stop words and punctuation
tokens = [tokens.lower() for tokens in tokens]
return [term for term in tokens if term.isalpha() and term not in self.stop_words]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment