Newer
Older
import nltk
weirdwizardthomas
committed
weirdwizardthomas
committed
from src.preprocessing.word_prunner import WordPrunner
def preprocess_collection(input_folder_path: str, output_persistence_path):
"""
Parses and saves all documents from input_folder_path to output_persistence_path
:param input_folder_path: path to the document collection to parse
:param output_persistence_path: path to the output persistence file
:return: None
"""
documents, frequencies = __parse_collection(input_folder_path)
with open(output_persistence_path + 'documents.json', 'w') as file:
json.dump(documents, file)
with open(output_persistence_path + 'most_frequent_words.json', 'w') as file:
json.dump(frequencies, file)
def __parse_collection(input_folder_path: str) -> (dict, dict):
"""
Parses all text files in the input_folder_path
:param input_folder_path: path to the document collection to parse
:return: dictionary, where key: file path, value: dictionary of terms and their frequencies
"""
documents = {}
index = 1
max_index = len(os.listdir(input_folder_path))
for file in os.listdir(input_folder_path):
print("[Processing file", index, "/", max_index, "]", file)
index += 1
path, words = preprocessor.process_file(input_folder_path + file)
documents[path] = words
return documents, preprocessor.get_most_frequent_words()
def load_documents(path: str) -> dict:
"""
Loads processed documents from a persistence file
:param path: Path to the persistence file
:return: dictionary of documents, where key: file path, value: dictionary of terms and their frequencies
"""
with open(path, 'r') as file:
return json.load(file)
weirdwizardthomas
committed
class Preprocessor:
"""A class that processes a document for analysis
Attributes
----------
Dictionary of terms and their frequencies in the parsed document
lemmatiser: WordNetLemmatizer
Tool that lemmatises the document
prunner:WordPrunner
Tool that removes stop words, punctuation & other redundant terms from the document
terms_highest_frequencies: dict
Dictionary of terms and their highest frequency in the collection
Methods
-------
process_file(path: str) -> (str,dict)
Loads the document defined by path and processes it into terms and their frequencies
"""
weirdwizardthomas
committed
def __init__(self):
weirdwizardthomas
committed
self.lemmatiser = WordNetLemmatizer()
self.prunner = WordPrunner()
self.terms_highest_frequencies = {}
weirdwizardthomas
committed
def process_file(self, path: str) -> (str, dict):
"""
Reads a document from file and processes it into terms and their frequencies
:param path: path to the document to open
:return: tuple of document path & dictionary of terms and their frequencies
"""
weirdwizardthomas
committed
with open(path, 'r') as file:
line = " "
while line:
line = file.readline()
tokens = self.prunner.prune(nltk.word_tokenize(line))
for word in tokens:
self.__add_word(self.lemmatise(word))
self.__update_frequencies()
return path, self.terms
def lemmatise(self, word):
return self.lemmatiser.lemmatize(word)
def get_most_frequent_words(self) -> dict:
return self.terms_highest_frequencies
weirdwizardthomas
committed
def __add_word(self, term: str):
"""
Adds a term to the document's dictionary
:param term: Term to be added
:return: None
"""
# add to terms
if term not in self.terms: # is a new term
self.terms[term] = 0
self.terms[term] += 1
def __update_frequencies(self):
"""
Updates all frequencies to contain the highest current frequency of a given term
If the frequency of a term in the currently processed document is higher than the current highest, replace it
:return: None
"""
for term in self.terms:
if term not in self.terms_highest_frequencies: # is a new word
self.terms_highest_frequencies[term] = self.terms[term]
if self.terms_highest_frequencies[term] < self.terms[term]:
self.terms_highest_frequencies[term] = self.terms[term]