import json import os import nltk from nltk import WordNetLemmatizer from src.preprocessing.word_prunner import WordPrunner def preprocess_collection(input_folder_path: str, output_persistence_path): """ Parses and saves all documents from input_folder_path to output_persistence_path :param input_folder_path: path to the document collection to parse :param output_persistence_path: path to the output persistence file :return: None """ documents = parse_collection(input_folder_path) with open(output_persistence_path, 'w') as file: json.dump(documents, file) def parse_collection(input_folder_path: str) -> dict: """ Parses all text files in the input_folder_path :param input_folder_path: path to the document collection to parse :return: dictionary, where key: file path, value: dictionary of terms and their frequencies """ preprocessor = Preprocessor() documents = {} for file in os.listdir(input_folder_path): if file.endswith(".txt"): path, words = preprocessor.process_file(input_folder_path + file) documents[path] = words return documents def load_documents(path: str): """ Loads processed documents from a persistence file :param path: Path to the persistence file :return: dictionary of documents, where key: file path, value: dictionary of terms and their frequencies """ with open(path, 'r') as file: return json.load(file) class Preprocessor: """A class that processes a document for analysis Attributes ---------- words: dict Dictionary of terms and their frequencies in the parsed document lemmatiser: WordNetLemmatizer Tool that lemmatises the document prunner:WordPrunner Tool that removes stop words, punctuation & other redundant terms from the document Methods ------- process_file(path: str) -> (str,dict) Loads the document defined by path and processes it into terms and their frequencies """ def __init__(self): self.words = {} self.lemmatiser = WordNetLemmatizer() self.prunner = WordPrunner() def process_file(self, path: str) -> (str, dict): """ Reads a document from file and processes it into terms and their frequencies :param path: path to the document to open :return: tuple of document path & dictionary of terms and their frequencies """ self.words = {} # reset with open(path, 'r') as file: line = " " while line: line = file.readline() tokens = self.prunner.prune(nltk.word_tokenize(line)) for word in tokens: self.__add_word(word) return path, self.words def __add_word(self, term: str): """ Adds a term to the document's dictionary :param term: Term to be added :return: None """ # change case to lower word = self.lemmatiser.lemmatize(term) # add to words if word not in self.words: self.words[word] = 0 self.words[word] += 1