From 1ad5f192d6f3288f7d9aa4c8da6f49b20bbffd76 Mon Sep 17 00:00:00 2001 From: Tomas Koristka <koristo1@fit.cvut.cz> Date: Wed, 1 Apr 2020 17:09:32 +0200 Subject: [PATCH] Added arguments from command line instead of constants, removed verbose in process information --- preprocessing/src/config.py | 7 ---- preprocessing/src/main.py | 12 ++++-- preprocessing/src/preprocessor.py | 69 +++++++++++++++++-------------- 3 files changed, 47 insertions(+), 41 deletions(-) delete mode 100644 preprocessing/src/config.py diff --git a/preprocessing/src/config.py b/preprocessing/src/config.py deleted file mode 100644 index 98d632a..0000000 --- a/preprocessing/src/config.py +++ /dev/null @@ -1,7 +0,0 @@ -OUTPUT_PERSISTENCE_PATH = '../../data/persistence/' - -SAMPLE_FOLDER_PATH = '../../data/input/sample/' -INPUT_FOLDER_PATH = '../../data/input/Gutenberg/txt/' - -DATABASE_FILE = 'docs_and_terms.db' -WORD_HIGHEST_FREQUENCY_FILE = 'most_frequent_words.json' diff --git a/preprocessing/src/main.py b/preprocessing/src/main.py index 43e0feb..bd86849 100644 --- a/preprocessing/src/main.py +++ b/preprocessing/src/main.py @@ -1,4 +1,10 @@ -from config import SAMPLE_FOLDER_PATH -from preprocessor import preprocess_collection +import argparse -preprocess_collection(SAMPLE_FOLDER_PATH) +from src.preprocessor import preprocess_collection + +parser = argparse.ArgumentParser(description="Collection preprocessing") +parser.add_argument("-i", "--input", type=str, help="Path to the input collection directory") +parser.add_argument("-o", "--output", type=str, help="Path to the output database file") +parser.add_argument("-f", "--frequency", type=str, help="Path to the output frequency file") +args = parser.parse_args() +preprocess_collection(args.input, args.frequency, args.output) diff --git a/preprocessing/src/preprocessor.py b/preprocessing/src/preprocessor.py index 9b38f4f..6f2f774 100644 --- a/preprocessing/src/preprocessor.py +++ b/preprocessing/src/preprocessor.py @@ -4,37 +4,36 @@ import nltk from nltk import WordNetLemmatizer -from database.database import Database -from config import DATABASE_FILE, OUTPUT_PERSISTENCE_PATH, WORD_HIGHEST_FREQUENCY_FILE -from word_prunner import WordPrunner +from src.database.database import Database +from src.word_prunner import WordPrunner -def preprocess_collection(input_folder_path: str): +def preprocess_collection(input_folder_path: str, + output_frequency_file_path: str, + output_database_file_path: str): """ Parses and saves all documents from input_folder_path to output_persistence_path - :param input_folder_path: path to the document collection to parse - :param output_persistence_path: path to the output persistence file + :param input_folder_path: Path to the folder which contains files -documents- to preprocess + :param output_frequency_file_path: Path to an output file into which terms' highest frequencies will be persisted + :param output_database_file_path: Path to an output database into which the processed terms will be persisted :return: None """ - Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE).drop() - frequencies = __parse_collection(input_folder_path) + Database(output_database_file_path).drop() + frequencies = __parse_collection(input_folder_path, output_database_file_path) - with open(OUTPUT_PERSISTENCE_PATH + WORD_HIGHEST_FREQUENCY_FILE, 'w') as file: + with open(output_frequency_file_path, 'w') as file: json.dump(frequencies, file) -def __parse_collection(input_folder_path: str) -> dict: +def __parse_collection(input_folder_path: str, output_database_file_path: str) -> dict: """ Parses all text files in the input_folder_path :param input_folder_path: path to the document collection to parse :return: dictionary, where key: file path, value: dictionary of terms and their frequencies """ - preprocessor = Preprocessor() - index = 1 - max_index = len(os.listdir(input_folder_path)) + preprocessor = Preprocessor(output_database_file_path) + for file in os.listdir(input_folder_path): - print("[Processing file", index, "/", max_index, "]", file) - index += 1 if file.endswith(".txt"): preprocessor.process_file(input_folder_path + file) @@ -64,45 +63,47 @@ class Preprocessor: Tool that removes stop words, punctuation & other redundant terms from the document terms_highest_frequencies: dict Dictionary of terms and their highest frequency in the collection - + database_path: str + Path to the database file in which results are persisted Methods ------- process_file(path: str) -> (str,dict) Loads the document defined by path and processes it into terms and their frequencies """ - def __init__(self): + def __init__(self, database_path: str): self.terms = {} self.lemmatiser = WordNetLemmatizer() self.prunner = WordPrunner() self.terms_highest_frequencies = {} + self.database_path = database_path - def process_file(self, path: str): + def process_file(self, file_path: str): """ Reads a document from file and processes it into terms and their frequencies - :param path: path to the document to open + :param file_path: path to the document to open :return: tuple of document path & dictionary of terms and their frequencies """ self.terms = {} # reset try: - self.__process_terms(path) + self.__process_terms(file_path) self.__update_frequencies() - self.__persist(path) + self.__persist(file_path) except FileNotFoundError: pass - def __process_terms(self, path): - with open(path, 'r') as file: + def __process_terms(self, file_path: str): + with open(file_path, 'r') as file: line = " " while line: try: line = file.readline() for word in self.prunner.prune(nltk.word_tokenize(line)): - self.__add_term(self.lemmatise(word)) + self.__add_term(self.__lemmatise(word)) except UnicodeDecodeError: pass - def lemmatise(self, word): + def __lemmatise(self, word): return self.lemmatiser.lemmatize(word) def get_most_frequent_words(self) -> dict: @@ -110,14 +111,14 @@ class Preprocessor: def __add_term(self, term: str): """ - Adds a term to the document's dictionary - :param term: Term to be added + Adds a term to the document's dictionary and note its frequency + :param term: Term to add :return: None """ # add to terms if term not in self.terms: # is a new term self.terms[term] = 0 - self.terms[term] += 1 + self.terms[term] += 1 # increase frequency def __update_frequencies(self): """ @@ -129,12 +130,18 @@ class Preprocessor: if term not in self.terms_highest_frequencies: # is a new word self.terms_highest_frequencies[term] = self.terms[term] + # check if frequency in the latest document is higher if self.terms_highest_frequencies[term] < self.terms[term]: self.terms_highest_frequencies[term] = self.terms[term] - def __persist(self, input_file): - database = Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE) - database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file]) + def __persist(self, input_file_name): + """ + Persists all terms for a given document + :param input_file_name: Path to the persisted document + :return: None + """ + database = Database(self.database_path) + database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file_name]) database.commit() document_key = database.last_primary_key() for term in self.terms: -- GitLab