Added arguments from command line instead of constants, removed verbose in process information

1ad5f192 · Tomáš Kořistka · 5510dbd2 · 5510dbd2 · 1ad5f192 · 1ad5f192
Commit 1ad5f192 authored 4 years ago by Tomáš Kořistka
--- a/preprocessing/src/config.py
+++ b/preprocessing/src/config.py
-OUTPUT_PERSISTENCE_PATH = '../../data/persistence/'
-SAMPLE_FOLDER_PATH = '../../data/input/sample/'
-INPUT_FOLDER_PATH = '../../data/input/Gutenberg/txt/'
-DATABASE_FILE = 'docs_and_terms.db'
-WORD_HIGHEST_FREQUENCY_FILE = 'most_frequent_words.json'
--- a/preprocessing/src/main.py
+++ b/preprocessing/src/main.py
-from config import SAMPLE_FOLDER_PATH
+import argparse
-from preprocessor import preprocess_collection
-preprocess_collection(SAMPLE_FOLDER_PATH)
+from src.preprocessor import preprocess_collection
+parser = argparse.ArgumentParser(description="Collection preprocessing")
+parser.add_argument("-i", "--input", type=str, help="Path to the input collection directory")
+parser.add_argument("-o", "--output", type=str, help="Path to the output database file")
+parser.add_argument("-f", "--frequency", type=str, help="Path to the output frequency file")
+args = parser.parse_args()
+preprocess_collection(args.input, args.frequency, args.output)
--- a/preprocessing/src/preprocessor.py
+++ b/preprocessing/src/preprocessor.py
@@ -4,37 +4,36 @@ import nltk
 from nltk import WordNetLemmatizer
-from database.database import Database
+from src.database.database import Database
-from config import DATABASE_FILE, OUTPUT_PERSISTENCE_PATH, WORD_HIGHEST_FREQUENCY_FILE
+from src.word_prunner import WordPrunner
-from word_prunner import WordPrunner
-def preprocess_collection(input_folder_path: str):
+def preprocess_collection(input_folder_path: str,
+                          output_frequency_file_path: str,
+                          output_database_file_path: str):
    """
    Parses and saves all documents from input_folder_path to output_persistence_path
-    :param input_folder_path: path to the document collection to parse
+    :param input_folder_path: Path to the folder which contains files -documents- to preprocess
-    :param output_persistence_path: path to the output persistence file
+    :param output_frequency_file_path: Path to an output file into which terms' highest frequencies will be persisted
+    :param output_database_file_path: Path to an output database into which the processed terms will be persisted
    :return: None
    """
-    Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE).drop()
+    Database(output_database_file_path).drop()
-    frequencies = __parse_collection(input_folder_path)
+    frequencies = __parse_collection(input_folder_path, output_database_file_path)
-    with open(OUTPUT_PERSISTENCE_PATH + WORD_HIGHEST_FREQUENCY_FILE, 'w') as file:
+    with open(output_frequency_file_path, 'w') as file:
        json.dump(frequencies, file)
-def __parse_collection(input_folder_path: str) -> dict:
+def __parse_collection(input_folder_path: str, output_database_file_path: str) -> dict:
    """
    Parses all text files in the input_folder_path
    :param input_folder_path: path to the document collection to parse
    :return: dictionary, where key: file path, value: dictionary of terms and their frequencies
    """
-    preprocessor = Preprocessor()
+    preprocessor = Preprocessor(output_database_file_path)
-    index = 1
-    max_index = len(os.listdir(input_folder_path))
    for file in os.listdir(input_folder_path):
-        print("[Processing file", index, "/", max_index, "]", file)
-        index += 1
        if file.endswith(".txt"):
            preprocessor.process_file(input_folder_path + file)
@@ -64,45 +63,47 @@ class Preprocessor:
        Tool that removes stop words, punctuation & other redundant terms from the document
    terms_highest_frequencies: dict
        Dictionary of terms and their highest frequency in the collection
+    database_path: str
+        Path to the database file in which results are persisted
    Methods
    -------
    process_file(path: str) -> (str,dict)
        Loads the document defined by path and processes it into terms and their frequencies
    """
-    def __init__(self):
+    def __init__(self, database_path: str):
        self.terms = {}
        self.lemmatiser = WordNetLemmatizer()
        self.prunner = WordPrunner()
        self.terms_highest_frequencies = {}
+        self.database_path = database_path
-    def process_file(self, path: str):
+    def process_file(self, file_path: str):
        """
        Reads a document from file and processes it into terms and their frequencies
-        :param path: path to the document to open
+        :param file_path: path to the document to open
        :return: tuple of document path & dictionary of terms and their frequencies
        """
        self.terms = {}  # reset
        try:
-            self.__process_terms(path)
+            self.__process_terms(file_path)
            self.__update_frequencies()
-            self.__persist(path)
+            self.__persist(file_path)
        except FileNotFoundError:
            pass
-    def __process_terms(self, path):
+    def __process_terms(self, file_path: str):
-        with open(path, 'r') as file:
+        with open(file_path, 'r') as file:
            line = " "
            while line:
                try:
                    line = file.readline()
                    for word in self.prunner.prune(nltk.word_tokenize(line)):
-                        self.__add_term(self.lemmatise(word))
+                        self.__add_term(self.__lemmatise(word))
                except UnicodeDecodeError:
                    pass
-    def lemmatise(self, word):
+    def __lemmatise(self, word):
        return self.lemmatiser.lemmatize(word)
    def get_most_frequent_words(self) -> dict:
@@ -110,14 +111,14 @@ class Preprocessor:
    def __add_term(self, term: str):
        """
-        Adds a term to the document's dictionary
+        Adds a term to the document's dictionary and note its frequency
-        :param term: Term to be added
+        :param term: Term to add
        :return: None
        """
        # add to terms
        if term not in self.terms:  # is a new term
            self.terms[term] = 0
-        self.terms[term] += 1
+        self.terms[term] += 1  # increase frequency
    def __update_frequencies(self):
        """
@@ -129,12 +130,18 @@ class Preprocessor:
            if term not in self.terms_highest_frequencies:  # is a new word
                self.terms_highest_frequencies[term] = self.terms[term]
+            # check if frequency in the latest document is higher
            if self.terms_highest_frequencies[term] < self.terms[term]:
                self.terms_highest_frequencies[term] = self.terms[term]
-    def __persist(self, input_file):
+    def __persist(self, input_file_name):
-        database = Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE)
+        """
-        database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file])
+        Persists all terms for a given document
+        :param input_file_name: Path to the persisted document
+        :return: None
+        """
+        database = Database(self.database_path)
+        database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file_name])
        database.commit()
        document_key = database.last_primary_key()
        for term in self.terms: