From 1ad5f192d6f3288f7d9aa4c8da6f49b20bbffd76 Mon Sep 17 00:00:00 2001
From: Tomas Koristka <koristo1@fit.cvut.cz>
Date: Wed, 1 Apr 2020 17:09:32 +0200
Subject: [PATCH] Added arguments from command line instead of constants,
 removed verbose in process information

---
 preprocessing/src/config.py       |  7 ----
 preprocessing/src/main.py         | 12 ++++--
 preprocessing/src/preprocessor.py | 69 +++++++++++++++++--------------
 3 files changed, 47 insertions(+), 41 deletions(-)
 delete mode 100644 preprocessing/src/config.py

diff --git a/preprocessing/src/config.py b/preprocessing/src/config.py
deleted file mode 100644
index 98d632a..0000000
--- a/preprocessing/src/config.py
+++ /dev/null
@@ -1,7 +0,0 @@
-OUTPUT_PERSISTENCE_PATH = '../../data/persistence/'
-
-SAMPLE_FOLDER_PATH = '../../data/input/sample/'
-INPUT_FOLDER_PATH = '../../data/input/Gutenberg/txt/'
-
-DATABASE_FILE = 'docs_and_terms.db'
-WORD_HIGHEST_FREQUENCY_FILE = 'most_frequent_words.json'
diff --git a/preprocessing/src/main.py b/preprocessing/src/main.py
index 43e0feb..bd86849 100644
--- a/preprocessing/src/main.py
+++ b/preprocessing/src/main.py
@@ -1,4 +1,10 @@
-from config import SAMPLE_FOLDER_PATH
-from preprocessor import preprocess_collection
+import argparse
 
-preprocess_collection(SAMPLE_FOLDER_PATH)
+from src.preprocessor import preprocess_collection
+
+parser = argparse.ArgumentParser(description="Collection preprocessing")
+parser.add_argument("-i", "--input", type=str, help="Path to the input collection directory")
+parser.add_argument("-o", "--output", type=str, help="Path to the output database file")
+parser.add_argument("-f", "--frequency", type=str, help="Path to the output frequency file")
+args = parser.parse_args()
+preprocess_collection(args.input, args.frequency, args.output)
diff --git a/preprocessing/src/preprocessor.py b/preprocessing/src/preprocessor.py
index 9b38f4f..6f2f774 100644
--- a/preprocessing/src/preprocessor.py
+++ b/preprocessing/src/preprocessor.py
@@ -4,37 +4,36 @@ import nltk
 
 from nltk import WordNetLemmatizer
 
-from database.database import Database
-from config import DATABASE_FILE, OUTPUT_PERSISTENCE_PATH, WORD_HIGHEST_FREQUENCY_FILE
-from word_prunner import WordPrunner
+from src.database.database import Database
+from src.word_prunner import WordPrunner
 
 
-def preprocess_collection(input_folder_path: str):
+def preprocess_collection(input_folder_path: str,
+                          output_frequency_file_path: str,
+                          output_database_file_path: str):
     """
     Parses and saves all documents from input_folder_path to output_persistence_path
-    :param input_folder_path: path to the document collection to parse
-    :param output_persistence_path: path to the output persistence file
+    :param input_folder_path: Path to the folder which contains files -documents- to preprocess
+    :param output_frequency_file_path: Path to an output file into which terms' highest frequencies will be persisted
+    :param output_database_file_path: Path to an output database into which the processed terms will be persisted
     :return: None
     """
-    Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE).drop()
-    frequencies = __parse_collection(input_folder_path)
+    Database(output_database_file_path).drop()
+    frequencies = __parse_collection(input_folder_path, output_database_file_path)
 
-    with open(OUTPUT_PERSISTENCE_PATH + WORD_HIGHEST_FREQUENCY_FILE, 'w') as file:
+    with open(output_frequency_file_path, 'w') as file:
         json.dump(frequencies, file)
 
 
-def __parse_collection(input_folder_path: str) -> dict:
+def __parse_collection(input_folder_path: str, output_database_file_path: str) -> dict:
     """
     Parses all text files in the input_folder_path
     :param input_folder_path: path to the document collection to parse
     :return: dictionary, where key: file path, value: dictionary of terms and their frequencies
     """
-    preprocessor = Preprocessor()
-    index = 1
-    max_index = len(os.listdir(input_folder_path))
+    preprocessor = Preprocessor(output_database_file_path)
+
     for file in os.listdir(input_folder_path):
-        print("[Processing file", index, "/", max_index, "]", file)
-        index += 1
         if file.endswith(".txt"):
             preprocessor.process_file(input_folder_path + file)
 
@@ -64,45 +63,47 @@ class Preprocessor:
         Tool that removes stop words, punctuation & other redundant terms from the document
     terms_highest_frequencies: dict
         Dictionary of terms and their highest frequency in the collection
-
+    database_path: str
+        Path to the database file in which results are persisted
     Methods
     -------
     process_file(path: str) -> (str,dict)
         Loads the document defined by path and processes it into terms and their frequencies
     """
 
-    def __init__(self):
+    def __init__(self, database_path: str):
         self.terms = {}
         self.lemmatiser = WordNetLemmatizer()
         self.prunner = WordPrunner()
         self.terms_highest_frequencies = {}
+        self.database_path = database_path
 
-    def process_file(self, path: str):
+    def process_file(self, file_path: str):
         """
         Reads a document from file and processes it into terms and their frequencies
-        :param path: path to the document to open
+        :param file_path: path to the document to open
         :return: tuple of document path & dictionary of terms and their frequencies
         """
         self.terms = {}  # reset
         try:
-            self.__process_terms(path)
+            self.__process_terms(file_path)
             self.__update_frequencies()
-            self.__persist(path)
+            self.__persist(file_path)
         except FileNotFoundError:
             pass
 
-    def __process_terms(self, path):
-        with open(path, 'r') as file:
+    def __process_terms(self, file_path: str):
+        with open(file_path, 'r') as file:
             line = " "
             while line:
                 try:
                     line = file.readline()
                     for word in self.prunner.prune(nltk.word_tokenize(line)):
-                        self.__add_term(self.lemmatise(word))
+                        self.__add_term(self.__lemmatise(word))
                 except UnicodeDecodeError:
                     pass
 
-    def lemmatise(self, word):
+    def __lemmatise(self, word):
         return self.lemmatiser.lemmatize(word)
 
     def get_most_frequent_words(self) -> dict:
@@ -110,14 +111,14 @@ class Preprocessor:
 
     def __add_term(self, term: str):
         """
-        Adds a term to the document's dictionary
-        :param term: Term to be added
+        Adds a term to the document's dictionary and note its frequency
+        :param term: Term to add
         :return: None
         """
         # add to terms
         if term not in self.terms:  # is a new term
             self.terms[term] = 0
-        self.terms[term] += 1
+        self.terms[term] += 1  # increase frequency
 
     def __update_frequencies(self):
         """
@@ -129,12 +130,18 @@ class Preprocessor:
             if term not in self.terms_highest_frequencies:  # is a new word
                 self.terms_highest_frequencies[term] = self.terms[term]
 
+            # check if frequency in the latest document is higher
             if self.terms_highest_frequencies[term] < self.terms[term]:
                 self.terms_highest_frequencies[term] = self.terms[term]
 
-    def __persist(self, input_file):
-        database = Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE)
-        database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file])
+    def __persist(self, input_file_name):
+        """
+        Persists all terms for a given document
+        :param input_file_name: Path to the persisted document
+        :return: None
+        """
+        database = Database(self.database_path)
+        database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file_name])
         database.commit()
         document_key = database.last_primary_key()
         for term in self.terms:
-- 
GitLab