Skip to content
Snippets Groups Projects
Commit 1ad5f192 authored by Tomáš Kořistka's avatar Tomáš Kořistka
Browse files

Added arguments from command line instead of constants, removed verbose in process information

parent 5510dbd2
No related branches found
No related tags found
Loading
OUTPUT_PERSISTENCE_PATH = '../../data/persistence/'
SAMPLE_FOLDER_PATH = '../../data/input/sample/'
INPUT_FOLDER_PATH = '../../data/input/Gutenberg/txt/'
DATABASE_FILE = 'docs_and_terms.db'
WORD_HIGHEST_FREQUENCY_FILE = 'most_frequent_words.json'
from config import SAMPLE_FOLDER_PATH import argparse
from preprocessor import preprocess_collection
   
preprocess_collection(SAMPLE_FOLDER_PATH) from src.preprocessor import preprocess_collection
parser = argparse.ArgumentParser(description="Collection preprocessing")
parser.add_argument("-i", "--input", type=str, help="Path to the input collection directory")
parser.add_argument("-o", "--output", type=str, help="Path to the output database file")
parser.add_argument("-f", "--frequency", type=str, help="Path to the output frequency file")
args = parser.parse_args()
preprocess_collection(args.input, args.frequency, args.output)
...@@ -4,37 +4,36 @@ import nltk ...@@ -4,37 +4,36 @@ import nltk
   
from nltk import WordNetLemmatizer from nltk import WordNetLemmatizer
   
from database.database import Database from src.database.database import Database
from config import DATABASE_FILE, OUTPUT_PERSISTENCE_PATH, WORD_HIGHEST_FREQUENCY_FILE from src.word_prunner import WordPrunner
from word_prunner import WordPrunner
   
   
def preprocess_collection(input_folder_path: str): def preprocess_collection(input_folder_path: str,
output_frequency_file_path: str,
output_database_file_path: str):
""" """
Parses and saves all documents from input_folder_path to output_persistence_path Parses and saves all documents from input_folder_path to output_persistence_path
:param input_folder_path: path to the document collection to parse :param input_folder_path: Path to the folder which contains files -documents- to preprocess
:param output_persistence_path: path to the output persistence file :param output_frequency_file_path: Path to an output file into which terms' highest frequencies will be persisted
:param output_database_file_path: Path to an output database into which the processed terms will be persisted
:return: None :return: None
""" """
Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE).drop() Database(output_database_file_path).drop()
frequencies = __parse_collection(input_folder_path) frequencies = __parse_collection(input_folder_path, output_database_file_path)
   
with open(OUTPUT_PERSISTENCE_PATH + WORD_HIGHEST_FREQUENCY_FILE, 'w') as file: with open(output_frequency_file_path, 'w') as file:
json.dump(frequencies, file) json.dump(frequencies, file)
   
   
def __parse_collection(input_folder_path: str) -> dict: def __parse_collection(input_folder_path: str, output_database_file_path: str) -> dict:
""" """
Parses all text files in the input_folder_path Parses all text files in the input_folder_path
:param input_folder_path: path to the document collection to parse :param input_folder_path: path to the document collection to parse
:return: dictionary, where key: file path, value: dictionary of terms and their frequencies :return: dictionary, where key: file path, value: dictionary of terms and their frequencies
""" """
preprocessor = Preprocessor() preprocessor = Preprocessor(output_database_file_path)
index = 1
max_index = len(os.listdir(input_folder_path))
for file in os.listdir(input_folder_path): for file in os.listdir(input_folder_path):
print("[Processing file", index, "/", max_index, "]", file)
index += 1
if file.endswith(".txt"): if file.endswith(".txt"):
preprocessor.process_file(input_folder_path + file) preprocessor.process_file(input_folder_path + file)
   
...@@ -64,45 +63,47 @@ class Preprocessor: ...@@ -64,45 +63,47 @@ class Preprocessor:
Tool that removes stop words, punctuation & other redundant terms from the document Tool that removes stop words, punctuation & other redundant terms from the document
terms_highest_frequencies: dict terms_highest_frequencies: dict
Dictionary of terms and their highest frequency in the collection Dictionary of terms and their highest frequency in the collection
database_path: str
Path to the database file in which results are persisted
Methods Methods
------- -------
process_file(path: str) -> (str,dict) process_file(path: str) -> (str,dict)
Loads the document defined by path and processes it into terms and their frequencies Loads the document defined by path and processes it into terms and their frequencies
""" """
   
def __init__(self): def __init__(self, database_path: str):
self.terms = {} self.terms = {}
self.lemmatiser = WordNetLemmatizer() self.lemmatiser = WordNetLemmatizer()
self.prunner = WordPrunner() self.prunner = WordPrunner()
self.terms_highest_frequencies = {} self.terms_highest_frequencies = {}
self.database_path = database_path
   
def process_file(self, path: str): def process_file(self, file_path: str):
""" """
Reads a document from file and processes it into terms and their frequencies Reads a document from file and processes it into terms and their frequencies
:param path: path to the document to open :param file_path: path to the document to open
:return: tuple of document path & dictionary of terms and their frequencies :return: tuple of document path & dictionary of terms and their frequencies
""" """
self.terms = {} # reset self.terms = {} # reset
try: try:
self.__process_terms(path) self.__process_terms(file_path)
self.__update_frequencies() self.__update_frequencies()
self.__persist(path) self.__persist(file_path)
except FileNotFoundError: except FileNotFoundError:
pass pass
   
def __process_terms(self, path): def __process_terms(self, file_path: str):
with open(path, 'r') as file: with open(file_path, 'r') as file:
line = " " line = " "
while line: while line:
try: try:
line = file.readline() line = file.readline()
for word in self.prunner.prune(nltk.word_tokenize(line)): for word in self.prunner.prune(nltk.word_tokenize(line)):
self.__add_term(self.lemmatise(word)) self.__add_term(self.__lemmatise(word))
except UnicodeDecodeError: except UnicodeDecodeError:
pass pass
   
def lemmatise(self, word): def __lemmatise(self, word):
return self.lemmatiser.lemmatize(word) return self.lemmatiser.lemmatize(word)
   
def get_most_frequent_words(self) -> dict: def get_most_frequent_words(self) -> dict:
...@@ -110,14 +111,14 @@ class Preprocessor: ...@@ -110,14 +111,14 @@ class Preprocessor:
   
def __add_term(self, term: str): def __add_term(self, term: str):
""" """
Adds a term to the document's dictionary Adds a term to the document's dictionary and note its frequency
:param term: Term to be added :param term: Term to add
:return: None :return: None
""" """
# add to terms # add to terms
if term not in self.terms: # is a new term if term not in self.terms: # is a new term
self.terms[term] = 0 self.terms[term] = 0
self.terms[term] += 1 self.terms[term] += 1 # increase frequency
   
def __update_frequencies(self): def __update_frequencies(self):
""" """
...@@ -129,12 +130,18 @@ class Preprocessor: ...@@ -129,12 +130,18 @@ class Preprocessor:
if term not in self.terms_highest_frequencies: # is a new word if term not in self.terms_highest_frequencies: # is a new word
self.terms_highest_frequencies[term] = self.terms[term] self.terms_highest_frequencies[term] = self.terms[term]
   
# check if frequency in the latest document is higher
if self.terms_highest_frequencies[term] < self.terms[term]: if self.terms_highest_frequencies[term] < self.terms[term]:
self.terms_highest_frequencies[term] = self.terms[term] self.terms_highest_frequencies[term] = self.terms[term]
   
def __persist(self, input_file): def __persist(self, input_file_name):
database = Database(OUTPUT_PERSISTENCE_PATH + DATABASE_FILE) """
database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file]) Persists all terms for a given document
:param input_file_name: Path to the persisted document
:return: None
"""
database = Database(self.database_path)
database.execute('''INSERT OR IGNORE INTO Document(filename) VALUES (?)''', [input_file_name])
database.commit() database.commit()
document_key = database.last_primary_key() document_key = database.last_primary_key()
for term in self.terms: for term in self.terms:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment