Added documentation to preprocessor.py and word_prunner.py

8f89f98b · weirdwizardthomas · 0b0b7f0f · 0b0b7f0f · 8f89f98b · 8f89f98b
Commit 8f89f98b authored 5 years ago by weirdwizardthomas
--- a/src/preprocessing/document_parser.py
+++ b/src/preprocessing/document_parser.py
-from src.preprocessing.preprocessor import preprocess_folder, load_documents
-
-preprocess_folder("./data/sample/", "./data/persistence/documents.json")
-documents = load_documents("./data/persistence/documents.json")
--- a/src/preprocessing/preprocessor.py
+++ b/src/preprocessing/preprocessor.py
@@ -7,47 +7,90 @@ from nltk import WordNetLemmatizer
 from src.preprocessing.word_prunner import WordPrunner
  
  
-def preprocess_folder(input_folder_path: str, output_persistence_path):
-    documents = get_documents(input_folder_path)
+def preprocess_collection(input_folder_path: str, output_persistence_path):
+    """
+    Parses and saves all documents from input_folder_path to output_persistence_path
+    :param input_folder_path: path to the document collection to parse
+    :param output_persistence_path: path to the output persistence file
+    :return: None
+    """
+    documents = parse_collection(input_folder_path)
  
    with open(output_persistence_path, 'w') as file:
        json.dump(documents, file)
  
  
-def get_documents(input_folder_path: str) -> dict:
+def parse_collection(input_folder_path: str) -> dict:
+    """
+    Parses all text files in the input_folder_path
+    :param input_folder_path: path to the document collection to parse
+    :return: dictionary, where key: file path, value: dictionary of terms and their frequencies
+    """
    preprocessor = Preprocessor()
    documents = {}
    for file in os.listdir(input_folder_path):
        if file.endswith(".txt"):
-            path, words = preprocessor.read_file(input_folder_path + file)
+            path, words = preprocessor.process_file(input_folder_path + file)
            documents[path] = words
  
    return documents
  
  
 def load_documents(path: str):
+    """
+    Loads processed documents from a persistence file
+    :param path: Path to the persistence file
+    :return: dictionary of documents, where key: file path, value: dictionary of terms and their frequencies
+    """
    with open(path, 'r') as file:
        return json.load(file)
  
  
 class Preprocessor:
+    """A class that processes a document for analysis
+
+    Attributes
+    ----------
+    words: dict
+        Dictionary of terms and their frequencies in the parsed document
+    lemmatiser: WordNetLemmatizer
+        Tool that lemmatises the document
+    prunner:WordPrunner
+        Tool that removes stop words, punctuation & other redundant terms from the document
+
+    Methods
+    -------
+    process_file(path: str) -> (str,dict)
+        Loads the document defined by path and processes it into terms and their frequencies
+    """
+
    def __init__(self):
        self.words = {}
        self.lemmatiser = WordNetLemmatizer()
        self.prunner = WordPrunner()
  
-    def read_file(self, path: str) -> (str, dict):
-        self.words = {}
+    def process_file(self, path: str) -> (str, dict):
+        """
+        Reads a document from file and processes it into terms and their frequencies
+        :param path: path to the document to open
+        :return: tuple of document path & dictionary of terms and their frequencies
+        """
+        self.words = {}  # reset
        with open(path, 'r') as file:
            line = " "
            while line:
                line = file.readline()
                tokens = self.prunner.prune(nltk.word_tokenize(line))
                for word in tokens:
-                    self.add_word(word)
+                    self.__add_word(word)
        return path, self.words
  
-    def add_word(self, term: str):
+    def __add_word(self, term: str):
+        """
+        Adds a term to the document's dictionary
+        :param term: Term to be added
+        :return: None
+        """
        # change case to lower
        word = self.lemmatiser.lemmatize(term)
        # add to words

--- a/src/preprocessing/word_prunner.py
+++ b/src/preprocessing/word_prunner.py
-import string
-
 from nltk.corpus import stopwords
  
  
 class WordPrunner:
+    """A class that removes stop words, numbers & punctuation from a collection of strings.
+
+    Attributes
+    ----------
+    stop_words : set
+        set of stop word strings
+
+
+    Methods
+    -------
+    prune(tokens: list) -> list
+        removes all stop words, numbers & punctuation from tokens
+
+    """
+
    def __init__(self):
        self.stop_words = set(stopwords.words('english'))
  
    def prune(self, tokens: list) -> list:
+        """
+        Removes all stop words, numbers & punctuation from tokens
+        :param tokens: list of strings to be parsed
+        :return: list of strings not containing any stop words, numbers, or punctuation
+        """
        # remove stop words and punctuation
        tokens = [tokens.lower() for tokens in tokens]
        return [term for term in tokens if term.isalpha() and term not in self.stop_words]