Added the Document class and its json encoder

df768bdb · weirdwizardthomas · 74c84a13 · df768bdb · df768bdb · df768bdb
Commit df768bdb authored 5 years ago by weirdwizardthomas
--- a/src/preprocessing/document.py
+++ b/src/preprocessing/document.py
+import json
+
+
+class Document:
+    def __init__(self, path: str, terms: dict):
+        self.path = path
+        self.terms = terms
+
+    def to_json(self):
+        return json.dumps(self, default=lambda o: o.__dict__)
--- a/src/preprocessing/document_encoder.py
+++ b/src/preprocessing/document_encoder.py
+from json import JSONEncoder
+
+from src.preprocessing.document import Document
+
+
+class DocumentEncoder(JSONEncoder):
+    def default(self, o: Document):
+        return o.__dict__
--- a/src/preprocessing/document_parser.py
+++ b/src/preprocessing/document_parser.py
+from src.preprocessing.preprocessor import preprocess_folder
+
+preprocess_folder("./data/sample/", "./data/persistence/words.json")
--- a/src/preprocessing/preprocessor.py
+++ b/src/preprocessing/preprocessor.py
@@ -4,16 +4,20 @@ import nltk
  
 from nltk import WordNetLemmatizer
  
+from src.preprocessing.document import Document
+from src.preprocessing.document_encoder import DocumentEncoder
 from src.preprocessing.word_prunner import WordPrunner
  
  
 def preprocess_folder(input_folder_path: str, output_persistence_path):
    preprocessor = Preprocessor()
-
+    documents = []
    for file in os.listdir(input_folder_path):
        if file.endswith(".txt"):
-            preprocessor.read_file(input_folder_path + file)
-    preprocessor.persist(output_persistence_path)
+            documents.append(preprocessor.read_file(input_folder_path + file))
+
+    with open(output_persistence_path, 'w') as file:
+        json.dump(documents, file, cls=DocumentEncoder)
  
  
 class Preprocessor:
@@ -22,7 +26,8 @@ class Preprocessor:
        self.lemmatiser = WordNetLemmatizer()
        self.prunner = WordPrunner()
  
-    def read_file(self, path: str):
+    def read_file(self, path: str) -> Document:
+        self.words = {}
        with open(path, 'r') as file:
            line = " "
            while line:
@@ -30,6 +35,7 @@ class Preprocessor:
                tokens = self.prunner.prune(nltk.word_tokenize(line))
                for word in tokens:
                    self.add_word(word)
+        return Document(path, self.words)
  
    def add_word(self, term: str):
        # change case to lower
@@ -38,7 +44,3 @@ class Preprocessor:
        if word not in self.words:
            self.words[word] = 0
        self.words[word] += 1
-
-    def persist(self, path: str):
-        with open(path, 'w') as file:
-            json.dump(self.words, file)