Skip to content
Snippets Groups Projects
Commit df768bdb authored by weirdwizardthomas's avatar weirdwizardthomas
Browse files

Added the Document class and its json encoder

parent 74c84a13
No related branches found
No related tags found
No related merge requests found
import json
class Document:
def __init__(self, path: str, terms: dict):
self.path = path
self.terms = terms
def to_json(self):
return json.dumps(self, default=lambda o: o.__dict__)
from json import JSONEncoder
from src.preprocessing.document import Document
class DocumentEncoder(JSONEncoder):
def default(self, o: Document):
return o.__dict__
from src.preprocessing.preprocessor import preprocess_folder
preprocess_folder("./data/sample/", "./data/persistence/words.json")
......@@ -4,16 +4,20 @@ import nltk
 
from nltk import WordNetLemmatizer
 
from src.preprocessing.document import Document
from src.preprocessing.document_encoder import DocumentEncoder
from src.preprocessing.word_prunner import WordPrunner
 
 
def preprocess_folder(input_folder_path: str, output_persistence_path):
preprocessor = Preprocessor()
documents = []
for file in os.listdir(input_folder_path):
if file.endswith(".txt"):
preprocessor.read_file(input_folder_path + file)
preprocessor.persist(output_persistence_path)
documents.append(preprocessor.read_file(input_folder_path + file))
with open(output_persistence_path, 'w') as file:
json.dump(documents, file, cls=DocumentEncoder)
 
 
class Preprocessor:
......@@ -22,7 +26,8 @@ class Preprocessor:
self.lemmatiser = WordNetLemmatizer()
self.prunner = WordPrunner()
 
def read_file(self, path: str):
def read_file(self, path: str) -> Document:
self.words = {}
with open(path, 'r') as file:
line = " "
while line:
......@@ -30,6 +35,7 @@ class Preprocessor:
tokens = self.prunner.prune(nltk.word_tokenize(line))
for word in tokens:
self.add_word(word)
return Document(path, self.words)
 
def add_word(self, term: str):
# change case to lower
......@@ -38,7 +44,3 @@ class Preprocessor:
if word not in self.words:
self.words[word] = 0
self.words[word] += 1
def persist(self, path: str):
with open(path, 'w') as file:
json.dump(self.words, file)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment