Skip to content
Snippets Groups Projects
preprocessor.py 1.37 KiB
Newer Older
import json
import os
from src.preprocessing.document import Document
from src.preprocessing.document_encoder import DocumentEncoder
def preprocess_folder(input_folder_path: str, output_persistence_path):
    preprocessor = Preprocessor()
    for file in os.listdir(input_folder_path):
        if file.endswith(".txt"):
            documents.append(preprocessor.read_file(input_folder_path + file))

    with open(output_persistence_path, 'w') as file:
        json.dump(documents, file, cls=DocumentEncoder)
class Preprocessor:
    def __init__(self):
        self.words = {}
        self.lemmatiser = WordNetLemmatizer()
        self.prunner = WordPrunner()

    def read_file(self, path: str) -> Document:
        self.words = {}
        with open(path, 'r') as file:
            line = " "
            while line:
                line = file.readline()
                tokens = self.prunner.prune(nltk.word_tokenize(line))
                for word in tokens:
        return Document(path, self.words)
        word = self.lemmatiser.lemmatize(term)
        if word not in self.words:
            self.words[word] = 0
        self.words[word] += 1