Skip to content
Snippets Groups Projects
preprocessor.py 1.32 KiB
Newer Older
  • Learn to ignore specific revisions
  • import json
    import os
    
    from src import document
    from src.document import Document
    
    def preprocess_folder(input_folder_path: str, output_persistence_path):
        preprocessor = Preprocessor()
    
        for file in os.listdir(input_folder_path):
            if file.endswith(".txt"):
    
                documents.append(preprocessor.read_file(input_folder_path + file))
    
        with open(output_persistence_path, 'w') as file:
    
            json.dump(documents, file, cls=document.Encoder)
    
    class Preprocessor:
        def __init__(self):
            self.words = {}
            self.lemmatiser = WordNetLemmatizer()
            self.prunner = WordPrunner()
    
    
        def read_file(self, path: str) -> Document:
            self.words = {}
    
            with open(path, 'r') as file:
                line = " "
                while line:
                    line = file.readline()
    
                    tokens = self.prunner.prune(nltk.word_tokenize(line))
                    for word in tokens:
    
            return Document(path, self.words)
    
            word = self.lemmatiser.lemmatize(term)
    
            if word not in self.words:
                self.words[word] = 0
            self.words[word] += 1