diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py index d406524a2215da2f9d585c1f9eb2ee3246aef0a5..3c408d8d03679a0ce8abdf3816a7eb6e20cb2c7b 100644 --- a/src/preprocessing/preprocessor.py +++ b/src/preprocessing/preprocessor.py @@ -1,3 +1,5 @@ +import json +import os import nltk from nltk import WordNetLemmatizer @@ -5,6 +7,15 @@ from nltk import WordNetLemmatizer from src.preprocessing.word_prunner import WordPrunner +def preprocess_folder(input_folder_path: str, output_persistence_path): + preprocessor = Preprocessor() + + for file in os.listdir(input_folder_path): + if file.endswith(".txt"): + preprocessor.read_file(input_folder_path + file) + preprocessor.persist(output_persistence_path) + + class Preprocessor: def __init__(self): self.words = {}