diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py new file mode 100644 index 0000000000000000000000000000000000000000..a17e533af3ec8c79289daa938607d2d154e6c778 --- /dev/null +++ b/src/preprocessing/preprocessor.py @@ -0,0 +1,27 @@ +import nltk +from nltk import WordNetLemmatizer + +from src.preprocessing.word_prunner import WordPrunner + + +class Preprocessor: + def __init__(self): + self.words = {} + self.lemmatiser = WordNetLemmatizer() + self.prunner = WordPrunner() + + def read_file(self, path): + with open(path, 'r') as file: + line = " " + while line: + line = file.readline() + for word in self.prunner.prune(nltk.word_tokenize(line)): + self.add_word(word) + + def add_word(self, term: str): + # change case to lower + term = self.lemmatiser.lemmatize(term.lower()) + # add to words + if term not in self.words: + self.words[term] = 0 + self.words[term] += 1