diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py
new file mode 100644
index 0000000000000000000000000000000000000000..a17e533af3ec8c79289daa938607d2d154e6c778
--- /dev/null
+++ b/src/preprocessing/preprocessor.py
@@ -0,0 +1,27 @@
+import nltk
+from nltk import WordNetLemmatizer
+
+from src.preprocessing.word_prunner import WordPrunner
+
+
+class Preprocessor:
+    def __init__(self):
+        self.words = {}
+        self.lemmatiser = WordNetLemmatizer()
+        self.prunner = WordPrunner()
+
+    def read_file(self, path):
+        with open(path, 'r') as file:
+            line = " "
+            while line:
+                line = file.readline()
+                for word in self.prunner.prune(nltk.word_tokenize(line)):
+                    self.add_word(word)
+
+    def add_word(self, term: str):
+        # change case to lower
+        term = self.lemmatiser.lemmatize(term.lower())
+        # add to words
+        if term not in self.words:
+            self.words[term] = 0
+        self.words[term] += 1