From 61ecbf8c88e769ec3ce12b38aa279a91179b8a96 Mon Sep 17 00:00:00 2001
From: weirdwizardthomas <thomas.koristka@gmail.com>
Date: Sat, 7 Mar 2020 00:33:50 +0100
Subject: [PATCH] Word prunner now compares case insensitive

---
 src/preprocessing/preprocessor.py | 18 ++++++++++++------
 src/preprocessing/word_prunner.py |  1 +
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py
index a17e533..d406524 100644
--- a/src/preprocessing/preprocessor.py
+++ b/src/preprocessing/preprocessor.py
@@ -1,4 +1,5 @@
 import nltk
+
 from nltk import WordNetLemmatizer
 
 from src.preprocessing.word_prunner import WordPrunner
@@ -10,18 +11,23 @@ class Preprocessor:
         self.lemmatiser = WordNetLemmatizer()
         self.prunner = WordPrunner()
 
-    def read_file(self, path):
+    def read_file(self, path: str):
         with open(path, 'r') as file:
             line = " "
             while line:
                 line = file.readline()
-                for word in self.prunner.prune(nltk.word_tokenize(line)):
+                tokens = self.prunner.prune(nltk.word_tokenize(line))
+                for word in tokens:
                     self.add_word(word)
 
     def add_word(self, term: str):
         # change case to lower
-        term = self.lemmatiser.lemmatize(term.lower())
+        word = self.lemmatiser.lemmatize(term)
         # add to words
-        if term not in self.words:
-            self.words[term] = 0
-        self.words[term] += 1
+        if word not in self.words:
+            self.words[word] = 0
+        self.words[word] += 1
+
+    def persist(self, path: str):
+        with open(path, 'w') as file:
+            json.dump(self.words, file)
diff --git a/src/preprocessing/word_prunner.py b/src/preprocessing/word_prunner.py
index c8d305d..23df276 100644
--- a/src/preprocessing/word_prunner.py
+++ b/src/preprocessing/word_prunner.py
@@ -9,4 +9,5 @@ class WordPrunner:
 
     def prune(self, tokens: list) -> list:
         # remove stop words and punctuation
+        tokens = [tokens.lower() for tokens in tokens]
         return [term for term in tokens if term.isalpha() and term not in self.stop_words]
-- 
GitLab