From 763077e2b3dba2fdc34d02dd969987078ce02a54 Mon Sep 17 00:00:00 2001
From: weirdwizardthomas <thomas.koristka@gmail.com>
Date: Sat, 7 Mar 2020 16:59:24 +0100
Subject: [PATCH] Simplified tokenisation

---
 src/preprocessing/preprocessor.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py
index 8e4c131..999ce57 100644
--- a/src/preprocessing/preprocessor.py
+++ b/src/preprocessing/preprocessor.py
@@ -87,10 +87,12 @@ class Preprocessor:
         with open(path, 'r') as file:
             line = " "
             while line:
-                line = file.readline()
-                tokens = self.prunner.prune(nltk.word_tokenize(line))
-                for word in tokens:
-                    self.__add_word(self.lemmatise(word))
+                try:
+                    line = file.readline()
+                    for word in self.prunner.prune(nltk.word_tokenize(line)):
+                        self.__add_term(self.lemmatise(word))
+                except UnicodeDecodeError:
+                    pass
         self.__update_frequencies()
         return path, self.terms
 
@@ -100,7 +102,7 @@ class Preprocessor:
     def get_most_frequent_words(self) -> dict:
         return self.terms_highest_frequencies
 
-    def __add_word(self, term: str):
+    def __add_term(self, term: str):
         """
         Adds a term to the document's dictionary
         :param term: Term to be added
-- 
GitLab