From b039fa499e711be69fafa5a6606359296df35d51 Mon Sep 17 00:00:00 2001
From: weirdwizardthomas <thomas.koristka@gmail.com>
Date: Fri, 6 Mar 2020 23:47:51 +0100
Subject: [PATCH] Added a word prunner to filter out stop words and non
 alphabetical words (i.e. punctuation)

---
 src/preprocessing/word_prunner.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)
 create mode 100644 src/preprocessing/word_prunner.py

diff --git a/src/preprocessing/word_prunner.py b/src/preprocessing/word_prunner.py
new file mode 100644
index 0000000..c8d305d
--- /dev/null
+++ b/src/preprocessing/word_prunner.py
@@ -0,0 +1,12 @@
+import string
+
+from nltk.corpus import stopwords
+
+
+class WordPrunner:
+    def __init__(self):
+        self.stop_words = set(stopwords.words('english'))
+
+    def prune(self, tokens: list) -> list:
+        # remove stop words and punctuation
+        return [term for term in tokens if term.isalpha() and term not in self.stop_words]
-- 
GitLab