From b039fa499e711be69fafa5a6606359296df35d51 Mon Sep 17 00:00:00 2001 From: weirdwizardthomas <thomas.koristka@gmail.com> Date: Fri, 6 Mar 2020 23:47:51 +0100 Subject: [PATCH] Added a word prunner to filter out stop words and non alphabetical words (i.e. punctuation) --- src/preprocessing/word_prunner.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 src/preprocessing/word_prunner.py diff --git a/src/preprocessing/word_prunner.py b/src/preprocessing/word_prunner.py new file mode 100644 index 0000000..c8d305d --- /dev/null +++ b/src/preprocessing/word_prunner.py @@ -0,0 +1,12 @@ +import string + +from nltk.corpus import stopwords + + +class WordPrunner: + def __init__(self): + self.stop_words = set(stopwords.words('english')) + + def prune(self, tokens: list) -> list: + # remove stop words and punctuation + return [term for term in tokens if term.isalpha() and term not in self.stop_words] -- GitLab