Newer
Older
weirdwizardthomas
committed
from nltk.corpus import stopwords
class WordPrunner:
"""A class that removes stop words, numbers & punctuation from a collection of strings.
Attributes
----------
stop_words : set
set of stop word strings
Methods
-------
prune(tokens: list) -> list
removes all stop words, numbers & punctuation from tokens
"""
weirdwizardthomas
committed
def __init__(self):
self.stop_words = set(stopwords.words('english'))
def prune(self, tokens: list) -> list:
"""
Removes all stop words, numbers & punctuation from tokens
:param tokens: list of strings to be parsed
:return: list of strings not containing any stop words, numbers, or punctuation
"""
weirdwizardthomas
committed
# remove stop words and punctuation
tokens = [tokens.lower() for tokens in tokens]
weirdwizardthomas
committed
return [term for term in tokens if term.isalpha() and term not in self.stop_words]