Skip to content
Snippets Groups Projects
word_prunner.py 945 B
Newer Older
# author : koristo1@fit.cvut.cz

    """A class that removes stop words, numbers & punctuation from a collection of strings.

    Attributes
    ----------
    stop_words : set
        set of stop word strings


    Methods
    -------
    prune(tokens: list) -> list
        removes all stop words, numbers & punctuation from tokens

    """

    def __init__(self):
        self.stop_words = set(stopwords.words('english'))

    def prune(self, tokens: list) -> list:
        """
        Removes all stop words, numbers & punctuation from tokens
        :param tokens: list of strings to be parsed
        :return: list of strings not containing any stop words, numbers, or punctuation
        """
        tokens = [tokens.lower() for tokens in tokens]
        return [term for term in tokens if term.isalpha() and term not in self.stop_words]