From 61ecbf8c88e769ec3ce12b38aa279a91179b8a96 Mon Sep 17 00:00:00 2001 From: weirdwizardthomas <thomas.koristka@gmail.com> Date: Sat, 7 Mar 2020 00:33:50 +0100 Subject: [PATCH] Word prunner now compares case insensitive --- src/preprocessing/preprocessor.py | 18 ++++++++++++------ src/preprocessing/word_prunner.py | 1 + 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/src/preprocessing/preprocessor.py b/src/preprocessing/preprocessor.py index a17e533..d406524 100644 --- a/src/preprocessing/preprocessor.py +++ b/src/preprocessing/preprocessor.py @@ -1,4 +1,5 @@ import nltk + from nltk import WordNetLemmatizer from src.preprocessing.word_prunner import WordPrunner @@ -10,18 +11,23 @@ class Preprocessor: self.lemmatiser = WordNetLemmatizer() self.prunner = WordPrunner() - def read_file(self, path): + def read_file(self, path: str): with open(path, 'r') as file: line = " " while line: line = file.readline() - for word in self.prunner.prune(nltk.word_tokenize(line)): + tokens = self.prunner.prune(nltk.word_tokenize(line)) + for word in tokens: self.add_word(word) def add_word(self, term: str): # change case to lower - term = self.lemmatiser.lemmatize(term.lower()) + word = self.lemmatiser.lemmatize(term) # add to words - if term not in self.words: - self.words[term] = 0 - self.words[term] += 1 + if word not in self.words: + self.words[word] = 0 + self.words[word] += 1 + + def persist(self, path: str): + with open(path, 'w') as file: + json.dump(self.words, file) diff --git a/src/preprocessing/word_prunner.py b/src/preprocessing/word_prunner.py index c8d305d..23df276 100644 --- a/src/preprocessing/word_prunner.py +++ b/src/preprocessing/word_prunner.py @@ -9,4 +9,5 @@ class WordPrunner: def prune(self, tokens: list) -> list: # remove stop words and punctuation + tokens = [tokens.lower() for tokens in tokens] return [term for term in tokens if term.isalpha() and term not in self.stop_words] -- GitLab