Skip to content
Snippets Groups Projects
Commit 0c2405f1 authored by weirdwizardthomas's avatar weirdwizardthomas
Browse files

Addeda a preprocessor that processes the input file - constructs a dictionary of words in that file

parent b039fa49
No related branches found
No related tags found
No related merge requests found
import nltk
from nltk import WordNetLemmatizer
from src.preprocessing.word_prunner import WordPrunner
class Preprocessor:
def __init__(self):
self.words = {}
self.lemmatiser = WordNetLemmatizer()
self.prunner = WordPrunner()
def read_file(self, path):
with open(path, 'r') as file:
line = " "
while line:
line = file.readline()
for word in self.prunner.prune(nltk.word_tokenize(line)):
self.add_word(word)
def add_word(self, term: str):
# change case to lower
term = self.lemmatiser.lemmatize(term.lower())
# add to words
if term not in self.words:
self.words[term] = 0
self.words[term] += 1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment