Commit 45209add authored by Martin Znamenáček's avatar Martin Znamenáček

find image

parent e515d57d
......@@ -9,3 +9,5 @@ out
gen
models
files
java
......@@ -7,7 +7,7 @@ def load_picture_dictionary(module):
imagesDictionary = dict()
logging.info("loading image URLs")
with open('processed_data.csv') as csv_file:
with open('files/processed_data.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
......
......@@ -30,8 +30,8 @@ def load_module_from_file(file_name, bin=True, limit=None):
def train_local_module():
documents = list(read_input("reviews_data.txt.gz")) # FIXME
logging.info("Done reading data file")
documents = list(read_input("files/reviews_data.txt.gz"))
logging.info("reading data file done")
model = gensim.models.Word2Vec(documents, size=150, window=10, min_count=2, workers=multiprocessing.cpu_count(),
iter=ITERATIONS)
......@@ -40,14 +40,11 @@ def train_local_module():
logging.info("training done, now save model to file")
model.wv.save_word2vec_format(fname=LOCAL_MODULE, binary=True)
return model
return load_module_from_file(LOCAL_MODULE)
def read_input(input_file):
logging.info("reading file {0}...this may take a while".format(input_file))
with gzip.open(input_file, 'rb') as f:
for i, line in enumerate(f):
if (i % 10000 == 0):
logging.info("read {0} reviews".format(i))
yield gensim.utils.simple_preprocess(line)
This diff is collapsed.
from collections import Counter
import gensim
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
......@@ -9,7 +8,7 @@ from nltk.tokenize import word_tokenize
import src.ImageManager as ImageManager
import src.ModuleManager as ModuleManager
google_module = ModuleManager.load_google_module(limit=100000)
google_module = ModuleManager.load_google_module(limit=10000000)
local_module = ModuleManager.load_local_module()
google_image_dictionary = ImageManager.load_picture_dictionary(google_module)
local_image_dictionary = ImageManager.load_picture_dictionary(local_module)
......@@ -17,11 +16,6 @@ local_image_dictionary = ImageManager.load_picture_dictionary(local_module)
stopWords = set(stopwords.words('english'))
def findImage(text):
lemma = WordNetLemmatizer()
sentence = gensim.utils.simple_preprocess(text)
#sentence = list(filter(lambda word: word not in stopWords, map(lambda word: lemma.lemmatize(word), sentence)))
all = parseText(text)
print(all)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment