Commit 2e404caa authored by Martin Znamenáček's avatar Martin Znamenáček

find image

parent d896f3a6
This diff is collapsed.
import csv
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
def load_picture_dictionary(module):
imagesDictionary = dict()
logging.info("loading image URLs")
with open('processed_data.csv') as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
line_count = 0
for row in csv_reader:
if line_count == 0:
line_count += 1
else:
try:
module.get_vector(row[0].lower())
except KeyError:
continue
imagesDictionary[row[0].lower()] = row[1]
line_count += 1
logging.info("Loaded {0} image URLs.".format(len(imagesDictionary)))
return imagesDictionary
\ No newline at end of file
......@@ -10,11 +10,11 @@ LOCAL_MODULE = "models/local-module.bin"
ITERATIONS = 1
def load_local_module():
def load_local_module(limit=None):
module_file = Path(LOCAL_MODULE)
if module_file.is_file():
logging.info("local module found on disc")
return load_module_from_file(LOCAL_MODULE)
return load_module_from_file(LOCAL_MODULE, limit=limit)
else:
logging.info("local module will be trained")
return train_local_module()
......@@ -47,12 +47,7 @@ def read_input(input_file):
logging.info("reading file {0}...this may take a while".format(input_file))
with gzip.open(input_file, 'rb') as f:
for i, line in enumerate(f):
if (i % 10000 == 0):
logging.info("read {0} reviews".format(i))
# do some pre-processing and return a list of words for each review text
yield gensim.utils.simple_preprocess(line)
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
This diff is collapsed.
from collections import Counter
import gensim
import nltk
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import src.ImageManager as ImageManager
import src.ModuleManager as ModuleManager
google_module = ModuleManager.load_google_module(limit=100000)
local_module = ModuleManager.load_local_module()
google_image_dictionary = ImageManager.load_picture_dictionary(google_module)
local_image_dictionary = ImageManager.load_picture_dictionary(local_module)
stopWords = set(stopwords.words('english'))
def findImage(text):
lemma = WordNetLemmatizer()
sentence = gensim.utils.simple_preprocess(text)
#sentence = list(filter(lambda word: word not in stopWords, map(lambda word: lemma.lemmatize(word), sentence)))
all = parseText(text)
print(all)
top_words = Counter(all).most_common(3)
google = {}
local = {}
for word in top_words:
word = word[0]
google[word] = search_module(word, google_module, google_image_dictionary)
local[word] = search_module(word, local_module, local_image_dictionary)
return top_words,google,local
def search_module(word, module, images):
try:
return images[module.wv.most_similar_to_given(word, list(images.keys()))]
except KeyError:
return None
def parseText(text):
sentences = nltk.sent_tokenize(text)
tagged_words = []
for sentence in sentences:
tagged_words += nltk.pos_tag(word_tokenize(sentence))
lemma = WordNetLemmatizer()
return list(filter(lambda word: word not in stopWords, map(lambda x: lemma.lemmatize(x[0].lower()), filter(lambda x: x[1] == "NN" or x[1] == "NNS" or x[1] == "NNP" or x[1] == "NNPS", tagged_words))))
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment