...
 
Commits (2)
......@@ -6,110 +6,6 @@
.idea
*.iml
out
gen### Python template
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
gen
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# Jupyter Notebook
.ipynb_checkpoints
# pyenv
.python-version
# celery beat schedule file
celerybeat-schedule
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.idea/
models
import gzip
import logging
import multiprocessing
from pathlib import Path
import gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
LOCAL_MODULE = "models/local-module.bin"
ITERATIONS = 1
def load_local_module():
module_file = Path(LOCAL_MODULE)
if module_file.is_file():
logging.info("local module found on disc")
return load_module_from_file(LOCAL_MODULE)
else:
logging.info("local module will be trained")
return train_local_module()
def load_google_module(limit=None):
logging.info("loading google module")
return load_module_from_file("models/GoogleNews-vectors-negative300.bin", limit=limit)
def load_module_from_file(file_name, bin=True, limit=None):
return gensim.models.KeyedVectors.load_word2vec_format(file_name, binary=bin, limit=limit)
def train_local_module():
documents = list(read_input("reviews_data.txt.gz")) # FIXME
logging.info("Done reading data file")
model = gensim.models.Word2Vec(documents, size=150, window=10, min_count=2, workers=multiprocessing.cpu_count(),
iter=ITERATIONS)
model.train(documents, total_examples=len(documents), epochs=ITERATIONS)
logging.info("training done, now save model to file")
model.wv.save_word2vec_format(fname=LOCAL_MODULE, binary=True)
return model
def read_input(input_file):
logging.info("reading file {0}...this may take a while".format(input_file))
with gzip.open(input_file, 'rb') as f:
for i, line in enumerate(f):
if (i % 10000 == 0):
logging.info("read {0} reviews".format(i))
# do some pre-processing and return a list of words for each review text
yield gensim.utils.simple_preprocess(line)
# read the tokenized reviews into a list
# each review item becomes a serries of words
# so this becomes a list of lists
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:30,005 : INFO : loading google module\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:30,007 : INFO : loading projection weights from models/GoogleNews-vectors-negative300.bin\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:31,258 : INFO : loaded (50000, 300) matrix from models/GoogleNews-vectors-negative300.bin\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:31,259 : INFO : local module found on disc\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:31,262 : INFO : loading projection weights from models/local-module.bin\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"2019-01-03 01:15:32,740 : INFO : loaded (70538, 150) matrix from models/local-module.bin\n"
]
}
],
"source": [
"import ModuleManagement\n",
"\n",
"google_module = ModuleManagement.load_google_module(limit=50000)\n",
"local_module = ModuleManagement.load_local_module()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n \n/home/martin/.local/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('filthy', 0.6117340326309204),\n ('smelly', 0.5709947347640991),\n ('Dirty', 0.5670455694198608),\n ('clean', 0.5180182456970215),\n ('nasty', 0.5079717636108398),\n ('soiled', 0.4905186891555786),\n ('sleazy', 0.48590758442878723),\n ('greasy', 0.484619140625),\n ('disgusting', 0.4718772768974304),\n ('filth', 0.45772865414619446)]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"w1 = \"dirty\"\n",
"google_module.wv.most_similar (positive=w1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:1: DeprecationWarning: Call to deprecated `wv` (Attribute will be removed in 4.0.0, use self instead).\n \"\"\"Entry point for launching an IPython kernel.\n/home/martin/.local/lib/python3.6/site-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n if np.issubdtype(vec.dtype, np.int):\n"
]
},
{
"data": {
"text/plain": [
"[('filthy', 0.8682267665863037),\n ('unclean', 0.8144831657409668),\n ('stained', 0.7948288321495056),\n ('smelly', 0.7844254970550537),\n ('grubby', 0.7499089241027832),\n ('disgusting', 0.7483757138252258),\n ('moldy', 0.7422553300857544),\n ('mouldy', 0.7376161813735962),\n ('dusty', 0.733493983745575),\n ('dingy', 0.7278274297714233)]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"local_module.wv.most_similar (positive=w1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}