...
 
from gensim.models import KeyedVectors as KV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sys import argv
import argparse
import os.path
import os
parser = argparse.ArgumentParser(description='Find closest words in word2vec.')
parser.add_argument('data', help="Path to the dataset")
parser.add_argument('topn', type=int, default=10,
help="Amount of words to look for")
args = parser.parse_args()
dataset = argv[0]
similar_amount = argv[1]
if not os.path.exists('export'):
os.mkdir('export')
print 'Loading data {}'.format(args.data)
model = KV.load_word2vec_format(args.data, binary=False)
# print model.similar_by_word('elisabeth', topn=20)
print 'Preparing TSNE'
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(model.syn0)
while True:
search_word = raw_input('Word: ')
try:
print 'Searching most similar words'
similar = model.similar_by_word(search_word, topn=args.topn)
print 'Found: ', similar
words = [w for w, d in similar]
selected_x = []
selected_y = []
for w in words:
i = model.vocab[w].index
selected_x.append(Y[i, 0])
selected_y.append(Y[i, 1])
print 'Generating plot'
with open('export/txt-{}.txt'.format(search_word), 'w') as h:
h.write("Most similar words to: {}\n".format(search_word))
for w in words:
h.write("{}\n".format(w))
h.close()
plt.scatter(selected_x, selected_y)
for label, x, y in zip(words, selected_x, selected_y):
plt.annotate(label, xy=(x, y), xytext=(
0, 0), textcoords='offset points')
plt.savefig('export/graph-{}.png'.format(search_word))
plt.gcf().clear()
except KeyError:
print 'Word not in dataset?'