Commit c95c1664 by gijs

TSNE less ambitious

parent 185bfa15
from gensim.models import KeyedVectors as KV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sys import argv
import argparse
import os.path
import os
parser = argparse.ArgumentParser(description='Find closest words in word2vec.')
parser.add_argument('data', help="Path to the dataset")
parser.add_argument('topn', type=int, default=10, nargs='?',
help="Amount of words to look for")
args = parser.parse_args()
dataset = argv[0]
similar_amount = argv[1]
if not os.path.exists('export'):
os.mkdir('export')
print 'Loading data {}'.format(args.data)
model = KV.load_word2vec_format(args.data, binary=False)
while True:
search_word = raw_input('Word: ')
try:
print 'Searching most similar words'
similar = model.similar_by_word(search_word, topn=args.topn)
print 'Found: ', similar
words, vecs = zip(*[(w, model.word_vec(w)) for w, d in similar])
selected_x = []
selected_y = []
print 'Preparing TSNE'
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(vecs)
for i in range(len(words)):
# i = model.vocab[w].index
selected_x.append(Y[i, 0])
selected_y.append(Y[i, 1])
print 'Generating plot'
with open('export/txt-{}.txt'.format(search_word), 'w') as h:
h.write("Most similar words to: {}\n".format(search_word))
for w in words:
h.write("{}\n".format(w))
h.close()
plt.scatter(selected_x, selected_y)
for label, x, y in zip(words, selected_x, selected_y):
plt.annotate(label, xy=(x, y), xytext=(
0, 0), textcoords='offset points')
plt.savefig('export/graph-{}.png'.format(search_word))
plt.gcf().clear()
except KeyError:
print 'Word not in dataset?'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment