Commit ecfe5e77 authored by gijs's avatar gijs

Script to generate graphs and wordlists of neighbouring words

parent 43d7addb
from gensim.models import KeyedVectors as KV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sys import argv
import argparse
import os.path
import os
parser = argparse.ArgumentParser(description='Find closest words in word2vec.')
parser.add_argument('data', help="Path to the dataset")
parser.add_argument('topn', type=int, default=10,
help="Amount of words to look for")
args = parser.parse_args()
dataset = argv[0]
similar_amount = argv[1]
if not os.path.exists('export'):
os.mkdir('export')
print 'Loading data {}'.format(args.data)
model = KV.load_word2vec_format(args.data, binary=False)
# print model.similar_by_word('elisabeth', topn=20)
print 'Preparing TSNE'
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(model.syn0)
while True:
search_word = raw_input('Word: ')
try:
print 'Searching most similar words'
similar = model.similar_by_word(search_word, topn=args.topn)
print 'Found: ', similar
words = [w for w, d in similar]
selected_x = []
selected_y = []
for w in words:
i = model.vocab[w].index
selected_x.append(Y[i, 0])
selected_y.append(Y[i, 1])
print 'Generating plot'
with open('export/txt-{}.txt'.format(search_word), 'w') as h:
h.write("Most similar words to: {}\n".format(search_word))
for w in words:
h.write("{}\n".format(w))
h.close()
plt.scatter(selected_x, selected_y)
for label, x, y in zip(words, selected_x, selected_y):
plt.annotate(label, xy=(x, y), xytext=(
0, 0), textcoords='offset points')
plt.savefig('export/graph-{}.png'.format(search_word))
plt.gcf().clear()
except KeyError:
print 'Word not in dataset?'
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment