Commit 7797d84e authored by Cristina's avatar Cristina
Browse files

Merge branch 'master' of gitlab.constantvzw.org:algolit/algolit

parents 40b0a492 bde312a0
from gensim.models import KeyedVectors as KV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sys import argv
import argparse
import os.path
import os
parser = argparse.ArgumentParser(description='Find closest words in word2vec.')
parser.add_argument('data', help="Path to the dataset")
parser.add_argument('topn', type=int, default=10,
help="Amount of words to look for")
args = parser.parse_args()
dataset = argv[0]
similar_amount = argv[1]
if not os.path.exists('export'):
os.mkdir('export')
print 'Loading data {}'.format(args.data)
model = KV.load_word2vec_format(args.data, binary=False)
# print model.similar_by_word('elisabeth', topn=20)
print 'Preparing TSNE'
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(model.syn0)
while True:
search_word = raw_input('Word: ')
try:
print 'Searching most similar words'
similar = model.similar_by_word(search_word, topn=args.topn)
print 'Found: ', similar
words = [w for w, d in similar]
selected_x = []
selected_y = []
for w in words:
i = model.vocab[w].index
selected_x.append(Y[i, 0])
selected_y.append(Y[i, 1])
print 'Generating plot'
with open('export/txt-{}.txt'.format(search_word), 'w') as h:
h.write("Most similar words to: {}\n".format(search_word))
for w in words:
h.write("{}\n".format(w))
h.close()
plt.scatter(selected_x, selected_y)
for label, x, y in zip(words, selected_x, selected_y):
plt.annotate(label, xy=(x, y), xytext=(
0, 0), textcoords='offset points')
plt.savefig('export/graph-{}.png'.format(search_word))
plt.gcf().clear()
except KeyError:
print 'Word not in dataset?'
from gensim.models import KeyedVectors as KV
import numpy as np
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sys import argv
import argparse
import os.path
import os
parser = argparse.ArgumentParser(description='Find closest words in word2vec.')
parser.add_argument('data', help="Path to the dataset")
parser.add_argument('topn', type=int, default=10, nargs='?',
help="Amount of words to look for")
args = parser.parse_args()
dataset = argv[0]
similar_amount = argv[1]
if not os.path.exists('export'):
os.mkdir('export')
print 'Loading data {}'.format(args.data)
model = KV.load_word2vec_format(args.data, binary=False)
while True:
search_word = raw_input('Word: ')
try:
print 'Searching most similar words'
similar = model.similar_by_word(search_word, topn=args.topn)
print 'Found: ', similar
words, vecs = zip(*[(w, model.word_vec(w)) for w, d in similar])
selected_x = []
selected_y = []
print 'Preparing TSNE'
tsne = TSNE(n_components=2, random_state=0)
Y = tsne.fit_transform(vecs)
for i in range(len(words)):
# i = model.vocab[w].index
selected_x.append(Y[i, 0])
selected_y.append(Y[i, 1])
print 'Generating plot'
with open('export/txt-{}.txt'.format(search_word), 'w') as h:
h.write("Most similar words to: {}\n".format(search_word))
for w in words:
h.write("{}\n".format(w))
h.close()
plt.scatter(selected_x, selected_y)
for label, x, y in zip(words, selected_x, selected_y):
plt.annotate(label, xy=(x, y), xytext=(
0, 0), textcoords='offset points')
plt.savefig('export/graph-{}.png'.format(search_word))
plt.gcf().clear()
except KeyError:
print 'Word not in dataset?'
gensim
scikit-learn
matplotlib
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment