Commit 021b80f2 by manetta

changes in the algolit basic script of word2vec

parent 80b9da0d
......@@ -23,6 +23,7 @@ import math
import os
import random
import zipfile
import codecs
import numpy as np
from six.moves import urllib
......@@ -64,7 +65,7 @@ def read_data(filename):
# ALGOLIT step 1: read data from plain text file
# ***********************************************************************************
filename = 'input/Mankind-in-the-Making.txt'
filename = 'input/mankind-in-the-making_stripped.txt'
words = []
def read_input_text(filename):
......@@ -73,12 +74,11 @@ def read_input_text(filename):
"""
with open(filename, 'r') as source:
for line in source:
wordsline = line.split(' ')
for word in wordsline:
words.append(word)
w = source.readlines()[0].split(' ')
for word in w:
words.append(word)
# print('Data size:', len(words))s
print('Data size:', len(words))
read_input_text(filename)
......@@ -187,6 +187,7 @@ def export_dictionary(reverse_dictionary):
"""
with open("dictionary.txt", 'w+') as out:
out.write(str(reverse_dictionary))
print('*dictionary.txt written*')
export_dictionary(reverse_dictionary)
......@@ -289,8 +290,7 @@ batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)
Step 4: Build and train a skip-gram model.
"""
batch_size = 128
embedding_size = 20 # Dimension of the embedding vector.
# > what is embedding_size?
embedding_size = 300 # Dimension of the embedding vector.
skip_window = 1 # How many words to consider left and right.
num_skips = 2 # How many times to reuse an input to generate a label.
......@@ -309,7 +309,7 @@ graph = tf.Graph()
# ALGOLIT extension:
# Pick you own set of validation words!
# You can use the file dictionary.txt, to look up the index numbers of certain words.
valid_examples = [3095, 171] # pick words by their index numbers
valid_examples = [341, 397, 2604, 3508, 4323] # pick words by their index numbers
valid_size = len(valid_examples) # Number of words to evaluate similarity on.
valid_window = 100 # Only pick the 100 samples in the head of the distribution.
num_sampled = valid_window - embedding_size - valid_size # Number of negative examples to sample.
......@@ -341,10 +341,10 @@ with graph.as_default():
embeddings = tf.Variable(
tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))
print('embeddings', embeddings)
# >>> print the big random matrix
# output: Tensor("Variable/read:0", shape=(5000, 20), dtype=float32)
# matrix = 5000 x 20
# print('embeddings', embeddings)
# **************
# *** NODE 2 ***
......@@ -354,9 +354,9 @@ with graph.as_default():
# embed = 128 x 20
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
print('embed', embed)
# >>> print this embed
# output: Tensor("embedding_lookup:0", shape=(128, 20), dtype=float32)
# print('embed', embed)
# **************
# *** NODE 3 ***
......@@ -368,14 +368,17 @@ with graph.as_default():
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
print('nce_weights', nce_weights)
# >>> print nce_weights
# print('nce_weights', nce_weights)
# **************
# *** NODE 4 ***
# **************
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
print('nce_biases', nce_biases)
# >>> print nce_biases
# **************
# *** NODE 5 ***
# **************
......@@ -387,6 +390,9 @@ with graph.as_default():
tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels,
num_sampled, vocabulary_size))
print('loss', loss)
# >>> print loss
# **************
# *** NODE 6 ***
# **************
......@@ -425,11 +431,13 @@ with graph.as_default():
# ***********************************************************************************
# Step 5: Begin training.
# ***********************************************************************************
np.set_printoptions(threshold=np.inf) # allows to print full arrays / prevents printing truncated representations of arrays
"""
Step 5: Begin training.
"""
num_steps = 100001
num_steps = 1001
# num_steps = 100001
with tf.Session(graph=graph) as session:
# We must initialize all variables before we use them.
......@@ -453,9 +461,10 @@ with tf.Session(graph=graph) as session:
# The average loss is an estimate of the loss over the last 2000 batches.
print("Average loss at step ", step, ": ", average_loss)
average_loss = 0
# Note that this is expensive (~20% slowdown if computed every 500 steps)
if step % 10000 == 0:
print ("\n")
with open('logfile.txt', 'a') as destination:
sim = similarity.eval()
for i in xrange(valid_size):
......@@ -469,7 +478,10 @@ with tf.Session(graph=graph) as session:
print(log_str)
destination.write(log_str+'\n')
destination.write('\n\n')
print ("\n")
print('*logfile.txt written*')
final_embeddings = normalized_embeddings.eval()
print('Final embeddings: \n\n', final_embeddings)
# ***********************************************************************************
......@@ -501,7 +513,7 @@ try:
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
plot_only = 500
low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only,:])
labels = [reverse_dictionary[i] for i in xrange(plot_only)]
labels = [reverse_dictionary[i].decode('utf8') for i in xrange(plot_only)]
plot_with_labels(low_dim_embs, labels)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment