Commit 3deb5c70 authored by gijs's avatar gijs

Merge branch 'master' of gitlab.constantvzw.org:algolit/algolit

parents ecfe5e77 c81dafde
This diff is collapsed.
This diff is collapsed.
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#
# This is a combination of two scripts:
# 1. racist_bias.py by Rob Speer
# 2. word2vec_bias.py from the introduction tutorial by Tensorflow
#
# 1. load word-embeddings
# 2. translate word-embeddings to graph
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# from racist_bias.py
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
import codecs
import numpy as np
# import pandas as pd
def load_embeddings(filename):
"""
Load a DataFrame from the generalized text format used by word2vec, GloVe,
fastText, and ConceptNet Numberbatch. The main point where they differ is
whether there is an initial line with the dimensions of the matrix.
"""
labels = []
rows = []
with codecs.open(filename, encoding='utf-8') as infile:
for i, line in enumerate(infile):
items = line.rstrip().split(' ')
if len(items) == 2:
# This is a header row giving the shape of the matrix
continue
labels.append(items[0])
values = np.array([float(x) for x in items[1:]], 'f')
rows.append(values)
arr = np.vstack(rows)
return rows, labels, arr
# return pd.DataFrame(arr, index=labels, dtype='f'), rows
embeddings, labels, array = load_embeddings('data/glove.42B.300d_part.txt')
embeddings = array
# print(embeddings)
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# from word2vec_basic.py
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# ***********************************************************************************
# Step 6: Visualize the embeddings.
# ***********************************************************************************
def plot_with_labels(low_dim_embs, labels, wordlist, filename='graph-500-least.png', format='png'):
"""
Step 6: Visualize the embeddings.
"""
assert low_dim_embs.shape[0] >= len(labels), "More labels than embeddings"
plt.figure(figsize=(18, 18)) #in inches
ax = plt.axes(frameon=False)
ax.get_xaxis().tick_bottom()
ax.axes.get_xaxis().set_visible(False)
ax.axes.get_yaxis().set_visible(False)
font = {'family': 'monospace',
'color': 'black',
'weight': 'normal',
'size': 8,
}
for i, label in enumerate(labels):
x, y = low_dim_embs[i,:]
plt.scatter(x, y)
if label in wordlist:
plt.annotate(label,
fontsize=10,
xy=(x, y),
xytext=(5, 2),
color='red',
textcoords='offset points',
ha='right',
va='bottom')
else:
plt.annotate(label,
fontsize=10,
xy=(x, y),
xytext=(5, 2),
textcoords='offset points',
ha='right',
va='bottom')
plt.savefig(filename)
print('*graph plotted*')
try:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)
wordlist = [
'human',
'learning',
'system'
]
low_dim_embs = tsne.fit_transform(embeddings[-500:])
plot_with_labels(low_dim_embs, labels[-500:], wordlist)
except ImportError:
print("Please install sklearn, matplotlib, and scipy to visualize embeddings.")
This diff is collapsed.
This diff is collapsed.
numpy
pandas
matplotlib
seaborn
statsmodels
sklearn
This diff is collapsed.
[[ -5.32581545e+00 -4.87625234e-03]
[ -9.63402073e-01 -4.80607605e-01]
[ -5.26490023e-01 -8.93244815e-01]
[ -4.85272706e+00 -7.83769284e-03]
[ -9.24649498e-01 -5.05291678e-01]
[ -7.76253677e-01 -6.16420554e-01]
[ -2.08896703e+00 -1.32177985e-01]
[ -5.25071166e-01 -8.95295843e-01]
[ -3.27030114e+00 -3.87356137e-02]
[ -5.14613091e-01 -9.10636292e-01]
[ -6.84533880e-01 -7.01835315e-01]
[ -3.27177948e+00 -3.86772705e-02]
[ -5.71839496e-01 -8.31228595e-01]
[ -1.32061759e+00 -3.10569160e-01]
[ -2.21721518e+00 -1.15312077e-01]
[ -9.39246200e-01 -4.95809600e-01]
[ -1.36339423e+00 -2.95433487e-01]
[ -7.83234720e-01 -6.10508936e-01]
[ -5.41020087e-01 -8.72642578e-01]
[ -1.32669328e-01 -2.08549689e+00]
[ -1.86230749e-01 -1.77243950e+00]
[ -3.10203979e+00 -4.59993376e-02]
[ -7.04952556e-01 -6.81479548e-01]
[ -1.66952580e+00 -2.08669254e-01]
[ -2.38706527e-01 -1.54950059e+00]
[ -2.74106493e+00 -6.66758657e-02]
[ -3.96683007e-01 -1.11641131e+00]
[ -1.84229748e+00 -1.72513372e-01]
[ -4.66005859e+00 -9.51099426e-03]
[ -5.33731842e+00 -4.82034770e-03]
[ -4.24188258e+00 -1.44848953e-02]
[ -9.67458408e+00 -6.28630097e-05]
[ -1.09481985e+00 -4.07366736e-01]
[ -1.40630587e+00 -2.81099583e-01]
[ -1.81679859e+00 -1.77388098e-01]
[ -7.92157785e-01 -6.03062542e-01]
[ -3.70788732e+00 -2.48351462e-02]
[ -1.18113313e+00 -3.66625357e-01]
[ -3.70797881e+00 -2.48328455e-02]
[ -1.85511725e+00 -1.70117846e-01]
[ -4.73748276e+00 -8.79927145e-03]
[ -6.53899837e+00 -1.44698244e-03]
[ -2.81033008e+00 -6.20723627e-02]
[ -1.00348364e+00 -4.56653318e-01]
[ -5.64419678e+00 -3.54426251e-03]
[ -1.71411327e+00 -1.98601401e-01]
[ -1.44771399e+00 -2.68019497e-01]
[ -4.01619776e-01 -1.10634760e+00]]
\ No newline at end of file
['french', 'love']
[[ -9.35114579e-01 -4.98470408e-01]
[ -7.33441620e+00 -6.52897917e-04]]
\ No newline at end of file
['terrorist']
[[ -2.40510482e-05 -1.06353440e+01]]
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment