Commit da4dea9f by manetta

added the onehotvector script to the word2vec inspection script

parent 735d401e
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
def export_onehotvector_for_trainingbatch(batch):
la = np.linalg
# unique words of the text
prematrix = set()
sentences = [batch]
for sentence in sentences:
print '> sentence: ', sentence
words = sentence.split(" ")
for word in words:
word = word.lower()
word = word.strip()
prematrix.add(word)
print '> prematrix: ', prematrix
# order set & turn into list
pre2 = sorted(list(prematrix))
print '> pre2: ', pre2
# create bigrams
bigrams = []
for sentence in sentences:
for b in nltk.bigrams(sentence.lower().split()):
print '> bigram:', b
bigrams.append(b)
print '> bigrams: ', bigrams
# create Co-occurence matrix
# create matrix with zeros, having the length of the vocabulary
X = np.zeros((len(pre2),len(pre2)), dtype=np.int)
print '> co-occurence matrix (empty): ', X
# for each bigram, add one
for b in bigrams:
X[pre2.index(b[0]), pre2.index(b[1])] = X[pre2.index(b[0]),pre2.index(b[1])] + 1
X[pre2.index(b[1]),pre2.index(b[0])] = X[pre2.index(b[1]),pre2.index(b[0])] + 1
print '> co-occurence matrix: ', X
sortedbatch = pre2
return sortedbatch, X
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment