Commit 735d401e by manetta

adding the one-hot-vector scripts we did earlier during algolit

parent 0cb7be0a
sentences = [
"I like deep learning",
"I like NLP",
"I enjoy flying"
]
counter = dict()
words = list()
for sentence in sentences:
for ngram in nltk.bigrams(sentence.split()):
ngram_sorted = tuple(sorted(ngram))
if ngram_sorted not in counter:
counter[ngram_sorted] = 0
counter[ngram_sorted] += 1
for word in ngram_sorted:
if word not in words:
words.append(word)
words.sort()
matrix = [[counter[tuple(sorted((word1, word2)))] if tuple(sorted((word1, word2))) in counter else 0 for word2 in words] for word1 in words]
"""
'expanded' version of lijn 85
matrix = []
for word1 in words:
row = []
for word2 in words:
key = tuple(sorted([word1, word2]))
if key in counter:
row.push(counter[key])
else:
row.push(0)
matrix.push(row)
"""
print "{: >10}".format('') + ' ' + ''.join(["{: <10}".format(word) for word in words])
for k, word in enumerate(words):
print "{: >10}".format(word) + ' ' + ''.join(["{: <10}".format(c) for c in matrix[k]])
\ No newline at end of file
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
la = np.linalg
sentences = [
"Vandaag hebben we neural networks bestudeerd",
"Cristina was er ook, en Gijs niet",
"vandaag was het deep",
"net zo deep als deep learning"
]
# sentences = ["I like deep learning.", "I like NLP.", "I enjoy flying."]
# unique words of the text
prematrix = set()
for sentence in sentences:
print '> sentence: ', sentence
words = sentence.split(" ")
for word in words:
word = word.lower()
word = word.strip()
prematrix.add(word)
print '> prematrix: ', prematrix
# order set & turn into list
pre2 = sorted(list(prematrix))
print '> pre2: ', pre2
# create bigrams
bigrams = []
for sentence in sentences:
for b in nltk.bigrams(sentence.lower().split()):
print '> bigram:', b
bigrams.append(b)
print '> bigrams: ', bigrams
# create Co-occurence matrix
# create matrix with zeros, having the length of the vocabulary
X = np.zeros((len(pre2),len(pre2)), dtype=np.int)
print '> co-occurence matrix (empty): ', X
# for each bigram, add one
for b in bigrams:
X[pre2.index(b[0]), pre2.index(b[1])] = X[pre2.index(b[0]),pre2.index(b[1])] + 1
X[pre2.index(b[1]),pre2.index(b[0])] = X[pre2.index(b[1]),pre2.index(b[0])] + 1
print '> co-occurence matrix: ', X
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or sign in to comment