Commit 56547527 authored by manetta's avatar manetta

adding a script to work with word2vec in gensim

parent f58072f4
import gensim
from nltk.tokenize import sent_tokenize, word_tokenize
# from: https://rare-technologies.com/word2vec-tutorial/
# trainingset = [['first', 'sentence'], ['second', 'sentence']]
text = open('input/frankenstein_gutenberg_plain.txt','r').read()
sentences = sent_tokenize(text)
trainingset = []
for sentence in sentences:
words = word_tokenize(sentence)
trainingset.append(words)
# print(trainingset)
model = gensim.models.Word2Vec(trainingset, min_count=1, workers=4, size=200)
# print(model)
# print(model.wv.vocab) # print vocabulary
# saving the model
# model.save('output/mymodel')
# loading the model again
# model = Word2Vec.load_word2vec_format('/tmp/vectors.txt', binary=False)
# training more sentences
# model.train(more_sentences)
most_similar = model.most_similar(positive=['human', 'fellow'], negative=['father'], topn=1)
print('most_similar: ', most_similar)
no_match = model.doesnt_match("human fellow father house".split())
print('no_match: ', no_match)
similarity = model.similarity('human', 'fellow')
print('similarity: ', similarity)
This source diff could not be displayed because it is too large. You can view the blob instead.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment