Commit 2458ba01 by manetta

adding an Algologs folder with scripts for the session tomorrow

parent 18a4a94e
 import nltk sentences = [ "I like deep learning", "I like NLP", "I enjoy flying" ] counter = dict() words = list() for sentence in sentences: for ngram in nltk.bigrams(sentence.split()): ngram_sorted = tuple(sorted(ngram)) if ngram_sorted not in counter: counter[ngram_sorted] = 0 counter[ngram_sorted] += 1 for word in ngram_sorted: if word not in words: words.append(word) words.sort() matrix = [[counter[tuple(sorted((word1, word2)))] if tuple(sorted((word1, word2))) in counter else 0 for word2 in words] for word1 in words] """ 'expanded' version of lijn 85 matrix = [] for word1 in words: row = [] for word2 in words: key = tuple(sorted([word1, word2])) if key in counter: row.push(counter[key]) else: row.push(0) matrix.push(row) """ print("{: >10}".format('') + ' ' + ''.join(["{: <10}".format(word) for word in words])) for k, word in enumerate(words): print("{: >10}".format(word) + ' ' + ''.join(["{: <10}".format(c) for c in matrix[k]])) \ No newline at end of file
 import numpy as np import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer import nltk la = np.linalg sentences = [ "Vandaag hebben we neural networks bestudeerd", "Cristina was er ook, en Gijs niet", "vandaag was het deep", "net zo deep als deep learning" ] # sentences = ["I like deep learning.", "I like NLP.", "I enjoy flying."] # unique words of the text prematrix = set() for sentence in sentences: print('> sentence: ', sentence) words = sentence.split(" ") for word in words: word = word.lower() word = word.strip() prematrix.add(word) print('\n> prematrix: \n', prematrix) # order set & turn into list pre2 = sorted(list(prematrix)) print('\n> pre2: \n', pre2, '\n') # create bigrams bigrams = [] for sentence in sentences: for b in nltk.bigrams(sentence.lower().split()): print('> bigram:', b) bigrams.append(b) print('\n> bigrams: \n', bigrams) # create Co-occurence matrix # create matrix with zeros, having the length of the vocabulary X = np.zeros((len(pre2),len(pre2)), dtype=np.int) print('\n> co-occurence matrix (empty): \n', X) # for each bigram, add one for b in bigrams: X[pre2.index(b[0]), pre2.index(b[1])] = X[pre2.index(b[0]),pre2.index(b[1])] + 1 X[pre2.index(b[1]),pre2.index(b[0])] = X[pre2.index(b[1]),pre2.index(b[0])] + 1 print('\n> co-occurence matrix: \n', X)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
 import os from gensim.models import Word2Vec # based on the following tutorial: https://rare-technologies.com/word2vec-tutorial/ class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): # more text-processing steps can be added here # for example: "convert to unicode, lowercase, remove numbers, extract named entities…" print(line.split()) yield line.split() # returns a list of words sentences = MySentences('./input/test') model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) # model = Word2Vec(sentences, size=1, window=1, min_count=1, workers=1) print(model)
 import sys from nltk import ngrams # text = 'this is a test sentence for now' text = sys.stdin.read() # out = ngrams(text, 3) n = 3 out = [text[i:i+n] for i in range(len(text)-n+1)] print list(out) out = str(out) sys.stdout.write(out) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')]
 import sys # >>> std.in # text = 'this is a test sentence for now' text = sys.stdin.read() # >>> tokenizer from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(text) # >>> ngrams # from nltk import ngrams # out = ngrams(tokens, 3) n = 3 out = [tokens[i:i+n] for i in range(len(tokens)-n+1)] # print list(out) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')] # >>> std.out out = str(out) sys.stdout.write(out) \ No newline at end of file
 import sys from nltk.tokenize import TreebankWordTokenizer # text = 'this is a test sentence for now' text = sys.stdin.read() tokenizer = TreebankWordTokenizer() out = tokenizer.tokenize(text) out = str(out) sys.stdout.write(out) print out \ No newline at end of file
 import sys import nltk from nltk.tokenize import TreebankWordTokenizer, sent_tokenize from nltk import ngrams from collections import Counter import json # # >>> std.in # text = sys.stdin.read() # text = text, errors='ignore' # print(text) text = open('input/frankenstein_gutenberg_plain.txt','r').read() # >>> tokenizer sentences = sent_tokenize(text) print(sentences) tokenizer = TreebankWordTokenizer() data = [] for sentence in sentences: tokens = tokenizer.tokenize(sentence) data.append(tokens) # print(tokens) # print(data) # [['this','is','a','sentence'], ['this','is','another','sentence']] # >>> ngrams # ngrams = ngrams(tokens, 3) n = 3 ngrams = [] for sentence_tokens in data: current_ngrams = [sentence_tokens[i:i+n] for i in range(len(sentence_tokens)-n+1)] for ngram in current_ngrams: ngrams.append(ngram) # print(list(ngrams)) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')] # >>> filter targets = ['human', 'fellow'] ngrams1 = [] ngrams2 = [] counter1 = Counter() counter2 = Counter() # Counter({u'black': 380, u'of': 76, u'the': 50, u'militant': 25, u'people': 21, ... }) for ngram in ngrams: if targets[0] == ngram[1]: ngrams1.append(ngram) for word in ngram: # print(word) counter1[word] += 1 if targets[1] == ngram[1]: ngrams2.append(ngram) for word in ngram: counter2[word] += 1 # print('\n---------\n') # for ngram in ngrams1: # print(ngram) # print('\n---------\n' # for ngram in ngrams2: # print(ngram) # print('\n---------\n' # print(counter1) # print('\n---------\n' # print(counter2) # print('\n---------\n' out = {} for word, value in counter1.items(): # if the word is not the target word if word != any((targets[0], targets[1])): # print(word, value) # if the word also appeared in the window of the second target word if word in counter2: val1 = value val2 = counter2[word] # print(word, val1, val2) sentences1 = [] sentences2 = [] bigrams1 = [' '+word+' '+targets[0]+' ', ' '+targets[0]+' '+word+' '] bigrams2 = [' '+word+' '+targets[1]+' ', ' '+targets[1]+' '+word+' '] for sentence in data: if targets[0] in sentence: index = sentence.index(targets[0]) if sentence[index-1] == word: sentences1.append(' '.join(sentence)) if sentence[index+1] == word: sentences1.append(' '.join(sentence)) if targets[1] in sentence: index = sentence.index(targets[1]) if sentence[index-1] == word: sentences2.append(' '.join(sentence)) if sentence[index+1] == word: sentences2.append(' '.join(sentence)) out[word] = { targets[0] : { 'freq' : val1, 'sentences': sentences1 }, targets[1] : { 'freq' : val2, 'sentences' : sentences2 } } # out = { # 'of' : { # 'human' : { # freq : 23, # sentences: [ # 'sentence 1', # 'sentence 2' # ] # }, # 'black' : { # freq : 111, # sentences: [ # 'sentence 1', # 'sentence 2' # ] # } # } # } # >>> std.out out = json.dumps(out, sort_keys=True, indent=4, separators=(',', ': ')) sys.stdout.write(out) \ No newline at end of file
This diff is collapsed.
 Tensor("Variable/read:0", shape=(5000, 128), dtype=float32, device=/device:CPU:0) \ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
 [] \ No newline at end of file
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!