Commit 2458ba01 by manetta

adding an Algologs folder with scripts for the session tomorrow

parent 18a4a94e
 import nltk sentences = [ "I like deep learning", "I like NLP", "I enjoy flying" ] counter = dict() words = list() for sentence in sentences: for ngram in nltk.bigrams(sentence.split()): ngram_sorted = tuple(sorted(ngram)) if ngram_sorted not in counter: counter[ngram_sorted] = 0 counter[ngram_sorted] += 1 for word in ngram_sorted: if word not in words: words.append(word) words.sort() matrix = [[counter[tuple(sorted((word1, word2)))] if tuple(sorted((word1, word2))) in counter else 0 for word2 in words] for word1 in words] """ 'expanded' version of lijn 85 matrix = [] for word1 in words: row = [] for word2 in words: key = tuple(sorted([word1, word2])) if key in counter: row.push(counter[key]) else: row.push(0) matrix.push(row) """ print("{: >10}".format('') + ' ' + ''.join(["{: <10}".format(word) for word in words])) for k, word in enumerate(words): print("{: >10}".format(word) + ' ' + ''.join(["{: <10}".format(c) for c in matrix[k]])) \ No newline at end of file
 import numpy as np import matplotlib.pyplot as plt from sklearn.feature_extraction.text import CountVectorizer import nltk la = np.linalg sentences = [ "Vandaag hebben we neural networks bestudeerd", "Cristina was er ook, en Gijs niet", "vandaag was het deep", "net zo deep als deep learning" ] # sentences = ["I like deep learning.", "I like NLP.", "I enjoy flying."] # unique words of the text prematrix = set() for sentence in sentences: print('> sentence: ', sentence) words = sentence.split(" ") for word in words: word = word.lower() word = word.strip() prematrix.add(word) print('\n> prematrix: \n', prematrix) # order set & turn into list pre2 = sorted(list(prematrix)) print('\n> pre2: \n', pre2, '\n') # create bigrams bigrams = [] for sentence in sentences: for b in nltk.bigrams(sentence.lower().split()): print('> bigram:', b) bigrams.append(b) print('\n> bigrams: \n', bigrams) # create Co-occurence matrix # create matrix with zeros, having the length of the vocabulary X = np.zeros((len(pre2),len(pre2)), dtype=np.int) print('\n> co-occurence matrix (empty): \n', X) # for each bigram, add one for b in bigrams: X[pre2.index(b[0]), pre2.index(b[1])] = X[pre2.index(b[0]),pre2.index(b[1])] + 1 X[pre2.index(b[1]),pre2.index(b[0])] = X[pre2.index(b[1]),pre2.index(b[0])] + 1 print('\n> co-occurence matrix: \n', X)
 import os from gensim.models import Word2Vec # based on the following tutorial: https://rare-technologies.com/word2vec-tutorial/ class MySentences(object): def __init__(self, dirname): self.dirname = dirname def __iter__(self): for fname in os.listdir(self.dirname): for line in open(os.path.join(self.dirname, fname)): # more text-processing steps can be added here # for example: "convert to unicode, lowercase, remove numbers, extract named entities…" print(line.split()) yield line.split() # returns a list of words sentences = MySentences('./input/test') model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4) # model = Word2Vec(sentences, size=1, window=1, min_count=1, workers=1) print(model)
 import sys from nltk import ngrams # text = 'this is a test sentence for now' text = sys.stdin.read() # out = ngrams(text, 3) n = 3 out = [text[i:i+n] for i in range(len(text)-n+1)] print list(out) out = str(out) sys.stdout.write(out) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')]
 import sys # >>> std.in # text = 'this is a test sentence for now' text = sys.stdin.read() # >>> tokenizer from nltk.tokenize import TreebankWordTokenizer tokenizer = TreebankWordTokenizer() tokens = tokenizer.tokenize(text) # >>> ngrams # from nltk import ngrams # out = ngrams(tokens, 3) n = 3 out = [tokens[i:i+n] for i in range(len(tokens)-n+1)] # print list(out) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')] # >>> std.out out = str(out) sys.stdout.write(out) \ No newline at end of file
 import sys from nltk.tokenize import TreebankWordTokenizer # text = 'this is a test sentence for now' text = sys.stdin.read() tokenizer = TreebankWordTokenizer() out = tokenizer.tokenize(text) out = str(out) sys.stdout.write(out) print out \ No newline at end of file
 import sys import nltk from nltk.tokenize import TreebankWordTokenizer, sent_tokenize from nltk import ngrams from collections import Counter import json # # >>> std.in # text = sys.stdin.read() # text = text, errors='ignore' # print(text) text = open('input/frankenstein_gutenberg_plain.txt','r').read() # >>> tokenizer sentences = sent_tokenize(text) print(sentences) tokenizer = TreebankWordTokenizer() data = [] for sentence in sentences: tokens = tokenizer.tokenize(sentence) data.append(tokens) # print(tokens) # print(data) # [['this','is','a','sentence'], ['this','is','another','sentence']] # >>> ngrams # ngrams = ngrams(tokens, 3) n = 3 ngrams = [] for sentence_tokens in data: current_ngrams = [sentence_tokens[i:i+n] for i in range(len(sentence_tokens)-n+1)] for ngram in current_ngrams: ngrams.append(ngram) # print(list(ngrams)) # [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')] # >>> filter targets = ['human', 'fellow'] ngrams1 = [] ngrams2 = [] counter1 = Counter() counter2 = Counter() # Counter({u'black': 380, u'of': 76, u'the': 50, u'militant': 25, u'people': 21, ... }) for ngram in ngrams: if targets[0] == ngram[1]: ngrams1.append(ngram) for word in ngram: # print(word) counter1[word] += 1 if targets[1] == ngram[1]: ngrams2.append(ngram) for word in ngram: counter2[word] += 1 # print('\n---------\n') # for ngram in ngrams1: # print(ngram) # print('\n---------\n' # for ngram in ngrams2: # print(ngram) # print('\n---------\n' # print(counter1) # print('\n---------\n' # print(counter2) # print('\n---------\n' out = {} for word, value in counter1.items(): # if the word is not the target word if word != any((targets[0], targets[1])): # print(word, value) # if the word also appeared in the window of the second target word if word in counter2: val1 = value val2 = counter2[word] # print(word, val1, val2) sentences1 = [] sentences2 = [] bigrams1 = [' '+word+' '+targets[0]+' ', ' '+targets[0]+' '+word+' '] bigrams2 = [' '+word+' '+targets[1]+' ', ' '+targets[1]+' '+word+' '] for sentence in data: if targets[0] in sentence: index = sentence.index(targets[0]) if sentence[index-1] == word: sentences1.append(' '.join(sentence)) if sentence[index+1] == word: sentences1.append(' '.join(sentence)) if targets[1] in sentence: index = sentence.index(targets[1]) if sentence[index-1] == word: sentences2.append(' '.join(sentence)) if sentence[index+1] == word: sentences2.append(' '.join(sentence)) out[word] = { targets[0] : { 'freq' : val1, 'sentences': sentences1 }, targets[1] : { 'freq' : val2, 'sentences' : sentences2 } } # out = { # 'of' : { # 'human' : { # freq : 23, # sentences: [ # 'sentence 1', # 'sentence 2' # ] # }, # 'black' : { # freq : 111, # sentences: [ # 'sentence 1', # 'sentence 2' # ] # } # } # } # >>> std.out out = json.dumps(out, sort_keys=True, indent=4, separators=(',', ': ')) sys.stdout.write(out) \ No newline at end of file
 Tensor("Variable/read:0", shape=(5000, 128), dtype=float32, device=/device:CPU:0) \ No newline at end of file
