Commit 2458ba01 authored by manetta's avatar manetta

adding an Algologs folder with scripts for the session tomorrow

parent 18a4a94e
import nltk
sentences = [
"I like deep learning",
"I like NLP",
"I enjoy flying"
]
counter = dict()
words = list()
for sentence in sentences:
for ngram in nltk.bigrams(sentence.split()):
ngram_sorted = tuple(sorted(ngram))
if ngram_sorted not in counter:
counter[ngram_sorted] = 0
counter[ngram_sorted] += 1
for word in ngram_sorted:
if word not in words:
words.append(word)
words.sort()
matrix = [[counter[tuple(sorted((word1, word2)))] if tuple(sorted((word1, word2))) in counter else 0 for word2 in words] for word1 in words]
"""
'expanded' version of lijn 85
matrix = []
for word1 in words:
row = []
for word2 in words:
key = tuple(sorted([word1, word2]))
if key in counter:
row.push(counter[key])
else:
row.push(0)
matrix.push(row)
"""
print("{: >10}".format('') + ' ' + ''.join(["{: <10}".format(word) for word in words]))
for k, word in enumerate(words):
print("{: >10}".format(word) + ' ' + ''.join(["{: <10}".format(c) for c in matrix[k]]))
\ No newline at end of file
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
la = np.linalg
sentences = [
"Vandaag hebben we neural networks bestudeerd",
"Cristina was er ook, en Gijs niet",
"vandaag was het deep",
"net zo deep als deep learning"
]
# sentences = ["I like deep learning.", "I like NLP.", "I enjoy flying."]
# unique words of the text
prematrix = set()
for sentence in sentences:
print('> sentence: ', sentence)
words = sentence.split(" ")
for word in words:
word = word.lower()
word = word.strip()
prematrix.add(word)
print('\n> prematrix: \n', prematrix)
# order set & turn into list
pre2 = sorted(list(prematrix))
print('\n> pre2: \n', pre2, '\n')
# create bigrams
bigrams = []
for sentence in sentences:
for b in nltk.bigrams(sentence.lower().split()):
print('> bigram:', b)
bigrams.append(b)
print('\n> bigrams: \n', bigrams)
# create Co-occurence matrix
# create matrix with zeros, having the length of the vocabulary
X = np.zeros((len(pre2),len(pre2)), dtype=np.int)
print('\n> co-occurence matrix (empty): \n', X)
# for each bigram, add one
for b in bigrams:
X[pre2.index(b[0]), pre2.index(b[1])] = X[pre2.index(b[0]),pre2.index(b[1])] + 1
X[pre2.index(b[1]),pre2.index(b[0])] = X[pre2.index(b[1]),pre2.index(b[0])] + 1
print('\n> co-occurence matrix: \n', X)
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
import os
from gensim.models import Word2Vec
# based on the following tutorial: https://rare-technologies.com/word2vec-tutorial/
class MySentences(object):
def __init__(self, dirname):
self.dirname = dirname
def __iter__(self):
for fname in os.listdir(self.dirname):
for line in open(os.path.join(self.dirname, fname)):
# more text-processing steps can be added here
# for example: "convert to unicode, lowercase, remove numbers, extract named entities…"
print(line.split())
yield line.split() # returns a list of words
sentences = MySentences('./input/test')
model = Word2Vec(sentences, size=100, window=5, min_count=5, workers=4)
# model = Word2Vec(sentences, size=1, window=1, min_count=1, workers=1)
print(model)
import sys
from nltk import ngrams
# text = 'this is a test sentence for now'
text = sys.stdin.read()
# out = ngrams(text, 3)
n = 3
out = [text[i:i+n] for i in range(len(text)-n+1)]
print list(out)
out = str(out)
sys.stdout.write(out)
# [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')]
import sys
# >>> std.in
# text = 'this is a test sentence for now'
text = sys.stdin.read()
# >>> tokenizer
from nltk.tokenize import TreebankWordTokenizer
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(text)
# >>> ngrams
# from nltk import ngrams
# out = ngrams(tokens, 3)
n = 3
out = [tokens[i:i+n] for i in range(len(tokens)-n+1)]
# print list(out)
# [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')]
# >>> std.out
out = str(out)
sys.stdout.write(out)
\ No newline at end of file
import sys
from nltk.tokenize import TreebankWordTokenizer
# text = 'this is a test sentence for now'
text = sys.stdin.read()
tokenizer = TreebankWordTokenizer()
out = tokenizer.tokenize(text)
out = str(out)
sys.stdout.write(out)
print out
\ No newline at end of file
import sys
import nltk
from nltk.tokenize import TreebankWordTokenizer, sent_tokenize
from nltk import ngrams
from collections import Counter
import json
# # >>> std.in
# text = sys.stdin.read()
# text = text, errors='ignore'
# print(text)
text = open('input/frankenstein_gutenberg_plain.txt','r').read()
# >>> tokenizer
sentences = sent_tokenize(text)
print(sentences)
tokenizer = TreebankWordTokenizer()
data = []
for sentence in sentences:
tokens = tokenizer.tokenize(sentence)
data.append(tokens)
# print(tokens)
# print(data)
# [['this','is','a','sentence'], ['this','is','another','sentence']]
# >>> ngrams
# ngrams = ngrams(tokens, 3)
n = 3
ngrams = []
for sentence_tokens in data:
current_ngrams = [sentence_tokens[i:i+n] for i in range(len(sentence_tokens)-n+1)]
for ngram in current_ngrams:
ngrams.append(ngram)
# print(list(ngrams))
# [('this', 'is', 'a'), ('is', 'a', 'test'), ('a', 'test', 'sentence'), ('test', 'sentence', 'for'), ('sentence', 'for', 'now')]
# >>> filter
targets = ['human', 'fellow']
ngrams1 = []
ngrams2 = []
counter1 = Counter()
counter2 = Counter()
# Counter({u'black': 380, u'of': 76, u'the': 50, u'militant': 25, u'people': 21, ... })
for ngram in ngrams:
if targets[0] == ngram[1]:
ngrams1.append(ngram)
for word in ngram:
# print(word)
counter1[word] += 1
if targets[1] == ngram[1]:
ngrams2.append(ngram)
for word in ngram:
counter2[word] += 1
# print('\n---------\n')
# for ngram in ngrams1:
# print(ngram)
# print('\n---------\n'
# for ngram in ngrams2:
# print(ngram)
# print('\n---------\n'
# print(counter1)
# print('\n---------\n'
# print(counter2)
# print('\n---------\n'
out = {}
for word, value in counter1.items():
# if the word is not the target word
if word != any((targets[0], targets[1])):
# print(word, value)
# if the word also appeared in the window of the second target word
if word in counter2:
val1 = value
val2 = counter2[word]
# print(word, val1, val2)
sentences1 = []
sentences2 = []
bigrams1 = [' '+word+' '+targets[0]+' ', ' '+targets[0]+' '+word+' ']
bigrams2 = [' '+word+' '+targets[1]+' ', ' '+targets[1]+' '+word+' ']
for sentence in data:
if targets[0] in sentence:
index = sentence.index(targets[0])
if sentence[index-1] == word:
sentences1.append(' '.join(sentence))
if sentence[index+1] == word:
sentences1.append(' '.join(sentence))
if targets[1] in sentence:
index = sentence.index(targets[1])
if sentence[index-1] == word:
sentences2.append(' '.join(sentence))
if sentence[index+1] == word:
sentences2.append(' '.join(sentence))
out[word] = {
targets[0] : {
'freq' : val1,
'sentences': sentences1
},
targets[1] : {
'freq' : val2,
'sentences' : sentences2
}
}
# out = {
# 'of' : {
# 'human' : {
# freq : 23,
# sentences: [
# 'sentence 1',
# 'sentence 2'
# ]
# },
# 'black' : {
# freq : 111,
# sentences: [
# 'sentence 1',
# 'sentence 2'
# ]
# }
# }
# }
# >>> std.out
out = json.dumps(out, sort_keys=True, indent=4, separators=(',', ': '))
sys.stdout.write(out)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
Tensor("Variable/read:0", shape=(5000, 128), dtype=float32, device=/device:CPU:0)
\ No newline at end of file
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This diff is collapsed.
This source diff could not be displayed because it is too large. You can view the blob instead.
[]
\ No newline at end of file
This diff is collapsed.
[[-0.02037157 -0.08585653 -0.11732487 ..., -0.02487704 0.14382525
-0.00787624]
[-0.0488829 0.02987167 0.03885154 ..., 0.03740711 0.08836326
-0.06355019]
[ 0.05732905 0.09197161 -0.06498306 ..., -0.10195243 0.11381164
0.05407699]
...,
[-0.14505312 0.03208925 0.12413757 ..., 0.0992361 0.09659038
0.07573354]
[ 0.0491682 0.1109554 0.09332574 ..., -0.04606175 0.11865787
-0.02952761]
[ 0.01391508 0.14312759 0.03544989 ..., 0.00776124 -0.03358401
-0.13934346]]
\ No newline at end of file
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/python -tt
# -*- coding: utf-8 -*-
from nltk.tokenize import word_tokenize
import re
import string
import codecs
filename = 'classification-systems/all.txt'
sentences = []
regex = re.compile('[%s]' % re.escape(string.punctuation)) #see documentation here: http://docs.python.org/2/library/string.html
decoded = ''
with open(filename, 'r') as source:
lines = source.readlines()
if len(lines) == 1:
lines = lines[0].decode('utf8')
decoded = 'yes'
lines = lines.split('. ')
print lines
for line in lines:
# print line
# if "”" in line:
# line = line.replace("”","")
# if "“" in line:
# line = line.replace("“","")
if decoded != 'yes':
line = line.decode('utf8')
words = word_tokenize(line)
print words
string = []
for word in words:
new_word = regex.sub(u'', word)
if not new_word == u'':
string.append(new_word)
string = ' '.join(string)
sentences.append(string)
outputfilename = filename.replace('.txt','_stripped.txt')
with codecs.open(outputfilename, "w", "utf-8") as destination:
for sentence in sentences:
destination.write(sentence.strip().capitalize()+" ")
print '*text is stripped*'
This diff is collapsed.
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment