Commit bd31fd75 authored by manetta's avatar manetta
Browse files

Merge branch 'master' of gitlab.constantvzw.org:algolit/algolit

parents 6aecb6bb 1f6752d8
### Following this tutorial in Python3:
### https://gist.github.com/rspeer/ef750e7e407e04894cb3b78a82d66aed#file-how-to-make-a-racist-ai-without-really-trying-ipynb
'''
This script applies a trained model to textfiles.
It splits the text in sentences and predicts a sentiment score for each of the sentences.
The score & sentence are saved in a file, ordered from small to big scores
clean your text using adapting_the_reading_glasses.py
'''
### FOR THE STORYTELLER: uncomment ##time.sleep + print i/line on 45/46
import numpy as np
import pandas as pd
import re
import time
import random
import os, sys
import nltk
import nltk.data
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from sklearn.externals import joblib
### FUNCTIONS
### WRITING
### ------------
def write(sentence):
words = sentence.split(" ")
for word in words:
for char in word:
sys.stdout.write('%s' % char)
sys.stdout.flush()
time.sleep(0.2)
sys.stdout.write(" ")
sys.stdout.flush()
def archive(sentence, filename):
with open(filename, "a") as destination:
destination.write(sentence)
### LOAD GloVE word Embeddings
### ---------------------------
def load_embeddings(filename):
"""
Load a DataFrame from the generalized text format used by word2vec, GloVe,
fastText, and ConceptNet Numberbatch. The main point where they differ is
whether there is an initial line with the dimensions of the matrix.
"""
labels = []
rows = []
with open(filename, 'r') as infile:
for i, line in enumerate(infile):
# print(i)
#print(line)
items = line.rstrip().split(' ')
if len(items) == 2:
# This is a header row giving the shape of the matrix
continue
labels.append(items[0])
values = np.array([float(x) for x in items[1:]], 'f')
rows.append(values)
arr = np.vstack(rows)
return pd.DataFrame(arr, index=labels, dtype='f')
### Load Lexicon of POSITIVE and NEGATIVE words
### -------------------------------------------
def load_lexicon(filename):
'''
Load a file from Bing Liu's sentiment lexicon
(https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html), containing
English words in Latin-1 encoding.
One file contains a list of positive words, and the other contains
a list of negative words. The files contain comment lines starting
with ';' and blank lines, which should be skipped.
'''
lexicon = []
with open(filename, encoding='latin-1') as infile:
for line in infile:
line = line.rstrip()
if line and not line.startswith(';'):
lexicon.append(line)
return lexicon
### See the sentiment that this classifier predicts for particular words
### --------------------------------------------------------------------
def vecs_to_sentiment(vecs):
# predict_log_proba gives the log probability for each class
predictions = model.predict_log_proba(vecs)
# To see an overall positive vs. negative classification in one number,
# we take the log probability of positive sentiment minus the log
# probability of negative sentiment.
return predictions[:, 1] - predictions[:, 0]
### Use sentiment function to see some examples of its predictions on the test data
### --------------------------------------------------------------------------------
def words_to_sentiment(words):
vecs = embeddings.loc[words].dropna()
log_odds = vecs_to_sentiment(vecs)
return pd.DataFrame({'sentiment': log_odds}, index=vecs.index)
### Combine sentiments for word vectors into an overall sentiment score by averaging them
### --------------------------------------------------------------------------------------
TOKEN_RE = re.compile(r"\w.*?\b")
# The regex above finds tokens that start with a word-like character (\w), and continues
# matching characters (.+?) until the next word break (\b). It's a relatively simple
# expression that manages to extract something very much like words from text.
def text_to_sentiment(text):
tokens = [token.casefold() for token in TOKEN_RE.findall(text)]
sentiments = words_to_sentiment(tokens)
return sentiments['sentiment'].mean()
### Use Pandas to make a table of names, their predominant ethnic background, and the predicted sentiment score
### ------------------------------------------------------------------------------------------------------------
def name_sentiment_table():
frames = []
for group, name_list in sorted(NAMES_BY_ETHNICITY.items()):
lower_names = [name.lower() for name in name_list]
sentiments = words_to_sentiment(lower_names)
sentiments['group'] = group
frames.append(sentiments)
# Put together the data we got from each ethnic group into one big table
return pd.concat(frames)
### ------------------------------------------
### ACTIONS
### LOAD EMBEDDINGS
#playsound('/path/to/file/you/want/to/play.wav')
embeddings = load_embeddings('data/glove.42B.300d.txt')
#embeddings = load_embeddings('data/glovesample.txt')
#playsound('/path/to/file/you/want/to/play.wav')
filename = 'data/1984_all_stripped.txt'
#filename = 'data/frankenstein_for_machines.txt'
pos_output = filename.replace('.txt','_pos.txt')
neg_output = filename.replace('.txt','_neg.txt')
### Welcome & choice
### ----------------
# rows = embeddings.shape[0]
# columns = embeddings.shape[1]
# pos_words = load_lexicon('data/positive-words.txt')
# neg_words = load_lexicon('data/negative-words.txt')
# ### CLEAN UP positive and negative words
# ### ------------------------------------
# #the data points here are the embeddings of these positive and negative words.
# #We use the Pandas .loc[] operation to look up the embeddings of all the words.
# pos_vectors = embeddings.loc[pos_words]
# neg_vectors = embeddings.loc[neg_words]
# #Some of these words are not in the GloVe vocabulary, particularly the misspellings such as "fancinating".
# #Those words end up with rows full of NaN to indicate their missing embeddings, so we use .dropna() to remove them.
# pos_vectors = embeddings.loc[pos_words].dropna()
# neg_vectors = embeddings.loc[neg_words].dropna()
# print("\t\tTidied up, you see that each word is represented by exactly 300 points in the vector landscape: \n", pos_vectors[:5], "\n")
# #time.sleep(10)
# len_pos = len(pos_vectors)
# len_neg = len(neg_vectors)
# '''
# Now we make arrays of the desired inputs and outputs.
# The inputs are the embeddings, and the outputs are 1 for positive words and -1 for negative words.
# We also make sure to keep track of the words they're labeled with, so we can interpret the results.
# '''
# vectors = pd.concat([pos_vectors, neg_vectors])
# targets = np.array([1 for entry in pos_vectors.index] + [-1 for entry in neg_vectors.index])
# labels = list(pos_vectors.index) + list(neg_vectors.index)
# ### TRAINING & TESTING
# ### ___________________
# '''
# Using the scikit-learn train_test_split function, we simultaneously separate the input vectors,
# output values, and labels into training and test data, with 10% of the data used for testing.
# '''
# train_vectors, test_vectors, train_targets, test_targets, train_labels, test_labels = \
# train_test_split(vectors, targets, labels, test_size=0.1, random_state=0)
# '''
# Now we make our classifier, and train it by running the training vectors through it for 100 iterations.
# We use a logistic function as the loss, so that the resulting classifier can output the probability
# that a word is positive or negative.
# '''
# model = SGDClassifier(loss='log', random_state=0, n_iter=100)
# model.fit(train_vectors, train_targets)
# '''
# ### EVALUATION - Finetuning the scoring
# ### ____________________________________
# We evaluate the classifier on the test vectors.
# It predicts the correct sentiment for sentiment words outside of its training data 95% of the #time.
# Precision: (also called positive predictive value) is the fraction of relevant instances among the retrieved instances:
# -> When it predicts yes, how often is it correct?
# Recall: (also known as sensitivity) is the fraction of relevant instances that have been retrieved over
# the total amount of relevant instances: how many instances did the classifier classify correctly?
# Confusion Matrix: True Positives | False Negatives
# False Positives | True Negatives
# '''
# confusion_matrix = (confusion_matrix(model.predict(test_vectors), test_targets))
# #print("confusion matrix", confusion_matrix)
# cm = np.split(confusion_matrix, 2, 1)
# print("\t\tLet's ", blue("evaluate our findings!\n"))
# #time.sleep(4)
# print("\t\tFor each of 10% of test words, we predict their overall sentiment.\n")
# #time.sleep(4)
# print("\t\tWe compare our results to the given labels.\n")
# #time.sleep(4)
# print("\t\tFor this test we scored the following:\n")
# #time.sleep(4)
# TP = cm[0][0]
# FP = cm[0][1]
# TN = cm[1][1]
# FN = cm[1][0]
# print("\t\tWe matched ", green(str(TP))," items correctly as positive words.\n")
# #time.sleep(4)
# print("\t\tThese are also called ", red("True Positives."))
# #time.sleep(4)
# print("\n")
# print("\t\tWe mismatched ", green(str(FP))," items, we labeled them incorrectly as positive words.\n")
# #time.sleep(4)
# print("\t\tThese are also called ", red("False Positives."))
# #time.sleep(4)
# print("\n")
# print("\t\tWe matched ", green(str(TN))," items, we labeled them correctly as negative words.\n")
# #time.sleep(4)
# print("\t\tThese are also called ", red("True Negatives."))
# #time.sleep(4)
# print("\n")
# print("\t\tWe mismatched ", green(str(FN))," items, we labeled them incorrectly as negative words.\n")
# #time.sleep(4)
# print("\t\tThese are also called ", red("False Negatives."))
# #time.sleep(4)
# print("\n")
# ### QUESTION::: How to map weights features/outcome numbers back to original words??]
# ### QUESTION: how to show examples of TP/FP/TN/FN???
# #print("Weights assigned to features: ", model.coef_)
# accuracy_score = (accuracy_score(model.predict(test_vectors), test_targets))
# print("\t\tOur accuracy score is ", accuracy_score)
# '''
# ### Predicted sentiment for Particular Word
# ### ________________________________________
# Let's use the function vecs_to_sentiment(vecs) and words_to_sentiment(words) above to see the sentiment that this classifier predicts for particular words,
# to see some examples of its predictions on the test data.
# '''
# # Show 20 examples from the test set
# samples = words_to_sentiment(test_labels).ix[:20]
# print("\t\tHere are a few samples to get an idea: \n", samples)
# #time.sleep(4)
# print("\n")
# '''
# There are many ways to combine sentiments for word vectors into an overall sentiment score.
# Again, because we're following the path of least resistance, we're just going to average them.
# '''
### SAVING THE MODEL
### -----------------
# # using joblib
# joblib.dump(model, 'sentiment_thermometer_glove.pkl')
# print("\t\tAND NOW, WE TAKE A LITTLE BREAK & DANCE!")
### SCORE SENTENCES
### ---------------
# load model using joblib
loaded_model = joblib.load('sentiment_thermometer_glove.pkl')
# get sentence
finding_sentences = nltk.data.load('tokenizers/punkt/english.pickle')
sentences_list = []
all_sents = []
with open(filename, "r") as source:
for line in source:
if line:
print("line", line)
# this returns a list with 1 element containing the entire text, sentences separated by \n
sentences = '\n'.join(finding_sentences.tokenize(line.strip()))
print("sentences", sentences)
# transform string into list of sentences
sentences_list = sentences.split("\n")
print('sentences_list', sentences_list)
all_sents.append(sentences_list)
pos_scores = set()
neg_scores = set()
## CHECK: WHY DOES IT PRINT DTYPE = FLOAT
for sentences in all_sents:
if sentences:
for sentence in sentences:
# tokenize sentence
if sentence:
TOKEN_RE = re.compile(r"\w.*?\b")
tokens = [token.casefold() for token in TOKEN_RE.findall(sentence)]
tokens_set = set(tokens)
# look for words of given sentence in word embeddings
vecs = embeddings.loc[tokens].dropna()
# find logarithmic scores
predictions = loaded_model.predict_log_proba(vecs)
# print("tot1", predictions[:, 1])
# print("tot0", predictions[:, 0])
log_odds = predictions[:, 1] - predictions[:, 0]
#print("log_odds", log_odds)
#print("\t\tThe sentiment scores for each of the words are", pd.DataFrame({':': log_odds}, index=vecs.index))
score = pd.DataFrame({':': log_odds}, index=vecs.index).mean()*10
scorestr = str(score)
clean_score = re.sub('[^0-9\.\-]', '', scorestr)
scores = [clean_score, sentence]
if int(pd.DataFrame({'': log_odds}).mean()) > 0.00:
pos_scores.add(tuple(scores))
else:
neg_scores.add(tuple(scores))
print(sentence, "\t", clean_score)
print("\n")
pos_scores = sorted(pos_scores)
print("positive scores", pos_scores)
neg_scores = sorted(neg_scores)
print("neg scores", neg_scores)
for s in pos_scores:
archive(s[0], pos_output)
archive("\t", pos_output)
archive(s[1], pos_output)
archive("\n", pos_output)
for s in neg_scores:
archive(s[0], neg_output)
archive("\t", neg_output)
archive(s[1], neg_output)
archive("\n", neg_output)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment