Commit 697a4004 authored by ana's avatar ana

algoliterary exercise for linear regression halfway, 14 December 2018

parent 1ef3cb3a
interesting,9
noble,8
noble,8
unfashioned,3
human,5
helpless,2
pictured,5
guilty,1
yellow,5
alive,8
little,4
unsocial,3
happy,9
capable,9
pretended,3
happy,9
capable,9
human,5
unfit,2
bound,2
mild,6
most,6
fallen,2
favourable,6
desert,4
eternal,7
speedy,5
happy,9
young,5
fair,9
excellent,10
lovely,10
lovely,10
high,5
omnipotent,8
perfect,10
lovely,10
amiable,8
deserted,2
excellent,10
human,5
generous,9
little,4
beautiful,9
whole,8
another,5
fine,6
inaccessible,3
long,5
whole,6
tremulous,2
greater,8
future,5
insuperable,3
half finished,3
shuddering,2
human,5
angelic,7
every,5
purest,8
future,5
every,5
impenetrable,5
glorious,10
useful,8
extensive,7
rational,5
first,6
same,5
This source diff could not be displayed because it is too large. You can view the blob instead.
# This script is a first attempt to generate a story that can be used as input to train a linear regression model.
# The idea is to rewrite the story so that the R² measure becomes closer and closer to 1
# the hypothesis we take as a start is the following:
# the closer an adjective is to the word 'monster' or its synonym, the more negative it becomes
import nltk
import nltk.data
from nltk.tokenize import sent_tokenize, word_tokenize
import csv
from operator import itemgetter
# find adjectives for each sentence and choose adjective that is closest in position to monster word
def selecting_adjective(words, position_word):
pos_adjective = 0
adjetivo = ''
distance = 0
min_distance = 10000
position = 0
counter = 0
# find part of speech tags for each word of the sentence
pos = nltk.pos_tag(words)
# check if pos is adjective
for el in pos:
# if pos is adjective
if el[1] == 'JJ':
# find position of word
pos_adjective = words.index(el[0])
# calculate distance between adjective and monster word
distance = abs(pos_adjective - position_word)
# find the minimum distance
if distance < min_distance:
min_distance = distance
position = counter
adjetivo = el[0]
counter += 1
# return adjective & position
return adjetivo, position
# open novel frankenstein, cleaned up for this type of machinic exercises
with open("frankenstein_for_machines.txt", 'r') as text:
# read the text
text = text.read()
# split the text in sentences
sentences = sent_tokenize(text)
#print(sentences)
# open a dictionary of adjectives with their sentiment scores (from 0 to 10)
# --- this list needs revision!!!!
with open('creature_data_frankie.csv', 'r') as data:
data = csv.reader(data)
data = list(data)
# find sentences with monster words and for each of those sentences,
# find monster word, position monsterword, closest adj, position adj, sentiment score adj
selected_sentences = []
# list of synonyms monster
monsterly = ['monster', 'beast', 'creature', 'freak', 'giant', 'monstrosity', 'miscreation', 'demon']
# loop over sentences of the novel
for sentence in sentences:
# split sentence in words
words = word_tokenize(sentence)
# check each word against list of monsterly words
for w in words:
# if monster word is in
if w in monsterly:
# capture position monster word
position_w = words.index(w)
# find closest adjective and its position
adj, adj_pos = selecting_adjective(words, position_w)
# find sentiment score for adjective in external dictionary
for el in data:
if adj == el[0]:
score = el[1]
# add everything as sublist to list
monsterly_data = sentence, w, position_w, adj, adj_pos, int(score)
selected_sentences.append(monsterly_data)
# sort sublists by sentiment score of the adjective
selected_data = sorted(selected_sentences, key=itemgetter(5))
# read the text
for d in selected_data:
print(d[0])
# issue to solve still: now some sentences appear multiple times, should not be the case!
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment