Commit 5fbe8c5b authored by manetta's avatar manetta
Browse files

adding the i-could-have-written-that writing-systems to the git

parent bf4baf37
Writing Machines
================
This is a writing system that works with the [cosine similarity](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) function of SciKit Learn.
Requirements
============
The requirements for this tool are listed in the requirements.txt file.
To install these, run:
> pip install -r requirements.txt
Output folder
=============
Exported html & pdf documents are written to the 'output' folder. Before using the tool, create the following folders:
> writing-systems/cosine-similarity/output/html/regular/
> writing-systems/cosine-similarity/output/html/booklet/
> writing-systems/cosine-similarity/output/pdf/regular/
> writing-systems/cosine-similarity/output/pdf/booklet/
Use
===
To run the tool locally, use the python server module.
Navigate to this folder in the terminal and start the server from there.
> cd ~/path/i-could-have-written-that/writing-machines/cosine-similarity/
> python -m CGIHTTPServer
This run a CGI server on localhost:8000.
Open in a browser:
> localhost:8000/cgi-bin/cosine-similarity-morphs.cgi
---
Note: It is possible to run multiple python servers, but not on the same port.
To run a server on port 9000 for example:
> python -m CGIHTTPServer 9000
modules
=======
beside the .cgi interface, there are some standalone scripts in the modules folder.
#!/usr/bin/env python
from __future__ import print_function
import os, sys
import cgi
import cgitb; cgitb.enable()
from jinja2 import FileSystemLoader, Environment
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# *******************************************************
# settings
# *******************************************************
path = "./"
env = Environment(loader=FileSystemLoader(path))
template = env.get_template("cosine-similarity-morphs.html")
# *******************************************************
# read form fields into variables
# *******************************************************
method = os.environ.get("REQUEST_METHOD")
form = cgi.FieldStorage()
state = form.getvalue("state", "").decode('utf-8')
input_text = form.getvalue("input_text", "").decode('utf-8')
feature1 = form.getvalue("feature1", "word")
feature2 = form.getvalue("feature2", "bigram")
feature3 = form.getvalue("feature3", "chargram3")
feature4 = form.getvalue("feature4", "pos")
features = [feature1, feature2, feature3, feature4]
countingmethod = form.getvalue("countingmethod", "")
lowercase = form.getvalue("lowercase", "")
# matrix = form.getvalue("matrix", "")
input_sentence = form.getvalue("input_sentence", "").decode('utf-8')
source = form.getvalue("source", "").decode('utf-8')
filename = form.getvalue("filename", "").decode('utf-8')
title = form.getvalue("title", "").decode('utf-8')
author = form.getlist('author')
if len(author) > 1:
', '.join(author)
# *******************************************************
# change variables (when form is submitted)
# *******************************************************
# **************************
# delete the 'none' features
# **************************
for i in range(4):
if 'none' in features:
features.remove('none')
# print('\nfeatures:', features, file=sys.stderr)
# **********************
# CORPUS
# **********************
corpus = input_text.replace('\n',' ')
corpus = sent_tokenize(corpus)
corpus.append(input_sentence)
print('\n*corpus:', corpus, file=sys.stderr)
# **********************
# TF-IDF VECTORIZER
# **********************
sentences_in = []
sentences_in_vector_html = []
sentences_out = []
cossim_values = []
sentences_out_vector_index = []
sentences_out_vector_html = []
sentences_out_features_html = []
def makeTwoDigitFloat(num):
return '{0:.2f}'.format(float(num)) # make a 0.00 float
env.globals.update(makeTwoDigitFloat=makeTwoDigitFloat)
def getFeatureName(index):
if matrix:
return tfidf_vectorizer.get_feature_names()[index]
env.globals.update(getFeatureName=getFeatureName)
if input_text:
for feature_type in features:
if feature_type == 'none':
continue
else:
# function that the TFIDF vectorizer uses
def createfeatures(corpus):
words = word_tokenize(corpus)
for w in words:
if lowercase == 'True':
w = w.lower()
if 'word' in feature_type:
yield w
if 'chargram2' in feature_type:
for i in range(len(w) - 2):
yield w[i:i+2]
if 'chargram3' in feature_type:
for i in range(len(w) - 3):
yield w[i:i+3]
if 'chargram4' in feature_type:
for i in range(len(w) - 4):
yield w[i:i+4]
if 'chargram5' in feature_type:
for i in range(len(w) - 5):
yield w[i:i+5]
if 'bigram' in feature_type:
bigrams = ngrams(words, 2)
for bigram in bigrams:
bigram = '_'.join(bigram)
yield bigram
if 'trigram' in feature_type:
trigrams = ngrams(words, 3)
for trigram in trigrams:
trigram = '_'.join(trigram)
yield trigram
if 'pos_bigr' in feature_type:
bigrams = ngrams(words, 2)
for bigram in bigrams:
pos = nltk.pos_tag(bigram)
bigram_pos = '_'.join(tag for w, tag in pos)
yield bigram_pos
if 'pos_trigr' in feature_type:
trigrams = ngrams(words, 3)
for trigram in trigrams:
pos = nltk.pos_tag(trigram)
trigram_pos = '_'.join(tag for w, tag in pos)
yield trigram_pos
if 'pos' in feature_type:
pos = nltk.pos_tag(words)
for tag in pos:
yield 'POS_'+tag[1]
if feature_type:
print('\n>feature_type:', feature_type, file=sys.stderr)
# for reference:
# TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict', dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content', lowercase=True, max_df=1.0, max_features=None, min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True, stop_words=None, strip_accents=None, sublinear_tf=False, token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True, vocabulary=None)
if corpus:
if lowercase == 'True':
tfidf_vectorizer = TfidfVectorizer(analyzer=createfeatures, stop_words=None, min_df=1, max_df=1.0, lowercase=True, norm=None, tokenizer=TreebankWordTokenizer().tokenize)
print('>lowercase: True', file=sys.stderr)
else:
tfidf_vectorizer = TfidfVectorizer(analyzer=createfeatures, stop_words=None, min_df=1, max_df=1.0, lowercase=False, norm=None, tokenizer=TreebankWordTokenizer().tokenize)
print('>lowercase: False', file=sys.stderr)
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
tfidf_matrix_array = tfidf_matrix.toarray()
matrix = []
for vector in tfidf_matrix_array:
v = []
for val in vector:
v.append(val)
matrix.append(v)
print('\n*matrix created*', file=sys.stderr)
# **********************
# COSINE SIMILARITY
# **********************
from sklearn.metrics.pairwise import cosine_similarity
state = 'corpus-created'
# if there is an input sentence selected
if input_sentence:
sentence_in_vector = matrix[-1]
sentence_in = corpus[-1]
print('\n>input sentence found:', sentence_in, file=sys.stderr)
print('>input vector found:', sentence_in_vector, file=sys.stderr)
# calculate the cosine similarities
# between the input vector and all the other vectors
sentence_in_vector_similarities = cosine_similarity(sentence_in_vector, tfidf_matrix)
if max(sentence_in_vector_similarities[0]) != 0:
# transform numpy array to python list
tmp_similarities = []
for num in sentence_in_vector_similarities[0]:
tmp_similarities.append(num)
highest_cossim_value = max(n for n in tmp_similarities if n!=max(tmp_similarities))
highest_cossim_value_utf = highest_cossim_value.astype(unicode)
sentence_out_vector_index = tmp_similarities.index(highest_cossim_value)
sentence_out_vector = matrix[sentence_out_vector_index]
sentence_out = corpus[sentence_out_vector_index]
print('>most similar index found:', sentence_out_vector_index +1, file=sys.stderr)
print('>most similar sentence found:', sentence_out, file=sys.stderr)
print('\n*cosine similarity calculated*', file=sys.stderr)
# ************************
# similarity visualisation
# ************************
# create feature lists
# ********************
sentence_in_vector_html = []
sentence_in_vector_values = []
sentence_in_features = []
sentence_out_vector_html = []
sentence_out_vector_values = []
sentence_out_features = []
for index, x in enumerate(sentence_in_vector):
# do not include features with a value of '0'
if x != 0:
value = '{0:.2f}'.format(float(x))
feature = tfidf_vectorizer.get_feature_names()[index]
sentence_in_vector_values.append(value)
sentence_in_features.append(feature)
sentence_in_vector_html.append('<div>'+value+' &nbsp; <span class="feature">'+feature+'</span></div>')
for index, y in enumerate(sentence_out_vector):
# do not include features with a value of '0'
if y != 0:
value = '{0:.2f}'.format(float(y))
feature = tfidf_vectorizer.get_feature_names()[index]
sentence_out_vector_values.append(value)
sentence_out_features.append(feature)
sentence_out_vector_html.append('<div>'+value+' &nbsp; <span class="feature">'+feature+'</span></div>')
# create the overlapping features list
# ************************************
sentence_out_features_html = []
sentence_out_features_item = []
for i, feature in enumerate(sentence_in_features):
if feature in sentence_out_features:
index = sentence_out_features.index(feature)
val_in = sentence_in_vector_values[i]
val_out = sentence_out_vector_values[index]
sentence_out_features_item.append([feature, val_in, val_out])
# exception for POS features:
# words are added to "overlapping features" list
if 'pos' == feature_type:
sentence_in_words = [word for word in word_tokenize(sentence_in)]
sentence_out_words = [word for word in word_tokenize(sentence_out)]
sentence_in_pos = nltk.pos_tag(sentence_in_words)
sentence_out_pos = nltk.pos_tag(sentence_out_words)
feature = feature.replace("POS_","")
pos_list = []
for list in [sentence_in_pos, sentence_out_pos]:
for pos_item in list:
word = pos_item[0]
pos = pos_item[1]
if pos == feature:
html = '<span class="pos-feature-word">'+word+'</span>'
pos_list.append(html)
# replace POS tags with names, following the Penn Treebank (https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
if feature == "CC":
feature = "COORDINATING CONJUNCTION"
if feature == "CD":
feature = "CARDINAL NUMBER"
if feature == "DT":
feature = "DETERMINER"
if feature == "EX":
feature = "EXISTENTIAL THERE"
if feature == "FW":
feature = "FOREIGN WORD"
if feature == "IN":
feature = "PREPOSITION OR SUBORDINATING CONJUNCTION"
if feature == "JJ":
feature = "ADJECTIVE"
if feature == "JJR":
feature = "ADJECTIVE, COMPARATIVE"
if feature == "JJS":
feature = "ADJECTIVE, SUPERLATIVE"
if feature == "LS":
feature = "LIST ITEM MARKER"
if feature == "MD":
feature = "MODAL"
if feature == "NN":
feature = "NOUN, SINGULAR OR MASS"
if feature == "NNS":
feature = "NOUN, PLURAL"
if feature == "NNP":
feature = "PROPER NOUN, SINGULAR"
if feature == "NNPS":
feature = "PROPER NOUN, PLURAL"
if feature == "PDT":
feature = "PREDETERMINER"
if feature == "POS":
feature = "POSSESSIVE ENDING"
if feature == "PRP":
feature = "PERSONAL PRONOUN"
if feature == "PRP$":
feature = "POSSESSIVE PRONOUN"
if feature == "RB":
feature = "ADVERB"
if feature == "RBR":
feature = "ADVERB, COMPARATIVE"
if feature == "RBS":
feature = "ADVERB, SUPERLATIVE"
if feature == "RP":
feature = "PARTICLE"
if feature == "SYM":
feature = "SYMBOL"
if feature == "TO":
feature = "TO"
if feature == "UH":
feature = "INTERJECTION"
if feature == "VB":
feature = "VERB, BASE FORM"
if feature == "VBD":
feature = "VERB, PAST TENSE"
if feature == "VBG":
feature = "VERB, GERUND OR PRESENT PARTICIPLE"
if feature == "VBN":
feature = "VERB, PAST PARTICIPLE"
if feature == "VBP":
feature = "VERB, NON-3RD PERSON SINGULAR PRESENT"
if feature == "VBZ":
feature = "VERB, 3RD PERSON SINGULAR PRESENT"
if feature == "WDT":
feature = "WH-DETERMINER"
if feature == "WP":
feature = "WH-PRONOUN"
if feature == "WP$":
feature = "POSSESSIVE WH-PRONOUN"
if feature == "WRB":
feature = "WH-ADVERB"
if feature == ".": # my addition to this list
feature = "PUNCTUATION" # my addition to this list
feature = feature+': '+' '.join(pos_list)
sentence_out_features_html.append('<span><span class="feature">'+feature+'</span> ('+str(val_in)+', '+str(val_out)+'); </span>')
else:
sentence_out_features_html.append('<span><span class="feature">'+feature+'</span> ('+str(val_in)+', '+str(val_out)+'); </span>')
# create similarity visualization with outlines
# *********************************************
# wrapping the features with a <span> in the sentences
# which uses the feature TFIDF value as thickness of the outline offset
print('\nINPUT >>> sentence_in:', sentence_in, file=sys.stderr)
print('INPUT >>> sentence_out:', sentence_out, file=sys.stderr)
if 'pos' == feature_type:
for x, sentence in enumerate([sentence_in, sentence_out]):
words = []
tmp = []
w = [word for word in word_tokenize(sentence)]
pos_list = nltk.pos_tag(w)
for pos_item in pos_list:
done = False
word = pos_item[0]
pos = pos_item[1]
for i, item in enumerate(sentence_out_features_item):
feature = item[0].replace("POS_","")
if pos == feature:
outline_intensity = 0.3 # to tweak the outline
value_in = float(item[1]) * outline_intensity
value_out = float(item[2]) * outline_intensity
html_word = '<span class="highlight" style="text-shadow: -'+str(value_in)+'px -'+str(value_in)+'px 0 #000, '+str(value_in)+'px -'+str(value_in)+'px 0 #000, -'+str(value_in)+'px '+str(value_in)+'px 0 #000, '+str(value_in)+'px '+str(value_in)+'px 0 #000;">'+word+'</span>'
words.append(html_word)
done = True
else:
if i == len(sentence_out_features_item)-1:
if done == False:
words.append(word)
if x == 0:
sentence_in = ' '.join(words)
if x == 1:
sentence_out = ' '.join(words)
else:
for item in sentence_out_features_item:
outline_intensity = 0.25 # to tweak the outline offset
value_in = float(item[1]) * outline_intensity # value of feature in input sentence
value_out = float(item[2]) * outline_intensity # value of feature in output sentence
space_1 = False # space character on the left
space_2 = False # space character on the right
if 'chargram' in feature_type:
feature = item[0].replace("_","")
if 'bigram' or 'trigram' == feature_type:
feature = item[0].replace("_"," ")
if " '" in feature:
f = feature.replace(" '","'")
if "," in feature:
f = feature.replace(" ,",",")
if feature[-1] == '.' or '!' or '?':
f = feature.replace(" ","")
if 'word' == feature_type:
feature = item[0]
words_in = [w for w in word_tokenize(sentence_in)]
words_out = [w for w in word_tokenize(sentence_in)]
if feature == words_in[0]:
feature = feature+' '
elif feature == words_in[-2]:
feature = ' '+feature
space_1 = True
elif feature == words_in[-1]:
feature = feature
elif feature+'<spa' in sentence_in:
feature = ' '+feature
space_1 = True
elif 'span>'+feature in sentence_in:
feature = feature+' '
space_2 = True
elif feature == words_out[0]:
feature = feature+' '
space_2 = True
elif feature == words_out[-2]:
feature = ' '+feature
space_1 = True
else:
feature = ' '+feature+' '
space_1 = True
space_2 = True
def getHtml(feature, value):
return '<span class="highlight" style="text-shadow: -'+str(value)+'px -'+str(value)+'px 0 #000, '+str(value)+'px -'+str(value)+'px 0 #000, -'+str(value)+'px '+str(value)+'px 0 #000, '+str(value)+'px '+str(value)+'px 0 #000;">'+feature+'</span>'
# include or not include spaces in searching for features in sentences
if space_1 == True:
if space_2 == True:
html = getHtml(feature, value_in)
sentence_in = sentence_in.replace(feature,' '+html+' ')
html = getHtml(feature, value_out)
sentence_out = sentence_out.replace(feature,' '+html+' ')
if space_2 == False:
html = getHtml(feature, value_in)
sentence_in = sentence_in.replace(feature,' '+html)
html = getHtml(feature, value_out)
sentence_out = sentence_out.replace(feature,' '+html)
if space_2 == True:
if space_1 == False:
html = getHtml(feature, value_in)
sentence_in = sentence_in.replace(feature, html+' ')
html = getHtml(feature, value_out)
sentence_out = sentence_out.replace(feature, html+' ')
else:
html = getHtml(feature, value_in)
sentence_in = sentence_in.replace(feature, html)
html = getHtml(feature, value_out)
sentence_out = sentence_out.replace(feature, html)
sentences_in.append(sentence_in)
sentences_in_vector_html.append(sentence_in_vector_html)
sentences_out.append(sentence_out)
cossim_values.append(highest_cossim_value_utf)
sentences_out_vector_index.append(sentence_out_vector_index + 1)
sentences_out_vector_html.append(sentence_out_vector_html)
sentences_out_features_html.append(sentence_out_features_html)
if state != 'export':
state = 'retracing-similarity'
# print('cossim_values',cossim_values, file=sys.stderr)
# print('sentences_out_vector_index',sentences_out_vector_index, file=sys.stderr)
# print('sentences_out_vector_html',sentences_out_vector_html, file=sys.stderr)
# print('sentences_out_features_html',sentences_out_features_html, file=sys.stderr)
# *******************************************************
# set jinja variables
# *******************************************************
tvars = {}
tvars['method'] = method
tvars['state'] = state
tvars['input_text'] = input_text
tvars['feature1'] = feature1
tvars['feature2'] = feature2
tvars['feature3'] = feature3
tvars['feature4'] = feature4
tvars['features'] = features
tvars['lowercase'] = lowercase
tvars['source'] = source
if corpus:
tvars['corpus'] = corpus
# tvars['matrices'] = matrices
tvars['input_sentence'] = input_sentence
if input_sentence != '':
# input
tvars['sentence_in'] = sentence_in
tvars['sentences_in'] = sentences_in
tvars['sentences_in_vector_html'] = sentences_in_vector_html
# outputs
tvars['sentences_out'] = sentences_out
tvars['cossim_values'] = cossim_values
tvars['sentences_out_vector_index'] = sentences_out_vector_index
tvars['sentences_out_vector_html'] = sentences_out_vector_html
tvars['sentences_out_features_html'] = sentences_out_features_html
# export
tvars['filename'] = filename
tvars['title'] = title
tvars['source'] = source
tvars['author'] = author