Commit 5fbe8c5b authored by manetta's avatar manetta

adding the i-could-have-written-that writing-systems to the git

parent bf4baf37
Writing Machines
================
This is a writing system that works with the [cosine similarity](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.cosine_similarity.html) function of SciKit Learn.
Requirements
============
The requirements for this tool are listed in the requirements.txt file.
To install these, run:
> pip install -r requirements.txt
Output folder
=============
Exported html & pdf documents are written to the 'output' folder. Before using the tool, create the following folders:
> writing-systems/cosine-similarity/output/html/regular/
> writing-systems/cosine-similarity/output/html/booklet/
> writing-systems/cosine-similarity/output/pdf/regular/
> writing-systems/cosine-similarity/output/pdf/booklet/
Use
===
To run the tool locally, use the python server module.
Navigate to this folder in the terminal and start the server from there.
> cd ~/path/i-could-have-written-that/writing-machines/cosine-similarity/
> python -m CGIHTTPServer
This run a CGI server on localhost:8000.
Open in a browser:
> localhost:8000/cgi-bin/cosine-similarity-morphs.cgi
---
Note: It is possible to run multiple python servers, but not on the same port.
To run a server on port 9000 for example:
> python -m CGIHTTPServer 9000
modules
=======
beside the .cgi interface, there are some standalone scripts in the modules folder.
#!/usr/bin/env python
from __future__ import print_function
import cgi
import cgitb; cgitb.enable()
import os, sys
from datetime import datetime
# *******************************************************
# read input fields into variables
# *******************************************************
form = cgi.FieldStorage()
filename = form.getvalue("filename", "").decode('utf-8')
title = form.getvalue("title", "").decode('utf-8')
source = form.getvalue("source", "").decode('utf-8')
author = form.getvalue("author", "").decode('utf-8')
results = form.getvalue("results", "").decode('utf-8')
result1 = form.getvalue("result1", "").decode('utf-8')
result2 = form.getvalue("result2", "").decode('utf-8')
result3 = form.getvalue("result3", "").decode('utf-8')
result4 = form.getvalue("result4", "").decode('utf-8')
print('> printing title', title, file=sys.stderr)
print('> printing ', filename, file=sys.stderr)
date = datetime.now().strftime('%d %B %Y, at %H:%M:%S')
# ****************
# export
# ****************
def save(filename):
# ***********************
# HTML > posted via AJAX
# ***********************
html_out = 'output/html/booklet/'+filename+'.html'
with open (html_out,'w+') as f:
html = '''<!DOCTYPE html>
<html class="page">
<head>
<meta charset="utf-8">
<title>'''+title+''' (booklet)</title>
<link rel="stylesheet" type="text/css" href="../../../css/stylesheet.css">
<link rel="stylesheet" type="text/css" href="../../../css/export-booklet.css">
</head>
<body>
<div id="header">
This document is generated with the Cosine Similarity Morph interface on <em>'''+date+'''</em>, and is written by the following author(s): <em>'''+author+'''</em>. It is based on the source <em>'''+source+'''</em> and set in the font <a href="https://www.checkmyworking.com/cm-web-fonts/" target="_blank">Computer Modern Unicode Typewriter</a>. Licenced under the <a href="http://artlibre.org/licence/lal/en/" target="_blank">Free Art licence</a>. </div>
<div id="wrapper" class="first">
'''+result4+'''
<div class="pagenumber">4 - <em>'''+title+'''</em></div>
</div>
<div id="wrapper" class="second">
'''+result1+'''
<div class="pagenumber">1 - <em>'''+title+'''</em></div>
</div>
<div id="wrapper" class="third">
'''+result2+'''
<div class="pagenumber">2 - <em>'''+title+'''</em></div>
</div>
<div id="wrapper" class="fourth">
'''+result3+'''
<div class="pagenumber">3 - <em>'''+title+'''</em></div>
</div>
</body>
</html>'''
f.write(html.encode('utf8'))
f.close()
print('> html file written', file=sys.stderr)
# *******************************************
# PDF with wkhtmltopdf: html > pdf
# *******************************************
pdf_out = 'output/pdf/booklet/'+filename+'.pdf'
cmd = "wkhtmltopdf --page-width 210 --page-height 297 --disable-smart-shrinking --zoom 1.5 "+html_out+' '+pdf_out
print('>>>', cmd, file=sys.stderr)
os.system(cmd)
print('> pdf generated', file=sys.stderr)
print('> done', file=sys.stderr)
if filename:
save(filename)
\ No newline at end of file
#!/usr/bin/env python
from __future__ import print_function
import cgi
import cgitb; cgitb.enable()
import os, sys
from datetime import datetime
# *******************************************************
# read input fields into variables
# *******************************************************
form = cgi.FieldStorage()
filename = form.getvalue("filename", "").decode('utf-8')
results = form.getvalue("results", "").decode('utf-8')
source = form.getvalue("source", "").decode('utf-8')
title = form.getvalue("title", "").decode('utf-8')
source = form.getvalue("source", "").decode('utf-8')
author = form.getvalue("author", "").decode('utf-8')
print('> printing ', filename, file=sys.stderr)
date = datetime.now().strftime('%d %B %Y, at %H:%M:%S')
# ****************
# export
# ****************
def save(filename):
# ***********************
# HTML > posted via AJAX
# ***********************
html_out = 'output/html/regular/'+filename+'.html'
with open (html_out,'w+') as f:
html = '''<!DOCTYPE html>
<html class="page">
<head>
<meta charset="utf-8">
<title>'''+title+'''</title>
<link rel="stylesheet" type="text/css" href="../../../css/stylesheet.css">
<link rel="stylesheet" type="text/css" href="../../../css/export.css">
</head>
<body>
<div id="header">This document <b>'''+title+'''</b> is generated with the Cosine Similarity Morph interface on <b>'''+date+'''</b>, and is written by the following author(s): <b>'''+author+'''</b>. It is based on the source '''+source+''' and set in the font <a href="https://www.checkmyworking.com/cm-web-fonts/" target="_blank">Computer Modern Unicode Typewriter</a>. Licenced under the <a href="http://artlibre.org/licence/lal/en/" target="_blank">Free Art licence</a>. </div>
<div id="wrapper">
'''+results+'''
</div>
</body>
</html>'''
f.write(html.encode('utf8'))
f.close()
print('> html file written', file=sys.stderr)
# *******************************************
# PDF with wkhtmltopdf: html > pdf
# *******************************************
pdf_out = 'output/pdf/regular/'+filename+'.pdf'
cmd = "wkhtmltopdf "+html_out+' '+pdf_out
print('>>>', cmd, file=sys.stderr)
os.system(cmd)
print('> pdf generated', file=sys.stderr)
print('> done', file=sys.stderr)
if filename:
save(filename)
\ No newline at end of file
#!/usr/bin/env python
from __future__ import print_function
import os, sys
import cgi
import cgitb; cgitb.enable()
from jinja2 import FileSystemLoader, Environment
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize, TreebankWordTokenizer
from nltk.util import ngrams
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
# *******************************************************
# settings
# *******************************************************
path = "./"
env = Environment(loader=FileSystemLoader(path))
template = env.get_template("cosine-similarity-words.html")
# *******************************************************
# read form fields into variables
# *******************************************************
method = os.environ.get("REQUEST_METHOD")
form = cgi.FieldStorage()
input_text = form.getvalue("input_text", "").decode('utf-8')
stopwords = word_tokenize(form.getvalue("stopwords", "").decode('utf-8'))
countingmethod = form.getvalue("countingmethod", "")
lowercase = form.getvalue("lowercase", "")
window_size = int(form.getvalue("window_size", "2"))
minimum_count = int(form.getvalue("minimum_count", "3"))
layer_size = int(form.getvalue("layer_size", "250"))
iterations = int(form.getvalue("iterations", "50"))
query = form.getvalue("query", "").decode('utf-8')
matrix = form.getvalue("matrix", "")
filename = form.getvalue("filename", "").decode('utf-8')
# *******************************************************
# change variables (as soon as the form is submitted)
# *******************************************************
corpus = input_text
lines = sent_tokenize(corpus)
words = []
for line in lines:
w = word_tokenize(line)
words.append(w)
if query != "":
# ****************************
# word2vec & GloVe
# ****************************
# based on the following tutorial:
# http://textminingonline.com/getting-started-with-word2vec-and-glove-in-python
# https://rare-technologies.com/word2vec-tutorial/
# https://radimrehurek.com/gensim/models/word2vec.html
# Glove:
# https://nlp.stanford.edu/projects/glove/
from gensim.models import word2vec
from nltk.tokenize import word_tokenize, sent_tokenize
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
# ****************************
# SETTINGS
# ****************************
window_size = window_size
minimum_count = minimum_count
layer_size = layer_size
# ****************************
model = word2vec.Word2Vec(words, window=window_size, min_count=minimum_count, size=layer_size, workers=1)
# window_size:
# "window is the maximum distance between the current and predicted word within a sentence"
# min_count:
# "Words that appear only once or twice in a billion-word corpus are probably uninteresting typos and garbage. In addition, there's not enough data to make any meaningful training on those words, so it's best to ignore them. A reasonable value for min_count is between 0-100, depending on the size of your dataset. "
# layer_size:
# "Another parameter is the size of the NN layers, which correspond to the 'degrees' of freedom the training algorithm has. Bigger size values require more training data, but can lead to better (more accurate) models. Reasonable values are in the tens to hundreds."
# >>> is this the number of features?
# workers:
# " The last of the major parameters (full list here) is for training parallelization, to speed up training. The workers parameter has only effect if you have Cython installed. Without Cython, you'll only be able to use one core because of the GIL (and word2vec training will be miserably slow)."
# ****************************
input_word = query
queries = []
queries_tmp = []
value = 1 # a static starting value for the first query
most_similar_word = ''
iterations = iterations
def nextQuery(results):
for i, result in enumerate(results):
if result[0] not in queries_tmp:
return results[i]
break
import datetime
timestamp = '{:%Y-%m-%d-%H-%M-%S}'.format(datetime.datetime.now())
log_filename = 'log_'+query+'_'+timestamp
for i in range(iterations):
index = 0
# find next unique query
if query != input_word:
if query in queries_tmp:
result = nextQuery(results)
if result != None:
query = result[0]
value = result[1]
else:
break
# store in queries list
queries.append([query, value])
queries_tmp.append(query)
# find most similar list of words
try:
results = model.most_similar([query])
except:
break
most_similar = results[0]
most_similar_word = str(most_similar[0])
most_similar_value = most_similar[1]
query = most_similar_word
# *******************************************************
# save
# *******************************************************
def save(filename, queries, window_size, minimum_count, layer_size, iterations):
folder = "output/html/"
ext = ".html"
htmlname = folder+filename+ext
with open (htmlname,'w+') as f:
html_top = '''
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>output vectorizer: Recursive Similarity - words</title>
<link rel="stylesheet" type="text/css" href="css/stylesheet.css">
</head>
<body>
<h1>Recursive Similarity - words<br>(using word2vec)</h1>
<div <div id="result-list">
<div class="info"><u>window size</u>: '''+str(window_size)+'''</div>
<div class="info"><u>minimum count</u>: '''+str(minimum_count)+'''</div>
<div class="info"><u>layer size</u>: '''+str(layer_size)+'''</div>
<div class="info"><u>iterations</u>: '''+str(iterations)+'''</div>
</div>'''
f.write(html_top.encode('utf8'))
for s in queries:
word = s[0]
sim_value = s[1]
html = '''
<div id="result-list" style="opacity:'''+str(value)+''';">
<div class="word">'''+word+'''</div>
<div class="info"><u>similarity rate</u>: '''+str(sim_value)+'''</div>'''
f.write(html.encode('utf8'))
html_bottom = '''
</body>
</html>'''
f.write(html_bottom.encode('utf8'))
f.close()
# *******************************************************
# export pdfs with wkhtmltopdf: html > pdf
# *******************************************************
folder = "output/pdf/"
ext = ".pdf"
pdfname = folder+filename+ext
# cmd = "wkhtmltopdf "+htmlname+' '+pdfname
cmd = "weasyprint "+htmlname+' '+pdfname+" -s output/html/css/stylesheet.css"
print('>>> pdf generated', cmd, file=sys.stderr)
os.system(cmd)
if filename:
save(filename, queries, window_size, minimum_count, layer_size, iterations)
from collections import Counter
counter = Counter()
for word in words:
counter.update(word)
# print('>>> common words', counter, file=sys.stderr)
common_words = []
for word, value in counter.most_common():
common_words.append([word,value])
# *******************************************************
# set jinja variables
# *******************************************************
tvars = {}
tvars['input_text'] = input_text
tvars['window_size'] = str(window_size)
tvars['minimum_count'] = str(minimum_count)
tvars['layer_size'] = str(layer_size)
tvars['iterations'] = str(iterations)
tvars['stopwords'] = ' '.join(stopwords)
tvars['corpus'] = corpus
tvars['corpus_length'] = len(corpus)
tvars['lowercase'] = lowercase
tvars['matrix'] = matrix
tvars['common_words'] = common_words
if query:
tvars['queries'] = queries
# *******************************************************
# render
# *******************************************************
print('*variables rendered*\n', file=sys.stderr)
print("Content-type: text/html;charset=utf-8")
print()
html = template.render(tvars).encode('utf-8')
print(html)
\ No newline at end of file
#!/usr/bin/env python
from __future__ import print_function
import cgi
import cgitb; cgitb.enable()
import os, sys
# *******************************************************
# read input fields into variables
# *******************************************************
form = cgi.FieldStorage()
a = form.getvalue("a", "").decode('utf-8')
b = form.getvalue("b", "").decode('utf-8')
input_csv = form.getvalue("input_csv", "")
csv = open(csv,'r').readlines().decode('utf-8')
print('a:', a, file=sys.stderr)
print('b:', b, file=sys.stderr)
print('input_csv:', input_csv, file=sys.stderr)
print('csv:', csv, file=sys.stderr)
# *******************************************************
# change variables
# *******************************************************
# *******************************************************
# set output variables
# *******************************************************
out = {}
out['a'] = a
out['b'] = b
out['csv'] = csv
print('Your data is returned to the interface through AJAX ...', file=sys.stderr)
# *******************************************************
# return json
# *******************************************************
import json
print ("Content-type: application/json")
print ()
print (json.dumps(out, indent=4).encode('utf-8'))
\ No newline at end of file
#!/usr/bin/env python
from __future__ import print_function
import cgi
import cgitb; cgitb.enable()
# from jinja2 import FileSystemLoader, Environment
import os, sys
# *******************************************************
# read input fields into variables
# *******************************************************
form = cgi.FieldStorage()
a = form.getvalue("a", "").decode('utf-8')
b = form.getvalue("b", "").decode('utf-8')
text_a = form.getvalue("text_a", "").decode('utf-8')
text_b = form.getvalue("text_b", "").decode('utf-8')
print('a:', a, file=sys.stderr)
print('b:', b, file=sys.stderr)
print('text_a:', text_a, file=sys.stderr)
print('text_b:', text_b, file=sys.stderr)
# *******************************************************
# change variables
# *******************************************************
out_a = []
out_b = []
out_ab = []
texts = [('a', text_a.split('\n')), ('b', text_b.split('\n'))]
for text in texts:
for line in text[1]:
if text[0] == 'a':