Commit 8754555d authored by gijs's avatar gijs

Several more

parent 1ddbf2fd
[Unit]
Description=Printer service
After=network.target
[Service]
User=pi
Group=pi
Restart=on-failure
WorkingDirectory=/home/pi/Documents/mundaneum/exhibition/printer_service
ExecStart=/home/pi/Documents/mundaneum/exhibition/printer_service/venv/bin/gunicorn --config /home/pi/Documents/mundaneum/printer_service/gunicorn.conf --bind localhost:5557 --workers 1 app:app
[Install]
WantedBy=multi-user.target
\ No newline at end of file
location /classifying-the-world {
proxy_pass http://127.0.0.1:5557;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
\ No newline at end of file
......@@ -2,4 +2,5 @@ flask
pandas
sklearn
numpy
joblib
\ No newline at end of file
joblib
gunicorn
\ No newline at end of file
from flask import Flask, request
import glob
from escpos import printer
app = Flask(__name__)
header = 'Algoliterator'
# header = 'The Cleaner'
def find_printer ():
return glob.glob('/dev/ttyUSB*')[0]
@app.route('/print', methods=["post"])
def print ():
text = request.form['text']
thermal_printer.set(bold=True)
thermal_printer.text(header)
thermal_printer.set(bold=False)
thermal_printer.text('\n\n')
thermal_printer.text(text)
thermal_printer.text('\n\n\nDataworkers, Mons 2019')
thermal_printer.cut()
thermal_printer = printer.Serial(find_printer(), 19200)
if __name__ == "__main__":
app.run(host='0.0.0.0')
\ No newline at end of file
accesslog = "/home/pi/Documents/mundaneum/printer_service/logs/gunicorn_access.log"
errorlog = "/home/pi/Documents/mundaneum/printer_service/logs/gunicorn_error.log"
location / {
proxy_pass http://127.0.0.1:5556;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
\ No newline at end of file
[Unit]
Description=Printer service
After=network.target
[Service]
User=pi
Group=pi
Restart=on-failure
WorkingDirectory=/home/pi/Documents/mundaneum/exhibition/printer_service
ExecStart=/home/pi/Documents/mundaneum/exhibition/printer_service/venv/bin/gunicorn --config /home/pi/Documents/mundaneum/printer_service/gunicorn.conf --bind localhost:5556 --workers 1 app:app
[Install]
WantedBy=multi-user.target
\ No newline at end of file
sudo apt-get install libjpeg8-dev
\ No newline at end of file
python-escpos==3.0a4
gunicorn
flask
\ No newline at end of file
../naive_bayes/classify-the-world/classify-the-world.conf
\ No newline at end of file
location /classifying-the-world {
proxy_pass http://127.0.0.1:5557;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location /the-annotator {
proxy_pass http://127.0.0.1:5558;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
location /the-cleaner {
proxy_pass http://127.0.0.1:5559;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
\ No newline at end of file
../the-annotator/the-annotator.conf
\ No newline at end of file
../the-cleaner/the-cleaner.conf
\ No newline at end of file
baseurl = '/the-cleaner/api'
"""
Importer for a collection of texts
"""
import re
import argparse
import glob
import os.path
import random
from collections import namedtuple
from spellchecker import SpellChecker
from pattern.fr import parsetree
from langdetect import detect
# quality reasons
NO_WORDS = 'NO_WORDS'
TOO_MUCH_UNRECOGNIZED_WORDS = 'TOO_MUCH_UNRECOGNIZED_WORDS'
TOO_MUCH_NON_ALPHABETICAL_CHARS = 'TOO_MUCH_NON_ALPHABETICAL_CHARS'
from collections import namedtuple
AVERAGE_WORD_LENGTH_TOO_SHORT = 'AVERAGE_WORD_LENGTH_TOO_SHORT'
MARKED_BY_USER = 'MARKED_BY_USER'
MARKED_CLEAN = 'MARKED_CLEAN'
CLEAN = 'CLEAN'
IMPORTED = 'IMPORTED'
CLEANED = 'CLEANED'
ANNOTATED = 'ANNOTATED'
spell = SpellChecker(language="fr")
## text dataset is a list of
def simpleTree(text):
return parsetree(text, tokenize = True, tags = True, chunks = False, lemmata = True)
def removeHyphens (text):
return re.sub(r'-\n(\w)', '\\1', text, flags=re.M)
def removeWordWrap (text):
return re.sub(r'\n(\w)', ' \\1', text, flags=re.M)
def removeMultiNewlines(text):
return re.sub(r'(?:\s*\n){4,}', '\n\n', text, flags=re.M)
def clean (text):
return removeMultiNewlines(removeWordWrap(removeHyphens(text)))
def filterOutNoise (text):
output = ''
for line in text.splitlines():
if line:
if re.fullmatch(r'[^a-z]+', line.strip('\n'), flags=re.I):
# print('no letters')
# print('**NOISE**: {}'.format(line))
pass
else:
tree = simpleTree(line)
lemmas = set()
words = set()
for sentence in tree.sentences:
for word in sentence.words:
if re.fullmatch(r'\w+', word.string):
lemmas.add(word.lemma)
words.add(word.string)
misspelled = spell.unknown(lemmas)
if words:
if (len(lemmas) < 8 and len(misspelled) * 2 > len(lemmas)) or \
(len(lemmas) > 7 and len(misspelled) * 3 > len(words)):
# print('too much misspelled')
# print('**NOISE**: {}'.format(line))
pass
else:
letters = re.sub('[^a-z]', '', line, flags=re.I)
notLetters = re.sub('[a-z]', '', line, flags=re.I)
if len(line) < 15 and len(notLetters) * 2 > len(letters):
pass
# print('too much non words')
# print('**NOISE**: {}'.format(line))
else:
avgwordlength = sum([len(word) for word in words]) / float(len(words))
if avgwordlength < 3:
# print('words too short')
# print('**NOISE**: {}'.format(line))
pass
else:
output += line + '\n'
return output
def importFile(path):
print(path)
with open(path, 'r') as h:
print("Loading: {}".format(path))
text = clean(h.read())
language = detect(text)
print("Detected document lanuage {}".format(language))
if (language == 'fr'):
speller = SpellChecker(language=language)
return filterOutNoise(text) + '\n'
else:
return ''
def importFolder(path):
files = glob.glob(os.path.join(path, '*.txt'))
corpus = ''
# importFile(random.choice(files))
for fp in glob.glob(os.path.join(path, '*.txt')):
corpus += importFile(fp)
with open('output.txt', 'w') as w:
w.write(corpus)
print('done!')
importFolder("/home/gijs/Documents/Bedrijf/Projecten/Algolit/mons/git/data/re-ocred")
\ No newline at end of file
"""
Importer for a collection of texts
"""
import re
import argparse
import glob
import os.path
import random
from collections import namedtuple
from spellchecker import SpellChecker
from pattern.fr import parsetree
from langdetect import detect
# quality reasons
NO_WORDS = 'NO_WORDS'
TOO_MUCH_UNRECOGNIZED_WORDS = 'TOO_MUCH_UNRECOGNIZED_WORDS'
TOO_MUCH_NON_ALPHABETICAL_CHARS = 'TOO_MUCH_NON_ALPHABETICAL_CHARS'
from collections import namedtuple
AVERAGE_WORD_LENGTH_TOO_SHORT = 'AVERAGE_WORD_LENGTH_TOO_SHORT'
MARKED_BY_USER = 'MARKED_BY_USER'
MARKED_CLEAN = 'MARKED_CLEAN'
CLEAN = 'CLEAN'
IMPORTED = 'IMPORTED'
CLEANED = 'CLEANED'
ANNOTATED = 'ANNOTATED'
spell = SpellChecker(language="fr")
## text dataset is a list of
def simpleTree(text):
return parsetree(text, tokenize = True, tags = True, chunks = False, lemmata = True)
def removeHyphens (text):
return re.sub(r'-\n(\w)', '\\1', text, flags=re.M)
def removeWordWrap (text):
return re.sub(r'\n(\w)', ' \\1', text, flags=re.M)
def removeMultiNewlines(text):
return re.sub(r'(?:\s*\n){4,}', '\n\n', text, flags=re.M)
def clean (text):
return removeMultiNewlines(removeWordWrap(removeHyphens(text)))
def markNoise (text):
for line in text.splitlines():
if line:
if re.fullmatch(r'[^a-z]+', line.strip('\n'), flags=re.I):
print('no letters')
print('**NOISE**: {}'.format(line))
else:
tree = simpleTree(line)
lemmas = set()
words = set()
for sentence in tree.sentences:
for word in sentence.words:
if re.fullmatch(r'\w+', word.string):
lemmas.add(word.lemma)
words.add(word.string)
misspelled = spell.unknown(lemmas)
if words:
if (len(lemmas) < 8 and len(misspelled) * 2 > len(lemmas)) or \
(len(lemmas) > 7 and len(misspelled) * 3 > len(words)):
print('too much misspelled')
print('**NOISE**: {}'.format(line))
# pass
else:
letters = re.sub('[^a-z]', '', line, flags=re.I)
notLetters = re.sub('[a-z]', '', line, flags=re.I)
if len(line) < 15 and len(notLetters) * 2 > len(letters):
# pass
print('too much non words')
print('**NOISE**: {}'.format(line))
else:
avgwordlength = sum([len(word) for word in words]) / float(len(words))
if avgwordlength < 3:
print('words too short')
print('**NOISE**: {}'.format(line))
else:
pass
# print('**CLEAN?**: {}'.format(line))
def importFile(path):
print(path)
with open(path, 'r') as h:
print("Loading: {}".format(path))
text = clean(h.read())
language = detect(text)
print("Detected document lanuage {}".format(language))
speller = SpellChecker(language=language)
markNoise(text)
def importFolder(path):
files = glob.glob(os.path.join(path, '*.txt'))
importFile(random.choice(files))
# for fp in glob.glob(os.path.join(path, '*.txt')):
# importFile(fp)
importFolder("/home/gijs/Documents/Bedrijf/Projecten/Algolit/mons/git/data/re-ocred")
\ No newline at end of file
This diff is collapsed.
SpellingSuggestion = Shape{
key: string,
suggestions: Array<string>
}
LineSuggestions = Shape{
'spelling': Array<SpellingSuggestion>
}
LineVersion = Shape{
'clean': boolean,
'verified': boolean,
'line': string,
'note': reason,
'suggestions': LineSuggestions
}
Line = Array<LineVersion>
Text = {
language: string
lines: Array<Line>
}
def makeSpellingSuggestion (key suggestions):
return {
'key': key,
'suggestions': suggestions
}
def makeLineSuggestions (spelling):
return {
'spelling': spelling
}
def makeLineVersion (clean, verified, line, reason, suggestions):
return {
'clean': clean,
'verified': verified,
'line': line,
'reason': reason,
'suggestion':
}
def makeLine(versions = []):
return versions
def addVersion(line, version):
return [version] + line
def makeText(lanugage, lines = [])
return {
'language': language,
'lines': [] + lines
}
\ No newline at end of file
location /the-cleaner {
proxy_pass http://127.0.0.1:5559;
proxy_set_header Host $host;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
\ No newline at end of file
flask
gunicorn
pattern
langdetect
pyspellchecker
csvwrite
\ No newline at end of file
[Unit]
Description=The annotator service
After=network.target
[Service]
User=algolit
Group=algolit
Restart=on-failure
WorkingDirectory=/home/pi/Documents/mundaneum/exhibition/printer_service
ExecStart=/home/pi/Documents/mundaneum/exhibition/printer_service/venv/bin/gunicorn --config /home/pi/Documents/mundaneum/printer_service/gunicorn.conf --bind localhost:5559 --workers 1 app:app
[Install]
WantedBy=multi-user.target
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment