Commit 5b7d8086 authored by gijs's avatar gijs

Utils

parent 8754555d
EMPTY = 'EMPTY LINE'
NO_WORDS = 'NO_WORDS'
NO_LETTERS = 'NO_LETTERS'
TOO_MUCH_UNRECOGNIZED_WORDS = 'TOO_MUCH_UNRECOGNIZED_WORDS'
TOO_MUCH_NON_ALPHABETICAL_CHARS = 'TOO_MUCH_NON_ALPHABETICAL_CHARS'
AVERAGE_WORD_LENGTH_TOO_SHORT = 'AVERAGE_WORD_LENGTH_TOO_SHORT'
MARKED_BY_USER = 'MARKED_BY_USER'
MARKED_CLEAN = 'MARKED_CLEAN'
CLEAN = 'CLEAN'
\ No newline at end of file
import sqlite3
from . import settings
from . import clean_states
from .utils import error
def connect():
conn = sqlite3.connect(settings.DATABASE)
conn.row_factory = sqlite3.Row
return conn
def init_table(conn):
cur = conn.cursor()
try:
cur.execute("""CREATE TABLE IF NOT EXISTS sentences(
document TEXT,
line INTEGER,
sentence TEXT,
clean BOOLEAN,
clean_annotation TEXT,
checked BOOLEAN,
classification INTEGER,
detected_language CHARACTER,
misspelled TEXT
)""")
conn.commit()
return True
except Exception as e:
conn.rollback()
error("Could not create sentences table {}".format(e))
return False
def add_sentence(conn, document='', line=-1, sentence='', clean=False, clean_annotation='', checked=False, classification=-1, detected_language='fr', misspelled=[]):
cur = conn.cursor()
try:
cur.execute("""INSERT INTO sentences(
document,
line,
sentence,
clean,
clean_annotation,
checked,
classification,
detected_language,
misspelled)
VALUES(
:document,
:line,
:sentence,
:clean,
:clean_annotation,
:checked,
:classification,
:detected_language,
:misspelled
)
""", {
'document': document,
'line': line,
'sentence': sentence,
'clean': clean,
'clean_annotation': clean_annotation,
'checked': checked,
'classification': classification,
'detected_language': detected_language,
'misspelled': ','.join(misspelled)
})
conn.commit()
return True
except Exception as e:
conn.rollback()
error('Could not insert sentence {}'.format(e))
return False
def add_sentences(conn, sentences=[]):
cur = conn.cursor()
try:
cur.executemany("""INSERT INTO sentences(
document,
line,
sentence,
clean,
clean_annotation,
checked,
classification,
detected_language,
misspelled)
VALUES(
:document,
:line,
:sentence,
:clean,
:clean_annotation,
:checked,
:classification,
:detected_language,
:misspelled
)
""", sentences)
conn.commit()
return True
except Exception as e:
conn.rollback()
error('Could not insert sentence {}'.format(e))
return False
def classify_sentence(conn, rowid, classification):
cur = conn.cursor()
try:
cur.execute(
"UPDATE sentences SET classification=:classification WHERE rowid=:rowid",
{ 'rowid': rowid, 'classification': classification })
conn.commit()
except Exception as e:
conn.rollback()
error('Could not update sentence classification {}'.format(e))
return False
def mark_sentence_dirty(conn, rowid):
cur = conn.cursor()
try:
cur.execute(
"UPDATE sentences SET clean=:clean, clean_annotation=:clean_annotation WHERE rowid=:rowid",
{ 'rowid': rowid, 'clean': False, 'clean_annotation': clean_states.MARKED_BY_USER })
conn.commit()
except Exception as e:
conn.rollback()
error('Could not update sentence cleanliness {}'.format(e))
return False
def get_sentence_to_annotate(conn):
cur = conn.cursor()
try:
cur.execute(
"SELECT rowid, document, line, sentence FROM sentences WHERE clean = :clean AND classification = :classification ORDER BY RANDOM() LIMIT 1",
{ 'clean': True, 'classification': -1 })
return cur.fetchone()
except Exception as e:
conn.rollback()
error('Could not select row {}'.format(e))
return False
def get_prev_lines (conn, document, line, limit = 5):
cur = conn.cursor()
try:
cur.execute(
"SELECT sentence FROM sentences WHERE document = :document AND line < :line ORDER BY line DESC LIMIT :limit",
{ 'document': document, 'line': line, 'limit': limit })
return cur.fetchall()
except Exception as e:
conn.rollback()
error('Could not select rows {}'.format(e))
return False
def get_next_lines (conn, document, line, limit = 5):
cur = conn.cursor()
try:
cur.execute(
"SELECT sentence FROM sentences WHERE document = :document AND line > :line ORDER BY line ASC LIMIT :limit",
{ 'document': document, 'line': line, 'limit': limit })
return cur.fetchall()
except Exception as e:
conn.rollback()
error('Could not select rows {}'.format(e))
return False
def get_document(conn):
doc_cur = conn.cursor()
try:
doc_cur.execute("SELECT DISTINCT document FROM sentences ORDER BY RANDOM() LIMIT 1")
row = doc_cur.fetchone()
sent_cur = conn.cursor()
sent_cur.execute(
"""SELECT line, document, sentence, clean, clean_annotation, checked
FROM sentences WHERE document = :document
ORDER BY line ASC""", { 'document': row['document'] })
return sent_cur.fetchall()
except Exception as e:
conn.rollback()
error('Could not select document {}'.format(e))
return False
def update_document(conn, document, lines):
cur = conn.cursor()
try:
# don't forget checked field
cur.executemany("""
UPDATE sentences
SET sentence = :sentence,
clean = :clean,
clean_annotation = :clean_annotation,
checked = :checked
WHERE document = :document AND line = :line
""", [{
'document': document,
'line': l['line'],
'sentence': l['sentence'],
'clean': l['clean'],
'clean_annotation': l['clean_annotation'],
'checked': True
} for l in lines])
return True
except Exception as e:
conn.rollback()
error('Could not update document {}'.format(e))
return False
def update_sentence(conn, documentTitle, line, sentence, clean):
cur = conn.cursor()
try:
cur.execute("""
UPDATE sentences
SET sentence = :sentence,
clean = :clean,
clean_annotation = :clean_annotation
checked = :checked
WHERE document = :document AND line = :line
""", {
'document': document,
'line': line,
'sentence': sentence,
'clean': clean,
'clean_annotation': clean_states.MARKED_BY_USER,
'checked': True
})
conn.commit()
except Exception as e:
conn.rollback()
error('Could not update sentence {}'.format(e))
return False
# cur = conn.cursor()
# try:
# cur.execute(
# "UPDATE sentences SET clean=:clean, clean_annotation=:clean_annotation WHERE rowid=:rowid",
# { 'rowid': rowid, 'clean': False, 'clean_annotation': clean_states.MARKED_BY_USER })
# conn.commit()
# except:
# conn.rollback()
# return False
\ No newline at end of file
"""
Importer for a collection of texts
"""
import re
import argparse
import glob
import os.path
import random
from collections import namedtuple
from spellchecker import SpellChecker
from pattern.fr import parsetree
from langdetect import detect
from . import db
from . import clean_states
from . import settings
from .utils import log, debug
spellers = {}
def simpleTree(text):
return parsetree(text, tokenize = True, tags = True, chunks = False, lemmata = True)
def removeHyphens (text):
return re.sub(r'-\n(\w)', '\\1', text, flags=re.M)
def removeWordWrap (text):
return re.sub(r'\n(\w)', ' \\1', text, flags=re.M)
def removeMultiNewlines(text):
return re.sub(r'(?:\s*\n){4,}', '\n\n', text, flags=re.M)
## Chain actions to clean a wordwrapped text
def reflow (text):
return removeMultiNewlines(removeWordWrap(removeHyphens(text)))
def isClean (line, speller):
if line:
if re.fullmatch(r'[^a-z]+', line.strip('\n'), flags=re.I):
debug('**NOISE** ({}): {}'.format(clean_states.NO_LETTERS, line))
return (False, clean_states.NO_LETTERS, [])
else:
tree = simpleTree(line)
lemmas = set()
words = set()
for sentence in tree.sentences:
for word in sentence.words:
if re.fullmatch(r'\w+', word.string):
lemmas.add(word.lemma)
words.add(word.string)
if speller:
misspelled = speller.unknown(lemmas)
else:
misspelled = set()
if words:
if (len(lemmas) < 8 and len(misspelled) * 2 > len(lemmas)) or \
(len(lemmas) > 7 and len(misspelled) * 3 > len(words)):
debug('**NOISE** ({}): {}'.format(clean_states.TOO_MUCH_UNRECOGNIZED_WORDS, line))
return (False, clean_states.TOO_MUCH_UNRECOGNIZED_WORDS, misspelled)
else:
letters = re.sub('[^a-z]', '', line, flags=re.I)
notLetters = re.sub('[a-z]', '', line, flags=re.I)
if len(line) < 15 and len(notLetters) * 2 > len(letters):
# pass
debug('**NOISE** ({}): {}'.format(clean_states.TOO_MUCH_UNRECOGNIZED_WORDS, line))
return (False, clean_states.TOO_MUCH_UNRECOGNIZED_WORDS, misspelled)
else:
avgwordlength = sum([len(word) for word in words]) / float(len(words))
if avgwordlength < 3:
debug('**NOISE** ({}): {}'.format(clean_states.AVERAGE_WORD_LENGTH_TOO_SHORT, line))
return (False, clean_states.AVERAGE_WORD_LENGTH_TOO_SHORT, misspelled)
else:
return (True, clean_states.MARKED_CLEAN, misspelled)
return (False, clean_states.EMPTY, [])
else:
debug('**NOISE** ({}): {}'.format(clean_states.EMPTY, line))
return (False, clean_states.EMPTY, [])
def importFile(path, conn):
with open(path, 'r') as h:
document = os.path.basename(path)
text = reflow(h.read())
language = detect(text)
speller = getSpeller(language)
sentences = []
log("Loading: {}".format(path))
debug("Detected document lanuage {}".format(language))
for line, sentence in enumerate(text.splitlines()):
clean, reason, misspelled = isClean(sentence, speller)
sentences.append({
'document': document,
'line': line,
'sentence': sentence,
'clean': clean,
'clean_annotation': reason,
'checked': False,
'classification': -1,
'detected_language': language,
'misspelled': ','.join(misspelled)
})
debug(misspelled)
# db.add_sentence(document=document, line=line, sentence=sentence, clean=clean, clean_annotation=reason, checked=False, classification=-1, detected_language=language, misspelled=misspelled)
db.add_sentences(conn, sentences)
log("Inserted {} sentences".format(len(sentences)))
def getSpeller(language):
if language not in spellers:
try:
spellers[language] = SpellChecker(language)
except:
spellers[language] = None
return spellers[language]
def importFolder(path):
log('Importing folder {}'.format(path))
conn = db.connect()
for fp in glob.glob(os.path.join(path, '*.txt')):
importFile(fp, conn)
if __name__ == '__main__':
log('Starting import')
db.init_table()
importFolder(settings.RAW_DATA_PATH)
log('Finished, exporting')
\ No newline at end of file
DATABASE = '/home/gijs/Documents/Bedrijf/Projecten/Algolit/mons/git/exhibition/classification_utils/mundaneum.db'
RAW_DATA_PATH = '/home/gijs/Documents/Bedrijf/Projecten/Algolit/mons/git/data/re-ocred'
DEBUG = False
CLASSES = [
"0 - Science and Knowledge. Organization. Computer Science. Information Science. Documentation. Librarianship. Institutions. Publications",
"1 - Philosophy. Psychology",
"2 - Religion. Theology",
"3 - Social Sciences",
"4 - vacant",
"5 - Mathematics. Natural Sciences",
"6 - Applied Sciences. Medicine, Technology",
"7 - The Arts. Entertainment. Sport",
"8 - Linguistics. Literature",
"9 - Geography. History"
]
from . import settings
def debug (text):
if settings.DEBUG:
print("Debug : {}".format(text))
def log (text):
print("Log : {}".format(text))
def error (text):
print("Error : {}".format(text))
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment