...
 
Commits (2)
from db import connect, find_one_word
"""
Script to inspect database coverage for all words that can be replaced
"""
conn = connect()
with open('../data/queneau_words.txt', 'r') as source:
for word in source:
if not find_one_word(conn, [('tag', 'NOM'), ('word', word.strip())]):
print('Can not find word: {}'.format(word.strip()))
data = find_one_word(conn, [('word', word.strip())])
if data:
print('but found')
for k in data.keys():
print('{}: {}'.format(k, data[k]))
print()
\ No newline at end of file
......@@ -37,7 +37,7 @@ def extract_rhyme (syllables):
with open('../data/Lexique.csv', 'r') as source:
with open('../data/Lexique383_met_alles.csv', 'r') as source:
reader = csv.reader(source)
conn = connect()
......@@ -64,7 +64,7 @@ with open('../data/Lexique.csv', 'r') as source:
words.append({
'word': row[0],
'lemma': row[1],
'lemma': row[2],
'tag': row[3],
'gender': row[4],
'plural': True if row[5] == 'p' else False,
......
......@@ -4,7 +4,7 @@ import numpy as np
print('Loading model')
embeddings = KeyedVectors.load('gensimW2V/mariemont.wv')
classifier = joblib.load('meta_classifier.pkl')
classifier = joblib.load('../data/nature_classifier.pkl')
# for key, label in enumerate(model.vocab):
# vector = model.vectors[key]
......
......@@ -22,10 +22,17 @@ def load_queneau_replacement_words ():
def find_replacement(word, should_rhyme=False, nature_score=None):
# Find information about this word
word_data = find_one_word(conn, [('tag', 'NOM'), ('word', word)])
tag = 'NOM'
word_data = find_one_word(conn, [('tag', tag), ('word', word)])
if not word_data:
# Try without POS tag
word_data = find_one_word(conn, [('word', word)])
# print('Coud not find word: {}'.format(word))
if word_data:
tag = word_data['tag']
if word_data:
# Find words with an equal amount of syllables and the same tag
parameters = [ ('tag', 'NOM'),
parameters = [ ('tag', tag),
('gender', word_data['gender']),
('plural', word_data['plural']),
('syllable_count', word_data['syllable_count'])]
......
......@@ -19,8 +19,8 @@ def load_lexicon(filename):
embeddings = KeyedVectors.load('gensimW2V/mariemont.wv')
pos_words = load_lexicon('../data/synonymes_nature.txt')
neg_words = load_lexicon('../data/synonymes_technique.txt')
pos_words = load_lexicon('../data/synonymes_nature_clean.txt')
neg_words = load_lexicon('../data/synonymes_technique_clean.txt')
# pos_words = load_lexicon('../data/nature_mots.txt')
# neg_words = load_lexicon('../data/technique_mots.txt')
......@@ -63,8 +63,8 @@ neg_labels, neg_vectors = find_embeddings(embeddings, neg_words)
#Those words end up with rows full of NaN to indicate their missing embeddings, so we use .dropna() to remove them.
# pos_vectors = embeddings.loc[pos_words].dropna()
# neg_vectors = embeddings.loc[neg_words].dropna()
print(len(pos_vectors))
print(len(neg_vectors))
print('Nature words: {}'.format(len(pos_vectors)))
print('Technique words: {}'.format(len(neg_vectors)))
'''
Now we make arrays of the desired inputs and outputs.
......@@ -107,4 +107,4 @@ score = (accuracy_score(model.predict(test_vectors), test_targets))
print(score)
from sklearn.externals import joblib
joblib.dump(model, 'meta_classifier.pkl')
\ No newline at end of file
joblib.dump(model, '../data/nature_classifier.pkl')
\ No newline at end of file