Commit 353f1a6f authored by ana's avatar ana

inspecting authorship recognition model made with naive bayes, exports to text...

inspecting authorship recognition model made with naive bayes, exports to text files, there are some issues still in export number 3
parent 60cc5fad
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
documents/Moby Dick.txt
======================
classifications/Franz Kafka.txt 1.5247024730492327
classifications/Herman Melville.txt 1.9382362696411972
classifications/James Joyce.txt 1.3673681871109242
documents/The Trial.txt
======================
classifications/Franz Kafka.txt 1.8255517800461223
classifications/Herman Melville.txt 1.489485418454495
classifications/James Joyce.txt 1.4494049644021938
Moby Dick: Herman Melville
The Trial: Franz Kafka
'''Multinomial naive Bayes text classifier.'''
from collections import Counter
from math import log
import util
def create_frequency_dictionaries(D):
value = {}
# look at all files in folder documents
for d in D:
# if there is a file
if d:
# print name of file (no extension)
print(f'{util._name(d)}')
# create wordlist file, no endlines
text = util._get_features(d, stopchars)
# count frequency for each word
value[d] = Counter(text)
# remove stopwords
[value[d].__delitem__(w) for w in stopwords]
# print total amount of features/vocabulary
print(f'\t{len(value[d])} features')
util._percent(value[d])
return value
# write output in files
frequencies_documents = open('0_frequencies_documents.txt', 'a')
frequencies_classifications = open('1_frequencies_classifications.txt', 'a')
compare_values = open('2_compare_values.txt', 'a')
compare_dictionary = open('3_compare_dictionary.txt', 'a')
final_results= open('4_final_results.txt', 'a')
# remove endlines & stopwords
stopchars = 'P'
extension = '*.txt'
stopchars = util._remove_unicode(stopchars)
stopwords = util._get_words("stopwords")
# find frequency count documents
path = "documents"
D = util._yield_name(path, extension)
#print(D) #<generator object _iglob at 0x7f577d57c308>
# create a dictionary with name file as key and frequency dictionary as value, for each of the documents
valuesD = create_frequency_dictionaries(D)
for d, i in valuesD.items():
frequencies_documents.write(d+"\n")
for a,b in i.items():
frequencies_documents.write(a+"\t\t")
frequencies_documents.write(str(b))
frequencies_documents.write("\n")
frequencies_documents.write("\n")
# find frequency counts classifications
path = "classifications"
C = util._yield_name(path, extension)
#print(D) #<generator object _iglob at 0x7f577d57c308>
# create a dictionary with name file as key and frequency dictionary as value, for each of the classifications
valuesC = create_frequency_dictionaries(C)
for c, i in valuesC.items():
frequencies_classifications.write(c+"\n")
for a,b in i.items():
frequencies_classifications.write(a+"\t\t")
frequencies_classifications.write(str(b))
frequencies_classifications.write("\n")
frequencies_classifications.write("\n")
# compare values of documentation and classification files
compare = {}
# for each document in prediction folder
for d in valuesD:
# create empty dictionary with name of document file
compare[d] = {}
# for each document in classification folder
for c in valuesC:
# add c (name of classification file as key of an empty dictionary in each document dictionary)
compare[d][c] = {}
#print(compare)
# for each word in frequency dictionary of document file
for w in valuesD[d]:
wd= w+"\n"
value_wd = "value word d " + str(valuesD[d][w])+"\n"
value_wc = "value word c " + str(valuesC[c][w])+"\n"
log_value_wd = "log word Document+1 * " + str(log(valuesD[d][w] + 1))+"\n"
log_value_wc = "log word Classification+1 * " + str(log(valuesC[c][w] + 1))+"\n"
compare_values.write(d+"\n")
compare_values.write(wd)
compare_values.write(value_wd)
compare_values.write(value_wc)
compare_values.write(log_value_wd)
compare_values.write(log_value_wc)
compare_values.write("\n")
# Since probabilities are small, we operate in log space.
# We add 1 to prevent computing log(0), which is undefined.
compare[d][c][w] = log(valuesD[d][w] + 1) + log(valuesC[c][w] + 1)
for v, i in compare.items():
compare_dictionary.write(v+"\n")
compare_dictionary.write("======================\n")
for a,b in i.items():
compare_dictionary.write(a+"\n")
compare_dictionary.write(str(b))
compare_dictionary.write("\n")
compare_dictionary.write("\n")
# sums all occurences of 1 classification file for each document file
compare[d][c] = sum(compare[d][c].values())
# print name document file
name= util._name(d)
#print score classification file
score = f'\t{util._name(c)} ({compare[d][c]})'
for v, i in compare.items():
final_results.write(v+"\n")
final_results.write("======================\n")
for a,b in i.items():
final_results.write(a+"\t")
final_results.write(str(b))
final_results.write("\n")
final_results.write('\n')
#return compare
# returns highest value of compare[d][c] as predicted author
# #@staticmethod
# def get_classifications(compare, verbose=False):
# '''
# Return dictionary with predicted classifications.
# Parameters
# ----------
# compare : dict
# Classification likelihoods obtained from Unigram.compare().
# Returns
# -------
# prediction : dict
# Predicted classification for each document.
# '''
prediction = {}
for d in compare:
prediction[d] = util._max(compare[d])
result = f'{util._name(d)}: {util._name(prediction[d])}'
final_results.write(result+"\n")
frequencies_documents.close()
frequencies_classifications.close()
compare_values.close()
compare_dictionary.close()
final_results.close()
\ No newline at end of file
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment