Commit 773f5ef4 authored by ana's avatar ana

adding naive bayes exercise: sentiment analysis model using bag-of-words and nltk

parent 5af05e84
# following this tutorial: https://pythonprogramming.net/text-classification-nltk-tutorial/
# NLTK corpus movie_reviews data set has the reviews, and they are labeled already as positive or negative.
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
# import movie reviews and their category (pos/neg label), word tokenization is already included
# In each category (we have pos or neg), take all of the file IDs (each review has its own ID),
# then store the word_tokenized version (a list of words) for the file ID,
# followed by the positive or negative label in one big list.
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
# Shuffle because we're going to be training and testing.
# If we left them in order, chances are we'd train on all of the negatives, some positives,
# and then test only against positives. We don't want that.
random.shuffle(documents)
# example of 1 document:a big list, where the first element is a list the words,
# and the 2nd element is the "pos" or "neg" label.
#print(documents[1])
# print(len(documents))
# collect all words that we find, so we can have a massive list of typical words.
all_words = []
for w in movie_reviews.words():
all_words.append(w.lower())
# perform a frequency distribution, to then find out the most common words
all_words = nltk.FreqDist(all_words)
# return 15 most common words
#print(all_words.most_common(15))
# return occurences for 1 particular word
#print(all_words["stupid"])
#### see in how many positively labeled documents 'stupid' appears
# create word features with the top 3,000 most common words
word_features = list(all_words.keys())[:3000]
#print('word features:', word_features)
# function that will find these top 3,000 words in our positive and negative documents,
# marking their presence as either positive or negative
def find_features(document):
words = set(document)
features = {}
for w in word_features:
#print('word:', w)
features[w] = (w in words)
#print('features:', w in words)
#print(features)
return features
# #print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))
featuresets = [(find_features(review), category) for (review, category) in documents]
# #print('featuresets:', featuresets)
# # split in train & test data
# # set that we'll train our classifier with
# training_set = featuresets[:1800]
# # set that we'll test against.
# testing_set = featuresets[1800:]
# # define classifier & train
# classifier = nltk.NaiveBayesClassifier.train(training_set)
# # test
# print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
# # most valuable words
# classifier.show_most_informative_features(15)
# # save classifier
# save_classifier = open("naivebayes.pickle","wb")
# pickle.dump(classifier, save_classifier)
# save_classifier.close()
# open classifier
classifier_f = open("naivebayes.pickle", "rb")
classifier = pickle.load(classifier_f)
sentence = "This film is absolutely one of marvels best movies, but that does not mean that it is a child friendly film . This film has things ranging from violent stabbings to (at times) quite a sad story . There are many over the top fight scenes in this film with many stabbings but these are done without great detail , but still show realistic blood wounds . There is a lot of blood in this film but all of it is dried so nothing to shocking with the blood is shown . The fight scenes can also be very violent with lots of bashing in them and lots of dried blood shown and the baddie in one scene repeatedly kicks the hero even when he's given up , but this scene is very short and no blood is shown in this part . There are many deaths by guns in this film including a scene where a man shoots through his girlfriend to get to his enemy and then repeatedly shoots him , but again no great detail is shown . There's a part where someone's neck is cut , but no detail at all is shown and it is very quick . In another scene someone takes a bullet and quite a detailed wound is shown . The film has quite a lot of swear words in it , but it's only the s word and a middle finger is shown at one point."
print(classifier.classify(find_features(sentence)))
classifier.show_most_informative_features(15)
classifier_f.close()
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment