PyLing meeting, Feb 8 2017
Following this tutorial on scikit-learn.org: http://scikit-learn.org/dev/tutorial/text_analytics/working_with_text_data.html
Adapted to
Also, checkout documentation on dataset loading: http://scikit-learn.org/stable/datasets/
import sklearn
from sklearn.datasets import load_files
moviedir = r'D:\Lab\nltk_data\corpora\movie_reviews'
# loading all files as training data.
movie_train = load_files(moviedir, shuffle=True)
len(movie_train.data)
# target names ("classes") are automatically generated from subfolder names
movie_train.target_names
# First file seems to be about a Schwarzenegger movie.
movie_train.data[0][:500]
# first file is in "neg" folder
movie_train.filenames[0]
# first file is a negative review and is mapped to 0 index 'neg' in target_names
movie_train.target[0]
# import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
# Turn off pretty printing of jupyter notebook... it generates long lines
%pprint
import nltk
sents = ['A rose is a rose is a rose is a rose.',
'Oh, what a fine day it is.',
"It ain't over till it's over, I tell you!!"]
# Initialize a CoutVectorizer to use NLTK's tokenizer instead of its
# default one (which ignores punctuation and stopwords).
# Minimum document frequency set to 1.
foovec = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)
# sents turned into sparse vector of word frequency counts
sents_counts = foovec.fit_transform(sents)
# foovec now contains vocab dictionary which maps unique words to indexes
foovec.vocabulary_
# sents_counts has a dimension of 3 (document count) by 19 (# of unique words)
sents_counts.shape
# this vector is small enough to view in full!
sents_counts.toarray()
# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
sents_tfidf = tfidf_transformer.fit_transform(sents_counts)
# TF-IDF values
# raw counts have been normalized against document length,
# terms that are found across many docs are weighted down
sents_tfidf.toarray()
# initialize movie_vector object, and then turn movie train data into a vector
movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize) # use all 25K words. 82.2% acc.
# movie_vec = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features = 3000) # use top 3000 words only. 78.5% acc.
movie_counts = movie_vec.fit_transform(movie_train.data)
# 'screen' is found in the corpus, mapped to index 19637
movie_vec.vocabulary_.get('screen')
# Likewise, Mr. Steven Seagal is present...
movie_vec.vocabulary_.get('seagal')
# huge dimensions! 2,000 documents, 25K unique terms.
movie_counts.shape
# Convert raw frequency counts into TF-IDF values
tfidf_transformer = TfidfTransformer()
movie_tfidf = tfidf_transformer.fit_transform(movie_counts)
# Same dimensions, now with tf-idf values instead of raw frequency counts
movie_tfidf.shape
# Now ready to build a classifier.
# We will use Multinominal Naive Bayes as our model
from sklearn.naive_bayes import MultinomialNB
# Split data into training and test sets
# from sklearn.cross_validation import train_test_split # deprecated in 0.18
from sklearn.model_selection import train_test_split
docs_train, docs_test, y_train, y_test = train_test_split(
movie_tfidf, movie_train.target, test_size = 0.20, random_state = 12)
# Train a Multimoda Naive Bayes classifier
clf = MultinomialNB().fit(docs_train, y_train)
# Predicting the Test set results, find accuracy
y_pred = clf.predict(docs_test)
sklearn.metrics.accuracy_score(y_test, y_pred)
# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm
# very short and fake movie reviews
reviews_new = ['This movie was excellent', 'Absolute joy ride',
'Steven Seagal was terrible', 'Steven Seagal shined through.',
'This was certainly a movie', 'Two thumbs up', 'I fell asleep halfway through',
"We can't wait for the sequel!!", '!', '?', 'I cannot recommend this highly enough',
'instant classic.', 'Steven Seagal was amazing. His performance was Oscar-worthy.']
reviews_new_counts = movie_vec.transform(reviews_new)
reviews_new_tfidf = tfidf_transformer.transform(reviews_new_counts)
# have classifier make a prediction
pred = clf.predict(reviews_new_tfidf)
# print out results
for review, category in zip(reviews_new, pred):
print('%r => %s' % (review, movie_train.target_names[category]))
# Mr. Seagal simply cannot win!