>>> from nltk.corpus import movie_reviews
>>> movie_reviews.words()
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
>>> movie_reviews.words()[:50]
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch']
>>> movie_reviews.words()[-50:]
['with', 'interesting', 'stories', 'set', 'in', 'interesting', 'worlds', '.', 'but', 'neither', 'film', 'really', 'felt', 'like', 'it', 'capitalized', 'on', 'all', 'the', 'great', 'ideas', ';', 'neither', 'film', '"', 'clicked', '"', 'and', 'became', 'an', 'instant', 'classic', '.', 'nevertheless', ',', 'i', 'look', 'forward', 'to', 'niccol', "'", 's', 'next', 'film', ',', 'whatever', 'it', 'may', 'be', '.']
>>> len(movie_reviews.words())
1583820

#### Pretty big corpus at 1.6 million words


>>> len(movie_reviews.fileids())
2000
>>> movie_reviews.fileids()[:10]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']
>>> movie_reviews.fileids()[-10:]
['pos/cv990_11591.txt', 'pos/cv991_18645.txt', 'pos/cv992_11962.txt', 'pos/cv993_29737.txt', 'pos/cv994_12270.txt', 'pos/cv995_21821.txt', 'pos/cv996_11592.txt', 'pos/cv997_5046.txt', 'pos/cv998_14111.txt', 'pos/cv999_13106.txt']

#### 2,000 total files (=reviews)


>>> movie_reviews.raw('pos/cv990_11591.txt')[:1000]
'the relaxed dude rides a roller coaster \nthe big lebowski a film review by michael redman copyright 1998 by michael redman \nthe most surreal situations are ordinary everyday life as viewed by an outsider . \nwhen those observers are joel and ethan coen , the surreal becomes bizarre . \nwhen the life is that of jeff " the dude " leboswki , the bizarre falls over the edge into the world of " what\'sgoingonaroundhere " . \nthe marvelous sound of " the stranger " ( sam elliot ) \'s voice-over introduces the film . \nat least it does until he forgets what he was going to say and gives up . \nthe dude ( jeff bridges ) is described as the " laziest man in los angeles , possibly the world " , although he\'s not so much slothful as he is relaxed . \nspending the last 30 years with a roach clip in one hand and a white russian in the other , he doesn\'t have much of a life , but he\'s having a good time . \nwhen asked what he does for fun , he responds " bowl , drive around and the occasional acid flashback '

#### This positive review sounds fairly positive indeed.
#### Text is already in lower case.


>>> movie_reviews.categories()
['neg', 'pos']
>>> len(movie_reviews.fileids('neg'))
1000
>>> len(movie_reviews.fileids('pos'))
1000

#### Half positive, and half negative!



>>> documents = [(list(movie_reviews.words(fileid)), category)
...              for category in movie_reviews.categories()
...              for fileid in movie_reviews.fileids(category)]

#### Building documents list through nested list comprehension.
#### It is for-looping through the list of categories (a short one of just ['neg', 'pos']),
#### and then for-looping through all file IDs belonging to the category, and then finally
#### creating a tuple of (review tokens, category) which populates the documents list.


>>> import random
>>> random.shuffle(documents)

#### Shuffle the document list, mix up negative and positive reviews


>>> len(documents)
2000
>>> documents[0]
(['the', 'rapid', '-', 'fire', 'formula', 'that', 'worked', 'so', 'well', 'in', 'airplane', '!', ',', 'the', '"', 'police', 'squad', '!', '"', 'television', 'series', ',', 'top', 'secret', '!', ',', 'three', 'naked', 'gun', 'films', ',', 'and', 'two', 'hot', 'shots', '!', 'movies', 'has', 'finally', 'reached', 'a', 'desperate', 'dead', '-', 'end', 'with', 'spy', 'hard', '.', 'even', 'ezio', 'gregio', "'", 's', 'the', 'silence', 'of', 'the', 'hams', 'is', 'arguably', 'funnier', 'than', 'this', 'over', '-', 'extended', 'spy', '-', 'and', 'action', '-', 'movie', 'spoof', '.', 'leslie', 'nielsen', 'stars', 'as', 'secret', 'agent', 'wd', '-', '40', ',', 'who', 'returns', 'from', 'retirement', 'to', 'battle', 'his', 'old', 'nemesis', ',', 'general', 'rancor', '(', 'a', 'cackling', 'andy', 'griffith', ')', '.', 'the', 'jokes', 'fly', 'in', 'every', 'direction', 'and', 'with', 'hardly', 'a', 'hint', 'of', 'restraint', ',', 'timing', ',', 'or', 'tact', '.', 'most', 'the', 'movie', 'is', 'comprised', 'of', 'recycled', 'airplane', 'and', 'naked', 'gun', 'gags', ',', 'recreated', 'movie', 'sequences', ',', 'and', 'soggy', 'star', 'cameos', '.', 'yeah', ',', 'maybe', 'we', '*', 'do', '*', 'need', 'ray', 'charles', 'driving', 'an', 'l', '.', 'a', '.', 'bus', 'bound', 'for', 'a', 'speed', 'bump', ',', 'but', 'did', 'mr', '.', 't', ',', 'hulk', 'hogan', ',', 'and', 'dr', '.', 'joyce', 'brothers', 'also', 'have', 'to', 'appear', 'in', 'the', 'same', 'movie', '?', 'leslie', 'nielsen', 'plays', 'it', 'straight', 'with', 'his', 'usual', 'dopey', 'flair', '.', 'there', 'is', 'something', 'oddly', 'inspiring', 'about', 'the', 'sight', 'of', 'nielsen', 'wearing', 'a', 'nun', "'", 's', 'habit', ',', 'even', 'the', 'resulting', 'sister', 'act', 'spoof', 'is', 'silly', '.', 'as', 'are', 'the', 'bits', 'directly', 'lifted', 'from', 'home', 'alone', ',', 'pulp', 'fiction', ',', 'true', 'lies', ',', 'etc', '.', 'etc', '.', '(', 'adolescent', 'males', 'might', 'enjoy', 'this', 'mess', ',', 'tho', '.', 'the', 'butt', 'shots', ',', 'breast', 'peeks', ',', 'penis', 'pokes', ',', 'flatulence', 'gags', ',', 'and', 'related', 'innuendo', 'are', 'right', 'up', 'beavis', 'and', 'butthead', "'", 's', 'alley', '.', ')', 'beyond', 'the', 'hilarious', 'title', 'sequence', 'with', '"', 'weird', '"', 'al', 'yankovich', 'performing', 'the', 'theme', 'song', ',', 'spy', 'hard', 'is', 'barely', 'the', 'stock', 'that', 'it', "'", 's', 'printed', 'on', '.', 'my', 'recommendation', ':', 'duck', 'in', 'while', 'you', "'", 're', 'waiting', 'for', 'another', 'movie', 'to', 'start', '.', 'stay', 'long', 'enough', 'to', 'see', 'the', 'camera', 'dart', 'inside', 'of', 'weird', 'al', "'", 's', 'nostril', 'and', 'then', 'leave', '.', 'you', 'won', "'", 't', 'miss', 'a', 'thing', '.'], 'neg')

#### That's the very first review in the documents list, which is negative.
#### Because of shuffling, you will likely see a different review.
#### Note the tuple structure: ([w1,w2,...], 'neg')


>>> all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

>>> word_features = list(all_words)[:2000]                        # keys are pre-sorted by frequency
>>> word_features = [w for (w,f) in all_words.most_common(2000)]  # if on older (<3.5.1) nltk, use .most_common

>>> word_features[:100]
[',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life']
>>> word_features[-100:]
['jail', 'deals', 'cheesy', 'court', 'beach', 'austin', 'model', 'outstanding', 'substance', 'nudity', 'slapstick', 'joan', 'reveal', 'placed', 'check', 'beast', 'hurt', 'bloody', 'acts', 'fame', 'meeting', 'nuclear', '1996', 'strength', 'center', 'funniest', 'standing', 'damon', 'clich', 'position', 'desire', 'driven', 'seat', 'stock', 'wondering', 'realizes', 'dealing', 'taste', 'routine', 'comparison', 'cinematographer', 'seconds', 'singing', 'gangster', 'responsible', 'football', 'remarkable', 'hunting', 'adams', 'fly', 'suspects', 'treat', 'hopes', 'heaven', 'myers', 'struggle', 'costumes', 'beat', 'happening', 'skills', 'ugly', 'figures', 'thoroughly', 'ill', 'surprises', 'player', 'rival', 'guard', 'anthony', 'strike', 'community', 'streets', 'hopkins', 'ended', 'originally', 'sarah', 'creative', 'characterization', 'thankfully', 'growing', 'sharp', 'williamson', 'eccentric', 'explained', 'hey', 'claire', 'steal', 'inevitable', 'joel', 'core', 'weren', 'sorry', 'built', 'anne', 'breaking', 'villains', 'critic', 'lets', 'visit', 'followed']

#### Top 2,000 frequent words.
#### Top 100 look familiar, bottom 100 not so much.



>>> def document_features(document):
...     document_words = set(document)       # set of word types
...     features = {}
...     for word in word_features:                  # do this for top 2000 words
...         inornot = word in document_words        # returns True/False
...         features['contains({})'.format(word)] = inornot
...     return features

### A function that turns a tokenized document into a feature dictionary.


>>> document_features(movie_reviews.words('pos/cv957_8737.txt'))
{'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'contains(all)': True, 'contains(?)': False,
...
 'contains(joel)': False, 'contains(core)': False, 'contains(weren)': False, 'contains(sorry)': False, 'contains(built)': False, 'contains(anne)': False, 'contains(breaking)': False, 'contains(villains)': False, 'contains(critic)': False, 'contains(lets)': False, 'contains(visit)': False, 'contains(followed)': False}

#### Clipped the result in the middle, 2,000 total features for this particular review.
#### Each indicates whether or not a top word is contained in the document.


>>> featuresets = [(document_features(d), c) for (d,c) in documents]

#### Turn each document ([w1,w2,w3,...], pos/neg) into its features while keeping the category
####    each item in featuresets is a tuple: ({features}, pos/neg)


>>> test_set = featuresets[:100]          # setting aside first 100 for testing
>>> train_set = featuresets[100:]         # rest 1900 used for training
>>> classifier = nltk.NaiveBayesClassifier.train(train_set)

#### partition into testing and training sets,
#### then train a Naive Bayes classifier.


>>> print(nltk.classify.accuracy(classifier, test_set))
0.83
>>> classifier.show_most_informative_features(10)
Most Informative Features
   contains(outstanding) = True              pos : neg    =     14.0 : 1.0
         contains(mulan) = True              pos : neg    =      8.3 : 1.0
        contains(seagal) = True              neg : pos    =      7.8 : 1.0
   contains(wonderfully) = True              pos : neg    =      6.5 : 1.0
         contains(flynt) = True              pos : neg    =      5.6 : 1.0
         contains(damon) = True              pos : neg    =      5.6 : 1.0
        contains(poorly) = True              neg : pos    =      5.5 : 1.0
        contains(wasted) = True              neg : pos    =      5.4 : 1.0
          contains(lame) = True              neg : pos    =      5.2 : 1.0
         contains(awful) = True              neg : pos    =      5.2 : 1.0

#### Accuracy of 0.83. Not bad I suppose!
#### Top 10 most informative features. Pretty telling.


>>> myreview = """Mr. Matt Damon was outstanding, fantastic, excellent, wonderfully
subtle, superb, terrific, and memorable in his portrayal of Mulan."""

>>> myreview_toks = nltk.word_tokenize(myreview.lower())
>>> myreview_toks
['mr.', 'matt', 'damon', 'was', 'outstanding', ',', 'fantastic', ',', 'excellent', ',', 'wonderfully', 'subtle', ',', 'superb', ',', 'terrific', ',', 'and', 'memorable', 'in', 'his', 'portrayal', 'of', 'mulan', '.']
>>> myreview_feats = document_features(myreview_toks)
>>> classifier.classify(myreview_feats)
'pos'
>>> classifier.prob_classify(myreview_feats).prob('pos')
0.916761678831542
>>> classifier.prob_classify(myreview_feats).prob('neg')
0.0832383211684841

#### Matt Damon and a whole lot of positive sounding words.
#### Classified 'pos', 92% probability for 'pos'.


>>> myreview = """Mr. Steven Seagal was outstanding, fantastic, excellent, wonderfully
subtle, superb, terrific, and memorable in his portrayal of Mulan."""

>>> myreview_toks = nltk.word_tokenize(myreview.lower())
>>> myreview_feats = document_features(myreview_toks)
>>> classifier.classify(myreview_feats)
'neg'
>>> classifier.prob_classify(myreview_feats).prob('pos')
0.0921833172550908
>>> classifier.prob_classify(myreview_feats).prob('neg')
0.9078166827449369

#### Steven Seagal against a whole lot of positive sounding words.
#### Classified 'neg' this time, with only 9% probability for 'pos'.



>>> myreview = "Mr. Matt Damon was outstanding, fantastic."
>>> myreview_toks = nltk.word_tokenize(myreview.lower())
>>> myreview_feats = document_features(myreview_toks)
>>> classifier.classify(myreview_feats)
'neg'
>>> classifier.prob_classify(myreview_feats).prob('pos')
1.0146531638398488e-05
>>> classifier.prob_classify(myreview_feats).prob('neg')
0.9999898534683809
>>>

#### OK, so this was surprising. The classifier is dead sure this review is negative.