>>> from nltk.corpus import movie_reviews >>> movie_reviews.words() ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...] >>> movie_reviews.words()[:50] ['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party', ',', 'drink', 'and', 'then', 'drive', '.', 'they', 'get', 'into', 'an', 'accident', '.', 'one', 'of', 'the', 'guys', 'dies', ',', 'but', 'his', 'girlfriend', 'continues', 'to', 'see', 'him', 'in', 'her', 'life', ',', 'and', 'has', 'nightmares', '.', 'what', "'", 's', 'the', 'deal', '?', 'watch'] >>> movie_reviews.words()[-50:] ['with', 'interesting', 'stories', 'set', 'in', 'interesting', 'worlds', '.', 'but', 'neither', 'film', 'really', 'felt', 'like', 'it', 'capitalized', 'on', 'all', 'the', 'great', 'ideas', ';', 'neither', 'film', '"', 'clicked', '"', 'and', 'became', 'an', 'instant', 'classic', '.', 'nevertheless', ',', 'i', 'look', 'forward', 'to', 'niccol', "'", 's', 'next', 'film', ',', 'whatever', 'it', 'may', 'be', '.'] >>> len(movie_reviews.words()) 1583820 #### Pretty big corpus at 1.6 million words >>> len(movie_reviews.fileids()) 2000 >>> movie_reviews.fileids()[:10] ['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt'] >>> movie_reviews.fileids()[-10:] ['pos/cv990_11591.txt', 'pos/cv991_18645.txt', 'pos/cv992_11962.txt', 'pos/cv993_29737.txt', 'pos/cv994_12270.txt', 'pos/cv995_21821.txt', 'pos/cv996_11592.txt', 'pos/cv997_5046.txt', 'pos/cv998_14111.txt', 'pos/cv999_13106.txt'] #### 2,000 total files (=reviews) >>> movie_reviews.raw('pos/cv990_11591.txt')[:1000] 'the relaxed dude rides a roller coaster \nthe big lebowski a film review by michael redman copyright 1998 by michael redman \nthe most surreal situations are ordinary everyday life as viewed by an outsider . \nwhen those observers are joel and ethan coen , the surreal becomes bizarre . \nwhen the life is that of jeff " the dude " leboswki , the bizarre falls over the edge into the world of " what\'sgoingonaroundhere " . \nthe marvelous sound of " the stranger " ( sam elliot ) \'s voice-over introduces the film . \nat least it does until he forgets what he was going to say and gives up . \nthe dude ( jeff bridges ) is described as the " laziest man in los angeles , possibly the world " , although he\'s not so much slothful as he is relaxed . \nspending the last 30 years with a roach clip in one hand and a white russian in the other , he doesn\'t have much of a life , but he\'s having a good time . \nwhen asked what he does for fun , he responds " bowl , drive around and the occasional acid flashback ' #### This positive review sounds fairly positive indeed. #### Text is already in lower case. >>> movie_reviews.categories() ['neg', 'pos'] >>> len(movie_reviews.fileids('neg')) 1000 >>> len(movie_reviews.fileids('pos')) 1000 #### Half positive, and half negative! >>> documents = [(list(movie_reviews.words(fileid)), category) ... for category in movie_reviews.categories() ... for fileid in movie_reviews.fileids(category)] #### Building documents list through nested list comprehension. #### It is for-looping through the list of categories (a short one of just ['neg', 'pos']), #### and then for-looping through all file IDs belonging to the category, and then finally #### creating a tuple of (review tokens, category) which populates the documents list. >>> import random >>> random.shuffle(documents) #### Shuffle the document list, mix up negative and positive reviews >>> len(documents) 2000 >>> documents[0] (['the', 'rapid', '-', 'fire', 'formula', 'that', 'worked', 'so', 'well', 'in', 'airplane', '!', ',', 'the', '"', 'police', 'squad', '!', '"', 'television', 'series', ',', 'top', 'secret', '!', ',', 'three', 'naked', 'gun', 'films', ',', 'and', 'two', 'hot', 'shots', '!', 'movies', 'has', 'finally', 'reached', 'a', 'desperate', 'dead', '-', 'end', 'with', 'spy', 'hard', '.', 'even', 'ezio', 'gregio', "'", 's', 'the', 'silence', 'of', 'the', 'hams', 'is', 'arguably', 'funnier', 'than', 'this', 'over', '-', 'extended', 'spy', '-', 'and', 'action', '-', 'movie', 'spoof', '.', 'leslie', 'nielsen', 'stars', 'as', 'secret', 'agent', 'wd', '-', '40', ',', 'who', 'returns', 'from', 'retirement', 'to', 'battle', 'his', 'old', 'nemesis', ',', 'general', 'rancor', '(', 'a', 'cackling', 'andy', 'griffith', ')', '.', 'the', 'jokes', 'fly', 'in', 'every', 'direction', 'and', 'with', 'hardly', 'a', 'hint', 'of', 'restraint', ',', 'timing', ',', 'or', 'tact', '.', 'most', 'the', 'movie', 'is', 'comprised', 'of', 'recycled', 'airplane', 'and', 'naked', 'gun', 'gags', ',', 'recreated', 'movie', 'sequences', ',', 'and', 'soggy', 'star', 'cameos', '.', 'yeah', ',', 'maybe', 'we', '*', 'do', '*', 'need', 'ray', 'charles', 'driving', 'an', 'l', '.', 'a', '.', 'bus', 'bound', 'for', 'a', 'speed', 'bump', ',', 'but', 'did', 'mr', '.', 't', ',', 'hulk', 'hogan', ',', 'and', 'dr', '.', 'joyce', 'brothers', 'also', 'have', 'to', 'appear', 'in', 'the', 'same', 'movie', '?', 'leslie', 'nielsen', 'plays', 'it', 'straight', 'with', 'his', 'usual', 'dopey', 'flair', '.', 'there', 'is', 'something', 'oddly', 'inspiring', 'about', 'the', 'sight', 'of', 'nielsen', 'wearing', 'a', 'nun', "'", 's', 'habit', ',', 'even', 'the', 'resulting', 'sister', 'act', 'spoof', 'is', 'silly', '.', 'as', 'are', 'the', 'bits', 'directly', 'lifted', 'from', 'home', 'alone', ',', 'pulp', 'fiction', ',', 'true', 'lies', ',', 'etc', '.', 'etc', '.', '(', 'adolescent', 'males', 'might', 'enjoy', 'this', 'mess', ',', 'tho', '.', 'the', 'butt', 'shots', ',', 'breast', 'peeks', ',', 'penis', 'pokes', ',', 'flatulence', 'gags', ',', 'and', 'related', 'innuendo', 'are', 'right', 'up', 'beavis', 'and', 'butthead', "'", 's', 'alley', '.', ')', 'beyond', 'the', 'hilarious', 'title', 'sequence', 'with', '"', 'weird', '"', 'al', 'yankovich', 'performing', 'the', 'theme', 'song', ',', 'spy', 'hard', 'is', 'barely', 'the', 'stock', 'that', 'it', "'", 's', 'printed', 'on', '.', 'my', 'recommendation', ':', 'duck', 'in', 'while', 'you', "'", 're', 'waiting', 'for', 'another', 'movie', 'to', 'start', '.', 'stay', 'long', 'enough', 'to', 'see', 'the', 'camera', 'dart', 'inside', 'of', 'weird', 'al', "'", 's', 'nostril', 'and', 'then', 'leave', '.', 'you', 'won', "'", 't', 'miss', 'a', 'thing', '.'], 'neg') #### That's the very first review in the documents list, which is negative. #### Because of shuffling, you will likely see a different review. #### Note the tuple structure: ([w1,w2,...], 'neg') >>> all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) >>> word_features = list(all_words)[:2000] # keys are pre-sorted by frequency >>> word_features = [w for (w,f) in all_words.most_common(2000)] # if on older (<3.5.1) nltk, use .most_common >>> word_features[:100] [',', 'the', '.', 'a', 'and', 'of', 'to', "'", 'is', 'in', 's', '"', 'it', 'that', '-', ')', '(', 'as', 'with', 'for', 'his', 'this', 'film', 'i', 'he', 'but', 'on', 'are', 't', 'by', 'be', 'one', 'movie', 'an', 'who', 'not', 'you', 'from', 'at', 'was', 'have', 'they', 'has', 'her', 'all', '?', 'there', 'like', 'so', 'out', 'about', 'up', 'more', 'what', 'when', 'which', 'or', 'she', 'their', ':', 'some', 'just', 'can', 'if', 'we', 'him', 'into', 'even', 'only', 'than', 'no', 'good', 'time', 'most', 'its', 'will', 'story', 'would', 'been', 'much', 'character', 'also', 'get', 'other', 'do', 'two', 'well', 'them', 'very', 'characters', ';', 'first', '--', 'after', 'see', '!', 'way', 'because', 'make', 'life'] >>> word_features[-100:] ['jail', 'deals', 'cheesy', 'court', 'beach', 'austin', 'model', 'outstanding', 'substance', 'nudity', 'slapstick', 'joan', 'reveal', 'placed', 'check', 'beast', 'hurt', 'bloody', 'acts', 'fame', 'meeting', 'nuclear', '1996', 'strength', 'center', 'funniest', 'standing', 'damon', 'clich', 'position', 'desire', 'driven', 'seat', 'stock', 'wondering', 'realizes', 'dealing', 'taste', 'routine', 'comparison', 'cinematographer', 'seconds', 'singing', 'gangster', 'responsible', 'football', 'remarkable', 'hunting', 'adams', 'fly', 'suspects', 'treat', 'hopes', 'heaven', 'myers', 'struggle', 'costumes', 'beat', 'happening', 'skills', 'ugly', 'figures', 'thoroughly', 'ill', 'surprises', 'player', 'rival', 'guard', 'anthony', 'strike', 'community', 'streets', 'hopkins', 'ended', 'originally', 'sarah', 'creative', 'characterization', 'thankfully', 'growing', 'sharp', 'williamson', 'eccentric', 'explained', 'hey', 'claire', 'steal', 'inevitable', 'joel', 'core', 'weren', 'sorry', 'built', 'anne', 'breaking', 'villains', 'critic', 'lets', 'visit', 'followed'] #### Top 2,000 frequent words. #### Top 100 look familiar, bottom 100 not so much. >>> def document_features(document): ... document_words = set(document) # set of word types ... features = {} ... for word in word_features: # do this for top 2000 words ... inornot = word in document_words # returns True/False ... features['contains({})'.format(word)] = inornot ... return features ### A function that turns a tokenized document into a feature dictionary. >>> document_features(movie_reviews.words('pos/cv957_8737.txt')) {'contains(,)': True, 'contains(the)': True, 'contains(.)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': True, 'contains(to)': True, "contains(')": True, 'contains(is)': True, 'contains(in)': True, 'contains(s)': True, 'contains(")': True, 'contains(it)': True, 'contains(that)': True, 'contains(-)': True, 'contains())': True, 'contains(()': True, 'contains(as)': True, 'contains(with)': True, 'contains(for)': True, 'contains(his)': True, 'contains(this)': True, 'contains(film)': False, 'contains(i)': False, 'contains(he)': True, 'contains(but)': True, 'contains(on)': True, 'contains(are)': True, 'contains(t)': False, 'contains(by)': True, 'contains(be)': True, 'contains(one)': True, 'contains(movie)': True, 'contains(an)': True, 'contains(who)': True, 'contains(not)': True, 'contains(you)': True, 'contains(from)': True, 'contains(at)': False, 'contains(was)': False, 'contains(have)': True, 'contains(they)': True, 'contains(has)': True, 'contains(her)': False, 'contains(all)': True, 'contains(?)': False, ... 'contains(joel)': False, 'contains(core)': False, 'contains(weren)': False, 'contains(sorry)': False, 'contains(built)': False, 'contains(anne)': False, 'contains(breaking)': False, 'contains(villains)': False, 'contains(critic)': False, 'contains(lets)': False, 'contains(visit)': False, 'contains(followed)': False} #### Clipped the result in the middle, 2,000 total features for this particular review. #### Each indicates whether or not a top word is contained in the document. >>> featuresets = [(document_features(d), c) for (d,c) in documents] #### Turn each document ([w1,w2,w3,...], pos/neg) into its features while keeping the category #### each item in featuresets is a tuple: ({features}, pos/neg) >>> test_set = featuresets[:100] # setting aside first 100 for testing >>> train_set = featuresets[100:] # rest 1900 used for training >>> classifier = nltk.NaiveBayesClassifier.train(train_set) #### partition into testing and training sets, #### then train a Naive Bayes classifier. >>> print(nltk.classify.accuracy(classifier, test_set)) 0.83 >>> classifier.show_most_informative_features(10) Most Informative Features contains(outstanding) = True pos : neg = 14.0 : 1.0 contains(mulan) = True pos : neg = 8.3 : 1.0 contains(seagal) = True neg : pos = 7.8 : 1.0 contains(wonderfully) = True pos : neg = 6.5 : 1.0 contains(flynt) = True pos : neg = 5.6 : 1.0 contains(damon) = True pos : neg = 5.6 : 1.0 contains(poorly) = True neg : pos = 5.5 : 1.0 contains(wasted) = True neg : pos = 5.4 : 1.0 contains(lame) = True neg : pos = 5.2 : 1.0 contains(awful) = True neg : pos = 5.2 : 1.0 #### Accuracy of 0.83. Not bad I suppose! #### Top 10 most informative features. Pretty telling. >>> myreview = """Mr. Matt Damon was outstanding, fantastic, excellent, wonderfully subtle, superb, terrific, and memorable in his portrayal of Mulan.""" >>> myreview_toks = nltk.word_tokenize(myreview.lower()) >>> myreview_toks ['mr.', 'matt', 'damon', 'was', 'outstanding', ',', 'fantastic', ',', 'excellent', ',', 'wonderfully', 'subtle', ',', 'superb', ',', 'terrific', ',', 'and', 'memorable', 'in', 'his', 'portrayal', 'of', 'mulan', '.'] >>> myreview_feats = document_features(myreview_toks) >>> classifier.classify(myreview_feats) 'pos' >>> classifier.prob_classify(myreview_feats).prob('pos') 0.916761678831542 >>> classifier.prob_classify(myreview_feats).prob('neg') 0.0832383211684841 #### Matt Damon and a whole lot of positive sounding words. #### Classified 'pos', 92% probability for 'pos'. >>> myreview = """Mr. Steven Seagal was outstanding, fantastic, excellent, wonderfully subtle, superb, terrific, and memorable in his portrayal of Mulan.""" >>> myreview_toks = nltk.word_tokenize(myreview.lower()) >>> myreview_feats = document_features(myreview_toks) >>> classifier.classify(myreview_feats) 'neg' >>> classifier.prob_classify(myreview_feats).prob('pos') 0.0921833172550908 >>> classifier.prob_classify(myreview_feats).prob('neg') 0.9078166827449369 #### Steven Seagal against a whole lot of positive sounding words. #### Classified 'neg' this time, with only 9% probability for 'pos'. >>> myreview = "Mr. Matt Damon was outstanding, fantastic." >>> myreview_toks = nltk.word_tokenize(myreview.lower()) >>> myreview_feats = document_features(myreview_toks) >>> classifier.classify(myreview_feats) 'neg' >>> classifier.prob_classify(myreview_feats).prob('pos') 1.0146531638398488e-05 >>> classifier.prob_classify(myreview_feats).prob('neg') 0.9999898534683809 >>> #### OK, so this was surprising. The classifier is dead sure this review is negative.