Python 3.11.4 (tags/v3.11.4:d2340ef, Jun 7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] on win32 Type "help", "copyright", "credits" or "license()" for more information. >>> import nltk >>> br_tw = nltk.corpus.brown.tagged_words(categories='mystery') >>> br_tw[:10] [('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS')] # ---------------------------- Lowercasing tokens >>> [(w,t) for (w,t) in br_tw][:5] [('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')] >>> [w for (w,t) in br_tw][:5] ['There', 'were', 'thirty-eight', 'patients', 'on'] >>> [(w.lower(),t) for (w,t) in br_tw][:5] [('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')] >>> br_tw_lower = [(w.lower(),t) for (w,t) in br_tw] >>> br_tw_lower[:10] [('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('i', 'PPSS')] # ----------------------------- POS frequencies of word types >>> br_tw_cfd = nltk.ConditionalFreqDist(br_tw_lower) >>> br_tw_cfd['so'] FreqDist({'RB': 48, 'QL': 44, 'CS': 34}) >>> br_tw_cfd['so'].freq('RB') 0.38095238095238093 >>> br_tw_cfd['so'].freq('CS') 0.2698412698412698 >>> br_tw_cfd['question'] FreqDist({'NN': 5, 'VB': 3}) >>> br_tw_post_cfd['question/VB'] FreqDist({',': 1, 'CC': 1, 'AT': 1}) >>> br_tw_cfd['like'] FreqDist({'CS': 103, 'VB': 19, 'IN': 14, 'JJ': 3}) # ----------------------------- Looking up tags through nltk.help # If error, download: nltk.download('tagsets_json') >>> nltk.help.brown_tagset('IN') IN: preposition of in for by considering to on among at through with under into regarding than since despite according per before toward against as after during including between without except upon out over ... >>> nltk.help.brown_tagset('VB') VB: verb, base: uninflected present, imperative or infinitive investigate find act follow inure achieve reduce take remedy re-set distribute realize disable feel receive continue place protect eliminate elaborate work permit run enter force ... >>> br_ts = nltk.corpus.brown.tagged_sents(categories='mystery') >>> br_ts[0] [('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')] >>> br_ts[1] [('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')] >>> [s for s in br_ts if ('like', 'JJ') in s ] [[('smarter', 'JJR'), (',', ','), ('and', 'CC'), ('wear', 'VB'), ('different', 'JJ'), ('kinds', 'NNS'), ('of', 'IN'), ('clothes', 'NNS'), ('--', '--'), ("she'd", 'PPS+MD'), ('be', 'BE'), ('Katharine', 'NP'), ('Ross', 'NP'), (',', ','), ('just', 'RB'), ('what', 'WDT'), ('that', 'WPS'), ('sounded', 'VBD'), ('like', 'JJ'), ('.', '.')], [('``', '``'), ("Can't", 'MD*'), ('you', 'PPSS'), ('possibly', 'RB'), ('imagine', 'VB'), ('what', 'WDT'), ('life', 'NN'), ('is', 'BEZ'), ('going', 'VBG'), ('to', 'TO'), ('be', 'BE'), ('like', 'JJ'), (',', ','), ('here', 'RB'), ("''", "''"), ('?', '.'), ('?', '.')], [('That', 'DT'), (',', ','), ('incidentally', 'RB'), (',', ','), ('might', 'MD'), ('give', 'VB'), ('you', 'PPO'), ('some', 'DTI'), ('idea', 'NN'), ('of', 'IN'), ('what', 'WDT'), ('Felix', 'NP'), ('was', 'BEDZ'), ('like', 'JJ'), ('.', '.')]] >>> [s for s in br_ts if ('like', 'JJ') in s ][0] [('smarter', 'JJR'), (',', ','), ('and', 'CC'), ('wear', 'VB'), ('different', 'JJ'), ('kinds', 'NNS'), ('of', 'IN'), ('clothes', 'NNS'), ('--', '--'), ("she'd", 'PPS+MD'), ('be', 'BE'), ('Katharine', 'NP'), ('Ross', 'NP'), (',', ','), ('just', 'RB'), ('what', 'WDT'), ('that', 'WPS'), ('sounded', 'VBD'), ('like', 'JJ'), ('.', '.')] >>> [s for s in br_ts if ('like', 'JJ') in s ][1] [('``', '``'), ("Can't", 'MD*'), ('you', 'PPSS'), ('possibly', 'RB'), ('imagine', 'VB'), ('what', 'WDT'), ('life', 'NN'), ('is', 'BEZ'), ('going', 'VBG'), ('to', 'TO'), ('be', 'BE'), ('like', 'JJ'), (',', ','), ('here', 'RB'), ("''", "''"), ('?', '.'), ('?', '.')] # ----------------------------- building trigrams >>> br_3grams = list(nltk.ngrams(br_tw_lower, 3)) >>> br_3grams[0] (('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD')) >>> br_3grams[1] (('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS')) >>> br_3grams[2] (('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')) >>> [x[1] for x in br_3grams][0] ('were', 'BED') >>> [x[1][0] for x in br_3grams][0] 'were' # --------------------------- conditional frequency of preceding POS >>> [(w2+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][0] ('wereBED', 'EX') >>> [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][0] ('were/BED', 'EX') >>> [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][:5] [('were/BED', 'EX'), ('thirty-eight/CD', 'BED'), ('patients/NNS', 'CD'), ('on/IN', 'NNS'), ('the/AT', 'IN')] >>> br_tw_pre = [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams] >>> br_tw_pre[:10] [('were/BED', 'EX'), ('thirty-eight/CD', 'BED'), ('patients/NNS', 'CD'), ('on/IN', 'NNS'), ('the/AT', 'IN'), ('bus/NN', 'AT'), ('the/AT', 'NN'), ('morning/NN', 'AT'), ('i/PPSS', 'NN'), ('left/VBD', 'PPSS')] >>> br_tw_pre[-10:] [('in/IN', ','), ('case/NN', 'IN'), (',/,', 'NN'), ('i/PPSS', ','), ('brought/VBD', 'PPSS'), ('the/AT', 'VBD'), ('money/NN', 'AT'), ('with/IN', 'NN'), ('me/PPO', 'IN'), ("''/''", 'PPO')] >>> br_tw_pre_cfd = nltk.ConditionalFreqDist(br_tw_pre) >>> br_tw_pre_cfd['so/RB'] FreqDist({'CC': 10, '.': 10, 'VB': 4, ',': 4, 'DO': 3, '``': 3, 'VBZ': 2, 'VBG': 2, 'BEDZ*': 2, '--': 1, ...}) >>> br_tw_pre_cfd['so/RB'].most_common(5) [('CC', 10), ('.', 10), ('VB', 4), (',', 4), ('DO', 3)] >>> br_tw_pre_cfd['so/RB'].freq('CC') 0.20833333333333334 >>> br_tw_pre_cfd['so/CS'] FreqDist({'.': 7, ',': 6, 'NN': 5, 'NNS': 3, 'QLP': 2, 'PPO': 2, 'VBD': 2, '--': 2, 'RB': 1, 'CC': 1, ...}) >>> br_tw_pre_cfd['so/QL'] FreqDist({'IN': 5, 'BEDZ': 5, ',': 4, 'RB': 3, 'NN': 3, 'QL': 3, 'VBN': 2, 'VB': 2, 'PPO': 2, 'BEN': 1, ...}) # --------------------------- conditional frequency of following POS >>> br_tw_post = [(w2+"/"+t2, t3) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams] >>> br_tw_post_cfd = nltk.ConditionalFreqDist(br_tw_post) >>> br_tw_post_cfd['so/QL'] FreqDist({'JJ': 18, 'RB': 14, 'AP': 7, 'QL': 3, 'VBN': 2}) # ---------------------------- Now onto Penn Treebank >>> from nltk.corpus import treebank >>> treebank.words() ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...] >>> treebank.words()[:20] ['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken'] >>> treebank.tagged_words()[:20] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP')] >>> treebank.tagged_sents() [[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...] >>> treebank.tagged_sents()[0] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')] >>> len(treebank.tagged_words()) 100676 >>> len(treebank.tagged_sents()) 3914 # ---------------------------- Syntactic trees >>> treebank.parsed_sents()[0] Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])]) >>> treebank.parsed_sents()[0].pprint() (S (NP-SBJ (NP (NNP Pierre) (NNP Vinken)) (, ,) (ADJP (NP (CD 61) (NNS years)) (JJ old)) (, ,)) (VP (MD will) (VP (VB join) (NP (DT the) (NN board)) (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director))) (NP-TMP (NNP Nov.) (CD 29)))) (. .)) >>> treebank.parsed_sents()[0].draw() # --------------------------- Syntactic trees are RECURSIVE! # Now back to tagged words: >>> treebank.tagged_words()[:10] [('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')] >>> treebank_cfd = nltk.ConditionalFreqDist(treebank.tagged_words()) >>> treebank_cfd['question'] FreqDist({'NN': 12, 'VBP': 1, 'VB': 1}) >>> nltk.help.upenn_tagset('VBP') VBP: verb, present tense, not 3rd person singular predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ... >>> nltk.help.upenn_tagset('VB') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... >>> nltk.help.upenn_tagset('V.*') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... VBD: verb, past tense dipped pleaded swiped regummed soaked tidied convened halted registered cushioned exacted snubbed strode aimed adopted belied figgered speculated wore appreciated contemplated ... VBG: verb, present participle or gerund telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting erasing wincing ... VBN: verb, past participle multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ... VBP: verb, present tense, not 3rd person singular predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ... VBZ: verb, present tense, 3rd person singular bases reconstructs marks mixes displeases seals carps weaves snatches slumps stretches authorizes smolders pictures emerges stockpiles seduces fizzes uses bolsters slaps speaks pleads ... >>> treebank_cfd['so'] FreqDist({'RB': 55, 'IN': 16}) # only RB and IN? >>> nltk.help.upenn_tagset('RB') RB: adverb occasionally unabatingly maddeningly adventurously professedly stirringly prominently technologically magisterially predominately swiftly fiscally pitilessly ... >>> nltk.help.upenn_tagset('IN') IN: preposition or conjunction, subordinating astride among uppon whether out inside pro despite on by throughout below within for towards near behind atop around if like until below next into if beside ... # --------------------------- Treebank treats CS usage of 'so' as part of its preposition (IN) use # and its QL (qualifier) use as RB (adverb) broadly >>> treebank_cfd['will'] FreqDist({'MD': 280, 'NN': 1}) >>> treebank_cfd['share'] FreqDist({'NN': 116, 'VB': 3}) # --------------------------- 'share' is overwhelmingly a noun?? Digging deeper... >>> print(treebank.readme()) [ PENN TREEBANK SAMPLE ] http://www.cis.upenn.edu/~treebank/home.html This is a ~5% fragment of Penn Treebank, (C) LDC 1995. It is made available under fair use for the purposes of illustrating NLTK tools for tokenizing, tagging, chunking and parsing. This data is for non-commercial use only. Contents: raw, tagged, parsed and combined data from Wall Street Journal for 1650 sentences (99 treebank files wsj_0001 .. wsj_0099). For details about each of the four types, please see the other README files included in the treebank sample directory. Examples of the four types are shown below: ### The rest of readme clipped. Wall Street Journal it is! # --------------------------- More tag ambiguity >>> treebank_cfd['fly'] FreqDist({'VB': 1, 'VBP': 1}) >>> treebank_cfd['like'] FreqDist({'IN': 49, 'VB': 8, 'VBP': 4, 'JJ': 1}) # --------------------------- Penn Treebank tagset definitions >>> nltk.help.upenn_tagset('VB') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... >>> nltk.help.upenn_tagset('V.*') VB: verb, base form ask assemble assess assign assume atone attention avoid bake balkanize bank begin behold believe bend benefit bevel beware bless boil bomb boost brace break bring broil brush build ... VBD: verb, past tense dipped pleaded swiped regummed soaked tidied convened halted registered cushioned exacted snubbed strode aimed adopted belied figgered speculated wore appreciated contemplated ... VBG: verb, present participle or gerund telegraphing stirring focusing angering judging stalling lactating hankerin' alleging veering capping approaching traveling besieging encrypting interrupting erasing wincing ... VBN: verb, past participle multihulled dilapidated aerosolized chaired languished panelized used experimented flourished imitated reunifed factored condensed sheared unsettled primed dubbed desired ... VBP: verb, present tense, not 3rd person singular predominate wrap resort sue twist spill cure lengthen brush terminate appear tend stray glisten obtain comprise detest tease attract emphasize mold postpone sever return wag ... VBZ: verb, present tense, 3rd person singular bases reconstructs marks mixes displeases seals carps weaves snatches slumps stretches authorizes smolders pictures emerges stockpiles seduces fizzes uses bolsters slaps speaks pleads ... # --------------------------- Let's try NLTK's built-in POS tagger # warning: not that great >>> sent = "Colorless green ideas sleep furiously." >>> nltk.word_tokenize(sent) ['Colorless', 'green', 'ideas', 'sleep', 'furiously', '.'] >>> chom = nltk.word_tokenize(sent) >>> chom ['Colorless', 'green', 'ideas', 'sleep', 'furiously', '.'] >>> nltk.pos_tag(chom) LookupError: ********************************************************************** Resource averaged_perceptron_tagger_eng not found. Please use the NLTK Downloader to obtain the resource: >>> import nltk >>> nltk.download('averaged_perceptron_tagger_eng') For more information see: https://www.nltk.org/data.html >>> nltk.download('averaged_perceptron_tagger_eng') # download tagger model first [nltk_data] Downloading package averaged_perceptron_tagger_eng to [nltk_data] C:\Users\Jane Eyre\AppData\Roaming\nltk_data... [nltk_data] Unzipping taggers\averaged_perceptron_tagger_eng.zip. True >>> nltk.pos_tag(chom) [('Colorless', 'NNP'), ('green', 'JJ'), ('ideas', 'NNS'), ('sleep', 'VBP'), ('furiously', 'RB'), ('.', '.')] >>> nltk.help.upenn_tagset('NNP') NNP: noun, proper, singular Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA Shannon A.K.C. Meltex Liverpool ... >>> nltk.pos_tag('January was a cold month'.split()) [('January', 'NNP'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('month', 'NN')] >>> nltk.pos_tag('January was a cold month .'.split()) [('January', 'NNP'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('month', 'NN'), ('.', '.')] >>> nltk.pos_tag('Yesterday was a cold day .'.split()) [('Yesterday', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('day', 'NN'), ('.', '.')]