Python 3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)] on win32
Type "help", "copyright", "credits" or "license()" for more information.
>>> import nltk
>>> br_tw = nltk.corpus.brown.tagged_words(categories='mystery')
>>> br_tw[:10]
[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS')]

# ---------------------------- Lowercasing tokens

>>> [(w,t) for (w,t) in br_tw][:5]
[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')]
>>> [w for (w,t) in br_tw][:5]
['There', 'were', 'thirty-eight', 'patients', 'on']
>>> [(w.lower(),t) for (w,t) in br_tw][:5]
[('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN')]
>>> br_tw_lower = [(w.lower(),t) for (w,t) in br_tw]
>>> br_tw_lower[:10]
[('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('i', 'PPSS')]

# ----------------------------- POS frequencies of word types

>>> br_tw_cfd = nltk.ConditionalFreqDist(br_tw_lower)
>>> br_tw_cfd['so']
FreqDist({'RB': 48, 'QL': 44, 'CS': 34})
>>> br_tw_cfd['so'].freq('RB')
0.38095238095238093
>>> br_tw_cfd['so'].freq('CS')
0.2698412698412698
>>> br_tw_cfd['question']
FreqDist({'NN': 5, 'VB': 3})
>>> br_tw_post_cfd['question/VB']
FreqDist({',': 1, 'CC': 1, 'AT': 1})
>>> br_tw_cfd['like']
FreqDist({'CS': 103, 'VB': 19, 'IN': 14, 'JJ': 3})

# ----------------------------- Looking up tags through nltk.help
#                               If error, download: nltk.download('tagsets_json')

>>> nltk.help.brown_tagset('IN')
IN: preposition
    of in for by considering to on among at through with under into
    regarding than since despite according per before toward against as
    after during including between without except upon out over ...
>>> nltk.help.brown_tagset('VB')
VB: verb, base: uninflected present, imperative or infinitive
    investigate find act follow inure achieve reduce take remedy re-set
    distribute realize disable feel receive continue place protect
    eliminate elaborate work permit run enter force ...
>>> br_ts = nltk.corpus.brown.tagged_sents(categories='mystery')
>>> br_ts[0]
[('There', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'), ('the', 'AT'), ('bus', 'NN'), ('the', 'AT'), ('morning', 'NN'), ('I', 'PPSS'), ('left', 'VBD'), ('for', 'IN'), ('Hanover', 'NP'), (',', ','), ('most', 'AP'), ('of', 'IN'), ('them', 'PPO'), ('disturbed', 'VBN'), ('and', 'CC'), ('hallucinating', 'VBG'), ('.', '.')]
>>> br_ts[1]
[('An', 'AT'), ('interne', 'NN'), (',', ','), ('a', 'AT'), ('nurse', 'NN'), ('and', 'CC'), ('two', 'CD'), ('attendants', 'NNS'), ('were', 'BED'), ('in', 'IN'), ('charge', 'NN'), ('of', 'IN'), ('us', 'PPO'), ('.', '.')]
>>> [s for s in br_ts if ('like', 'JJ') in s ]
[[('smarter', 'JJR'), (',', ','), ('and', 'CC'), ('wear', 'VB'), ('different', 'JJ'), ('kinds', 'NNS'), ('of', 'IN'), ('clothes', 'NNS'), ('--', '--'), ("she'd", 'PPS+MD'), ('be', 'BE'), ('Katharine', 'NP'), ('Ross', 'NP'), (',', ','), ('just', 'RB'), ('what', 'WDT'), ('that', 'WPS'), ('sounded', 'VBD'), ('like', 'JJ'), ('.', '.')], [('``', '``'), ("Can't", 'MD*'), ('you', 'PPSS'), ('possibly', 'RB'), ('imagine', 'VB'), ('what', 'WDT'), ('life', 'NN'), ('is', 'BEZ'), ('going', 'VBG'), ('to', 'TO'), ('be', 'BE'), ('like', 'JJ'), (',', ','), ('here', 'RB'), ("''", "''"), ('?', '.'), ('?', '.')], [('That', 'DT'), (',', ','), ('incidentally', 'RB'), (',', ','), ('might', 'MD'), ('give', 'VB'), ('you', 'PPO'), ('some', 'DTI'), ('idea', 'NN'), ('of', 'IN'), ('what', 'WDT'), ('Felix', 'NP'), ('was', 'BEDZ'), ('like', 'JJ'), ('.', '.')]]
>>> [s for s in br_ts if ('like', 'JJ') in s ][0]
[('smarter', 'JJR'), (',', ','), ('and', 'CC'), ('wear', 'VB'), ('different', 'JJ'), ('kinds', 'NNS'), ('of', 'IN'), ('clothes', 'NNS'), ('--', '--'), ("she'd", 'PPS+MD'), ('be', 'BE'), ('Katharine', 'NP'), ('Ross', 'NP'), (',', ','), ('just', 'RB'), ('what', 'WDT'), ('that', 'WPS'), ('sounded', 'VBD'), ('like', 'JJ'), ('.', '.')]
>>> [s for s in br_ts if ('like', 'JJ') in s ][1]
[('``', '``'), ("Can't", 'MD*'), ('you', 'PPSS'), ('possibly', 'RB'), ('imagine', 'VB'), ('what', 'WDT'), ('life', 'NN'), ('is', 'BEZ'), ('going', 'VBG'), ('to', 'TO'), ('be', 'BE'), ('like', 'JJ'), (',', ','), ('here', 'RB'), ("''", "''"), ('?', '.'), ('?', '.')]


# ----------------------------- building trigrams

>>> br_3grams = list(nltk.ngrams(br_tw_lower, 3))
>>> br_3grams[0]
(('there', 'EX'), ('were', 'BED'), ('thirty-eight', 'CD'))
>>> br_3grams[1]
(('were', 'BED'), ('thirty-eight', 'CD'), ('patients', 'NNS'))
>>> br_3grams[2]
(('thirty-eight', 'CD'), ('patients', 'NNS'), ('on', 'IN'))
>>> [x[1] for x in br_3grams][0]
('were', 'BED')
>>> [x[1][0] for x in br_3grams][0]
'were'

# --------------------------- conditional frequency of preceding POS

>>> [(w2+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][0]
('wereBED', 'EX')
>>> [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][0]
('were/BED', 'EX')
>>> [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams][:5]
[('were/BED', 'EX'), ('thirty-eight/CD', 'BED'), ('patients/NNS', 'CD'), ('on/IN', 'NNS'), ('the/AT', 'IN')]
>>> br_tw_pre = [(w2+"/"+t2, t1) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams]
>>> br_tw_pre[:10]
[('were/BED', 'EX'), ('thirty-eight/CD', 'BED'), ('patients/NNS', 'CD'), ('on/IN', 'NNS'), ('the/AT', 'IN'), ('bus/NN', 'AT'), ('the/AT', 'NN'), ('morning/NN', 'AT'), ('i/PPSS', 'NN'), ('left/VBD', 'PPSS')]
>>> br_tw_pre[-10:]
[('in/IN', ','), ('case/NN', 'IN'), (',/,', 'NN'), ('i/PPSS', ','), ('brought/VBD', 'PPSS'), ('the/AT', 'VBD'), ('money/NN', 'AT'), ('with/IN', 'NN'), ('me/PPO', 'IN'), ("''/''", 'PPO')]
>>> br_tw_pre_cfd = nltk.ConditionalFreqDist(br_tw_pre)
>>> br_tw_pre_cfd['so/RB']
FreqDist({'CC': 10, '.': 10, 'VB': 4, ',': 4, 'DO': 3, '``': 3, 'VBZ': 2, 'VBG': 2, 'BEDZ*': 2, '--': 1, ...})
>>> br_tw_pre_cfd['so/RB'].most_common(5)
[('CC', 10), ('.', 10), ('VB', 4), (',', 4), ('DO', 3)]
>>> br_tw_pre_cfd['so/RB'].freq('CC')
0.20833333333333334
>>> br_tw_pre_cfd['so/CS']
FreqDist({'.': 7, ',': 6, 'NN': 5, 'NNS': 3, 'QLP': 2, 'PPO': 2, 'VBD': 2, '--': 2, 'RB': 1, 'CC': 1, ...})
>>> br_tw_pre_cfd['so/QL']
FreqDist({'IN': 5, 'BEDZ': 5, ',': 4, 'RB': 3, 'NN': 3, 'QL': 3, 'VBN': 2, 'VB': 2, 'PPO': 2, 'BEN': 1, ...})


# --------------------------- conditional frequency of following POS

>>> br_tw_post = [(w2+"/"+t2, t3) for ((w1,t1), (w2,t2), (w3,t3)) in br_3grams]
>>> br_tw_post_cfd = nltk.ConditionalFreqDist(br_tw_post)
>>> br_tw_post_cfd['so/QL']
FreqDist({'JJ': 18, 'RB': 14, 'AP': 7, 'QL': 3, 'VBN': 2})



# ---------------------------- Now onto Penn Treebank

>>> from nltk.corpus import treebank
>>> treebank.words()
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', ...]
>>> treebank.words()[:20]
['Pierre', 'Vinken', ',', '61', 'years', 'old', ',', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'Nov.', '29', '.', 'Mr.', 'Vinken']
>>> treebank.tagged_words()[:20]
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.'), ('Mr.', 'NNP'), ('Vinken', 'NNP')]
>>> treebank.tagged_sents()
[[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')], [('Mr.', 'NNP'), ('Vinken', 'NNP'), ('is', 'VBZ'), ('chairman', 'NN'), ('of', 'IN'), ('Elsevier', 'NNP'), ('N.V.', 'NNP'), (',', ','), ('the', 'DT'), ('Dutch', 'NNP'), ('publishing', 'VBG'), ('group', 'NN'), ('.', '.')], ...]
>>> treebank.tagged_sents()[0]
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT'), ('board', 'NN'), ('as', 'IN'), ('a', 'DT'), ('nonexecutive', 'JJ'), ('director', 'NN'), ('Nov.', 'NNP'), ('29', 'CD'), ('.', '.')]
>>> len(treebank.tagged_words())
100676
>>> len(treebank.tagged_sents())
3914

# ---------------------------- Syntactic trees

>>> treebank.parsed_sents()[0]
Tree('S', [Tree('NP-SBJ', [Tree('NP', [Tree('NNP', ['Pierre']), Tree('NNP', ['Vinken'])]), Tree(',', [',']), Tree('ADJP', [Tree('NP', [Tree('CD', ['61']), Tree('NNS', ['years'])]), Tree('JJ', ['old'])]), Tree(',', [','])]), Tree('VP', [Tree('MD', ['will']), Tree('VP', [Tree('VB', ['join']), Tree('NP', [Tree('DT', ['the']), Tree('NN', ['board'])]), Tree('PP-CLR', [Tree('IN', ['as']), Tree('NP', [Tree('DT', ['a']), Tree('JJ', ['nonexecutive']), Tree('NN', ['director'])])]), Tree('NP-TMP', [Tree('NNP', ['Nov.']), Tree('CD', ['29'])])])]), Tree('.', ['.'])])
>>> treebank.parsed_sents()[0].pprint()
(S
  (NP-SBJ
    (NP (NNP Pierre) (NNP Vinken))
    (, ,)
    (ADJP (NP (CD 61) (NNS years)) (JJ old))
    (, ,))
  (VP
    (MD will)
    (VP
      (VB join)
      (NP (DT the) (NN board))
      (PP-CLR (IN as) (NP (DT a) (JJ nonexecutive) (NN director)))
      (NP-TMP (NNP Nov.) (CD 29))))
  (. .))
>>> treebank.parsed_sents()[0].draw()

# --------------------------- Syntactic trees are RECURSIVE!
#                             Now back to tagged words:

>>> treebank.tagged_words()[:10]
[('Pierre', 'NNP'), ('Vinken', 'NNP'), (',', ','), ('61', 'CD'), ('years', 'NNS'), ('old', 'JJ'), (',', ','), ('will', 'MD'), ('join', 'VB'), ('the', 'DT')]
>>> treebank_cfd = nltk.ConditionalFreqDist(treebank.tagged_words())
>>> treebank_cfd['question']
FreqDist({'NN': 12, 'VBP': 1, 'VB': 1})
>>> nltk.help.upenn_tagset('VBP')
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
>>> nltk.help.upenn_tagset('VB')
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
>>> nltk.help.upenn_tagset('V.*')
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
VBN: verb, past participle
    multihulled dilapidated aerosolized chaired languished panelized used
    experimented flourished imitated reunifed factored condensed sheared
    unsettled primed dubbed desired ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...

>>> treebank_cfd['so']
FreqDist({'RB': 55, 'IN': 16})      # only RB and IN?
>>> nltk.help.upenn_tagset('RB')
RB: adverb
    occasionally unabatingly maddeningly adventurously professedly
    stirringly prominently technologically magisterially predominately
    swiftly fiscally pitilessly ...
>>> nltk.help.upenn_tagset('IN')
IN: preposition or conjunction, subordinating
    astride among uppon whether out inside pro despite on by throughout
    below within for towards near behind atop around if like until below
    next into if beside ...

# --------------------------- Treebank treats CS usage of 'so' as part of its preposition (IN) use
#                               and its QL (qualifier) use as RB (adverb) broadly


>>> treebank_cfd['will']
FreqDist({'MD': 280, 'NN': 1})
>>> treebank_cfd['share']
FreqDist({'NN': 116, 'VB': 3})

# --------------------------- 'share' is overwhelmingly a noun?? Digging deeper...

>>> print(treebank.readme())
[ PENN TREEBANK SAMPLE ]
http://www.cis.upenn.edu/~treebank/home.html

This is a ~5% fragment of Penn Treebank, (C) LDC 1995.  It is made
available under fair use for the purposes of illustrating NLTK tools
for tokenizing, tagging, chunking and parsing.  This data is for
non-commercial use only.

Contents: raw, tagged, parsed and combined data from Wall Street
Journal for 1650 sentences (99 treebank files wsj_0001 .. wsj_0099).
For details about each of the four types, please see the other README
files included in the treebank sample directory.  Examples of the four
types are shown below:

### The rest of readme clipped. Wall Street Journal it is!


# --------------------------- More tag ambiguity

>>> treebank_cfd['fly']
FreqDist({'VB': 1, 'VBP': 1})
>>> treebank_cfd['like']
FreqDist({'IN': 49, 'VB': 8, 'VBP': 4, 'JJ': 1})

# --------------------------- Penn Treebank tagset definitions

>>> nltk.help.upenn_tagset('VB')
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
>>> nltk.help.upenn_tagset('V.*')
VB: verb, base form
    ask assemble assess assign assume atone attention avoid bake balkanize
    bank begin behold believe bend benefit bevel beware bless boil bomb
    boost brace break bring broil brush build ...
VBD: verb, past tense
    dipped pleaded swiped regummed soaked tidied convened halted registered
    cushioned exacted snubbed strode aimed adopted belied figgered
    speculated wore appreciated contemplated ...
VBG: verb, present participle or gerund
    telegraphing stirring focusing angering judging stalling lactating
    hankerin' alleging veering capping approaching traveling besieging
    encrypting interrupting erasing wincing ...
VBN: verb, past participle
    multihulled dilapidated aerosolized chaired languished panelized used
    experimented flourished imitated reunifed factored condensed sheared
    unsettled primed dubbed desired ...
VBP: verb, present tense, not 3rd person singular
    predominate wrap resort sue twist spill cure lengthen brush terminate
    appear tend stray glisten obtain comprise detest tease attract
    emphasize mold postpone sever return wag ...
VBZ: verb, present tense, 3rd person singular
    bases reconstructs marks mixes displeases seals carps weaves snatches
    slumps stretches authorizes smolders pictures emerges stockpiles
    seduces fizzes uses bolsters slaps speaks pleads ...


# --------------------------- Let's try NLTK's built-in POS tagger
#                             warning: not that great

>>> sent = "Colorless green ideas sleep furiously."
>>> nltk.word_tokenize(sent)
['Colorless', 'green', 'ideas', 'sleep', 'furiously', '.']
>>> chom = nltk.word_tokenize(sent)
>>> chom
['Colorless', 'green', 'ideas', 'sleep', 'furiously', '.']
>>> nltk.pos_tag(chom)
LookupError:
**********************************************************************
  Resource averaged_perceptron_tagger_eng not found.
  Please use the NLTK Downloader to obtain the resource:
  >>> import nltk
  >>> nltk.download('averaged_perceptron_tagger_eng')
  For more information see: https://www.nltk.org/data.html
>>> nltk.download('averaged_perceptron_tagger_eng')   # download tagger model first
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Jane Eyre\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.
True
>>> nltk.pos_tag(chom)
[('Colorless', 'NNP'), ('green', 'JJ'), ('ideas', 'NNS'), ('sleep', 'VBP'), ('furiously', 'RB'), ('.', '.')]
>>> nltk.help.upenn_tagset('NNP')
NNP: noun, proper, singular
    Motown Venneboerger Czestochwa Ranzer Conchita Trumplane Christos
    Oceanside Escobar Kreisler Sawyer Cougar Yvette Ervin ODI Darryl CTCA
    Shannon A.K.C. Meltex Liverpool ...

>>> nltk.pos_tag('January was a cold month'.split())
[('January', 'NNP'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('month', 'NN')]
>>> nltk.pos_tag('January was a cold month .'.split())
[('January', 'NNP'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('month', 'NN'), ('.', '.')]
>>> nltk.pos_tag('Yesterday was a cold day .'.split())
[('Yesterday', 'NN'), ('was', 'VBD'), ('a', 'DT'), ('cold', 'JJ'), ('day', 'NN'), ('.', '.')]