%pprint

Pretty printing has been turned OFF

# Install stanza for yourself. I've already done this, so commented out.  
#    This installs the package in my home directory ~/.local/lib/python3.11/site-packages

# !pip install stanza --user

import stanza

# Download the English language model via stanza.download('en')
# This creates a new folder ~/stanza_resources and download the model files in it
# I've already downloaded the model, so commented out:

# stanza.download('en')

# Build an English NLP pipeline
nlp = stanza.Pipeline('en')
# By default, it checks for a newer language model. Use download_method=None to skip the check

2025-04-10 13:22:45 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-10 13:22:45 INFO: Downloaded file to C:\Users\narae\stanza_resources\resources.json
2025-04-10 13:22:47 INFO: Loading these models for language: en (English):
============================================
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |
============================================

2025-04-10 13:22:47 INFO: Using device: cpu
2025-04-10 13:22:47 INFO: Loading: tokenize
C:\Program Files\Python311\Lib\site-packages\torch\_utils.py:776: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()
  return self.fget.__get__(instance, owner)()
2025-04-10 13:22:47 INFO: Loading: mwt
2025-04-10 13:22:47 INFO: Loading: pos
2025-04-10 13:22:48 INFO: Loading: lemma
2025-04-10 13:22:48 INFO: Loading: constituency
2025-04-10 13:22:48 INFO: Loading: depparse
2025-04-10 13:22:48 INFO: Loading: sentiment
2025-04-10 13:22:49 INFO: Loading: ner
2025-04-10 13:22:50 INFO: Done loading processors!

doc = nlp("Barack Obama was born in Hawaii. He was elected president in 2008.")

# Printing out in Conllu format. 
# Confession: have no idea why it works the way it does. 
print("{:C}".format(doc))

# text = Barack Obama was born in Hawaii.
# sent_id = 0
# constituency = (ROOT (S (NP (NNP Barack) (NNP Obama)) (VP (VBD was) (VP (VBN born) (PP (IN in) (NP (NNP Hawaii))))) (. .)))
# sentiment = 1
1	Barack	Barack	PROPN	NNP	Number=Sing	4	nsubj:pass	_	start_char=0|end_char=6|ner=B-PERSON
2	Obama	Obama	PROPN	NNP	Number=Sing	1	flat	_	start_char=7|end_char=12|ner=E-PERSON
3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	4	aux:pass	_	start_char=13|end_char=16|ner=O
4	born	bear	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=17|end_char=21|ner=O
5	in	in	ADP	IN	_	6	case	_	start_char=22|end_char=24|ner=O
6	Hawaii	Hawaii	PROPN	NNP	Number=Sing	4	obl	_	start_char=25|end_char=31|ner=S-GPE|SpaceAfter=No
7	.	.	PUNCT	.	_	4	punct	_	start_char=31|end_char=32|ner=O

# text = He was elected president in 2008.
# sent_id = 1
# constituency = (ROOT (S (NP (PRP He)) (VP (VBD was) (VP (VBN elected) (S (NP (NN president))) (PP (IN in) (NP (CD 2008))))) (. .)))
# sentiment = 1
1	He	he	PRON	PRP	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Prs	3	nsubj:pass	_	start_char=33|end_char=35|ner=O
2	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	3	aux:pass	_	start_char=36|end_char=39|ner=O
3	elected	elect	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root	_	start_char=40|end_char=47|ner=O
4	president	president	NOUN	NN	Number=Sing	3	xcomp	_	start_char=48|end_char=57|ner=O
5	in	in	ADP	IN	_	6	case	_	start_char=58|end_char=60|ner=O
6	2008	2008	NUM	CD	NumForm=Digit|NumType=Card	3	obl	_	start_char=61|end_char=65|ner=S-DATE|SpaceAfter=No
7	.	.	PUNCT	.	_	3	punct	_	start_char=65|end_char=66|ner=O|SpaceAfter=No

print("{:C}".format(nlp("NLP is my favorite thing in the world. You have thousand years to live.")))

# text = NLP is my favorite thing in the world.
# sent_id = 0
# constituency = (ROOT (S (NP (NNP NLP)) (VP (VBZ is) (NP (NP (PRP$ my) (JJ favorite) (NN thing)) (PP (IN in) (NP (DT the) (NN world))))) (. .)))
# sentiment = 2
1	NLP	NLP	PROPN	NNP	Number=Sing	5	nsubj	_	start_char=0|end_char=3|ner=O
2	is	be	AUX	VBZ	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	5	cop	_	start_char=4|end_char=6|ner=O
3	my	my	PRON	PRP$	Case=Gen|Number=Sing|Person=1|Poss=Yes|PronType=Prs	5	nmod:poss	_	start_char=7|end_char=9|ner=O
4	favorite	favorite	ADJ	JJ	Degree=Pos	5	amod	_	start_char=10|end_char=18|ner=O
5	thing	thing	NOUN	NN	Number=Sing	0	root	_	start_char=19|end_char=24|ner=O
6	in	in	ADP	IN	_	8	case	_	start_char=25|end_char=27|ner=O
7	the	the	DET	DT	Definite=Def|PronType=Art	8	det	_	start_char=28|end_char=31|ner=O
8	world	world	NOUN	NN	Number=Sing	5	nmod	_	start_char=32|end_char=37|ner=O|SpaceAfter=No
9	.	.	PUNCT	.	_	5	punct	_	start_char=37|end_char=38|ner=O

# text = You have thousand years to live.
# sent_id = 1
# constituency = (ROOT (S (NP (PRP You)) (VP (VBP have) (NP (NP (CD thousand) (NNS years)) (SBAR (S (VP (TO to) (VP (VB live))))))) (. .)))
# sentiment = 1
1	You	you	PRON	PRP	Case=Nom|Person=2|PronType=Prs	2	nsubj	_	start_char=39|end_char=42|ner=O
2	have	have	VERB	VBP	Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin	0	root	_	start_char=43|end_char=47|ner=O
3	thousand	thousand	NUM	CD	NumForm=Word|NumType=Card	4	nummod	_	start_char=48|end_char=56|ner=B-DATE
4	years	year	NOUN	NNS	Number=Plur	2	obj	_	start_char=57|end_char=62|ner=E-DATE
5	to	to	PART	TO	_	6	mark	_	start_char=63|end_char=65|ner=O
6	live	live	VERB	VB	VerbForm=Inf	4	acl	_	start_char=66|end_char=70|ner=O|SpaceAfter=No
7	.	.	PUNCT	.	_	2	punct	_	start_char=70|end_char=71|ner=O|SpaceAfter=No

# What can you do with a document object?
dir(doc)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_attach_coref_mentions', '_coref', '_count_words', '_ents', '_lang', '_num_tokens', '_num_words', '_process_sentences', '_sentences', '_text', 'add_property', 'build_ents', 'coref', 'entities', 'ents', 'from_serialized', 'get', 'get_mwt_expansions', 'iter_tokens', 'iter_words', 'lang', 'mark_whitespace', 'num_tokens', 'num_words', 'reindex_sentences', 'sentence_comments', 'sentences', 'set', 'set_mwt_expansions', 'sort_features', 'text', 'to_dict', 'to_serialized']

# What can you do with a sentence object?
dir(doc.sentences[0])

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_comments', '_constituency', '_dependencies', '_doc', '_doc_id', '_empty_words', '_enhanced_dependencies', '_ents', '_index', '_process_tokens', '_sent_id', '_sentiment', '_text', '_tokens', '_words', 'add_comment', 'add_property', 'build_dependencies', 'build_ents', 'build_fake_dependencies', 'comments', 'constituency', 'dependencies', 'dependencies_string', 'doc', 'doc_id', 'empty_words', 'entities', 'ents', 'has_enhanced_dependencies', 'id', 'index', 'print_dependencies', 'print_tokens', 'print_words', 'rebuild_dependencies', 'sent_id', 'sentiment', 'text', 'to_dict', 'tokens', 'tokens_string', 'words', 'words_string']

doc.sentences[0].text

'Barack Obama was born in Hawaii.'

doc.sentences[0].constituency

(ROOT (S (NP (NNP Barack) (NNP Obama)) (VP (VBD was) (VP (VBN born) (PP (IN in) (NP (NNP Hawaii))))) (. .)))

dir(doc.sentences[0].constituency)

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'add_property', 'all_leaves_are_preterminals', 'children', 'count_unary_depth', 'depth', 'get_common_words', 'get_compound_constituents', 'get_constituent_counts', 'get_rare_words', 'get_root_labels', 'get_unique_constituent_labels', 'get_unique_tags', 'get_unique_words', 'is_leaf', 'is_preterminal', 'label', 'leaf_labels', 'pretty_print', 'prune_none', 'remap_constituent_labels', 'remap_words', 'replace_tags', 'replace_words', 'reverse', 'simplify_labels', 'visit_preorder', 'write_treebank', 'yield_preterminals']

doc.sentences[0].constituency.__str__()

'(ROOT (S (NP (NNP Barack) (NNP Obama)) (VP (VBD was) (VP (VBN born) (PP (IN in) (NP (NNP Hawaii))))) (. .)))'

import nltk
tree1 = nltk.Tree.fromstring(doc.sentences[0].constituency.__str__())
tree1

doc.sentences[0].print_dependencies()

('Barack', 4, 'nsubj:pass')
('Obama', 1, 'flat')
('was', 4, 'aux:pass')
('born', 0, 'root')
('in', 6, 'case')
('Hawaii', 4, 'obl')
('.', 4, 'punct')

# visualizing dependency trees! 
from stanza.utils.visualization.dependency_visualization import visualize_doc
visualize_doc(doc, 'en')

doc.sentences[0].words[:3]   # first 3 words

[{
  "id": 1,
  "text": "Barack",
  "lemma": "Barack",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 4,
  "deprel": "nsubj:pass",
  "start_char": 0,
  "end_char": 6
}, {
  "id": 2,
  "text": "Obama",
  "lemma": "Obama",
  "upos": "PROPN",
  "xpos": "NNP",
  "feats": "Number=Sing",
  "head": 1,
  "deprel": "flat",
  "start_char": 7,
  "end_char": 12
}, {
  "id": 3,
  "text": "was",
  "lemma": "be",
  "upos": "AUX",
  "xpos": "VBD",
  "feats": "Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin",
  "head": 4,
  "deprel": "aux:pass",
  "start_char": 13,
  "end_char": 16
}]

for w in doc.sentences[0].words:
    print(w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, sep="\t")

1	Barack	Barack	PROPN	NNP	Number=Sing	4	nsubj:pass
2	Obama	Obama	PROPN	NNP	Number=Sing	1	flat
3	was	be	AUX	VBD	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	4	aux:pass
4	born	bear	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	0	root
5	in	in	ADP	IN	None	6	case
6	Hawaii	Hawaii	PROPN	NNP	Number=Sing	4	obl
7	.	.	PUNCT	.	None	4	punct

doc.entities

[{
  "text": "Barack Obama",
  "type": "PERSON",
  "start_char": 0,
  "end_char": 12
}, {
  "text": "Hawaii",
  "type": "GPE",
  "start_char": 25,
  "end_char": 31
}, {
  "text": "2008",
  "type": "DATE",
  "start_char": 61,
  "end_char": 65
}]

nlp_ko = stanza.Pipeline('ko')
doc_ko = nlp_ko("오늘 날씨가 아주 좋군요.")

2025-04-10 13:22:59 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

2025-04-10 13:22:59 INFO: Downloaded file to C:\Users\narae\stanza_resources\resources.json
2025-04-10 13:23:00 INFO: Loading these models for language: ko (Korean):
==============================
| Processor | Package        |
------------------------------
| tokenize  | kaist          |
| pos       | kaist_nocharlm |
| lemma     | kaist_nocharlm |
| depparse  | kaist_nocharlm |
==============================

2025-04-10 13:23:00 INFO: Using device: cpu
2025-04-10 13:23:00 INFO: Loading: tokenize
2025-04-10 13:23:00 INFO: Loading: pos
2025-04-10 13:23:01 INFO: Loading: lemma
2025-04-10 13:23:02 INFO: Loading: depparse
2025-04-10 13:23:02 INFO: Done loading processors!

doc_ko.sentences[0].print_dependencies()

('오늘', 2, 'compound')
('날씨가', 4, 'nsubj')
('아주', 4, 'advmod')
('좋군요', 0, 'root')
('.', 4, 'punct')

# How is morphology represented?
for w in doc_ko.sentences[0].words:
    print(w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, sep="\t")

1	오늘	오늘	NOUN	ncn	None	2	compound
2	날씨가	날씨+가	NOUN	ncn+jcs	None	4	nsubj
3	아주	아주	ADV	mag	None	4	advmod
4	좋군요	좋+군요	ADJ	paa+ef	None	0	root
5	.	.	PUNCT	sf	None	4	punct

import pandas as pd

filename = 'review_10k.json'
df = pd.read_json(filename, lines=True, encoding='utf-8')
print(df.head(5))

                review_id                 user_id             business_id  \
0  YS5GiNt7SeaofYv4Ms_WMQ  6U0PY5tSp2kiFFBG-2conA  6M747U8wDVZcRZHqIYB_iQ   
1  7IfTa9YkgpUl2uxpPH3eng  8RrH2pgJJ2PPGZwYuzJnRg  DpmTxK7_GPfiyLAM6LSL7Q   
2  cNmwd1HdI6o075iqqqTduQ  ZrMA70Hq6mkJT-1l0xnwPQ  Ckzuf2-coSImCKlYwKRTNw   
3  I5Qlw24u74kCEtxxFAUJDg  1J4UgfT5v5rwYLCX6fCbYQ  3L-ezs0VANOtmdJDlbha1Q   
4  SQgplD2UFkduTfjgdgaYqg  KHgWURbB5TNHNYu72memYg  v_1d4aNN7bjiQ7aF3S0hgA   

   stars  useful  funny  cool  \
0      1       2      0     0   
1      5       0      0     0   
2      3       1      1     1   
3      5       1      0     0   
4      5       0      0     0   

                                                text                date  
0  Very confusing entrance. The putting in your n... 2015-11-30 20:37:04  
1  Absolutely delicious vegan donuts - not just o... 2018-09-13 20:02:55  
2  I had such high hopes! I will say the location... 2016-05-13 17:10:57  
3  Busy when I went. Worth the wait. The best Mex... 2013-06-15 03:25:33  
4  Have been wanting to check this place out for ... 2018-07-29 16:06:48

df = df[['stars', 'text']]
df.head()

df.text[0]

'Very confusing entrance. The putting in your name line and the your table is ready line are just muddled together. Called ahead to see if they accept reservation, they don\'t,  but was told to tell the hostess what game we wanted to see and they would make sure we were in that section. When we arrived the hostess, Natalie, asked our party size and then when we said we had called ahead as we were told to do but before we had a chance she cut us off with "We don\'t do reservations" we continued on with our request. When we were seated we had to ask to have the game we wanted put on a screen near us. They did so. The food was okay. A bit pricey but you are in a tourist mecca. My wife ordered a "Wedge" salad what she got was a salad with a few lettuce leaves definately no wedge. I had the Cuban Meatloaf sandwich it was alright. All in all if you want to watch a game and pay alot for a beer (6.50 a pint for domestic) this is your place.'

df.text[3]

"Busy when I went. Worth the wait. The best Mexican food I've had in Portland by far. Food made with love and great flavor!!"

doc1 = nlp(df.text[3])

for w in doc1.iter_words():
    print(w.id, w.text, w.lemma, w.upos, w.xpos, w.feats, w.head, w.deprel, sep="\t")

1	Busy	busy	ADJ	JJ	Degree=Pos	0	root
2	when	when	ADV	WRB	PronType=Int	4	advmod
3	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1|PronType=Prs	4	nsubj
4	went	go	VERB	VBD	Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin	1	advcl
5	.	.	PUNCT	.	None	1	punct
1	Worth	worth	ADJ	JJ	Degree=Pos	0	root
2	the	the	DET	DT	Definite=Def|PronType=Art	3	det
3	wait	wait	NOUN	NN	Number=Sing	1	obj
4	.	.	PUNCT	.	None	1	punct
1	The	the	DET	DT	Definite=Def|PronType=Art	4	det
2	best	good	ADJ	JJS	Degree=Sup	4	amod
3	Mexican	Mexican	ADJ	JJ	Degree=Pos	4	amod
4	food	food	NOUN	NN	Number=Sing	0	root
5	I	I	PRON	PRP	Case=Nom|Number=Sing|Person=1|PronType=Prs	7	nsubj
6	've	have	AUX	VBP	Mood=Ind|Number=Sing|Person=1|Tense=Pres|VerbForm=Fin	7	aux
7	had	have	VERB	VBN	Tense=Past|VerbForm=Part	4	acl:relcl
8	in	in	ADP	IN	None	9	case
9	Portland	Portland	PROPN	NNP	Number=Sing	7	obl
10	by	by	ADP	IN	None	11	case
11	far	far	ADV	RB	Degree=Pos	7	obl
12	.	.	PUNCT	.	None	4	punct
1	Food	food	NOUN	NN	Number=Sing	0	root
2	made	make	VERB	VBN	Tense=Past|VerbForm=Part|Voice=Pass	1	acl
3	with	with	ADP	IN	None	4	case
4	love	love	NOUN	NN	Number=Sing	2	obl
5	and	and	CCONJ	CC	None	7	cc
6	great	great	ADJ	JJ	Degree=Pos	7	amod
7	flavor	flavor	NOUN	NN	Number=Sing	4	conj
8	!!	!!	PUNCT	.	None	1	punct

for sent in doc1.sentences:
    print(sent.text)
    for w in sent.words:
        if w.upos=='ADJ': 
            # note how headword is looked up through ID and index:
            headtext = sent.words[w.head-1].text if w.head > 0 else "ROOT"
            print(" ", w.id, w.text, w.upos, str(w.head)+":"+headtext, w.deprel, sep="\t")
    print()

Busy when I went.
 	1	Busy	ADJ	0:ROOT	root

Worth the wait.
 	1	Worth	ADJ	0:ROOT	root

The best Mexican food I've had in Portland by far.
 	2	best	ADJ	4:food	amod
 	3	Mexican	ADJ	4:food	amod

Food made with love and great flavor!!
 	6	great	ADJ	7:flavor	amod

for i, sent in enumerate(doc1.sentences):
    print(sent.text)
    print("%d -> %d" % (i, sent.sentiment))

Busy when I went.
0 -> 1
Worth the wait.
1 -> 1
The best Mexican food I've had in Portland by far.
2 -> 2
Food made with love and great flavor!!
3 -> 2

senti_scores = [sent.sentiment for sent in doc1.sentences]
print(senti_scores)    # 1s and 2s
import numpy as np
np.mean(senti_scores)    # this was a 5-star review

[1, 1, 2, 2]

1.5

doc2 = nlp(df.text[0])
for i, sent in enumerate(doc2.sentences):
    print(sent.text)
    print("%d -> %d" % (i, sent.sentiment))

senti_scores2 = [sent.sentiment for sent in doc2.sentences]
np.mean(senti_scores2)   # this was a 1-star review

Very confusing entrance.
0 -> 0
The putting in your name line and the your table is ready line are just muddled together.
1 -> 0
Called ahead to see if they accept reservation, they don't,  but was told to tell the hostess what game we wanted to see and they would make sure we were in that section.
2 -> 1
When we arrived the hostess, Natalie, asked our party size and then when we said we had called ahead as we were told to do but before we had a chance she cut us off with "We don't do reservations" we continued on with our request.
3 -> 0
When we were seated we had to ask to have the game we wanted put on a screen near us.
4 -> 1
They did so.
5 -> 1
The food was okay.
6 -> 1
A bit pricey but you are in a tourist mecca.
7 -> 1
My wife ordered a "Wedge" salad what she got was a salad with a few lettuce leaves definately no wedge.
8 -> 1
I had the Cuban Meatloaf sandwich it was alright.
9 -> 1
All in all if you want to watch a game and pay alot for a beer (6.50 a pint for domestic) this is your place.
10 -> 1

0.7272727272727273

# progress bar! 
from tqdm import tqdm
tqdm.pandas(desc='DataFrame Operation')
review_docs = df.text.progress_apply(nlp)

## Well -- tqdm reports 2-5 sec per review, estimates 7-12 hours of total processing time. YIKES! 
## So I had to abort this run.

# So let's just do... 500 reviews. How long does it take? 
%time review_docs = df[:500].text.apply(nlp)
len(review_docs)

CPU times: total: 2h 20min 27s
Wall time: 25min 10s

500

dir(review_docs[0])

['__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_attach_coref_mentions', '_coref', '_count_words', '_ents', '_lang', '_num_tokens', '_num_words', '_process_sentences', '_sentences', '_text', 'add_property', 'build_ents', 'coref', 'entities', 'ents', 'from_serialized', 'get', 'get_mwt_expansions', 'iter_tokens', 'iter_words', 'lang', 'mark_whitespace', 'num_tokens', 'num_words', 'reindex_sentences', 'sentence_comments', 'sentences', 'set', 'set_mwt_expansions', 'sort_features', 'text', 'to_dict', 'to_serialized']

# (Condition, Outcome) format for every word in our favorite review
[(word.upos, word.text) for word in review_docs[3].iter_words()]

[('ADJ', 'Busy'), ('ADV', 'when'), ('PRON', 'I'), ('VERB', 'went'), ('PUNCT', '.'), ('ADJ', 'Worth'), ('DET', 'the'), ('NOUN', 'wait'), ('PUNCT', '.'), ('DET', 'The'), ('ADJ', 'best'), ('ADJ', 'Mexican'), ('NOUN', 'food'), ('PRON', 'I'), ('AUX', "'ve"), ('VERB', 'had'), ('ADP', 'in'), ('PROPN', 'Portland'), ('ADP', 'by'), ('ADV', 'far'), ('PUNCT', '.'), ('NOUN', 'Food'), ('VERB', 'made'), ('ADP', 'with'), ('NOUN', 'love'), ('CCONJ', 'and'), ('ADJ', 'great'), ('NOUN', 'flavor'), ('PUNCT', '!!')]

# For all of 500 review docs. Nested list comprehension that flattens: 
pos_word = [(word.upos, word.text.lower()) for mydoc in review_docs for word in mydoc.iter_words()]
pos_word[:10]

[('ADV', 'very'), ('ADJ', 'confusing'), ('NOUN', 'entrance'), ('PUNCT', '.'), ('DET', 'the'), ('VERB', 'putting'), ('ADP', 'in'), ('PRON', 'your'), ('NOUN', 'name'), ('NOUN', 'line')]

len(pos_word)

66238

pos_word_CFD = nltk.ConditionalFreqDist(pos_word)
pos_word_CFD['ADJ'].most_common(50)

[('good', 266), ('great', 161), ('other', 92), ('best', 92), ('nice', 88), ('more', 76), ('first', 69), ('new', 67), ('little', 67), ('delicious', 63), ('friendly', 63), ('amazing', 61), ('hot', 49), ('bad', 47), ('sure', 46), ('few', 45), ('same', 45), ('better', 44), ('last', 44), ('small', 41), ('many', 40), ('clean', 39), ('tasty', 38), ('different', 38), ('fresh', 38), ('perfect', 37), ('full', 36), ('worth', 36), ('only', 35), ('next', 34), ('big', 32), ('sweet', 31), ('free', 30), ('happy', 30), ('most', 29), ('awesome', 29), ('long', 28), ('wonderful', 27), ('old', 26), ('busy', 25), ('cool', 25), ('high', 24), ('hard', 24), ('else', 24), ('much', 23), ('worst', 23), ('less', 23), ('large', 23), ('decent', 23), ('several', 22)]

pos_word_CFD['ADJ'].freq('good')

0.050236071765816806

pos_word_CFD['ADV'].most_common(50)

[('so', 244), ('very', 218), ('when', 179), ('just', 175), ('here', 164), ('really', 124), ('back', 113), ('also', 99), ('never', 87), ('even', 84), ('how', 84), ('then', 83), ('there', 83), ('too', 79), ('always', 77), ('only', 77), ('again', 71), ('well', 67), ('as', 63), ('now', 58), ('still', 58), ('definitely', 57), ('ever', 51), ('pretty', 45), ('however', 42), ('about', 38), ('where', 36), ('highly', 32), ('in', 31), ('away', 30), ('super', 29), ('almost', 28), ('much', 27), ('actually', 26), ('more', 25), ('over', 25), ('maybe', 24), ('later', 23), ('probably', 22), ('once', 22), ('finally', 22), ('right', 21), ('quite', 21), ('all', 21), ('why', 20), ('though', 20), ('inside', 19), ('usually', 19), ('first', 19), ('soon', 19)]

Stanza by Stanford NLP: a New Library to Rule them All?¶

Stanza: A Python NLP Package for Many Human Languages¶

Tips for finding your way around new library and data objects¶

NER (Named-entity recognition)!¶

Let's try... Korean!¶

Back to English: Yelp Reviews, 10K only¶

Adjective + HEAD pairs?¶

Sentiment analysis!¶

What adjectives and adverbs are common in Yelp reviews?¶

Caution: Using Pre-Built NLP Suites¶

We're trained linguists, we can and should do better.¶

	stars	text
0	1	Very confusing entrance. The putting in your n...
1	5	Absolutely delicious vegan donuts - not just o...
2	3	I had such high hopes! I will say the location...
3	5	Busy when I went. Worth the wait. The best Mex...
4	5	Have been wanting to check this place out for ...