Word2Vec through gensim¶

4/10/2018 PyLing meeting

Tutorial at: https://machinelearningmastery.com/develop-word-embeddings-python-gensim/
The code below is a near-exact copy of everything in this tutorial. See the tutorial page for detailed explanation of what's going on.

Also, read SLP3 chapters:
- Ch.15 Vector semantics https://web.stanford.edu/~jurafsky/slp3/15.pdf
- Ch.16 Semantics with dense vectors https://web.stanford.edu/~jurafsky/slp3/16.pdf

Setting up your python environment on CRC¶

ssh into h2p.crc.pitt.edu
Your default python is 2.7.5. Confirm by: python --version
See which versions are available on system: module spider python
Switch over to the appropriate version of python: module load python/anaconda3.5-4.2.0
Confirm you are now working with this version of python: python --version
gensim package is not installed system-wide. You should install your own copy: pip intall gensim --user

Using Jupyter Notebook on CRC¶

Point your browser to https://hub.crc.pitt.edu/
Log in with your Pitt ID (will probably have to 2-factor-authenticate)
For this demo session, use "Host process" as job profile (less prone to network overload)
When you are running a real job, you should choose "SMP - 1 core, 1 hour"!!!
When done using Jupyter Notebook, make sure to properly close your session by logging out.

# Turns on/off pretty printing 
%pprint

# Every returned Out[] is displayed, not just the last one. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Pretty printing has been turned OFF

# Import Word2Vec from gensim. 
# gensim is not installed system-wide. Install for yourself: pip install gensim --user
from gensim.models import Word2Vec

from nltk import word_tokenize

mary = """Mary had a little lamb,
His fleece was white as snow,
And everywhere that Mary went,
The lamb was sure to go.
"Why does the lamb love Mary so?"
The eager children cry.
"Why, Mary loves the lamb, you know."
The teacher did reply."""

sentences = [word_tokenize(s) for s in mary.lower().split('\n')]
sentences

[['mary', 'had', 'a', 'little', 'lamb', ','], ['his', 'fleece', 'was', 'white', 'as', 'snow', ','], ['and', 'everywhere', 'that', 'mary', 'went', ','], ['the', 'lamb', 'was', 'sure', 'to', 'go', '.'], ['``', 'why', 'does', 'the', 'lamb', 'love', 'mary', 'so', '?', "''"], ['the', 'eager', 'children', 'cry', '.'], ['``', 'why', ',', 'mary', 'loves', 'the', 'lamb', ',', 'you', 'know', '.', "''"], ['the', 'teacher', 'did', 'reply', '.']]

# train model
model = Word2Vec(sentences, min_count=1)

# summarize the loaded model
print(model)

Word2Vec(vocab=37, size=100, alpha=0.025)

# summarize vocabulary
words = list(model.wv.vocab)
print(words)

['mary', 'white', '.', 'so', 'everywhere', 'to', 'went', '?', 'that', 'know', 'was', 'eager', 'does', 'sure', 'a', 'little', 'you', 'go', 'the', 'why', "''", 'cry', 'as', 'snow', '``', 'teacher', 'reply', 'fleece', 'and', 'children', 'did', 'love', 'loves', 'had', 'his', ',', 'lamb']

# access vector for one word
#print(model['sentence'])
print(model['lamb'])

[-2.7182766e-03  1.2487710e-03 -4.4074864e-03  3.5265326e-03
  1.7131231e-03 -2.8178061e-03  2.9640337e-03 -2.4741637e-03
  2.6946011e-04 -1.5621432e-03  1.7189173e-04 -1.4304175e-03
 -1.0281261e-03  1.5905446e-03  7.1188854e-04 -3.9464920e-03
  2.5527547e-03 -1.0956245e-03  7.3364319e-04  2.5594078e-03
 -9.1431820e-04  2.2941572e-03  3.5932153e-03  1.8624350e-03
  2.4553102e-03 -2.8646537e-03 -2.4643880e-03  1.9218427e-03
 -3.8764605e-03  3.4359689e-03 -2.4164815e-03  1.7021263e-03
 -3.5764112e-03  1.4196016e-03 -2.0195674e-03  4.0827030e-03
 -3.9471732e-03 -6.1231584e-04  3.9051475e-03 -3.8997584e-03
 -1.0480320e-03  2.0413217e-03 -4.5277039e-03  3.7661460e-03
 -2.5259005e-03 -1.4360002e-03  7.3607016e-04  3.8408018e-03
  3.9864983e-03 -6.8123976e-04 -3.5251860e-04  4.0145023e-03
  3.1279428e-03  3.9720386e-03  9.3507761e-04 -2.3743813e-03
 -2.3502845e-03  3.5284371e-03 -5.6150433e-04  3.0950115e-03
  3.9208033e-03  2.0687412e-03 -4.6105627e-03 -4.9310355e-03
  3.1625992e-03  1.9219007e-03  1.3885073e-03 -4.1954550e-03
  3.5644420e-03 -7.0061404e-05  3.6340770e-03 -3.7480018e-03
  7.0168247e-04 -4.0180283e-03  2.3534100e-03 -1.8334251e-03
 -2.7314085e-03  2.3767257e-03  2.6325590e-03  2.2375344e-03
  3.1389827e-03 -3.8831038e-04 -4.8180486e-04 -2.2694494e-03
 -1.0046981e-03 -2.2774746e-03  1.5747132e-03  4.3728095e-03
  2.2946699e-03 -3.6577368e-03  3.3423168e-04  1.2059795e-03
  6.9353776e-04 -3.6424596e-03  4.3872455e-03 -3.7329900e-03
 -2.2044044e-03  7.2465651e-04 -1.1315111e-03 -2.8985806e-03]

/ihome/crc/install/python/anaconda3.5-4.2.0/lib/python3.5/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  This is separate from the ipykernel package so we can avoid doing imports until

# save model
model.save('model.bin')

# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

Word2Vec(vocab=37, size=100, alpha=0.025)

dir(model)

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__ignoreds', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__numpys', '__recursive_saveloads', '__reduce__', '__reduce_ex__', '__repr__', '__scipys', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_training_sanity', '_clear_post_train', '_do_train_job', '_get_job_params', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_minimize_model', '_raw_word_count', '_save_specials', '_set_train_params', '_smart_save', '_train_epoch', '_update_job_params', '_worker_loop', 'accuracy', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'callbacks', 'cbow_mean', 'clear_sims', 'compute_loss', 'corpus_count', 'cum_table', 'delete_temporary_training_data', 'doesnt_match', 'epochs', 'estimate_memory', 'evaluate_word_pairs', 'get_latest_training_loss', 'hashfxn', 'hs', 'init_sims', 'intersect_word2vec_format', 'iter', 'layer1_size', 'load', 'load_word2vec_format', 'log_accuracy', 'min_alpha', 'min_alpha_yet_reached', 'min_count', 'model_trimmed_post_training', 'most_similar', 'most_similar_cosmul', 'n_similarity', 'negative', 'predict_output_word', 'random', 'reset_from', 'running_training_loss', 'sample', 'save', 'save_word2vec_format', 'score', 'sg', 'similar_by_vector', 'similar_by_word', 'similarity', 'syn0_lockf', 'syn1', 'syn1neg', 'total_train_time', 'train', 'train_count', 'trainables', 'vector_size', 'vocabulary', 'window', 'wmdistance', 'workers', 'wv']

model.wv.vocab

{'mary': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35860>, 'white': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35dd8>, '.': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc358d0>, 'so': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f98>, 'everywhere': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35908>, 'to': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35978>, 'went': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc359b0>, '?': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a90>, 'that': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a58>, 'know': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc3b048>, 'was': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc359e8>, 'eager': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b00>, 'does': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b38>, 'sure': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b70>, 'a': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ba8>, 'little': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c18>, 'you': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c50>, 'go': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c88>, 'the': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35fd0>, 'why': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35cf8>, "''": <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35d30>, 'cry': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc3b080>, 'as': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35d68>, 'snow': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35da0>, '``': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35898>, 'teacher': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a20>, 'reply': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35940>, 'fleece': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e80>, 'and': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35eb8>, 'children': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ef0>, 'did': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35be0>, 'love': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f60>, 'loves': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e10>, 'had': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35cc0>, 'his': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e48>, ',': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f28>, 'lamb': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ac8>}

Visualization through PCA (principal component analysis)¶

from sklearn.decomposition import PCA
from matplotlib import pyplot

# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()

/ihome/crc/install/python/anaconda3.5-4.2.0/lib/python3.5/site-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).

<matplotlib.collections.PathCollection object at 0x7f41b8c432e8>

<matplotlib.text.Annotation object at 0x7f41b8c551d0>

<matplotlib.text.Annotation object at 0x7f41b8c43438>

<matplotlib.text.Annotation object at 0x7f41b8c43e80>

<matplotlib.text.Annotation object at 0x7f41b8c55400>

<matplotlib.text.Annotation object at 0x7f41b8c55588>

<matplotlib.text.Annotation object at 0x7f41b8c55710>

<matplotlib.text.Annotation object at 0x7f41b8c55898>

<matplotlib.text.Annotation object at 0x7f41b8c55a20>

Load Google's pre-trained word2vec embedding¶

Good news! I have already downloaded the model file and saved it in our shared_data folder on h2p.

from gensim.models import KeyedVectors

filename = '/ihome/pyling/shared_data/word_vectors/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
# works!

[('queen', 0.7118192315101624)]

result = model.most_similar(positive=['woman', 'actor'], negative=['man'], topn=1)
print(result)
# works again!

[('actress', 0.860262393951416)]

result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# Hmm. Not sure about this one. 100 billion words of news data might not feature enough kittens and puppies.

[('dogs', 0.7231072187423706)]

Load Stanford's pre-trained GloVe embedding¶

Again, the model files are already downloaded in the shared_data folder.

# GloVe file format must be first convertd to Word2Vec format
# Something that needs to be done ONCE: you guys don't need to do this! 

from gensim.scripts.glove2word2vec import glove2word2vec

vdir = '/ihome/pyling/shared_data/word_vectors/'
glove_input_file = vdir + 'glove.6B.100d.txt'
word2vec_output_file = vdir + 'glove.6B.100d.txt.word2vec'

glove2word2vec(glove_input_file, word2vec_output_file)

(400000, 100)

from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = vdir + 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

[('queen', 0.7698541283607483)]

result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# 6 billion words of Wikipedia --> more kittens and puppies possibly.

[('puppy', 0.6481586694717407)]

result = model.most_similar(positive=['pittsburgh', 'eagles'], negative=['philadelphia'], topn=5)
print(result)
# Beautiful!

[('steelers', 0.7623487710952759), ('vikings', 0.7566234469413757), ('falcons', 0.7381411790847778), ('panthers', 0.7375524640083313), ('broncos', 0.735093355178833)]

result = model.most_similar(positive=['turtle', 'snail'], negative=['shell'], topn=5)
print(result)
# Hmm...

[('wildflower', 0.6302021741867065), ('turtles', 0.5460652112960815), ('cichlid', 0.5293101668357849), ('songbird', 0.5284878611564636), ('tortoises', 0.5175418257713318)]

result = model.most_similar(positive=['cow', 'pork'], negative=['pig'], topn=5)
print(result)
# Yayy!

[('beef', 0.7819944620132446), ('meat', 0.689876914024353), ('chicken', 0.5895066857337952), ('poultry', 0.5763587951660156), ('meats', 0.5711966753005981)]

result = model.most_similar(positive=['michelle', 'barack'], negative=['bush'], topn=5)
print(result)
# Huh! Is it because Laura Bush is still a Bush? 
# But George is not a very distinctive first name, and we can't do George W.

[('sasha', 0.6619373559951782), ('sarah', 0.6275115609169006), ('julia', 0.6156177520751953), ('malia', 0.6076964139938354), ('jessica', 0.605925440788269)]