Word2Vec through gensim

4/10/2018 PyLing meeting

Setting up your python environment on CRC

  • ssh into h2p.crc.pitt.edu
  • Your default python is 2.7.5. Confirm by: python --version
  • See which versions are available on system: module spider python
  • Switch over to the appropriate version of python: module load python/anaconda3.5-4.2.0
  • Confirm you are now working with this version of python: python --version
  • gensim package is not installed system-wide. You should install your own copy: pip intall gensim --user

Using Jupyter Notebook on CRC

  • Point your browser to https://hub.crc.pitt.edu/
  • Log in with your Pitt ID (will probably have to 2-factor-authenticate)
  • For this demo session, use "Host process" as job profile (less prone to network overload)
  • When you are running a real job, you should choose "SMP - 1 core, 1 hour"!!!
  • When done using Jupyter Notebook, make sure to properly close your session by logging out.
In [1]:
# Turns on/off pretty printing 
%pprint

# Every returned Out[] is displayed, not just the last one. 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Pretty printing has been turned OFF
In [2]:
# Import Word2Vec from gensim. 
# gensim is not installed system-wide. Install for yourself: pip install gensim --user
from gensim.models import Word2Vec
In [4]:
from nltk import word_tokenize

mary = """Mary had a little lamb,
His fleece was white as snow,
And everywhere that Mary went,
The lamb was sure to go.
"Why does the lamb love Mary so?"
The eager children cry.
"Why, Mary loves the lamb, you know."
The teacher did reply."""

sentences = [word_tokenize(s) for s in mary.lower().split('\n')]
sentences
Out[4]:
[['mary', 'had', 'a', 'little', 'lamb', ','], ['his', 'fleece', 'was', 'white', 'as', 'snow', ','], ['and', 'everywhere', 'that', 'mary', 'went', ','], ['the', 'lamb', 'was', 'sure', 'to', 'go', '.'], ['``', 'why', 'does', 'the', 'lamb', 'love', 'mary', 'so', '?', "''"], ['the', 'eager', 'children', 'cry', '.'], ['``', 'why', ',', 'mary', 'loves', 'the', 'lamb', ',', 'you', 'know', '.', "''"], ['the', 'teacher', 'did', 'reply', '.']]
In [5]:
# train model
model = Word2Vec(sentences, min_count=1)

# summarize the loaded model
print(model)
Word2Vec(vocab=37, size=100, alpha=0.025)
In [6]:
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
['mary', 'white', '.', 'so', 'everywhere', 'to', 'went', '?', 'that', 'know', 'was', 'eager', 'does', 'sure', 'a', 'little', 'you', 'go', 'the', 'why', "''", 'cry', 'as', 'snow', '``', 'teacher', 'reply', 'fleece', 'and', 'children', 'did', 'love', 'loves', 'had', 'his', ',', 'lamb']
In [7]:
# access vector for one word
#print(model['sentence'])
print(model['lamb'])
[-2.7182766e-03  1.2487710e-03 -4.4074864e-03  3.5265326e-03
  1.7131231e-03 -2.8178061e-03  2.9640337e-03 -2.4741637e-03
  2.6946011e-04 -1.5621432e-03  1.7189173e-04 -1.4304175e-03
 -1.0281261e-03  1.5905446e-03  7.1188854e-04 -3.9464920e-03
  2.5527547e-03 -1.0956245e-03  7.3364319e-04  2.5594078e-03
 -9.1431820e-04  2.2941572e-03  3.5932153e-03  1.8624350e-03
  2.4553102e-03 -2.8646537e-03 -2.4643880e-03  1.9218427e-03
 -3.8764605e-03  3.4359689e-03 -2.4164815e-03  1.7021263e-03
 -3.5764112e-03  1.4196016e-03 -2.0195674e-03  4.0827030e-03
 -3.9471732e-03 -6.1231584e-04  3.9051475e-03 -3.8997584e-03
 -1.0480320e-03  2.0413217e-03 -4.5277039e-03  3.7661460e-03
 -2.5259005e-03 -1.4360002e-03  7.3607016e-04  3.8408018e-03
  3.9864983e-03 -6.8123976e-04 -3.5251860e-04  4.0145023e-03
  3.1279428e-03  3.9720386e-03  9.3507761e-04 -2.3743813e-03
 -2.3502845e-03  3.5284371e-03 -5.6150433e-04  3.0950115e-03
  3.9208033e-03  2.0687412e-03 -4.6105627e-03 -4.9310355e-03
  3.1625992e-03  1.9219007e-03  1.3885073e-03 -4.1954550e-03
  3.5644420e-03 -7.0061404e-05  3.6340770e-03 -3.7480018e-03
  7.0168247e-04 -4.0180283e-03  2.3534100e-03 -1.8334251e-03
 -2.7314085e-03  2.3767257e-03  2.6325590e-03  2.2375344e-03
  3.1389827e-03 -3.8831038e-04 -4.8180486e-04 -2.2694494e-03
 -1.0046981e-03 -2.2774746e-03  1.5747132e-03  4.3728095e-03
  2.2946699e-03 -3.6577368e-03  3.3423168e-04  1.2059795e-03
  6.9353776e-04 -3.6424596e-03  4.3872455e-03 -3.7329900e-03
 -2.2044044e-03  7.2465651e-04 -1.1315111e-03 -2.8985806e-03]
/ihome/crc/install/python/anaconda3.5-4.2.0/lib/python3.5/site-packages/ipykernel_launcher.py:3: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  This is separate from the ipykernel package so we can avoid doing imports until
In [8]:
# save model
model.save('model.bin')
In [9]:
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)
Word2Vec(vocab=37, size=100, alpha=0.025)
In [10]:
dir(model)
Out[10]:
['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__gt__', '__hash__', '__ignoreds', '__init__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__numpys', '__recursive_saveloads', '__reduce__', '__reduce_ex__', '__repr__', '__scipys', '__setattr__', '__sizeof__', '__slotnames__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_check_training_sanity', '_clear_post_train', '_do_train_job', '_get_job_params', '_get_thread_working_mem', '_job_producer', '_load_specials', '_log_epoch_end', '_log_epoch_progress', '_log_progress', '_log_train_end', '_minimize_model', '_raw_word_count', '_save_specials', '_set_train_params', '_smart_save', '_train_epoch', '_update_job_params', '_worker_loop', 'accuracy', 'alpha', 'batch_words', 'build_vocab', 'build_vocab_from_freq', 'callbacks', 'cbow_mean', 'clear_sims', 'compute_loss', 'corpus_count', 'cum_table', 'delete_temporary_training_data', 'doesnt_match', 'epochs', 'estimate_memory', 'evaluate_word_pairs', 'get_latest_training_loss', 'hashfxn', 'hs', 'init_sims', 'intersect_word2vec_format', 'iter', 'layer1_size', 'load', 'load_word2vec_format', 'log_accuracy', 'min_alpha', 'min_alpha_yet_reached', 'min_count', 'model_trimmed_post_training', 'most_similar', 'most_similar_cosmul', 'n_similarity', 'negative', 'predict_output_word', 'random', 'reset_from', 'running_training_loss', 'sample', 'save', 'save_word2vec_format', 'score', 'sg', 'similar_by_vector', 'similar_by_word', 'similarity', 'syn0_lockf', 'syn1', 'syn1neg', 'total_train_time', 'train', 'train_count', 'trainables', 'vector_size', 'vocabulary', 'window', 'wmdistance', 'workers', 'wv']
In [11]:
model.wv.vocab
Out[11]:
{'mary': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35860>, 'white': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35dd8>, '.': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc358d0>, 'so': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f98>, 'everywhere': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35908>, 'to': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35978>, 'went': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc359b0>, '?': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a90>, 'that': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a58>, 'know': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc3b048>, 'was': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc359e8>, 'eager': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b00>, 'does': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b38>, 'sure': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35b70>, 'a': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ba8>, 'little': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c18>, 'you': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c50>, 'go': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35c88>, 'the': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35fd0>, 'why': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35cf8>, "''": <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35d30>, 'cry': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc3b080>, 'as': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35d68>, 'snow': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35da0>, '``': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35898>, 'teacher': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35a20>, 'reply': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35940>, 'fleece': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e80>, 'and': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35eb8>, 'children': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ef0>, 'did': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35be0>, 'love': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f60>, 'loves': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e10>, 'had': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35cc0>, 'his': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35e48>, ',': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35f28>, 'lamb': <gensim.models.keyedvectors.Vocab object at 0x7f41bbc35ac8>}

Visualization through PCA (principal component analysis)

In [12]:
from sklearn.decomposition import PCA
from matplotlib import pyplot
In [13]:
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)

# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)

for i, word in enumerate(words):
    pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
/ihome/crc/install/python/anaconda3.5-4.2.0/lib/python3.5/site-packages/ipykernel_launcher.py:2: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).
  
Out[13]:
<matplotlib.collections.PathCollection object at 0x7f41b8c432e8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c551d0>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c43438>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c43e80>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55400>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55588>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55710>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55898>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55a20>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55ba8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55d30>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c43f60>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c55f98>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63160>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c632e8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63470>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c635f8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63780>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63908>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63a90>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63c18>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63da0>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c63f28>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d0f0>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d278>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d400>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d588>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d710>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5d898>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5da20>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5dba8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5ddd8>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c5dd68>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c6c128>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c6c2b0>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c6c438>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c6c5c0>
Out[13]:
<matplotlib.text.Annotation object at 0x7f41b8c6c748>

Load Google's pre-trained word2vec embedding

  • Good news! I have already downloaded the model file and saved it in our shared_data folder on h2p.
In [14]:
from gensim.models import KeyedVectors

filename = '/ihome/pyling/shared_data/word_vectors/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
# works! 
[('queen', 0.7118192315101624)]
In [15]:
result = model.most_similar(positive=['woman', 'actor'], negative=['man'], topn=1)
print(result)
# works again!
[('actress', 0.860262393951416)]
In [16]:
result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# Hmm. Not sure about this one. 100 billion words of news data might not feature enough kittens and puppies.
[('dogs', 0.7231072187423706)]

Load Stanford's pre-trained GloVe embedding

  • Again, the model files are already downloaded in the shared_data folder.
In [17]:
# GloVe file format must be first convertd to Word2Vec format
# Something that needs to be done ONCE: you guys don't need to do this! 

from gensim.scripts.glove2word2vec import glove2word2vec

vdir = '/ihome/pyling/shared_data/word_vectors/'
glove_input_file = vdir + 'glove.6B.100d.txt'
word2vec_output_file = vdir + 'glove.6B.100d.txt.word2vec'

glove2word2vec(glove_input_file, word2vec_output_file)
Out[17]:
(400000, 100)
In [18]:
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = vdir + 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)

# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
[('queen', 0.7698541283607483)]
In [19]:
result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# 6 billion words of Wikipedia --> more kittens and puppies possibly. 
[('puppy', 0.6481586694717407)]
In [20]:
result = model.most_similar(positive=['pittsburgh', 'eagles'], negative=['philadelphia'], topn=5)
print(result)
# Beautiful! 
[('steelers', 0.7623487710952759), ('vikings', 0.7566234469413757), ('falcons', 0.7381411790847778), ('panthers', 0.7375524640083313), ('broncos', 0.735093355178833)]
In [21]:
result = model.most_similar(positive=['turtle', 'snail'], negative=['shell'], topn=5)
print(result)
# Hmm...
[('wildflower', 0.6302021741867065), ('turtles', 0.5460652112960815), ('cichlid', 0.5293101668357849), ('songbird', 0.5284878611564636), ('tortoises', 0.5175418257713318)]
In [22]:
result = model.most_similar(positive=['cow', 'pork'], negative=['pig'], topn=5)
print(result)
# Yayy! 
[('beef', 0.7819944620132446), ('meat', 0.689876914024353), ('chicken', 0.5895066857337952), ('poultry', 0.5763587951660156), ('meats', 0.5711966753005981)]
In [23]:
result = model.most_similar(positive=['michelle', 'barack'], negative=['bush'], topn=5)
print(result)
# Huh! Is it because Laura Bush is still a Bush? 
# But George is not a very distinctive first name, and we can't do George W. 
[('sasha', 0.6619373559951782), ('sarah', 0.6275115609169006), ('julia', 0.6156177520751953), ('malia', 0.6076964139938354), ('jessica', 0.605925440788269)]
In [ ]: