4/10/2018 PyLing meeting
h2p.crc.pitt.edu
python --version
module spider python
module load python/anaconda3.5-4.2.0
python --version
pip intall gensim --user
# Turns on/off pretty printing
%pprint
# Every returned Out[] is displayed, not just the last one.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
# Import Word2Vec from gensim.
# gensim is not installed system-wide. Install for yourself: pip install gensim --user
from gensim.models import Word2Vec
from nltk import word_tokenize
mary = """Mary had a little lamb,
His fleece was white as snow,
And everywhere that Mary went,
The lamb was sure to go.
"Why does the lamb love Mary so?"
The eager children cry.
"Why, Mary loves the lamb, you know."
The teacher did reply."""
sentences = [word_tokenize(s) for s in mary.lower().split('\n')]
sentences
# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)
# access vector for one word
#print(model['sentence'])
print(model['lamb'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)
dir(model)
model.wv.vocab
from sklearn.decomposition import PCA
from matplotlib import pyplot
# fit a 2d PCA model to the vectors
X = model[model.wv.vocab]
pca = PCA(n_components=2)
result = pca.fit_transform(X)
# create a scatter plot of the projection
pyplot.scatter(result[:, 0], result[:, 1])
words = list(model.wv.vocab)
for i, word in enumerate(words):
pyplot.annotate(word, xy=(result[i, 0], result[i, 1]))
pyplot.show()
shared_data
folder on h2p. from gensim.models import KeyedVectors
filename = '/ihome/pyling/shared_data/word_vectors/GoogleNews-vectors-negative300.bin'
model = KeyedVectors.load_word2vec_format(filename, binary=True)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
# works!
result = model.most_similar(positive=['woman', 'actor'], negative=['man'], topn=1)
print(result)
# works again!
result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# Hmm. Not sure about this one. 100 billion words of news data might not feature enough kittens and puppies.
shared_data
folder. # GloVe file format must be first convertd to Word2Vec format
# Something that needs to be done ONCE: you guys don't need to do this!
from gensim.scripts.glove2word2vec import glove2word2vec
vdir = '/ihome/pyling/shared_data/word_vectors/'
glove_input_file = vdir + 'glove.6B.100d.txt'
word2vec_output_file = vdir + 'glove.6B.100d.txt.word2vec'
glove2word2vec(glove_input_file, word2vec_output_file)
from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = vdir + 'glove.6B.100d.txt.word2vec'
model = KeyedVectors.load_word2vec_format(filename, binary=False)
# calculate: (king - man) + woman = ?
result = model.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)
result = model.most_similar(positive=['dog', 'kitten'], negative=['baby'], topn=1)
print(result)
# 6 billion words of Wikipedia --> more kittens and puppies possibly.
result = model.most_similar(positive=['pittsburgh', 'eagles'], negative=['philadelphia'], topn=5)
print(result)
# Beautiful!
result = model.most_similar(positive=['turtle', 'snail'], negative=['shell'], topn=5)
print(result)
# Hmm...
result = model.most_similar(positive=['cow', 'pork'], negative=['pig'], topn=5)
print(result)
# Yayy!
result = model.most_similar(positive=['michelle', 'barack'], negative=['bush'], topn=5)
print(result)
# Huh! Is it because Laura Bush is still a Bush?
# But George is not a very distinctive first name, and we can't do George W.