import nltk 

import logging   # for printing out detailed log info
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import sklearn 
from sklearn.feature_extraction.text import CountVectorizer

# Three tiny "documents"
docs = ['A rose is a rose is a rose is a rose',
        'Oh, what a fine day it is.',
        "A day ain't over till it's truly over."]

# Initialize a CountVectorizer to use NLTK's tokenizer instead of its 
#    default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
docs_vzer = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

# .fit_transform does two things:
#   (1) fit: adapts docs_vzer to the supplied text data (rounds up top words into vector space) 
#   (2) transform: creates and returns a count-vectorized output of docs
docs_counts = docs_vzer.fit_transform(docs)

C:\Program Files\Python311\Lib\site-packages\sklearn\feature_extraction\text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

# docs_vzer now contains vocab dictionary which maps unique words to indexes
# key is in alphabetically sorted order
print(docs_vzer.vocabulary_)

{'a': 3, 'rose': 12, 'is': 7, 'oh': 10, ',': 1, 'what': 15, 'fine': 6, 'day': 5, 'it': 8, '.': 2, 'ai': 4, "n't": 9, 'over': 11, 'till': 13, "'s": 0, 'truly': 14}

# docs_counts has a dimension of 3 (document count) by 16 (# of unique words)
docs_counts.shape

(3, 16)

# this one is small enough to view in full! 
docs_counts.toarray()
# first doc has same count of 4 for 'a' and 'rose'

array([[0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0]], dtype=int64)

# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
docs_tfmer = TfidfTransformer()  # initialize with default setting

# Again, fit and transform
docs_tfidf = docs_tfmer.fit_transform(docs_counts)

# TF-IDF values
# raw counts have been normalized
docvecs = docs_tfidf.toarray()
print(docvecs)

[[0.         0.         0.         0.45646196 0.         0.
  0.         0.44083341 0.         0.         0.         0.
  0.77285728 0.         0.         0.        ]
 [0.         0.3874216  0.29464411 0.22881744 0.         0.29464411
  0.3874216  0.29464411 0.29464411 0.         0.3874216  0.
  0.         0.         0.         0.3874216 ]
 [0.30036632 0.         0.22843633 0.17740119 0.30036632 0.22843633
  0.         0.         0.22843633 0.30036632 0.         0.60073264
  0.         0.30036632 0.30036632 0.        ]]

# first document: a vector of 16 words
# 'a' and 'rose' have same raw count of 4, but tf-idf values are now 0.4535 vs. 0.7678
# 'a' is found in many docs: weighted down, 'rose' isn't, weighted up
docvecs[0]

array([0.        , 0.        , 0.        , 0.45646196, 0.        ,
       0.        , 0.        , 0.44083341, 0.        , 0.        ,
       0.        , 0.        , 0.77285728, 0.        , 0.        ,
       0.        ])

wordvecs = docvecs.transpose()   # swap row and column --> we get word vectors

print(wordvecs[0])  # vector for "'s"
print(wordvecs[3])  # vector for "a"
print(wordvecs[5])  # vector for "day"
print(wordvecs[12]) # vector for "rose"

[0.         0.         0.30036632]
[0.45646196 0.22881744 0.17740119]
[0.         0.29464411 0.22843633]
[0.77285728 0.         0.        ]

from gensim.models import word2vec

docs_tok = [nltk.word_tokenize(d) for d in docs]
docs_tok

[['A', 'rose', 'is', 'a', 'rose', 'is', 'a', 'rose', 'is', 'a', 'rose'],
 ['Oh', ',', 'what', 'a', 'fine', 'day', 'it', 'is', '.'],
 ['A', 'day', 'ai', "n't", 'over', 'till', 'it', "'s", 'truly', 'over', '.']]

# train model
mini_model = word2vec.Word2Vec(docs_tok, min_count=1)

2023-11-28 12:23:21,888 : INFO : collecting all words and their counts
2023-11-28 12:23:21,888 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-28 12:23:21,888 : INFO : collected 17 word types from a corpus of 31 raw words and 3 sentences
2023-11-28 12:23:21,888 : INFO : Creating a fresh vocabulary
2023-11-28 12:23:21,896 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 17 unique words (100.00% of original 17, drops 0)', 'datetime': '2023-11-28T12:23:21.896779', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:21,896 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 31 word corpus (100.00% of original 31, drops 0)', 'datetime': '2023-11-28T12:23:21.896779', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:21,896 : INFO : deleting the raw counts dictionary of 17 items
2023-11-28 12:23:21,896 : INFO : sample=0.001 downsamples 17 most-common words
2023-11-28 12:23:21,900 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 4.413012489047725 word corpus (14.2%% of prior 31)', 'datetime': '2023-11-28T12:23:21.900490', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:21,900 : INFO : estimated required memory for 17 words and 100 dimensions: 22100 bytes
2023-11-28 12:23:21,900 : INFO : resetting layer weights
2023-11-28 12:23:21,905 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-11-28T12:23:21.905397', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'build_vocab'}
2023-11-28 12:23:21,905 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 17 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-28T12:23:21.905397', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2023-11-28 12:23:21,905 : INFO : EPOCH 0: training on 31 raw words (4 effective words) took 0.0s, 188680 effective words/s
2023-11-28 12:23:21,913 : INFO : EPOCH 1: training on 31 raw words (4 effective words) took 0.0s, 2929 effective words/s
2023-11-28 12:23:21,921 : INFO : EPOCH 2: training on 31 raw words (4 effective words) took 0.0s, 3067 effective words/s
2023-11-28 12:23:21,930 : INFO : EPOCH 3: training on 31 raw words (4 effective words) took 0.0s, 3754 effective words/s
2023-11-28 12:23:21,930 : INFO : EPOCH 4: training on 31 raw words (9 effective words) took 0.0s, 12048 effective words/s
2023-11-28 12:23:21,937 : INFO : Word2Vec lifecycle event {'msg': 'training on 155 raw words (25 effective words) took 0.0s, 806 effective words/s', 'datetime': '2023-11-28T12:23:21.937991', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2023-11-28 12:23:21,937 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=17, vector_size=100, alpha=0.025>', 'datetime': '2023-11-28T12:23:21.937991', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}

# summarize the trained model
print(mini_model)
# 17 instead of 16, because words are not lowercased. We have both 'A' and 'a'

Word2Vec<vocab=17, vector_size=100, alpha=0.025>

# We are done training and no longer need the entire model. 
# Save the keyed vector portion: 
mini_vecs = mini_model.wv
print(mini_vecs)

KeyedVectors<vector_size=100, 17 keys>

mini_vecs.key_to_index    # mini_vecs.vocab in older version of gensim

{'rose': 0,
 'is': 1,
 'a': 2,
 'day': 3,
 '.': 4,
 'it': 5,
 'A': 6,
 'over': 7,
 'ai': 8,
 "'s": 9,
 'fine': 10,
 'what': 11,
 ',': 12,
 'Oh': 13,
 "n't": 14,
 'till': 15,
 'truly': 16}

# access vector for one word ("key")
print(mini_vecs['rose'])

[-5.3797441e-04  2.4075869e-04  5.1029641e-03  9.0120677e-03
 -9.3044490e-03 -7.1215159e-03  6.4558829e-03  8.9769447e-03
 -5.0182356e-03 -3.7625555e-03  7.3807626e-03 -1.5371976e-03
 -4.5407922e-03  6.5546203e-03 -4.8616990e-03 -1.8173642e-03
  2.8746992e-03  9.9171989e-04 -8.2833348e-03 -9.4515709e-03
  7.3106615e-03  5.0709778e-03  6.7568589e-03  7.6090707e-04
  6.3470635e-03 -3.4070630e-03 -9.4971270e-04  5.7732332e-03
 -7.5201895e-03 -3.9395704e-03 -7.5130383e-03 -9.3387265e-04
  9.5407292e-03 -7.3158443e-03 -2.3359577e-03 -1.9348158e-03
  8.0818711e-03 -5.9314668e-03  4.0662704e-05 -4.7505233e-03
 -9.6012177e-03  5.0050383e-03 -8.7624080e-03 -4.3866122e-03
 -3.4230405e-05 -2.9366091e-04 -7.6627424e-03  9.6120695e-03
  4.9826493e-03  9.2321523e-03 -8.1581566e-03  4.4947332e-03
 -4.1322676e-03  8.2931347e-04  8.4962072e-03 -4.4621569e-03
  4.5215371e-03 -6.7874175e-03 -3.5449690e-03  9.4009005e-03
 -1.5776678e-03  3.2041219e-04 -4.1401107e-03 -7.6840241e-03
 -1.5067038e-03  2.4691985e-03 -8.8701793e-04  5.5375691e-03
 -2.7432081e-03  2.2624675e-03  5.4515186e-03  8.3482778e-03
 -1.4516616e-03 -9.2089754e-03  4.3747914e-03  5.7662081e-04
  7.4411295e-03 -8.1201800e-04 -2.6377554e-03 -8.7560518e-03
 -8.6015020e-04  2.8275226e-03  5.4008025e-03  7.0539895e-03
 -5.7043689e-03  1.8602248e-03  6.0854140e-03 -4.8012817e-03
 -3.1025098e-03  6.8018893e-03  1.6322032e-03  1.9226487e-04
  3.4709922e-03  2.1629187e-04  9.6195461e-03  5.0600581e-03
 -8.9163603e-03 -7.0376825e-03  9.0307347e-04  6.3963365e-03]

help(word2vec.Word2Vec)
# Default dimension size is set to 100. 
# Also see: https://radimrehurek.com/gensim/models/word2vec.html

bible_sents = nltk.corpus.gutenberg.sents('bible-kjv.txt')
len(bible_sents)

30103

print(bible_sents[10])
print(bible_sents[100])

['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']
['And', 'the', 'LORD', 'set', 'a', 'mark', 'upon', 'Cain', ',', 'lest', 'any', 'finding', 'him', 'should', 'kill', 'him', '.']

print(bible_sents[513])
print(bible_sents[17343])

['And', 'he', 'said', ',', 'Behold', 'the', 'fire', 'and', 'the', 'wood', ':', 'but', 'where', 'is', 'the', 'lamb', 'for', 'a', 'burnt', 'offering', '?']
['11', ':', '6', 'The', 'wolf', 'also', 'shall', 'dwell', 'with', 'the', 'lamb', ',', 'and', 'the', 'leopard', 'shall', 'lie', 'down', 'with', 'the', 'kid', ';', 'and', 'the', 'calf', 'and', 'the', 'young', 'lion', 'and', 'the', 'fatling', 'together', ';', 'and', 'a', 'little', 'child', 'shall', 'lead', 'them', '.']

bible_model = word2vec.Word2Vec(bible_sents)

2023-11-28 12:23:33,676 : INFO : collecting all words and their counts
2023-11-28 12:23:33,685 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-11-28 12:23:34,073 : INFO : PROGRESS: at sentence #10000, processed 371152 words, keeping 7561 word types
2023-11-28 12:23:34,322 : INFO : PROGRESS: at sentence #20000, processed 676558 words, keeping 11062 word types
2023-11-28 12:23:34,576 : INFO : PROGRESS: at sentence #30000, processed 1007009 words, keeping 13753 word types
2023-11-28 12:23:34,586 : INFO : collected 13769 word types from a corpus of 1010654 raw words and 30103 sentences
2023-11-28 12:23:34,586 : INFO : Creating a fresh vocabulary
2023-11-28 12:23:34,594 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 5752 unique words (41.78% of original 13769, drops 8017)', 'datetime': '2023-11-28T12:23:34.594311', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:34,594 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 996644 word corpus (98.61% of original 1010654, drops 14010)', 'datetime': '2023-11-28T12:23:34.594311', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:34,614 : INFO : deleting the raw counts dictionary of 13769 items
2023-11-28 12:23:34,622 : INFO : sample=0.001 downsamples 52 most-common words
2023-11-28 12:23:34,622 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 649095.9526113214 word corpus (65.1%% of prior 996644)', 'datetime': '2023-11-28T12:23:34.622087', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2023-11-28 12:23:34,653 : INFO : estimated required memory for 5752 words and 100 dimensions: 7477600 bytes
2023-11-28 12:23:34,653 : INFO : resetting layer weights
2023-11-28 12:23:34,653 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2023-11-28T12:23:34.653114', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'build_vocab'}
2023-11-28 12:23:34,653 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 5752 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-11-28T12:23:34.653114', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2023-11-28 12:23:35,668 : INFO : EPOCH 0 - PROGRESS: at 86.61% examples, 557475 words/s, in_qsize 0, out_qsize 0
2023-11-28 12:23:35,828 : INFO : EPOCH 0: training on 1010654 raw words (648902 effective words) took 1.2s, 554044 effective words/s
2023-11-28 12:23:36,838 : INFO : EPOCH 1 - PROGRESS: at 79.62% examples, 522207 words/s, in_qsize 0, out_qsize 0
2023-11-28 12:23:37,071 : INFO : EPOCH 1: training on 1010654 raw words (648993 effective words) took 1.2s, 524206 effective words/s
2023-11-28 12:23:38,084 : INFO : EPOCH 2 - PROGRESS: at 76.20% examples, 499096 words/s, in_qsize 0, out_qsize 0
2023-11-28 12:23:38,370 : INFO : EPOCH 2: training on 1010654 raw words (649205 effective words) took 1.3s, 501894 effective words/s
2023-11-28 12:23:39,372 : INFO : EPOCH 3 - PROGRESS: at 82.93% examples, 540807 words/s, in_qsize 0, out_qsize 0
2023-11-28 12:23:39,555 : INFO : EPOCH 3: training on 1010654 raw words (649169 effective words) took 1.2s, 544713 effective words/s
2023-11-28 12:23:40,574 : INFO : EPOCH 4 - PROGRESS: at 82.93% examples, 538038 words/s, in_qsize 0, out_qsize 0
2023-11-28 12:23:40,806 : INFO : EPOCH 4: training on 1010654 raw words (649202 effective words) took 1.2s, 525254 effective words/s
2023-11-28 12:23:40,806 : INFO : Word2Vec lifecycle event {'msg': 'training on 5053270 raw words (3245471 effective words) took 6.2s, 527717 effective words/s', 'datetime': '2023-11-28T12:23:40.806422', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2023-11-28 12:23:40,806 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=5752, vector_size=100, alpha=0.025>', 'datetime': '2023-11-28T12:23:40.806422', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}

# Pick out keyed vectors
bible_vecs = bible_model.wv

print(bible_vecs['lamb'])

[-7.00635836e-02  4.05096635e-02  2.95159400e-01 -4.91856486e-02
 -3.38857442e-01 -2.29398459e-01  3.71439040e-01  4.17077005e-01
 -4.01853383e-01  5.94529249e-02 -5.29930294e-01 -9.72019657e-02
  1.72399402e-01  5.63210249e-01 -1.28097877e-01 -7.59028420e-02
 -1.79430395e-01 -9.05304551e-02 -3.81401598e-01 -1.00703847e+00
  3.38075906e-02 -1.14725277e-01  1.02132604e-01  2.68458545e-01
 -5.39158225e-01  5.68492673e-02  2.23907322e-01  4.21043262e-02
 -4.40284699e-01  2.42261961e-01  9.99067351e-02  2.48873025e-01
 -2.30774730e-01  4.66387540e-01  6.70948327e-02  3.04142803e-01
  2.41476908e-01 -4.19585973e-01 -2.70496666e-01 -2.42669299e-01
  4.03969511e-02 -7.47187257e-01 -3.28801751e-01  1.25438616e-01
  5.95528126e-01  1.03915155e-01 -6.10071003e-01 -7.40895867e-01
  8.15759659e-01  4.75409985e-01 -2.13121340e-01 -8.51389587e-01
  8.95072222e-02  6.11951232e-01 -4.59947407e-01  2.87509561e-01
  3.76528800e-01 -4.55405712e-01  2.90560797e-02 -5.88639140e-01
  6.55464292e-01 -5.49936473e-01  2.75467783e-01 -5.57546139e-01
 -4.80590463e-01 -9.04060751e-02  1.40674431e-02  8.08346197e-02
 -1.77837417e-01  9.17950451e-01  2.09508076e-01  1.88855320e-01
  3.04937899e-01  1.63934991e-01 -1.31506562e-01  3.34697843e-01
  5.37287354e-01  6.28400803e-01  1.29761621e-01  1.86562259e-02
  2.66118139e-01 -7.46859014e-02 -2.50587612e-01  2.18556032e-01
  4.97369438e-05  2.34910876e-01 -2.17708394e-01 -3.51757854e-01
  2.84377396e-01  3.16568196e-01  1.00279404e-02 -4.40801859e-01
  2.11197481e-01 -1.85262449e-02 -3.50682810e-02 -2.27475360e-01
 -2.05028072e-01 -8.16606432e-02  6.37785017e-01  5.23087740e-01]

print(bible_vecs['sheep'])

[-0.15134561  0.15653346  0.21003102  0.11891221  0.06592    -0.44888723
  0.08198521  1.1223835  -0.3480728   0.21484146  0.0119535  -0.42220116
  0.7865827   0.08962237 -0.0731517  -0.13000903  0.00298682 -0.3818355
 -0.83721215 -0.92923594 -0.6251577   0.34843442 -0.37313855  0.15511043
  0.06790883 -0.38533157  0.51379836  0.06757652 -0.4571999  -0.25004193
 -0.04477658  0.01512047 -0.49214077  0.34959394 -0.3667245   0.32210892
  0.26314598 -0.83734375 -0.04317997 -0.26383698 -0.16378659 -0.9096472
 -0.28071997 -0.13471466  0.23833212 -0.05624822 -0.5596525  -0.57566446
  0.29384032  0.51392406 -0.16048548 -1.0746393  -0.03338281  0.270174
 -0.31434295 -0.32556394  0.11038484  0.07219376  0.5911439  -0.41281778
  0.60885215 -0.96604466  0.10324751  0.34851766 -0.35562414 -0.18586996
  0.0749181  -0.37322223 -0.3213393   0.56612265  0.01776537  0.15019342
  0.5997282  -0.12487037 -0.49490416  0.10552547  0.76722145  0.15520047
 -0.24188824  0.4096786   0.7115978  -0.33964825 -0.00147848  0.12519753
  0.06997982  0.12562689 -0.5761415  -0.17640032 -0.11557043  0.61810756
 -0.40038097  0.20978236  0.63704383  0.31625324  0.42526734 -0.13810174
  0.10468238 -0.2092095   0.59541404  0.593011  ]

print(bible_vecs['river'])

[-0.7991187   0.0547945  -0.25844377  0.53282183  0.8907928  -0.9121868
 -0.30714798  0.1392317   0.39000973  1.348982   -0.6581973  -0.67339075
  0.6020554   0.58929086  0.0760425   0.48714337 -0.34760523 -0.45455563
 -0.3425872  -1.3248395   0.573813   -0.35888192  0.28048936  0.02747101
  0.4559574  -0.37848637  0.64515734  0.28503042 -0.22541472  0.19499087
  0.4593173   0.252736    0.30604067  0.73402107 -0.9172078   0.01816918
  0.46188322 -0.6577364   0.45899245  0.11373993  0.6190553  -0.6718291
 -0.16995002  0.3283813   0.85491264  0.27873066  0.2599089  -1.0545988
  1.0972326   0.48221523  0.7897181  -0.20102945  0.4424315   1.0445846
 -0.11807676  0.4091144  -0.04547477  0.60997987 -0.18970652 -0.5772177
  0.33646765 -0.15093593 -0.8492036  -0.08777131  0.09648323 -0.30128804
 -0.455444   -0.35237554 -0.18738697  1.5013138  -0.3065533  -0.33802494
  0.17043643 -0.12752444 -0.7335599  -0.21247375 -0.6190231  -0.17229798
  0.53767234  0.4674001   0.3745742  -0.4996367  -0.79263127 -0.673205
 -0.97034514  0.1750142  -0.04968132  0.00856777  0.40272358  0.3621048
  0.00772519  0.20361975  0.05539655  0.08415575  0.6639748   0.1770144
  0.45545444 -0.22388709  0.42465806  0.33846435]

bible_vecs.similarity('lamb', 'sheep')

0.6658394

bible_vecs.similarity('lamb', 'river')

0.41190776

bible_vecs.most_similar("lamb")

[('ram', 0.9503517746925354),
 ('bullock', 0.9212687015533447),
 ('blemish', 0.8831152319908142),
 ('piece', 0.8731105327606201),
 ('cubit', 0.8731083273887634),
 ('cake', 0.8654652237892151),
 ('ephah', 0.8587759137153625),
 ('astonishment', 0.8411375880241394),
 ('bath', 0.8370754718780518),
 ('lambs', 0.8358587026596069)]

bible_vecs.most_similar("God")

[('Father', 0.7418670058250427),
 ('faith', 0.7173503041267395),
 ('Lord', 0.7166937589645386),
 ('Christ', 0.7017422914505005),
 ('Spirit', 0.6938731074333191),
 ('truth', 0.688569188117981),
 ('hosts', 0.6738213896751404),
 ('LORD', 0.6706422567367554),
 ('Holy', 0.6656263470649719),
 ('salvation', 0.6628267168998718)]

bible_vecs.most_similar("Jesus")

[('Moses', 0.7075117230415344),
 ('David', 0.686830461025238),
 ('Paul', 0.6523290872573853),
 ('John', 0.6438380479812622),
 ('Samuel', 0.637751579284668),
 ('Saul', 0.6263633370399475),
 ('Peter', 0.6215471029281616),
 ('prophet', 0.594984769821167),
 ('he', 0.5948426723480225),
 ('Joshua', 0.591395378112793)]

bible_vecs.most_similar("sin")

[('trespass', 0.8845173716545105),
 ('sacrifice', 0.7582433223724365),
 ('ever', 0.7581177353858948),
 ('statute', 0.7533267140388489),
 ('just', 0.7402607798576355),
 ('meat', 0.73576420545578),
 ('offering', 0.7214492559432983),
 ('witness', 0.7205300331115723),
 ('poor', 0.7042511105537415),
 ('atonement', 0.6866053342819214)]

bible_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6234248876571655),
 ('prophet', 0.6038112044334412),
 ('daughter', 0.5769397020339966),
 ('Mary', 0.5767046213150024),
 ('Esther', 0.5761806964874268)]

bible_vecs.most_similar(positive=['son', 'child'], negative=['man'], topn=5)

[('sister', 0.7488148212432861),
 ('daughter', 0.7290869355201721),
 ('mother', 0.7151821255683899),
 ('Eleazar', 0.7086402177810669),
 ('Mary', 0.7078005075454712)]

%pprint
dir(bible_vecs)

Pretty printing has been turned OFF

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_mean_vector', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index', 'load', 'load_word2vec_format', 'log_accuracy', 'log_evaluate_word_pairs', 'mapfile_path', 'most_similar', 'most_similar_cosmul', 'most_similar_to_given', 'n_similarity', 'next_index', 'norms', 'rank', 'rank_by_centrality', 'relative_cosine_similarity', 'resize_vectors', 'save', 'save_word2vec_format', 'set_vecattr', 'similar_by_key', 'similar_by_vector', 'similar_by_word', 'similarity', 'similarity_unseen_docs', 'sort_by_descending_frequency', 'unit_normalize_all', 'vector_size', 'vectors', 'vectors_for_all', 'vectors_lockf', 'vectors_norm', 'vocab', 'wmdistance', 'word_vec', 'words_closer_than']

help(bible_vecs.doesnt_match)

bible_vecs.doesnt_match(['eat', 'bread', 'wine', 'smite'])

'smite'

from gensim.models import KeyedVectors

filename = 'd:/Lab/word_vectors/GoogleNews-vectors-negative300.bin'
goog_vecs = KeyedVectors.load_word2vec_format(filename, binary=True)
# Read in directly as keyed vectors

2023-11-28 12:24:23,594 : INFO : loading projection weights from d:/Lab/word_vectors/GoogleNews-vectors-negative300.bin
2023-11-28 12:24:39,644 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from d:/Lab/word_vectors/GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2023-11-28T12:24:39.644981', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}

print(type(bible_model))  # full model
print(type(bible_vecs))   # keyed vectors
print(type(goog_vecs))    # keyed vectors

<class 'gensim.models.word2vec.Word2Vec'>
<class 'gensim.models.keyedvectors.KeyedVectors'>
<class 'gensim.models.keyedvectors.KeyedVectors'>

goog_vecs.most_similar('pretty')

[('pretty_darn', 0.8196645379066467), ('very', 0.7625877857208252), ('awfully', 0.7502423524856567), ('quite', 0.7455620765686035), ('fairly', 0.7299395799636841), ('pretty_darned', 0.7151399850845337), ('unbelievably', 0.7105132341384888), ('really', 0.6961446404457092), ('incredibly', 0.6918126940727234), ('Pretty', 0.6728280186653137)]

goog_vecs.most_similar('lamb')

[('lambs', 0.735481858253479), ('lamb_chops', 0.6806010007858276), ('sheep', 0.6539289355278015), ('loin', 0.6495908498764038), ('meat', 0.6480672955513), ('veal', 0.6419938206672668), ('hogget', 0.6389446258544922), ('mutton', 0.6381454467773438), ('roasted_lamb', 0.6320940852165222), ('suckling_pig', 0.6315929889678955)]

# calculate: (king + woman) - man = ?
goog_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]

goog_vecs.most_similar(positive=['woman', 'actor'], negative=['man'], topn=5)

[('actress', 0.8602624535560608), ('actresses', 0.6596670150756836), ('thesp', 0.6290916800498962), ('Actress', 0.6165294647216797), ('actress_Rachel_Weisz', 0.5997323989868164)]

goog_vecs.similarity('woman', 'man')

0.76640123

goog_vecs.similarity('woman', 'tiger')

0.18899678

goog_vecs.similarity('woman', 'telephone')

0.121303424

print(goog_vecs.similarity('coffee', 'tea'))
print(goog_vecs.similarity('coffee', 'sushi'))

0.5635292
0.37408432

print(goog_vecs.similarity('couch', 'sofa'))
print(goog_vecs.similarity('couch', 'desk'))

0.8309179
0.40990508

print(goog_vecs.similarity('teacher', 'professor'))
print(goog_vecs.similarity('teacher', 'singer'))
print(goog_vecs.similarity('teacher', 'spoon'))
print(goog_vecs.similarity('teacher', 'platypus'))

0.39003685
0.21822901
0.10365092
0.027399706

print(goog_vecs.similarity('lecturer', 'professor'))

0.8011324

goog_vecs.similarity('husband', 'wife')

0.8294167

goog_vecs.similarity('alligator', 'gator')

0.8633207

goog_vecs.similarity('America', 'USA')

0.5007448

goog_vecs.similarity('soda', 'pop')

0.21825212

goog_vecs.doesnt_match(['teacher', 'professor', 'singer', 'cook'])

'cook'

goog_vecs['lamb']
# vector has 300 dimensions

array([-8.00781250e-02,  1.56250000e-01, -3.53515625e-01,  2.21679688e-01,
        2.36328125e-01,  1.69921875e-01,  7.95898438e-02, -1.26953125e-02,
       -4.78515625e-02,  4.66796875e-01, -6.39648438e-02, -2.71484375e-01,
       -1.92382812e-01, -6.34765625e-02, -2.32421875e-01,  8.30078125e-03,
       -4.37500000e-01,  5.54199219e-02, -3.22265625e-01, -2.73437500e-01,
        4.10156250e-02, -6.49414062e-02,  2.33398438e-01,  1.68945312e-01,
       -1.19140625e-01, -1.01562500e-01, -1.20605469e-01,  1.54296875e-01,
        1.13769531e-01, -1.84570312e-01, -1.80664062e-01,  2.73437500e-01,
       -8.42285156e-03, -4.66918945e-03, -1.95312500e-01,  1.33789062e-01,
        7.03125000e-02, -5.37109375e-02,  1.28906250e-01,  1.36718750e-01,
       -6.29882812e-02, -4.10156250e-02, -7.22656250e-02,  5.34667969e-02,
        6.93359375e-02, -2.48046875e-01, -5.34667969e-02,  2.55859375e-01,
        1.55273438e-01,  1.13769531e-01, -2.73437500e-01, -9.42382812e-02,
       -2.04101562e-01,  1.17187500e-01,  3.12500000e-02, -2.27539062e-01,
       -1.43554688e-01,  1.41601562e-01, -5.78613281e-02, -1.39770508e-02,
       -3.63281250e-01,  1.83868408e-03, -2.67578125e-01, -2.87109375e-01,
        2.89062500e-01, -4.19921875e-01, -2.67333984e-02, -3.47656250e-01,
        4.66918945e-03, -7.47070312e-02,  2.48046875e-01, -4.19921875e-02,
       -2.08007812e-01,  1.61132812e-02,  1.17675781e-01, -2.16796875e-01,
       -3.14453125e-01, -7.86132812e-02, -1.61132812e-02, -1.94335938e-01,
        6.34765625e-02, -1.45507812e-01, -1.83105469e-02, -1.20117188e-01,
       -2.06054688e-01, -7.66754150e-04, -2.50000000e-01,  3.73046875e-01,
       -2.49023438e-02, -9.70458984e-03, -1.25000000e-01,  1.08032227e-02,
       -2.50000000e-01, -2.92968750e-01,  3.88183594e-02, -5.03906250e-01,
        1.36718750e-01, -2.63671875e-02,  1.68945312e-01, -7.61718750e-01,
       -1.34765625e-01, -1.70898438e-01, -2.77099609e-02, -1.25000000e-01,
        1.87988281e-02, -3.10546875e-01, -1.94335938e-01,  1.40625000e-01,
        6.64062500e-02, -2.33154297e-02, -2.10937500e-01, -1.91650391e-02,
        2.05078125e-01,  2.98828125e-01, -9.96093750e-02, -8.78906250e-02,
       -3.16406250e-01, -2.63671875e-01, -2.63671875e-01, -6.05468750e-02,
        8.64257812e-02,  4.12597656e-02,  1.83593750e-01, -3.47656250e-01,
       -1.55273438e-01,  1.96533203e-02, -4.37011719e-02, -1.17187500e-01,
       -1.20117188e-01,  2.21679688e-01, -3.26171875e-01,  1.81640625e-01,
       -4.73632812e-02,  2.24609375e-01,  3.78417969e-02,  1.75781250e-01,
       -3.02734375e-02,  1.59179688e-01,  8.15429688e-02,  2.55859375e-01,
        1.62109375e-01, -3.35937500e-01,  1.63085938e-01, -8.64257812e-02,
       -1.73828125e-01,  1.07910156e-01, -1.66015625e-01, -1.57928467e-03,
       -4.16015625e-01,  1.95312500e-01,  7.47070312e-02,  4.41894531e-02,
        6.68945312e-02,  2.02148438e-01,  2.58789062e-02, -1.94091797e-02,
       -3.33984375e-01, -1.08886719e-01, -3.75000000e-01, -5.15136719e-02,
       -4.31640625e-01,  2.09960938e-01,  4.78515625e-01, -2.39257812e-02,
        3.27148438e-02,  1.21582031e-01,  2.50000000e-01,  6.22558594e-02,
        4.56542969e-02,  2.46093750e-01, -2.30468750e-01,  2.83203125e-02,
       -2.80761719e-02, -5.78613281e-02, -3.16406250e-01,  2.72216797e-02,
        4.60937500e-01,  1.31835938e-01, -5.54199219e-02,  2.30468750e-01,
       -2.57812500e-01, -2.23632812e-01, -1.51824951e-03,  2.17285156e-02,
        1.43554688e-01,  4.36401367e-03, -1.38671875e-01,  6.39648438e-02,
       -1.72851562e-01,  4.62890625e-01, -8.49609375e-02,  1.71875000e-01,
       -9.66796875e-02,  2.39257812e-02,  1.99218750e-01,  3.80859375e-01,
        1.33789062e-01,  7.17773438e-02, -3.95507812e-02, -1.30859375e-01,
       -2.03125000e-01, -2.69531250e-01,  5.82885742e-03, -2.50000000e-01,
       -4.29687500e-02, -1.87500000e-01, -5.41992188e-02, -2.13623047e-02,
        4.49218750e-01, -1.18164062e-01, -3.20312500e-01,  3.63769531e-02,
       -6.12792969e-02, -2.00195312e-01, -7.76367188e-02, -6.68945312e-02,
        1.43554688e-01, -1.47460938e-01,  2.59765625e-01, -3.95507812e-02,
        2.45117188e-01,  2.06298828e-02,  1.44531250e-01, -5.72204590e-04,
        1.35742188e-01, -1.79687500e-01,  2.32421875e-01,  1.99218750e-01,
        6.44531250e-02,  2.12890625e-01, -1.54296875e-01, -1.50390625e-01,
       -6.03027344e-02,  5.31250000e-01, -2.14843750e-01, -1.99218750e-01,
        2.61718750e-01,  1.52343750e-01, -1.44531250e-01,  2.89062500e-01,
        2.21679688e-01,  4.54101562e-02,  1.88476562e-01,  1.68945312e-01,
        2.07519531e-02, -6.25000000e-01,  1.63085938e-01,  1.92382812e-01,
       -9.37500000e-02,  6.09375000e-01, -1.19628906e-01, -8.93554688e-02,
       -6.44531250e-02, -1.77734375e-01,  1.32812500e-01, -2.63671875e-02,
        1.80664062e-01,  1.69921875e-01, -5.83496094e-02, -2.59765625e-01,
        1.69921875e-01,  2.27539062e-01, -6.54296875e-02, -2.18505859e-02,
        7.56835938e-02, -3.92578125e-01, -1.56250000e-01,  1.41601562e-01,
       -7.81250000e-02, -1.29882812e-01, -1.84570312e-01,  6.17675781e-02,
       -9.81445312e-02,  7.56835938e-02,  3.18359375e-01, -4.62890625e-01,
        8.39843750e-02,  1.37695312e-01,  2.07519531e-03,  3.10546875e-01,
        4.68750000e-02,  1.26953125e-01,  1.21093750e-01, -8.49609375e-02,
       -7.03125000e-02,  1.69921875e-01,  3.00292969e-02,  3.53515625e-01,
       -7.91015625e-02,  3.65234375e-01, -2.26562500e-01, -2.79296875e-01,
       -1.18164062e-01,  4.31640625e-01,  1.95312500e-01,  9.03320312e-02,
        3.35937500e-01,  1.59912109e-02,  2.11914062e-01,  2.33398438e-01],
      dtype=float32)

from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'd:/Lab/word_vectors/glove/glove.6B.100d.txt'  # load .txt file directly
glove_vecs = KeyedVectors.load_word2vec_format(filename, binary=False, no_header=True)

# calculate: (king - man) + woman = ?
result = glove_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=1)
print(result)

Word Vectors Demo¶

(1) First stop: TF-IDF¶

(2) Training Word2Vec¶

TAKE 1: With the toy document set¶

TAKE 2: With a much larger corpus -- the Bible¶

(3) Using Google's pre-trained Word2Vec models¶

(4) Stanford's Pre-trained GloVe¶