import nltk 

import logging   # for printing out detailed log info
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

import sklearn 
from sklearn.feature_extraction.text import CountVectorizer

# Three tiny "documents"
docs = ['A rose is a rose is a rose is a rose',
        'Oh, what a fine day it is.',
        "A day ain't over till it's truly over."]

# Initialize a CountVectorizer to use NLTK's tokenizer instead of its 
#    default one (which ignores punctuation and stopwords). 
# Minimum document frequency set to 1. 
docs_vzer = CountVectorizer(min_df=1, tokenizer=nltk.word_tokenize)

# .fit_transform does two things:
#   (1) fit: adapts docs_vzer to the supplied text data (rounds up top words into vector space) 
#   (2) transform: creates and returns a count-vectorized output of docs
docs_counts = docs_vzer.fit_transform(docs)

C:\Program Files\Python311\Lib\site-packages\sklearn\feature_extraction\text.py:525: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'
  warnings.warn(

# docs_vzer now contains vocab dictionary which maps word types (all lowercased) to indexes
# key is in alphabetically sorted order
print(docs_vzer.vocabulary_)

{'a': 3, 'rose': 12, 'is': 7, 'oh': 10, ',': 1, 'what': 15, 'fine': 6, 'day': 5, 'it': 8, '.': 2, 'ai': 4, "n't": 9, 'over': 11, 'till': 13, "'s": 0, 'truly': 14}

# docs_counts has a dimension of 3 (document count) by 16 (# of unique words)
docs_counts.shape

(3, 16)

# this one is small enough to view in full! 
docs_counts.toarray()
# first doc has same count of 4 for 'a' and 'rose'

array([[0, 0, 0, 4, 0, 0, 0, 3, 0, 0, 0, 0, 4, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1],
       [1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0]], dtype=int64)

# Convert raw frequency counts into TF-IDF (Term Frequency -- Inverse Document Frequency) values
from sklearn.feature_extraction.text import TfidfTransformer
docs_tfmer = TfidfTransformer()  # initialize with default setting

# Again, fit and transform
docs_tfidf = docs_tfmer.fit_transform(docs_counts)

# TF-IDF values
# raw counts have been normalized
docvecs = docs_tfidf.toarray()
print(docvecs)

[[0.         0.         0.         0.45646196 0.         0.
  0.         0.44083341 0.         0.         0.         0.
  0.77285728 0.         0.         0.        ]
 [0.         0.3874216  0.29464411 0.22881744 0.         0.29464411
  0.3874216  0.29464411 0.29464411 0.         0.3874216  0.
  0.         0.         0.         0.3874216 ]
 [0.30036632 0.         0.22843633 0.17740119 0.30036632 0.22843633
  0.         0.         0.22843633 0.30036632 0.         0.60073264
  0.         0.30036632 0.30036632 0.        ]]

# first document: a vector of 16 words
# 'a' and 'rose' have same raw count of 4, but tf-idf values are now 0.4535 vs. 0.7678
# 'a' is found in many docs: weighted down, 'rose' isn't, weighted up
docvecs[0]

array([0.        , 0.        , 0.        , 0.45646196, 0.        ,
       0.        , 0.        , 0.44083341, 0.        , 0.        ,
       0.        , 0.        , 0.77285728, 0.        , 0.        ,
       0.        ])

wordvecs = docvecs.transpose()   # swap row and column --> we get word vectors

print(wordvecs[0])  # vector for "'s"
print(wordvecs[3])  # vector for "a"
print(wordvecs[5])  # vector for "day"
print(wordvecs[12]) # vector for "rose"

[0.         0.         0.30036632]
[0.45646196 0.22881744 0.17740119]
[0.         0.29464411 0.22843633]
[0.77285728 0.         0.        ]

from gensim.models import word2vec

docs_tok = [nltk.word_tokenize(d) for d in docs]
docs_tok

[['A', 'rose', 'is', 'a', 'rose', 'is', 'a', 'rose', 'is', 'a', 'rose'],
 ['Oh', ',', 'what', 'a', 'fine', 'day', 'it', 'is', '.'],
 ['A', 'day', 'ai', "n't", 'over', 'till', 'it', "'s", 'truly', 'over', '.']]

# train model
mini_model = word2vec.Word2Vec(docs_tok, min_count=1)

2024-12-05 10:57:43,660 : INFO : collecting all words and their counts
2024-12-05 10:57:43,661 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-12-05 10:57:43,661 : INFO : collected 17 word types from a corpus of 31 raw words and 3 sentences
2024-12-05 10:57:43,661 : INFO : Creating a fresh vocabulary
2024-12-05 10:57:43,662 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 retains 17 unique words (100.00% of original 17, drops 0)', 'datetime': '2024-12-05T10:57:43.662959', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:57:43,662 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=1 leaves 31 word corpus (100.00% of original 31, drops 0)', 'datetime': '2024-12-05T10:57:43.662959', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:57:43,663 : INFO : deleting the raw counts dictionary of 17 items
2024-12-05 10:57:43,663 : INFO : sample=0.001 downsamples 17 most-common words
2024-12-05 10:57:43,663 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 4.413012489047725 word corpus (14.2%% of prior 31)', 'datetime': '2024-12-05T10:57:43.663955', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:57:43,664 : INFO : estimated required memory for 17 words and 100 dimensions: 22100 bytes
2024-12-05 10:57:43,665 : INFO : resetting layer weights
2024-12-05 10:57:43,665 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2024-12-05T10:57:43.665949', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'build_vocab'}
2024-12-05 10:57:43,666 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 17 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2024-12-05T10:57:43.666946', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2024-12-05 10:57:43,669 : INFO : EPOCH 0: training on 31 raw words (4 effective words) took 0.0s, 4811 effective words/s
2024-12-05 10:57:43,673 : INFO : EPOCH 1: training on 31 raw words (4 effective words) took 0.0s, 7603 effective words/s
2024-12-05 10:57:43,675 : INFO : EPOCH 2: training on 31 raw words (4 effective words) took 0.0s, 10053 effective words/s
2024-12-05 10:57:43,678 : INFO : EPOCH 3: training on 31 raw words (4 effective words) took 0.0s, 7664 effective words/s
2024-12-05 10:57:43,680 : INFO : EPOCH 4: training on 31 raw words (9 effective words) took 0.0s, 23911 effective words/s
2024-12-05 10:57:43,681 : INFO : Word2Vec lifecycle event {'msg': 'training on 155 raw words (25 effective words) took 0.0s, 1665 effective words/s', 'datetime': '2024-12-05T10:57:43.681950', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2024-12-05 10:57:43,681 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=17, vector_size=100, alpha=0.025>', 'datetime': '2024-12-05T10:57:43.681950', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}

# summarize the trained model
print(mini_model)
# 17 instead of 16, because words are not lowercased. We have both 'A' and 'a'

Word2Vec<vocab=17, vector_size=100, alpha=0.025>

# We are done training and no longer need the entire model. 
# Save the keyed vector portion: 
mini_vecs = mini_model.wv
print(mini_vecs)

KeyedVectors<vector_size=100, 17 keys>

mini_vecs.key_to_index    # mini_vecs.vocab in older version of gensim

{'rose': 0,
 'is': 1,
 'a': 2,
 'day': 3,
 '.': 4,
 'it': 5,
 'A': 6,
 'over': 7,
 'ai': 8,
 "'s": 9,
 'fine': 10,
 'what': 11,
 ',': 12,
 'Oh': 13,
 "n't": 14,
 'till': 15,
 'truly': 16}

# access vector for one word ("key")
print(mini_vecs['rose'])

[-5.3797441e-04  2.4075869e-04  5.1029641e-03  9.0120677e-03
 -9.3044490e-03 -7.1215159e-03  6.4558829e-03  8.9769447e-03
 -5.0182356e-03 -3.7625555e-03  7.3807626e-03 -1.5371976e-03
 -4.5407922e-03  6.5546203e-03 -4.8616990e-03 -1.8173642e-03
  2.8746992e-03  9.9171989e-04 -8.2833348e-03 -9.4515709e-03
  7.3106615e-03  5.0709778e-03  6.7568589e-03  7.6090707e-04
  6.3470635e-03 -3.4070630e-03 -9.4971270e-04  5.7732332e-03
 -7.5201895e-03 -3.9395704e-03 -7.5130383e-03 -9.3387265e-04
  9.5407292e-03 -7.3158443e-03 -2.3359577e-03 -1.9348158e-03
  8.0818711e-03 -5.9314668e-03  4.0662704e-05 -4.7505233e-03
 -9.6012177e-03  5.0050383e-03 -8.7624080e-03 -4.3866122e-03
 -3.4230405e-05 -2.9366091e-04 -7.6627424e-03  9.6120695e-03
  4.9826493e-03  9.2321523e-03 -8.1581566e-03  4.4947332e-03
 -4.1322676e-03  8.2931347e-04  8.4962072e-03 -4.4621569e-03
  4.5215371e-03 -6.7874175e-03 -3.5449690e-03  9.4009005e-03
 -1.5776678e-03  3.2041219e-04 -4.1401107e-03 -7.6840241e-03
 -1.5067038e-03  2.4691985e-03 -8.8701793e-04  5.5375691e-03
 -2.7432081e-03  2.2624675e-03  5.4515186e-03  8.3482778e-03
 -1.4516616e-03 -9.2089754e-03  4.3747914e-03  5.7662081e-04
  7.4411295e-03 -8.1201800e-04 -2.6377554e-03 -8.7560518e-03
 -8.6015020e-04  2.8275226e-03  5.4008025e-03  7.0539895e-03
 -5.7043689e-03  1.8602248e-03  6.0854140e-03 -4.8012817e-03
 -3.1025098e-03  6.8018893e-03  1.6322032e-03  1.9226487e-04
  3.4709922e-03  2.1629187e-04  9.6195461e-03  5.0600581e-03
 -8.9163603e-03 -7.0376825e-03  9.0307347e-04  6.3963365e-03]

help(word2vec.Word2Vec)
# Default dimension size is set to 100. 
# Also see: https://radimrehurek.com/gensim/models/word2vec.html

bible_sents = nltk.corpus.gutenberg.sents('bible-kjv.txt')
len(bible_sents)      # 30K sentences as our "documents"!

30103

print(bible_sents[10])
print(bible_sents[100])

['1', ':', '6', 'And', 'God', 'said', ',', 'Let', 'there', 'be', 'a', 'firmament', 'in', 'the', 'midst', 'of', 'the', 'waters', ',', 'and', 'let', 'it', 'divide', 'the', 'waters', 'from', 'the', 'waters', '.']
['And', 'the', 'LORD', 'set', 'a', 'mark', 'upon', 'Cain', ',', 'lest', 'any', 'finding', 'him', 'should', 'kill', 'him', '.']

print(bible_sents[513])     # sentence has 'lamb'
print(bible_sents[17343])   # also has 'lamb'

['And', 'he', 'said', ',', 'Behold', 'the', 'fire', 'and', 'the', 'wood', ':', 'but', 'where', 'is', 'the', 'lamb', 'for', 'a', 'burnt', 'offering', '?']
['11', ':', '6', 'The', 'wolf', 'also', 'shall', 'dwell', 'with', 'the', 'lamb', ',', 'and', 'the', 'leopard', 'shall', 'lie', 'down', 'with', 'the', 'kid', ';', 'and', 'the', 'calf', 'and', 'the', 'young', 'lion', 'and', 'the', 'fatling', 'together', ';', 'and', 'a', 'little', 'child', 'shall', 'lead', 'them', '.']

bible_model = word2vec.Word2Vec(bible_sents)

2024-12-05 10:57:59,537 : INFO : collecting all words and their counts
2024-12-05 10:57:59,538 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-12-05 10:57:59,794 : INFO : PROGRESS: at sentence #10000, processed 371152 words, keeping 7561 word types
2024-12-05 10:57:59,998 : INFO : PROGRESS: at sentence #20000, processed 676558 words, keeping 11062 word types
2024-12-05 10:58:00,217 : INFO : PROGRESS: at sentence #30000, processed 1007009 words, keeping 13753 word types
2024-12-05 10:58:00,220 : INFO : collected 13769 word types from a corpus of 1010654 raw words and 30103 sentences
2024-12-05 10:58:00,220 : INFO : Creating a fresh vocabulary
2024-12-05 10:58:00,233 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 retains 5752 unique words (41.78% of original 13769, drops 8017)', 'datetime': '2024-12-05T10:58:00.233463', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:58:00,234 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=5 leaves 996644 word corpus (98.61% of original 1010654, drops 14010)', 'datetime': '2024-12-05T10:58:00.234463', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:58:00,248 : INFO : deleting the raw counts dictionary of 13769 items
2024-12-05 10:58:00,249 : INFO : sample=0.001 downsamples 52 most-common words
2024-12-05 10:58:00,250 : INFO : Word2Vec lifecycle event {'msg': 'downsampling leaves estimated 649095.9526113214 word corpus (65.1%% of prior 996644)', 'datetime': '2024-12-05T10:58:00.250157', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'prepare_vocab'}
2024-12-05 10:58:00,270 : INFO : estimated required memory for 5752 words and 100 dimensions: 7477600 bytes
2024-12-05 10:58:00,270 : INFO : resetting layer weights
2024-12-05 10:58:00,272 : INFO : Word2Vec lifecycle event {'update': False, 'trim_rule': 'None', 'datetime': '2024-12-05T10:58:00.272824', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'build_vocab'}
2024-12-05 10:58:00,273 : INFO : Word2Vec lifecycle event {'msg': 'training model with 3 workers on 5752 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2024-12-05T10:58:00.273835', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2024-12-05 10:58:01,079 : INFO : EPOCH 0: training on 1010654 raw words (648902 effective words) took 0.8s, 807581 effective words/s
2024-12-05 10:58:01,880 : INFO : EPOCH 1: training on 1010654 raw words (648993 effective words) took 0.8s, 811554 effective words/s
2024-12-05 10:58:02,699 : INFO : EPOCH 2: training on 1010654 raw words (649205 effective words) took 0.8s, 795488 effective words/s
2024-12-05 10:58:03,496 : INFO : EPOCH 3: training on 1010654 raw words (649169 effective words) took 0.8s, 815735 effective words/s
2024-12-05 10:58:04,303 : INFO : EPOCH 4: training on 1010654 raw words (649202 effective words) took 0.8s, 806833 effective words/s
2024-12-05 10:58:04,303 : INFO : Word2Vec lifecycle event {'msg': 'training on 5053270 raw words (3245471 effective words) took 4.0s, 805368 effective words/s', 'datetime': '2024-12-05T10:58:04.303748', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'train'}
2024-12-05 10:58:04,304 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=5752, vector_size=100, alpha=0.025>', 'datetime': '2024-12-05T10:58:04.304744', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'created'}

# Pick out keyed vectors
bible_vecs = bible_model.wv

print(bible_vecs['lamb'])

[-0.10634738  0.07013955  0.28733644 -0.03180248 -0.34145147 -0.22917746
  0.37695035  0.4134516  -0.398545    0.05705708 -0.52578664 -0.10034814
  0.17303905  0.5607802  -0.13259286 -0.07207344 -0.17577547 -0.08655512
 -0.38237628 -0.999483    0.0393151  -0.11636327  0.09401457  0.25797477
 -0.54819214  0.05596621  0.23513308  0.04007082 -0.44384488  0.24243113
  0.0999286   0.24432923 -0.242564    0.47746024  0.06445713  0.29433477
  0.24038267 -0.42000455 -0.26865244 -0.24441129  0.03042711 -0.7408437
 -0.3260684   0.12681141  0.5956361   0.10835534 -0.59796226 -0.7385897
  0.8027435   0.47354844 -0.222462   -0.84866244  0.08423816  0.61617374
 -0.4652733   0.2823688   0.38724604 -0.45745382  0.02486628 -0.5870118
  0.660567   -0.54681104  0.27277708 -0.55128664 -0.47669154 -0.08236694
  0.01554628  0.07702988 -0.18164802  0.92311263  0.21868078  0.19037795
  0.30324566  0.16400489 -0.14126886  0.34147033  0.5474734   0.62799126
  0.13236098  0.01842692  0.2560787  -0.06881244 -0.26141122  0.22643377
  0.00843488  0.23079674 -0.21653903 -0.34137565  0.2898391   0.32442418
  0.01270593 -0.44295827  0.2137825  -0.01400178 -0.03946253 -0.2256962
 -0.20415539 -0.08401126  0.64631706  0.5232199 ]

print(bible_vecs['sheep'])

[-0.2297877   0.2036469   0.19464867  0.16319872  0.06127961 -0.43554363
  0.08098715  1.1056687  -0.34262478  0.2102613   0.01875907 -0.43063396
  0.78162605  0.08542504 -0.07994304 -0.1195242   0.00747277 -0.36893326
 -0.83466536 -0.9109421  -0.6201786   0.34743896 -0.38787806  0.1414066
  0.06339884 -0.38560513  0.5268461   0.06546553 -0.4592055  -0.24673338
 -0.05086117  0.00941551 -0.5002393   0.35288486 -0.37269977  0.31233364
  0.26048312 -0.84954476 -0.04850565 -0.26870817 -0.17426145 -0.9056026
 -0.27878875 -0.12315323  0.24142604 -0.05150128 -0.55264086 -0.57861334
  0.28375182  0.51491034 -0.16883372 -1.0742437  -0.04991964  0.27253643
 -0.3181179  -0.3331096   0.13306338  0.07162504  0.5891257  -0.40356547
  0.61807406 -0.9595074   0.09876535  0.35463488 -0.3495619  -0.16906491
  0.07802353 -0.3746161  -0.3194823   0.5783521   0.03302351  0.15600176
  0.5968358  -0.12684272 -0.510524    0.11009375  0.7860386   0.15067787
 -0.24348032  0.4102621   0.70493233 -0.32522455 -0.01758084  0.14102592
  0.07729356  0.1318195  -0.5684477  -0.1683233  -0.11584987  0.61984867
 -0.40427503  0.20648558  0.6346487   0.32663298  0.41244543 -0.14438705
  0.10656042 -0.2014572   0.5916777   0.58216006]

print(bible_vecs['river'])

[-0.803186    0.05728564 -0.25052947  0.510781    0.883464   -0.9163292
 -0.30907023  0.12017395  0.39660722  1.3516456  -0.65804625 -0.67124605
  0.6047137   0.5759813   0.07302264  0.48044798 -0.3474569  -0.4648582
 -0.3463599  -1.3152326   0.5716384  -0.35729864  0.2731516   0.02749742
  0.45280886 -0.3752376   0.6458204   0.27593732 -0.23080002  0.19378936
  0.47138965  0.2517596   0.29852024  0.7373379  -0.92030525  0.0072279
  0.47383073 -0.66676706  0.45995203  0.11301974  0.61604357 -0.68161905
 -0.17147414  0.32079962  0.8555968   0.28203884  0.26346374 -1.0537992
  1.0887138   0.47607797  0.80205435 -0.19044387  0.45197937  1.0381436
 -0.12608872  0.4043016  -0.03853792  0.6124315  -0.19902287 -0.5701562
  0.33267036 -0.1464021  -0.8482341  -0.08018691  0.08890762 -0.30734926
 -0.46185735 -0.34542397 -0.19072144  1.5000404  -0.30130774 -0.347349
  0.16986652 -0.12443657 -0.73923147 -0.21784624 -0.61884135 -0.16633089
  0.5391998   0.46523046  0.36622116 -0.50390095 -0.7953144  -0.67246085
 -0.9689107   0.17528953 -0.04914942  0.01220812  0.4072517   0.37079617
  0.01105633  0.19800784  0.05737998  0.08736799  0.6640961   0.17697614
  0.4670399  -0.23113215  0.42930782  0.34181923]

bible_vecs.similarity('lamb', 'sheep')

0.66571385

bible_vecs.similarity('lamb', 'river')

0.4109837

bible_vecs.most_similar("lamb")

[('ram', 0.9504672288894653),
 ('bullock', 0.9212200045585632),
 ('blemish', 0.8830822110176086),
 ('cubit', 0.8733336329460144),
 ('piece', 0.8727827072143555),
 ('cake', 0.8658444881439209),
 ('ephah', 0.8589160442352295),
 ('astonishment', 0.8409969806671143),
 ('bath', 0.8369395732879639),
 ('lambs', 0.8364915251731873)]

bible_vecs.most_similar("God")

[('Father', 0.7418409585952759),
 ('faith', 0.7175395488739014),
 ('Lord', 0.716336190700531),
 ('Christ', 0.7024569511413574),
 ('Spirit', 0.6941484212875366),
 ('truth', 0.6884136199951172),
 ('hosts', 0.6738569140434265),
 ('LORD', 0.6706632971763611),
 ('Holy', 0.6654433608055115),
 ('salvation', 0.6634987592697144)]

bible_vecs.most_similar("Jesus")

[('Moses', 0.7085294723510742),
 ('David', 0.6862773299217224),
 ('Paul', 0.6524249911308289),
 ('John', 0.6425464749336243),
 ('Samuel', 0.6373059153556824),
 ('Saul', 0.625990629196167),
 ('Peter', 0.6211229562759399),
 ('prophet', 0.5959038138389587),
 ('he', 0.5945872068405151),
 ('Joshua', 0.5915449857711792)]

bible_vecs.most_similar("sin")

[('trespass', 0.8844916820526123),
 ('ever', 0.7594647407531738),
 ('sacrifice', 0.7584801316261292),
 ('statute', 0.7532597184181213),
 ('just', 0.7389859557151794),
 ('meat', 0.7368711233139038),
 ('offering', 0.7210184335708618),
 ('witness', 0.7206799387931824),
 ('poor', 0.7042708992958069),
 ('atonement', 0.6876780986785889)]

bible_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.6219961047172546),
 ('prophet', 0.604142427444458),
 ('Mary', 0.5770933032035828),
 ('daughter', 0.576759397983551),
 ('Esther', 0.5758907198905945)]

bible_vecs.most_similar(positive=['son', 'child'], negative=['man'], topn=5)

[('sister', 0.7481355667114258),
 ('daughter', 0.7285842299461365),
 ('mother', 0.7145549058914185),
 ('Eleazar', 0.7080657482147217),
 ('Mary', 0.7075715661048889)]

%pprint
dir(bible_vecs)

Pretty printing has been turned OFF

['__class__', '__contains__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setitem__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_adapt_by_suffix', '_load_specials', '_log_evaluate_word_analogies', '_save_specials', '_smart_save', '_upconvert_old_d2vkv', '_upconvert_old_vocab', 'add_lifecycle_event', 'add_vector', 'add_vectors', 'allocate_vecattrs', 'closer_than', 'cosine_similarities', 'distance', 'distances', 'doesnt_match', 'evaluate_word_analogies', 'evaluate_word_pairs', 'expandos', 'fill_norms', 'get_index', 'get_mean_vector', 'get_normed_vectors', 'get_vecattr', 'get_vector', 'has_index_for', 'index2entity', 'index2word', 'index_to_key', 'init_sims', 'intersect_word2vec_format', 'key_to_index', 'load', 'load_word2vec_format', 'log_accuracy', 'log_evaluate_word_pairs', 'mapfile_path', 'most_similar', 'most_similar_cosmul', 'most_similar_to_given', 'n_similarity', 'next_index', 'norms', 'rank', 'rank_by_centrality', 'relative_cosine_similarity', 'resize_vectors', 'save', 'save_word2vec_format', 'set_vecattr', 'similar_by_key', 'similar_by_vector', 'similar_by_word', 'similarity', 'similarity_unseen_docs', 'sort_by_descending_frequency', 'unit_normalize_all', 'vector_size', 'vectors', 'vectors_for_all', 'vectors_lockf', 'vectors_norm', 'vocab', 'wmdistance', 'word_vec', 'words_closer_than']

help(bible_vecs.doesnt_match)

Help on method doesnt_match in module gensim.models.keyedvectors:

doesnt_match(words) method of gensim.models.keyedvectors.KeyedVectors instance
    Which key from the given list doesn't go with the others?
    
    Parameters
    ----------
    words : list of str
        List of keys.
    
    Returns
    -------
    str
        The key further away from the mean of all keys.

bible_vecs.doesnt_match(['eat', 'bread', 'wine', 'smite'])

'smite'

from gensim.models import KeyedVectors

filename = 'd:/Lab/word_vectors/GoogleNews-vectors-negative300.bin'
goog_vecs = KeyedVectors.load_word2vec_format(filename, binary=True)
# Read in directly as keyed vectors

2024-12-05 10:59:45,490 : INFO : loading projection weights from d:/Lab/word_vectors/GoogleNews-vectors-negative300.bin
2024-12-05 10:59:58,745 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (3000000, 300) matrix of type float32 from d:/Lab/word_vectors/GoogleNews-vectors-negative300.bin', 'binary': True, 'encoding': 'utf8', 'datetime': '2024-12-05T10:59:58.745481', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}

print(type(bible_model))  # full model
print(type(bible_vecs))   # keyed vectors
print(type(goog_vecs))    # keyed vectors

<class 'gensim.models.word2vec.Word2Vec'>
<class 'gensim.models.keyedvectors.KeyedVectors'>
<class 'gensim.models.keyedvectors.KeyedVectors'>

goog_vecs.most_similar('pretty')

[('pretty_darn', 0.8196645379066467), ('very', 0.7625877857208252), ('awfully', 0.7502423524856567), ('quite', 0.7455620765686035), ('fairly', 0.7299395799636841), ('pretty_darned', 0.7151399850845337), ('unbelievably', 0.7105132341384888), ('really', 0.6961446404457092), ('incredibly', 0.6918126940727234), ('Pretty', 0.6728280186653137)]

goog_vecs.most_similar('lamb')

[('lambs', 0.735481858253479), ('lamb_chops', 0.6806010007858276), ('sheep', 0.6539289355278015), ('loin', 0.6495908498764038), ('meat', 0.6480672955513), ('veal', 0.6419938206672668), ('hogget', 0.6389446258544922), ('mutton', 0.6381454467773438), ('roasted_lamb', 0.6320940852165222), ('suckling_pig', 0.6315929889678955)]

# calculate: (king + woman) - man = ?
goog_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)

[('queen', 0.7118193507194519), ('monarch', 0.6189674139022827), ('princess', 0.5902431011199951), ('crown_prince', 0.5499460697174072), ('prince', 0.5377321839332581)]

goog_vecs.most_similar(positive=['woman', 'actor'], negative=['man'], topn=5)

[('actress', 0.8602624535560608), ('actresses', 0.6596670150756836), ('thesp', 0.6290916800498962), ('Actress', 0.6165294647216797), ('actress_Rachel_Weisz', 0.5997323989868164)]

goog_vecs.similarity('woman', 'man')

0.76640123

goog_vecs.similarity('woman', 'tiger')

0.18899678

goog_vecs.similarity('woman', 'telephone')

0.121303424

print(goog_vecs.similarity('coffee', 'tea'))
print(goog_vecs.similarity('coffee', 'sushi'))

0.5635292
0.37408432

print(goog_vecs.similarity('couch', 'sofa'))
print(goog_vecs.similarity('couch', 'desk'))

0.8309179
0.40990508

print(goog_vecs.similarity('teacher', 'professor'))
print(goog_vecs.similarity('teacher', 'singer'))
print(goog_vecs.similarity('teacher', 'spoon'))
print(goog_vecs.similarity('teacher', 'platypus'))

0.39003685
0.21822901
0.10365092
0.027399706

print(goog_vecs.similarity('lecturer', 'professor'))

0.8011324

goog_vecs.similarity('husband', 'wife')

0.8294167

goog_vecs.similarity('alligator', 'gator')

0.8633207

goog_vecs.similarity('woodchuck', 'groundhog')

0.6868968

goog_vecs.doesnt_match(['teacher', 'professor', 'mentor', 'cook'])

'cook'

goog_vecs['lamb']
# vector has 300 dimensions

array([-8.00781250e-02,  1.56250000e-01, -3.53515625e-01,  2.21679688e-01,
        2.36328125e-01,  1.69921875e-01,  7.95898438e-02, -1.26953125e-02,
       -4.78515625e-02,  4.66796875e-01, -6.39648438e-02, -2.71484375e-01,
       -1.92382812e-01, -6.34765625e-02, -2.32421875e-01,  8.30078125e-03,
       -4.37500000e-01,  5.54199219e-02, -3.22265625e-01, -2.73437500e-01,
        4.10156250e-02, -6.49414062e-02,  2.33398438e-01,  1.68945312e-01,
       -1.19140625e-01, -1.01562500e-01, -1.20605469e-01,  1.54296875e-01,
        1.13769531e-01, -1.84570312e-01, -1.80664062e-01,  2.73437500e-01,
       -8.42285156e-03, -4.66918945e-03, -1.95312500e-01,  1.33789062e-01,
        7.03125000e-02, -5.37109375e-02,  1.28906250e-01,  1.36718750e-01,
       -6.29882812e-02, -4.10156250e-02, -7.22656250e-02,  5.34667969e-02,
        6.93359375e-02, -2.48046875e-01, -5.34667969e-02,  2.55859375e-01,
        1.55273438e-01,  1.13769531e-01, -2.73437500e-01, -9.42382812e-02,
       -2.04101562e-01,  1.17187500e-01,  3.12500000e-02, -2.27539062e-01,
       -1.43554688e-01,  1.41601562e-01, -5.78613281e-02, -1.39770508e-02,
       -3.63281250e-01,  1.83868408e-03, -2.67578125e-01, -2.87109375e-01,
        2.89062500e-01, -4.19921875e-01, -2.67333984e-02, -3.47656250e-01,
        4.66918945e-03, -7.47070312e-02,  2.48046875e-01, -4.19921875e-02,
       -2.08007812e-01,  1.61132812e-02,  1.17675781e-01, -2.16796875e-01,
       -3.14453125e-01, -7.86132812e-02, -1.61132812e-02, -1.94335938e-01,
        6.34765625e-02, -1.45507812e-01, -1.83105469e-02, -1.20117188e-01,
       -2.06054688e-01, -7.66754150e-04, -2.50000000e-01,  3.73046875e-01,
       -2.49023438e-02, -9.70458984e-03, -1.25000000e-01,  1.08032227e-02,
       -2.50000000e-01, -2.92968750e-01,  3.88183594e-02, -5.03906250e-01,
        1.36718750e-01, -2.63671875e-02,  1.68945312e-01, -7.61718750e-01,
       -1.34765625e-01, -1.70898438e-01, -2.77099609e-02, -1.25000000e-01,
        1.87988281e-02, -3.10546875e-01, -1.94335938e-01,  1.40625000e-01,
        6.64062500e-02, -2.33154297e-02, -2.10937500e-01, -1.91650391e-02,
        2.05078125e-01,  2.98828125e-01, -9.96093750e-02, -8.78906250e-02,
       -3.16406250e-01, -2.63671875e-01, -2.63671875e-01, -6.05468750e-02,
        8.64257812e-02,  4.12597656e-02,  1.83593750e-01, -3.47656250e-01,
       -1.55273438e-01,  1.96533203e-02, -4.37011719e-02, -1.17187500e-01,
       -1.20117188e-01,  2.21679688e-01, -3.26171875e-01,  1.81640625e-01,
       -4.73632812e-02,  2.24609375e-01,  3.78417969e-02,  1.75781250e-01,
       -3.02734375e-02,  1.59179688e-01,  8.15429688e-02,  2.55859375e-01,
        1.62109375e-01, -3.35937500e-01,  1.63085938e-01, -8.64257812e-02,
       -1.73828125e-01,  1.07910156e-01, -1.66015625e-01, -1.57928467e-03,
       -4.16015625e-01,  1.95312500e-01,  7.47070312e-02,  4.41894531e-02,
        6.68945312e-02,  2.02148438e-01,  2.58789062e-02, -1.94091797e-02,
       -3.33984375e-01, -1.08886719e-01, -3.75000000e-01, -5.15136719e-02,
       -4.31640625e-01,  2.09960938e-01,  4.78515625e-01, -2.39257812e-02,
        3.27148438e-02,  1.21582031e-01,  2.50000000e-01,  6.22558594e-02,
        4.56542969e-02,  2.46093750e-01, -2.30468750e-01,  2.83203125e-02,
       -2.80761719e-02, -5.78613281e-02, -3.16406250e-01,  2.72216797e-02,
        4.60937500e-01,  1.31835938e-01, -5.54199219e-02,  2.30468750e-01,
       -2.57812500e-01, -2.23632812e-01, -1.51824951e-03,  2.17285156e-02,
        1.43554688e-01,  4.36401367e-03, -1.38671875e-01,  6.39648438e-02,
       -1.72851562e-01,  4.62890625e-01, -8.49609375e-02,  1.71875000e-01,
       -9.66796875e-02,  2.39257812e-02,  1.99218750e-01,  3.80859375e-01,
        1.33789062e-01,  7.17773438e-02, -3.95507812e-02, -1.30859375e-01,
       -2.03125000e-01, -2.69531250e-01,  5.82885742e-03, -2.50000000e-01,
       -4.29687500e-02, -1.87500000e-01, -5.41992188e-02, -2.13623047e-02,
        4.49218750e-01, -1.18164062e-01, -3.20312500e-01,  3.63769531e-02,
       -6.12792969e-02, -2.00195312e-01, -7.76367188e-02, -6.68945312e-02,
        1.43554688e-01, -1.47460938e-01,  2.59765625e-01, -3.95507812e-02,
        2.45117188e-01,  2.06298828e-02,  1.44531250e-01, -5.72204590e-04,
        1.35742188e-01, -1.79687500e-01,  2.32421875e-01,  1.99218750e-01,
        6.44531250e-02,  2.12890625e-01, -1.54296875e-01, -1.50390625e-01,
       -6.03027344e-02,  5.31250000e-01, -2.14843750e-01, -1.99218750e-01,
        2.61718750e-01,  1.52343750e-01, -1.44531250e-01,  2.89062500e-01,
        2.21679688e-01,  4.54101562e-02,  1.88476562e-01,  1.68945312e-01,
        2.07519531e-02, -6.25000000e-01,  1.63085938e-01,  1.92382812e-01,
       -9.37500000e-02,  6.09375000e-01, -1.19628906e-01, -8.93554688e-02,
       -6.44531250e-02, -1.77734375e-01,  1.32812500e-01, -2.63671875e-02,
        1.80664062e-01,  1.69921875e-01, -5.83496094e-02, -2.59765625e-01,
        1.69921875e-01,  2.27539062e-01, -6.54296875e-02, -2.18505859e-02,
        7.56835938e-02, -3.92578125e-01, -1.56250000e-01,  1.41601562e-01,
       -7.81250000e-02, -1.29882812e-01, -1.84570312e-01,  6.17675781e-02,
       -9.81445312e-02,  7.56835938e-02,  3.18359375e-01, -4.62890625e-01,
        8.39843750e-02,  1.37695312e-01,  2.07519531e-03,  3.10546875e-01,
        4.68750000e-02,  1.26953125e-01,  1.21093750e-01, -8.49609375e-02,
       -7.03125000e-02,  1.69921875e-01,  3.00292969e-02,  3.53515625e-01,
       -7.91015625e-02,  3.65234375e-01, -2.26562500e-01, -2.79296875e-01,
       -1.18164062e-01,  4.31640625e-01,  1.95312500e-01,  9.03320312e-02,
        3.35937500e-01,  1.59912109e-02,  2.11914062e-01,  2.33398438e-01],
      dtype=float32)

from gensim.models import KeyedVectors
# load the Stanford GloVe model
filename = 'd:/Lab/word_vectors/glove/glove.6B.100d.txt'  # load .txt file directly
glove_vecs = KeyedVectors.load_word2vec_format(filename, binary=False, no_header=True)

2024-12-05 11:00:33,968 : INFO : loading projection weights from d:/Lab/word_vectors/glove/glove.6B.100d.txt
2024-12-05 11:00:50,104 : INFO : KeyedVectors lifecycle event {'msg': 'loaded (400000, 100) matrix of type float32 from d:/Lab/word_vectors/glove/glove.6B.100d.txt', 'binary': False, 'encoding': 'utf8', 'datetime': '2024-12-05T11:00:50.104247', 'gensim': '4.3.2', 'python': '3.11.4 (tags/v3.11.4:d2340ef, Jun  7 2023, 05:45:37) [MSC v.1934 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19045-SP0', 'event': 'load_word2vec_format'}

# calculate: (king - man) + woman = ?
result = glove_vecs.most_similar(positive=['woman', 'king'], negative=['man'], topn=5)
print(result)

[('queen', 0.7698540687561035), ('monarch', 0.6843381524085999), ('throne', 0.6755736470222473), ('daughter', 0.6594556570053101), ('princess', 0.6520534157752991)]

Word Vectors Demo¶

(1) First stop: TF-IDF¶

(2) Training Word2Vec¶

TAKE 1: With the toy document set¶

TAKE 2: With a much larger corpus -- the Bible¶

(3) Using Google's pre-trained Word2Vec models¶

(4) Stanford's Pre-trained GloVe¶