from nltk.corpus import cmudict
prondict = cmudict.dict()

prondict['pumpkin']

[['P', 'AH1', 'M', 'P', 'K', 'IH0', 'N'], ['P', 'AH1', 'M', 'K', 'IH0', 'N']]

len(prondict)

123455

'linguist' in prondict

True

prondict['often']

[['AO1', 'F', 'AH0', 'N'], ['AO1', 'F', 'T', 'AH0', 'N']]

prondict['accent']

[['AH0', 'K', 'S', 'EH1', 'N', 'T'], ['AE1', 'K', 'S', 'EH2', 'N', 'T']]

pronlist = cmudict.entries()  # as a list
len(pronlist)

133737

prondict['read']    # Nya: homographic words?

[['R', 'EH1', 'D'], ['R', 'IY1', 'D']]

pronlist[469:475]

[('accelerometers',
  ['AE0', 'K', 'S', 'EH2', 'L', 'ER0', 'AA1', 'M', 'AH0', 'T', 'ER0', 'Z']),
 ('accent', ['AH0', 'K', 'S', 'EH1', 'N', 'T']),
 ('accent', ['AE1', 'K', 'S', 'EH2', 'N', 'T']),
 ('accented', ['AE1', 'K', 'S', 'EH0', 'N', 'T', 'IH0', 'D']),
 ('accenting', ['AE1', 'K', 'S', 'EH0', 'N', 'T', 'IH0', 'NG']),
 ('accents', ['AE1', 'K', 'S', 'EH0', 'N', 'T', 'S'])]

prondict['calculator']  # Jenna's word

[['K', 'AE1', 'L', 'K', 'Y', 'AH0', 'L', 'EY2', 'T', 'ER0']]

for x in sorted([(len(pron), w,pron) for (w,pron) in pronlist], reverse=True)[:5]:
    print (x)

(32, 'supercalifragilisticexpealidoshus', ['S', 'UW2', 'P', 'ER0', 'K', 'AE2', 'L', 'AH0', 'F', 'R', 'AE1', 'JH', 'AH0', 'L', 'IH2', 'S', 'T', 'IH0', 'K', 'EH2', 'K', 'S', 'P', 'IY0', 'AE2', 'L', 'AH0', 'D', 'OW1', 'SH', 'AH0', 'S'])
(28, 'antidisestablishmentarianism', ['AE2', 'N', 'T', 'AY0', 'D', 'IH0', 'S', 'AH0', 'S', 'T', 'AE2', 'B', 'L', 'IH0', 'SH', 'M', 'AH0', 'N', 'T', 'EH1', 'R', 'IY0', 'AH0', 'N', 'IH0', 'Z', 'AH0', 'M'])
(20, 'deinstitutionalization', ['D', 'IY0', 'IH2', 'N', 'S', 'T', 'IH0', 'T', 'UW2', 'SH', 'AH0', 'N', 'AH0', 'L', 'AH0', 'Z', 'EY1', 'SH', 'AH0', 'N'])
(19, 'supercalifragilistic', ['S', 'UW2', 'P', 'ER0', 'K', 'AE2', 'L', 'AH0', 'F', 'R', 'AE1', 'JH', 'AH0', 'L', 'IH2', 'S', 'T', 'IH0', 'K'])
(19, 'extraterritoriality', ['EH2', 'K', 'S', 'T', 'R', 'AH0', 'T', 'EH2', 'R', 'AH0', 'T', 'AO2', 'R', 'IY0', 'AE1', 'L', 'AH0', 'T', 'IY0'])

[(w,pron) for (w,pron) in pronlist if pron[-1] == 'ZH']

[('arbitrage', ['AA1', 'R', 'B', 'IH0', 'T', 'R', 'AA2', 'ZH']),
 ('barrage', ['B', 'ER0', 'AA1', 'ZH']),
 ('beige', ['B', 'EY1', 'ZH']),
 ('bruges', ['B', 'R', 'UW1', 'ZH']),
 ('camouflage', ['K', 'AE1', 'M', 'AH0', 'F', 'L', 'AA2', 'ZH']),
 ('collage', ['K', 'AH0', 'L', 'AA1', 'ZH']),
 ('concierge', ['K', 'AA2', 'N', 'S', 'IY0', 'EH1', 'R', 'ZH']),
 ('corsage', ['K', 'AO0', 'R', 'S', 'AA1', 'ZH']),
 ('cortege', ['K', 'AO0', 'R', 'T', 'EH1', 'ZH']),
 ('dhiraj', ['D', 'IH2', 'R', 'AA1', 'ZH']),
 ('dressage', ['D', 'R', 'EH0', 'S', 'AA1', 'ZH']),
 ('entourage', ['AA2', 'N', 'T', 'UH0', 'R', 'AA1', 'ZH']),
 ('entourage', ['AA2', 'N', 'T', 'ER0', 'AA1', 'ZH']),
 ('garage', ['G', 'ER0', 'AA1', 'ZH']),
 ('limoges', ['L', 'AH0', 'M', 'OW1', 'ZH']),
 ('massage', ['M', 'AH0', 'S', 'AA1', 'ZH']),
 ('mirage', ['M', 'ER0', 'AA1', 'ZH']),
 ('montage', ['M', 'AA0', 'N', 'T', 'AA1', 'ZH']),
 ('prestige', ['P', 'R', 'EH0', 'S', 'T', 'IY1', 'ZH']),
 ('raj', ['R', 'AA1', 'ZH']),
 ('rouge', ['R', 'UW1', 'ZH']),
 ('sabotage', ['S', 'AE1', 'B', 'AH0', 'T', 'AA2', 'ZH']),
 ('taj', ['T', 'AA1', 'ZH']),
 ('thivierge', ['TH', 'IH0', 'V', 'Y', 'EH1', 'R', 'ZH'])]

len(_)

24

cmuphone5 = [(w,pron) for (w,pron) in pronlist if len(pron)==5]
len(cmuphone5)

25821

cmuletter5 = [(w,pron) for (w,pron) in pronlist if len(w)==5]
len(cmuletter5)

15469

import nltk
enwords = nltk.corpus.words.words('en')
len(enwords)

235886

%pprint
enwords[:100]

Pretty printing has been turned OFF

['A', 'a', 'aa', 'aal', 'aalii', 'aam', 'Aani', 'aardvark', 'aardwolf', 'Aaron', 'Aaronic', 'Aaronical', 'Aaronite', 'Aaronitic', 'Aaru', 'Ab', 'aba', 'Ababdeh', 'Ababua', 'abac', 'abaca', 'abacate', 'abacay', 'abacinate', 'abacination', 'abaciscus', 'abacist', 'aback', 'abactinal', 'abactinally', 'abaction', 'abactor', 'abaculus', 'abacus', 'Abadite', 'abaff', 'abaft', 'abaisance', 'abaiser', 'abaissed', 'abalienate', 'abalienation', 'abalone', 'Abama', 'abampere', 'abandon', 'abandonable', 'abandoned', 'abandonedly', 'abandonee', 'abandoner', 'abandonment', 'Abanic', 'Abantes', 'abaptiston', 'Abarambo', 'Abaris', 'abarthrosis', 'abarticular', 'abarticulation', 'abas', 'abase', 'abased', 'abasedly', 'abasedness', 'abasement', 'abaser', 'Abasgi', 'abash', 'abashed', 'abashedly', 'abashedness', 'abashless', 'abashlessly', 'abashment', 'abasia', 'abasic', 'abask', 'Abassin', 'abastardize', 'abatable', 'abate', 'abatement', 'abater', 'abatis', 'abatised', 'abaton', 'abator', 'abattoir', 'Abatua', 'abature', 'abave', 'abaxial', 'abaxile', 'abaze', 'abb', 'Abba', 'abbacomes', 'abbacy', 'Abbadide']

letter5 = [w for w in enwords if len(w) == 5]
len(letter5)

10230

letter5[:100]

['aalii', 'Aaron', 'abaca', 'aback', 'abaff', 'abaft', 'Abama', 'abase', 'abash', 'abask', 'abate', 'abave', 'abaze', 'abbas', 'abbey', 'Abbie', 'abbot', 'abdal', 'abdat', 'abeam', 'abear', 'abele', 'abhor', 'abide', 'abidi', 'Abies', 'abilo', 'abkar', 'abler', 'ablow', 'abmho', 'Abner', 'abnet', 'abode', 'abody', 'abohm', 'aboil', 'aboma', 'aboon', 'abord', 'abort', 'about', 'above', 'Abram', 'abret', 'abrim', 'abrin', 'Abrus', 'absit', 'abuna', 'abura', 'abuse', 'Abuta', 'abuzz', 'abwab', 'abysm', 'abyss', 'acana', 'acapu', 'acara', 'acari', 'acate', 'accoy', 'acedy', 'acerb', 'achar', 'Achen', 'acher', 'achor', 'acier', 'acker', 'ackey', 'aclys', 'acmic', 'acock', 'acoin', 'acold', 'Acoma', 'acoma', 'acone', 'acorn', 'Acrab', 'acred', 'acrid', 'Acroa', 'acron', 'Acrux', 'acryl', 'actin', 'acton', 'actor', 'Acuan', 'acute', 'adage', 'Adapa', 'adapt', 'adati', 'adawe', 'adawn', 'adays']

arpa_map = {'IY0':'i', 'IH0':'ɪ', 'EH0':'ɛ', 'AE0':'æ', 'AA0':'ɑ', 'AO0':'ɔ', 'AH0':'ʌ/ə', 'UH0':'ʊ',
          'UW0':'u', 'ER0':'ɝ/ɚ', 'AY0':'aɪ', 'EY0':'eɪ', 'AW0':'aʊ', 'OW0':'oʊ', 'OY0':'ɔɪ',
          'IY1':'i', 'IH1':'ɪ', 'EH1':'ɛ', 'AE1':'æ', 'AA1':'ɑ', 'AO1':'ɔ', 'AH1':'ʌ/ə', 'UH1':'ʊ',
          'UW1':'u', 'ER1':'ɝ/ɚ', 'AY1':'aɪ', 'EY1':'eɪ', 'AW1':'aʊ', 'OW1':'oʊ', 'OY1':'ɔɪ',
          'IY2':'i', 'IH2':'ɪ', 'EH2':'ɛ', 'AE2':'æ', 'AA2':'ɑ', 'AO2':'ɔ', 'AH2':'ʌ/ə', 'UH2':'ʊ',
          'UW2':'u', 'ER2':'ɝ/ɚ', 'AY2':'aɪ', 'EY2':'eɪ', 'AW2':'aʊ', 'OW2':'oʊ', 'OY2':'ɔɪ',
          'P':'p', 'B':'b', 'T':'t', 'D':'d', 'K':'k', 'G':'g', 'M':'m', 'N':'n', 'NG':'ŋ',
          'F':'f', 'V':'v', 'TH':'θ', 'DH':'ð', 'S':'s', 'Z':'z', 'SH':'ʃ', 'ZH':'ʒ',
          'HH':'h', 'CH':'tʃ', 'JH':'dʒ', 'W':'w', 'R':'ɹ', 'Y':'j', 'L':'l'}

def ipa_fy(phones):
    "Converts CMU arpabet list to IPA string. Ignores stress."
    return ' '.join([arpa_map[p] for p in phones])

ipa_fy(['AE1', 'NG', 'K', 'SH', 'AH0', 'S'])

'æ ŋ k ʃ ʌ/ə s'

prondict['anxious']

[['AE1', 'NG', 'K', 'SH', 'AH0', 'S'], ['AE1', 'NG', 'SH', 'AH0', 'S']]

for pron in prondict['anxious']:
    print(ipa_fy(pron))

æ ŋ k ʃ ʌ/ə s
æ ŋ ʃ ʌ/ə s

ipa_fy(prondict['aphasia'][0])  # Ben's word

'ʌ/ə f eɪ ʒ ʌ/ə'

import pandas as pd
allwords_df = pd.read_json('https://rawcdn.githack.com/jmandel/heardle/453f0c8feb0d1755788a5a7c8d0bd16baf8be130/words.json')

allwords_df[100:110]

allwords_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128456 entries, 0 to 128455
Data columns (total 4 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   word      128456 non-null  object
 1   variant   128456 non-null  int64 
 2   phonemes  128456 non-null  object
 3   stress    128456 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 3.9+ MB

allwords_df[allwords_df.phonemes.map(len)==5]

targetwords_df = pd.read_json('https://joshuamandel.com/heardle/words-to-target.json')
targetwords_df.head()

targetwords_df.tail()

targetwords = list(zip(targetwords_df.word, targetwords_df.phonemes))
targetwords[-5:]

[('zipping', ['Z', 'IH', 'P', 'IH', 'NG']), ('zircon', ['Z', 'ER', 'K', 'AH', 'N']), ('zombie', ['Z', 'AA', 'M', 'B', 'IY']), ('zoning', ['Z', 'OW', 'N', 'IH', 'NG']), ('zygote', ['Z', 'AY', 'G', 'OW', 'T'])]

len(targetwords)

4115

[w for (w,pron) in targetwords][:100]

['abashed', 'abhor', 'ablate', 'ablaze', 'abloom', 'aboard', 'abort', 'abound', 'abrade', 'abridge', 'abroad', 'abscess', 'absurd', 'acacia', 'accede', 'access', 'acclaim', 'accord', 'accost', 'account', 'accursed', 'accuse', 'achiever', 'acorn', 'acquire', 'acquit', 'acrid', 'across', 'acting', 'action', 'active', 'acute', 'adapt', 'addled', 'adduct', 'adept', 'adhere', 'adjust', 'admin', 'admire', 'admit', 'adobe', 'adopt', 'adorn', 'adroit', 'adverb', 'advert', 'aerial', 'aesthete', 'affect', 'afford', 'afghan', 'afield', 'aflame', 'afloat', 'afraid', 'afresh', 'ageless', 'agent', 'aggress', 'aggrieve', 'aghast', 'agony', 'agreed', 'ahold', 'aimless', 'airbag', 'airbase', 'airboat', 'aircrew', 'airfare', 'airflow', 'airfoil', 'airhead', 'airless', 'airline', 'airlock', 'airmail', 'airman', 'airpower', 'airship', 'airtight', 'airtime', 'airwave', 'alarm', 'album', 'alcove', 'alehouse', 'alias', 'alibi', 'alien', 'alleged', 'allele', 'allergy', 'alleyway', 'almond', 'aloft', 'aloha', 'alpine', 'alright']

[w for (w,pron) in targetwords][2000:2100]

['lineup', 'lingo', 'lining', 'linked', 'linker', 'links', 'lipid', 'listen', 'lithic', 'little', 'lively', 'liven', 'livery', 'livid', 'living', 'lizard', 'loading', 'loathing', 'local', 'locale', 'locate', 'locket', 'locking', 'lockout', 'lockup', 'locus', 'lodging', 'lofty', 'logging', 'logic', 'login', 'logos', 'lonely', 'longbow', 'longer', 'longing', 'longish', 'looking', 'lookout', 'lookup', 'looming', 'looping', 'loosely', 'loosen', 'loosing', 'lopping', 'losing', 'lotion', 'lottery', 'lotus', 'loudly', 'lovely', 'loving', 'lowdown', 'lowering', 'loyally', 'lucid', 'luddite', 'lumber', 'lumen', 'lumpy', 'lunged', 'lupine', 'lupus', 'lurid', 'luscious', 'lushly', 'luster', 'lusty', 'lycra', 'lynx', 'lyric', 'lysine', 'machine', 'macro', 'madam', 'madden', 'madding', 'madly', 'mafia', 'maggot', 'magic', 'magma', 'magna', 'magpie', 'magus', 'maiden', 'mailing', 'mainly', 'maitre', 'makeup', 'making', 'malaise', 'malign', 'mallard', 'mallet', 'mamba', 'mambo', 'mammal', 'mammary']

[w for (w,pron) in targetwords][-10:]

['youtube', 'zealous', 'zebra', 'zenith', 'zippered', 'zipping', 'zircon', 'zombie', 'zoning', 'zygote']

prondict['zenith']

[['Z', 'IY1', 'N', 'AH0', 'TH'], ['Z', 'IY1', 'N', 'IH0', 'TH']]

arpa_map_nostress = {'IY':'i', 'IH':'ɪ', 'EH':'ɛ', 'AE':'æ', 'AA':'ɑ', 'AO':'ɔ', 'AH':'ʌ/ə', 'UH':'ʊ',
          'UW':'u', 'ER':'ɝ/ɚ', 'AY':'aɪ', 'EY':'eɪ', 'AW':'aʊ', 'OW':'oʊ', 'OY':'ɔɪ'}
arpa_map = arpa_map_nostress | arpa_map

[(w,pron) for (w,pron) in targetwords][-10:]

[('youtube', ['Y', 'UW', 'T', 'UW', 'B']), ('zealous', ['Z', 'EH', 'L', 'AH', 'S']), ('zebra', ['Z', 'IY', 'B', 'R', 'AH']), ('zenith', ['Z', 'IY', 'N', 'IH', 'TH']), ('zippered', ['Z', 'IH', 'P', 'ER', 'D']), ('zipping', ['Z', 'IH', 'P', 'IH', 'NG']), ('zircon', ['Z', 'ER', 'K', 'AH', 'N']), ('zombie', ['Z', 'AA', 'M', 'B', 'IY']), ('zoning', ['Z', 'OW', 'N', 'IH', 'NG']), ('zygote', ['Z', 'AY', 'G', 'OW', 'T'])]

[(w,ipa_fy(pron)) for (w,pron) in targetwords][-10:]

[('youtube', 'j u t u b'), ('zealous', 'z ɛ l ʌ/ə s'), ('zebra', 'z i b ɹ ʌ/ə'), ('zenith', 'z i n ɪ θ'), ('zippered', 'z ɪ p ɝ/ɚ d'), ('zipping', 'z ɪ p ɪ ŋ'), ('zircon', 'z ɝ/ɚ k ʌ/ə n'), ('zombie', 'z ɑ m b i'), ('zoning', 'z oʊ n ɪ ŋ'), ('zygote', 'z aɪ g oʊ t')]

prons_list = [pron for (w,pron) in targetwords]
prons_list[:5]

[['AH', 'B', 'AE', 'SH', 'T'], ['AE', 'B', 'HH', 'AO', 'R'], ['AH', 'B', 'L', 'EY', 'T'], ['AH', 'B', 'L', 'EY', 'Z'], ['AH', 'B', 'L', 'UW', 'M']]

prons_flat = [p for pron in prons_list for p in pron]
prons_flat[:20]

['AH', 'B', 'AE', 'SH', 'T', 'AE', 'B', 'HH', 'AO', 'R', 'AH', 'B', 'L', 'EY', 'T', 'AH', 'B', 'L', 'EY', 'Z']

prons_flat_ipa = [arpa_map[p] for p in prons_flat]
prons_flat_ipa[:20]

['ʌ/ə', 'b', 'æ', 'ʃ', 't', 'æ', 'b', 'h', 'ɔ', 'ɹ', 'ʌ/ə', 'b', 'l', 'eɪ', 't', 'ʌ/ə', 'b', 'l', 'eɪ', 'z']

len(prons_flat_ipa)    # 5 * 4115

20575

import nltk
phone_fd = nltk.FreqDist(prons_flat_ipa)

len(phone_fd)

39

phone_fd['h']

173

phone_fd['ʌ/ə']

1381

phone_fd.freq('ʌ/ə')

0.06712029161603889

phone_fd.most_common()

[('ʌ/ə', 1381), ('ɪ', 1304), ('l', 1263), ('t', 1247), ('s', 1161), ('ɹ', 1109), ('n', 1062), ('ɝ/ɚ', 1012), ('k', 991), ('i', 945), ('d', 880), ('p', 710), ('m', 582), ('ŋ', 574), ('b', 560), ('æ', 555), ('ɛ', 496), ('eɪ', 451), ('ɑ', 410), ('f', 385), ('oʊ', 368), ('aɪ', 349), ('g', 332), ('ɔ', 275), ('v', 263), ('w', 242), ('ʃ', 233), ('z', 230), ('u', 224), ('h', 173), ('dʒ', 171), ('tʃ', 165), ('aʊ', 144), ('θ', 89), ('ʊ', 68), ('j', 64), ('ɔɪ', 61), ('ð', 34), ('ʒ', 12)]

# only 1st sounds
phone1 = [arpa_map[p[0]] for p in prons_list]
phone1[:5], phone1[-5:]

(['ʌ/ə', 'æ', 'ʌ/ə', 'ʌ/ə', 'ʌ/ə'], ['z', 'z', 'z', 'z', 'z'])

phone1_fd = nltk.FreqDist(phone1)
phone1_fd.most_common()

[('s', 553), ('k', 352), ('b', 326), ('p', 277), ('ɹ', 243), ('f', 218), ('t', 201), ('d', 196), ('m', 183), ('ʌ/ə', 180), ('g', 166), ('l', 166), ('h', 139), ('w', 108), ('ɪ', 91), ('ʃ', 89), ('n', 84), ('æ', 72), ('ɛ', 69), ('v', 57), ('tʃ', 47), ('dʒ', 44), ('ɑ', 40), ('θ', 37), ('aʊ', 33), ('ɔ', 28), ('oʊ', 22), ('j', 19), ('aɪ', 17), ('eɪ', 16), ('ɝ/ɚ', 13), ('z', 10), ('i', 9), ('ð', 6), ('ɔɪ', 2), ('ʒ', 1), ('u', 1)]

phone2 = [arpa_map[p[1]] for p in prons_list]
phone2_fd = nltk.FreqDist(phone2)
phone2_fd.most_common()

[('ɪ', 382), ('ɹ', 371), ('æ', 301), ('ʌ/ə', 290), ('ɛ', 251), ('ɑ', 235), ('l', 231), ('i', 226), ('eɪ', 194), ('n', 168), ('t', 161), ('ɝ/ɚ', 155), ('aɪ', 150), ('oʊ', 144), ('ɔ', 137), ('k', 122), ('p', 100), ('u', 78), ('w', 61), ('m', 58), ('ʊ', 47), ('s', 36), ('aʊ', 35), ('d', 28), ('ɔɪ', 26), ('b', 23), ('v', 21), ('g', 19), ('f', 17), ('j', 14), ('ŋ', 10), ('θ', 9), ('dʒ', 7), ('ʃ', 5), ('tʃ', 1), ('h', 1), ('z', 1)]

phone3 = [arpa_map[p[2]] for p in prons_list]
phone3_fd = nltk.FreqDist(phone3)
phone3_fd.most_common()

[('ɹ', 331), ('l', 325), ('n', 315), ('t', 262), ('k', 240), ('s', 227), ('d', 194), ('m', 176), ('p', 146), ('ʌ/ə', 145), ('ɪ', 137), ('b', 133), ('æ', 130), ('v', 110), ('ɛ', 103), ('g', 100), ('i', 98), ('f', 86), ('z', 85), ('eɪ', 73), ('ɑ', 73), ('ŋ', 64), ('ɔ', 59), ('ʃ', 57), ('u', 57), ('w', 48), ('oʊ', 47), ('aɪ', 47), ('ɝ/ɚ', 45), ('tʃ', 41), ('dʒ', 39), ('h', 29), ('aʊ', 26), ('θ', 20), ('ð', 18), ('j', 9), ('ɔɪ', 9), ('ʊ', 6), ('ʒ', 5)]

phone4 = [arpa_map[p[3]] for p in prons_list]
phone4_fd = nltk.FreqDist(phone4)
phone4_fd.most_common()

[('ɪ', 694), ('ʌ/ə', 683), ('ɝ/ɚ', 209), ('l', 205), ('t', 202), ('n', 184), ('k', 143), ('i', 137), ('s', 137), ('aɪ', 113), ('ɹ', 112), ('p', 109), ('eɪ', 108), ('d', 106), ('m', 96), ('oʊ', 73), ('ɛ', 72), ('b', 61), ('u', 59), ('ɑ', 57), ('æ', 52), ('tʃ', 49), ('ɔ', 47), ('aʊ', 46), ('v', 43), ('f', 40), ('g', 40), ('dʒ', 37), ('ŋ', 36), ('z', 33), ('ʃ', 27), ('w', 25), ('j', 22), ('ɔɪ', 16), ('ʊ', 14), ('ð', 10), ('θ', 10), ('h', 4), ('ʒ', 4)]

phone5 = [arpa_map[p[4]] for p in prons_list]
phone5_fd = nltk.FreqDist(phone5)
phone5_fd.most_common()

[('ɝ/ɚ', 590), ('i', 475), ('ŋ', 464), ('t', 421), ('d', 356), ('l', 336), ('n', 311), ('s', 208), ('k', 134), ('z', 101), ('ʌ/ə', 83), ('oʊ', 82), ('p', 78), ('m', 69), ('eɪ', 60), ('ʃ', 55), ('ɹ', 52), ('dʒ', 44), ('v', 32), ('u', 29), ('tʃ', 27), ('f', 24), ('aɪ', 22), ('b', 17), ('θ', 13), ('ɔɪ', 8), ('g', 7), ('ɑ', 5), ('aʊ', 4), ('ɔ', 4), ('ʒ', 2), ('ʊ', 1), ('ɛ', 1)]

import re
def CV_fy(arpa_pron):
    cv_list = []
    for p in arpa_pron:
        if re.match(r'[AEIOU]', p): cv_list.append('V')
        else: cv_list.append('C')
    return ' '.join(cv_list)

CV_fy(['AH', 'B', 'AE', 'SH', 'T'])

'V C V C C'

prons_list_cv = [CV_fy(pron) for pron in prons_list]
prons_list_cv[-5:]

['C V C V C', 'C V C V C', 'C V C C V', 'C V C V C', 'C V C V C']

targetwords[-5:]

[('zipping', ['Z', 'IH', 'P', 'IH', 'NG']), ('zircon', ['Z', 'ER', 'K', 'AH', 'N']), ('zombie', ['Z', 'AA', 'M', 'B', 'IY']), ('zoning', ['Z', 'OW', 'N', 'IH', 'NG']), ('zygote', ['Z', 'AY', 'G', 'OW', 'T'])]

cv_fd = nltk.FreqDist(prons_list_cv)
cv_fd.most_common()

[('C V C V C', 1735), ('C V C C V', 670), ('C C V C V', 418), ('V C C V C', 349), ('C C V C C', 334), ('C V C V V', 111), ('V C V C C', 93), ('C C C V C', 73), ('V C V C V', 66), ('C V C C C', 57), ('C C V V C', 50), ('C V V C V', 40), ('V C C C V', 34), ('V C V V C', 23), ('C V V C C', 20), ('V C C V V', 16), ('V V C V C', 9), ('C V V V C', 7), ('C C V V V', 4), ('C C C V V', 3), ('V V C C V', 2), ('V C C C C', 1)]

targetwords_df['ipa'] = targetwords_df.phonemes.map(ipa_fy)
targetwords_df.head()

targetwords_df['CV'] = prons_list_cv
targetwords_df.head()

targetwords_df[targetwords_df.CV=='V C C C C']

prondict['angst']  # Go, Ben! You guessed it!

[['AA1', 'NG', 'K', 'S', 'T']]

targetwords_df[targetwords_df.CV=='V V C C V']

targetwords_df[targetwords_df.CV=='C C C V V']

def get_bag_probs(pron):
    ipas = [arpa_map[p] for p in pron]
    probs = [phone_fd.freq(x) for x in ipas]
    return probs

get_bag_probs(['S', 'IY', 'T', 'IH', 'D'])    # seated

[0.056427703523693806, 0.04592952612393682, 0.06060753341433779, 0.0633778857837181, 0.042770352369380316]

import math
math.prod(_)

4.2578614537093817e-07

get_bag_probs(['D', 'IH', 'S', 'IY', 'T'])   # deceit, same 5 sounds

[0.042770352369380316, 0.0633778857837181, 0.056427703523693806, 0.04592952612393682, 0.06060753341433779]

math.prod(_)   # same probability

4.2578614537093817e-07

# Dan's pick! 
prondict['until'], ipa_fy(prondict['until'][0])

([['AH0', 'N', 'T', 'IH1', 'L']], 'ʌ/ə n t ɪ l')

get_bag_probs(prondict['until'][0])

[0.06712029161603889, 0.05161603888213852, 0.06060753341433779, 0.0633778857837181, 0.06138517618469016]

math.prod(_)   # very high!

8.168952510477823e-07

targetwords_df['bagprob'] = targetwords_df.phonemes.map(lambda x: math.prod(get_bag_probs(x)))
targetwords_df.head(5)

targetwords_df.sort_values(by=['bagprob'], ascending=False)

targetwords_df.sort_values(by=['bagprob'], ascending=False).head(10)

# 's' count and probability as the first phone. High! 
phone1_fd['s'], phone1_fd.freq('s')

(553, 0.13438639125151883)

# 's' as the 2nd phone. Much less likely. 
phone2_fd['s'], phone2_fd.freq('s')

(36, 0.008748481166464156)

def get_slot_probs(pron):
    ipas = [arpa_map[p] for p in pron]
    slot_probs = [phone1_fd.freq(ipas[0]), phone2_fd.freq(ipas[1]), phone3_fd.freq(ipas[2]), 
                  phone4_fd.freq(ipas[3]), phone5_fd.freq(ipas[4])]
    return slot_probs

get_slot_probs(['S', 'IY', 'T', 'IH', 'D'])

[0.13438639125151883, 0.05492102065613609, 0.06366950182260024, 0.16865127582017012, 0.0865127582017011]

math.prod(_)

6.856383994933627e-06

get_slot_probs(['D', 'IH', 'S', 'IY', 'T'])

[0.04763061968408262, 0.0928311057108141, 0.0551640340218712, 0.03329283110571082, 0.10230862697448359]

math.prod(_)  # 'deceit' positional probability, much lower than 'seated'

8.308043402913229e-07

targetwords_df['positprob'] = targetwords_df.phonemes.map(lambda x: math.prod(get_slot_probs(x)))
targetwords_df.tail()

targetwords_df.sort_values(by=['positprob'], ascending=False)

# code that Na-Rae didn't transfer from Python shell session...
# Step 1: lock 'salad' as WORD1 
# Step 2: find WORD2 - no overlapping phones with 'salad', with highest positional probability
# Step 3: find WORD3 - no overlap with 'salad' and WORD2 
#   (... not strictly going by top probs, applied some favoritism...) 
# the three words I chose: 'salad', 'caring', and 'painter'.

targetwords_df[targetwords_df.word.isin({'salad', 'painter', 'caring'})]

prondict['salad'], prondict['painter'], prondict['caring']

([['S', 'AE1', 'L', 'AH0', 'D']], [['P', 'EY1', 'N', 'T', 'ER0'], ['P', 'EY1', 'N', 'ER0']], [['K', 'EH1', 'R', 'IH0', 'NG']])

opener_phones = set(prondict['salad'][0] + prondict['painter'][0] + prondict['caring'][0])
opener_phones

{'S', 'P', 'EH1', 'L', 'ER0', 'AE1', 'D', 'IH0', 'AH0', 'N', 'EY1', 'T', 'R', 'K', 'NG'}

opener_phones = {arpa_map[p] for p in opener_phones}
opener_phones

{'t', 'k', 'p', 'ɪ', 'ʌ/ə', 'd', 'ŋ', 'ɝ/ɚ', 'eɪ', 'ɛ', 'l', 'n', 's', 'ɹ', 'æ'}

len(opener_phones)

15

targetwords[:20]

[('abashed', ['AH', 'B', 'AE', 'SH', 'T']), ('abhor', ['AE', 'B', 'HH', 'AO', 'R']), ('ablate', ['AH', 'B', 'L', 'EY', 'T']), ('ablaze', ['AH', 'B', 'L', 'EY', 'Z']), ('abloom', ['AH', 'B', 'L', 'UW', 'M']), ('aboard', ['AH', 'B', 'AO', 'R', 'D']), ('abort', ['AH', 'B', 'AO', 'R', 'T']), ('abound', ['AH', 'B', 'AW', 'N', 'D']), ('abrade', ['AE', 'B', 'R', 'EY', 'D']), ('abridge', ['AH', 'B', 'R', 'IH', 'JH']), ('abroad', ['AH', 'B', 'R', 'AO', 'D']), ('abscess', ['AE', 'B', 'S', 'EH', 'S']), ('absurd', ['AH', 'B', 'S', 'ER', 'D']), ('acacia', ['AH', 'K', 'EY', 'SH', 'AH']), ('accede', ['AE', 'K', 'S', 'IY', 'D']), ('access', ['AE', 'K', 'S', 'EH', 'S']), ('acclaim', ['AH', 'K', 'L', 'EY', 'M']), ('accord', ['AH', 'K', 'AO', 'R', 'D']), ('accost', ['AH', 'K', 'AO', 'S', 'T']), ('account', ['AH', 'K', 'AW', 'N', 'T'])]

targetwords_nohit = [(w,pron) for (w,pron) in targetwords if all([arpa_map[p] not in opener_phones for p in pron])]
len(targetwords_nohit)

4

targetwords_nohit   #!!!

[('beehive', ['B', 'IY', 'HH', 'AY', 'V']), ('mambo', ['M', 'AA', 'M', 'B', 'OW']), ('wiseguy', ['W', 'AY', 'Z', 'G', 'AY']), ('zombie', ['Z', 'AA', 'M', 'B', 'IY'])]

targetwords_hitcount = [(w,pron, len([p for p in pron if arpa_map[p] in opener_phones])) for (w,pron) in targetwords]
targetwords_hitcount[:5]

[('abashed', ['AH', 'B', 'AE', 'SH', 'T'], 3), ('abhor', ['AE', 'B', 'HH', 'AO', 'R'], 2), ('ablate', ['AH', 'B', 'L', 'EY', 'T'], 4), ('ablaze', ['AH', 'B', 'L', 'EY', 'Z'], 3), ('abloom', ['AH', 'B', 'L', 'UW', 'M'], 2)]

nltk.FreqDist([count for (w,pron,count) in targetwords_hitcount])

FreqDist({4: 1527, 3: 1398, 5: 543, 2: 536, 1: 107, 0: 4})

	word	phonemes	stress
100	ABAD	[AH, B, AA, D]	2
101	ABADAKA	[AH, B, AE, D, AH, K, AH]	2
102	ABADI	[AH, B, AE, D, IY]	2
103	ABADIE	[AH, B, AE, D, IY]	2
104	ABAIR	[AH, B, EH, R]	2
105	ABALKIN	[AH, B, AA, L, K, IH, N]	2
106	ABALONE	[AE, B, AH, L, OW, N, IY]	4
107	ABALONES	[AE, B, AH, L, OW, N, IY, Z]	4
108	ABALOS	[AA, B, AA, L, OW, Z]	2
109	ABANDON	[AH, B, AE, N, D, AH, N]	2

	word	variant	phonemes	stress
37	(PARENS	0	[P, ER, EH, N, Z]	2
44	)PARENS	0	[P, ER, EH, N, Z]	2
50	-HYPHEN	0	[HH, AY, F, AH, N]	1
58	3-D	0	[TH, R, IY, D, IY]	2
59	3D	0	[TH, R, IY, D, IY]	2
...	...	...	...	...
128432	ZYCAD	0	[Z, IH, K, AE, D]	1
128437	ZYGOTE	0	[Z, AY, G, OW, T]	1
128439	ZYLKA	0	[Z, IH, L, K, AH]	1
128441	ZYMAN	0	[Z, AY, M, AH, N]	1
128442	ZYNDA	0	[Z, IH, N, D, AH]	1

	word	phonemes	stress	p
0	abashed	[AH, B, AE, SH, T]	2	0.941176
1	abhor	[AE, B, HH, AO, R]	3	0.885856
2	ablate	[AH, B, L, EY, T]	3	0.550000
3	ablaze	[AH, B, L, EY, Z]	3	0.983562
4	abloom	[AH, B, L, UW, M]	3	0.705234

	word	phonemes	stress	p
4110	zipping	[Z, IH, P, IH, NG]	1	0.988180
4111	zircon	[Z, ER, K, AH, N]	1	0.677966
4112	zombie	[Z, AA, M, B, IY]	1	0.997669
4113	zoning	[Z, OW, N, IH, NG]	1	0.958525
4114	zygote	[Z, AY, G, OW, T]	1	0.884804

	word	phonemes	stress	p	ipa
0	abashed	[AH, B, AE, SH, T]	2	0.941176	ʌ/ə b æ ʃ t
1	abhor	[AE, B, HH, AO, R]	3	0.885856	æ b h ɔ ɹ
2	ablate	[AH, B, L, EY, T]	3	0.550000	ʌ/ə b l eɪ t
3	ablaze	[AH, B, L, EY, Z]	3	0.983562	ʌ/ə b l eɪ z
4	abloom	[AH, B, L, UW, M]	3	0.705234	ʌ/ə b l u m

Nerding out with IPA: HEARDLE vs. CMU Pronouncing Dictionary¶

Trying out HEARDLE¶

Exploring the CMU Pronouncing Dictionary¶

Q1: How is it structured? How many entries?¶

Q2: How long is the longest word? How many sounds?¶

Q3: How many words end in `/ʒ/`?¶

Q4: How many 5-phone words (aka "Heardle words")?¶

Q5: And is that larger or smaller than 5-letter words (aka "Wordle words")?¶

Q6: ARPABET is confusing. Can we see some real IPA?¶

Dissecting HEARDLE¶

4115 target words. Let's get up-close!¶

Q1: What sort of words are in there?¶

Q2: Can we see some glorious IPA?¶

Q3: Which sounds are most frequent? Which are the rarest?¶

Q4: What's the most common initial sound? The final sound?¶

Q5: What's the most common CV pattern? Could it be... CVCVC? Perhaps VCCVC?¶

Q6: What is the BEST opening word? (How do we operationalize this?)¶

Q7: One opener isn't quite enough against 39 (!!) speech sounds. Can we come up with a set of three words?¶

Q8: So, the three openers. How much coverage do they provide between the 15 phonemes? What % of target words get zero hits? What % get all 5 phones covered?¶

	word	variant	phonemes	stress	p	ipa	CV
142	aorta	0	[EY, AO, R, T, AH]	1	0.929730	eɪ ɔ ɹ t ʌ/ə	V V C C V
184	arranger	0	[ER, EY, N, JH, ER]	1	0.850356	ɝ/ɚ eɪ n dʒ ɝ/ɚ	V V C C V

	word	phonemes	stress	p	ipa	CV
3060	screwy	[S, K, R, UW, IY]	3	0.950249	s k ɹ u i	C C C V V
3236	skewer	[S, K, Y, UW, ER]	3	0.988067	s k j u ɝ/ɚ	C C C V V
3402	sprayer	[S, P, R, EY, ER]	3	0.964557	s p ɹ eɪ ɝ/ɚ	C C C V V

	word	variant	phonemes	stress	p	ipa	CV	bagprob
2009	little	0	[L, IH, T, AH, L]	1	0.997429	l ɪ t ʌ/ə l	C V C V C	9.715054e-07
3732	tittle	0	[T, IH, T, AH, L]	1	0.691932	t ɪ t ʌ/ə l	C V C V C	9.591981e-07
3539	subtle	0	[S, AH, T, AH, L]	1	0.990123	s ʌ/ə t ʌ/ə l	C V C V C	9.457801e-07
3824	tussle	0	[T, AH, S, AH, L]	1	0.963054	t ʌ/ə s ʌ/ə l	C V C V C	9.457801e-07
3841	ultra	0	[AH, L, T, R, AH]	0	0.987685	ʌ/ə l t ɹ ʌ/ə	V C C C V	9.034196e-07
...	...	...	...	...	...	...	...	...
1425	fused	0	[F, Y, UW, Z, D]	2	0.982587	f j u z d	C C V C C	3.029703e-10
537	bureau	0	[B, Y, UH, R, OW]	2	0.990123	b j ʊ ɹ oʊ	C C V C V	2.697473e-10
1427	future	0	[F, Y, UW, CH, ER]	2	0.994845	f j u tʃ ɝ/ɚ	C C V C V	2.499505e-10
1717	hosiery	0	[HH, OW, ZH, ER, IY]	1	0.957346	h oʊ ʒ ɝ/ɚ i	C V C V V	1.981474e-10
473	boyhood	0	[B, OY, HH, UH, D]	1	0.976064	b ɔɪ h ʊ d	C V C V C	9.590833e-11

	word	variant	phonemes	stress	p	ipa	CV	bagprob	positprob
3077	searing	0	[S, IH, R, IH, NG]	1	0.976077	s ɪ ɹ ɪ ŋ	C V C V C	3.408251e-07	1.908292e-05
3228	sitting	0	[S, IH, T, IH, NG]	1	0.988479	s ɪ t ɪ ŋ	C V C V C	3.832362e-07	1.510491e-05
3595	synod	0	[S, IH, N, AH, D]	1	0.578554	s ɪ n ʌ/ə d	C V C V C	5.299214e-07	1.371262e-05
3098	selling	0	[S, EH, L, IH, NG]	1	0.997792	s ɛ l ɪ ŋ	C V C V C	1.476412e-07	1.231149e-05
2986	salad	0	[S, AE, L, AH, D]	1	1.000000	s æ l ʌ/ə d	C V C V C	2.682290e-07	1.114799e-05
...	...	...	...	...	...	...	...	...	...
2420	overdo	0	[OW, V, ER, D, UW]	0	0.948052	oʊ v ɝ/ɚ d u	V C V C V	5.236192e-09	5.416380e-11
2421	overdue	0	[OW, V, ER, D, UW]	0	1.000000	oʊ v ɝ/ɚ d u	V C V C V	5.236192e-09	5.416380e-11
97	aloha	0	[AH, L, OW, HH, AA]	2	0.932292	ʌ/ə l oʊ h ɑ	V C V C V	1.234740e-08	3.312555e-11
2419	overbuy	0	[OW, V, ER, B, AY]	0	0.803714	oʊ v ɝ/ɚ b aɪ	V C V C V	5.191565e-09	2.364601e-11
140	anyhow	0	[EH, N, IY, HH, AW]	0	0.963636	ɛ n i h aʊ	V C V C V	3.363159e-09	1.540477e-11

	word	phonemes	stress	p	ipa	CV	bagprob	positprob
606	caring	[K, EH, R, IH, NG]	1	0.994490	k ɛ ɹ ɪ ŋ	C V C V C	1.106566e-07	0.000008
2440	painter	[P, EY, N, T, ER]	1	0.995098	p eɪ n t ɝ/ɚ	C V C C V	1.163877e-07	0.000002
2986	salad	[S, AE, L, AH, D]	1	1.000000	s æ l ʌ/ə d	C V C V C	2.682290e-07	0.000011

Nerding out with IPA: HEARDLE vs. CMU Pronouncing Dictionary¶

Trying out HEARDLE¶

Exploring the CMU Pronouncing Dictionary¶

Q1: How is it structured? How many entries?¶

Q2: How long is the longest word? How many sounds?¶

Q3: How many words end in /ʒ/?¶

Q4: How many 5-phone words (aka "Heardle words")?¶

Q5: And is that larger or smaller than 5-letter words (aka "Wordle words")?¶

Q6: ARPABET is confusing. Can we see some real IPA?¶

Dissecting HEARDLE¶

4115 target words. Let's get up-close!¶

Q1: What sort of words are in there?¶

Q2: Can we see some glorious IPA?¶

Q3: Which sounds are most frequent? Which are the rarest?¶

Q4: What's the most common initial sound? The final sound?¶

Q5: What's the most common CV pattern? Could it be... CVCVC? Perhaps VCCVC?¶

Q6: What is the BEST opening word? (How do we operationalize this?)¶

Q7: One opener isn't quite enough against 39 (!!) speech sounds. Can we come up with a set of three words?¶

Q8: So, the three openers. How much coverage do they provide between the 15 phonemes? What % of target words get zero hits? What % get all 5 phones covered?¶

Q3: How many words end in `/ʒ/`?¶