# For HW2: https://sites.pitt.edu/~naraehan/ling1330/hw2.html # YOUR NAME # DATE # ------------------------------------------------------------------------ # PREPARATION print("...Importing and reading corpus files...") # ------------------------------------------------------------------------ # [A] Import all libraries here. # EDIT THE CODE BELOW. import pickle # [B] Open the text files for the two corpora, read in the contents. # EDIT THE CODE BELOW. b_txt = 'This is the bible text...' a_etxt = "This is Austen's Emma..." a_ptxt = "Austen's Persuasion..." a_stxt = "Sense and Sensibility..." a_txt = a_etxt + a_ptxt + a_stxt # all three texts in one string! # ------------------------------------------------------------------------ # BUILDING DATA OBJECTS print("...Building data objects...") # ------------------------------------------------------------------------ # [C1] Build two token lists, then lowercase them through list comprehension. # EDIT THE CODE BELOW. b_toks = [] a_toks = [] # [C2] Build word frequency distributions. # EDIT THE CODE BELOW. b_tokfd = "foo" a_tokfd = "foo" # [C3] Build word bigrams. You should cast them as *lists*. # EDIT THE CODE BELOW. b_bigrams = [] a_bigrams = [] # [C4] Build bigram frequency distributions. # EDIT THE CODE BELOW. b_bigramfd = "foo" a_bigramfd = "foo" # [C5] Build conditional frequency distributions of the two bigram lists. # EDIT THE CODE BELOW. b_bigramcfd = "foo" a_bigramcfd = "foo" # ------------------------------------------------------------------------ # PICKLE DATA print("...Pickling bigram cfd objects...") # ------------------------------------------------------------------------ # [D] Pickle the two bigram conditional frequency distributions. # File names: bible_bigramcfd.pkl, austen_bigramcfd.pkl # YOUR CODE BELOW. # ------------------------------------------------------------------------ # EXPLORE AND ANSWER QUESTIONS # TRY THINGS OUT IN SHELL FIRST! print("...Now answering questions on the two corpora.") # ------------------------------------------------------------------------ # [E1] Token and type counts, observation # BELOW IS A COMPLETE ANSWER, TO GIVE YOU A SENSE OF OUTPUT FORMAT print("[E1] token and type counts") # question heading print("The Bible has", len(b_toks), "tokens and", len(b_tokfd), "types.") print("The Austen corpus has", len(a_toks), "tokens and", len(a_tokfd), "types.") observ_e1 = """OBSERVATION: Even though the Austen corpus consists of 3 novels, the bible corpus is over twice in size.""" print(observ_e1) print() # Blank line for readability # [E2] Top unigram frequency # YOUR CODE BELOW. # [E3] Top bigram frequency # YOUR CODE BELOW. # [E4] 'so' unigram frequency # YOUR CODE BELOW. # [E5] 'so'-initial bigrams and their frequency # YOUR CODE BELOW. # [E6] conditional frequency of words following 'so' # YOUR CODE BELOW.