# For HW2: https://sites.pitt.edu/~naraehan/ling1330/hw2.html
# YOUR NAME
# DATE

# ------------------------------------------------------------------------
#                                                              PREPARATION
print("...Importing and reading corpus files...")
# ------------------------------------------------------------------------

# [A] Import all libraries here. 
# EDIT THE CODE BELOW.
import pickle

# [B] Open the text files for the two corpora, read in the contents.
# EDIT THE CODE BELOW. 

b_txt = 'This is the bible text...'

a_etxt = "This is Austen's Emma..."
a_ptxt = "Austen's Persuasion..."
a_stxt = "Sense and Sensibility..."
a_txt = a_etxt + a_ptxt + a_stxt     # all three texts in one string! 

# ------------------------------------------------------------------------
#                                                    BUILDING DATA OBJECTS
print("...Building data objects...")
# ------------------------------------------------------------------------

# [C1] Build two token lists, then lowercase them through list comprehension. 
# EDIT THE CODE BELOW.
b_toks = []
a_toks = []

# [C2] Build word frequency distributions.
# EDIT THE CODE BELOW.
b_tokfd = "foo"
a_tokfd = "foo"

# [C3] Build word bigrams. You should cast them as *lists*.  
# EDIT THE CODE BELOW.
b_bigrams = []
a_bigrams = []

# [C4] Build bigram frequency distributions.  
# EDIT THE CODE BELOW.
b_bigramfd = "foo"
a_bigramfd = "foo"

# [C5] Build conditional frequency distributions of the two bigram lists.  
# EDIT THE CODE BELOW.
b_bigramcfd = "foo"
a_bigramcfd = "foo"


# ------------------------------------------------------------------------
#                                                              PICKLE DATA
print("...Pickling bigram cfd objects...")
# ------------------------------------------------------------------------

# [D] Pickle the two bigram conditional frequency distributions.
# File names: bible_bigramcfd.pkl, austen_bigramcfd.pkl
# YOUR CODE BELOW. 


# ------------------------------------------------------------------------
#                                             EXPLORE AND ANSWER QUESTIONS
#                                           TRY THINGS OUT IN SHELL FIRST!
print("...Now answering questions on the two corpora.")
# ------------------------------------------------------------------------

# [E1] Token and type counts, observation
# BELOW IS A COMPLETE ANSWER, TO GIVE YOU A SENSE OF OUTPUT FORMAT
print("[E1] token and type counts")    # question heading
print("The Bible has", len(b_toks), "tokens and", len(b_tokfd), "types.")
print("The Austen corpus has", len(a_toks), "tokens and", len(a_tokfd), "types.")

observ_e1 = """OBSERVATION: Even though the Austen corpus consists of 3 novels, 
 the bible corpus is over twice in size."""
print(observ_e1)
print()             # Blank line for readability 

# [E2] Top unigram frequency
# YOUR CODE BELOW. 


# [E3] Top bigram frequency 
# YOUR CODE BELOW. 


# [E4] 'so' unigram frequency
# YOUR CODE BELOW. 


# [E5] 'so'-initial bigrams and their frequency 
# YOUR CODE BELOW. 


# [E6] conditional frequency of words following 'so'
# YOUR CODE BELOW.