# HW#4
# Comparative Analysis of Two Corpora:
# EFL Writing by Bulgarian and Japanese Students
# Part B: Process two learner corpora

import textproc, glob

# [STEP 1] Unpickle your Brown Vocabulary Rank as vrank. 
# YOUR CODE BELOW:


# [STEP 2a] Glob the file names for the two copora.
# EDIT THE TWO LINES BELOW:
bu_files = []   # List of file names for Bulgarian essays
ja_files = []   # List of file names for Japanese essays


# [STEP 2b] Iterating through the file names, build the two tokenized corpora.  
bu_toks = []
ja_toks = []
# YOUR CODE BELOW:



# [STEP 3] Calculate and print out the average sentence length for each.
# YOUR CODE BELOW: 



# [STEP 4a] notfoundrank is the rank you should assign to words that are not
# seen in the Brown Corpus, i.e., words not in vrank.
# CHANGE THE VALUE BASED ON YOUR ANSWER TO Q5.
notfoundrank = 100000000000000


# [STEP 4b] Transform the two token lists into lists of vocabulary ranks.
# YOUR CODE BELOW:
bu_ranks = []
ja_ranks = []


# [STEP 5] Calculate and print out the average vocabulary ranks for the
# two sets of essays.
# YOUR CODE BELOW: