Lecture Notes on 29 Nov 2017 def filter_string (st): s = ' ' for ch in st: if (ch >= 'a' and ch <= 'z'): s += ch else: s += ' ' return s def main(): # open the book book = open ("./hard_times.txt", "r") # create an empty set for unique words word_set = set() # create a dictionary for word frequency word_dict = {} # track the total number of words total_words = 0 for line in book: line = line.strip() line = line.lower() line = filter_string (line) # split the line into words word_list = line.split() # add each word to the set and the dictionary for word in word_list: word_set.add (word) total_words += 1 # add words to the dictionary if word in word_dict: word_dict[word] = word_dict[word] + 1 else: word_dict[word] = 1 # close the file book.close() # print the total number of words used print ('Total words used = ', total_words) num_unique_words = len (word_set) print ('Number of unique words = ', num_unique_words) word_ratio = num_unique_words / total_words print ('Word ratio = ', word_ratio) ''' # print the word frequencies all_words = list (word_dict.keys()) all_words.sort() for word in all_words: print (word + " : " + str (word_dict[word])) ''' # get the distribution according to frequency freq_dict = {} for word in word_dict: freq = word_dict[word] if freq in freq_dict: (freq_dict[freq]).append (word) else: new_list = [] new_list.append (word) freq_dict[freq] = new_list # print according to frequency all_freq = list (freq_dict.keys()) all_freq.sort() all_freq.reverse() for freq in all_freq: list_word = list (freq_dict[freq]) list_word.sort() print (str(freq) + " : " + str (list_word)) main()