Lecture Notes on 29 Nov 2017

def filter_string (st):
  s = ' '
  for ch in st:
    if (ch >= 'a' and ch <= 'z'):
      s += ch
    else:
      s += ' '
  return s

def main():
  # open the book
  book = open ("./hard_times.txt", "r")

  # create an empty set for unique words
  word_set = set()

  # create a dictionary for word frequency
  word_dict = {}

  # track the total number of words
  total_words = 0

  for line in book:
    line = line.strip()
    line = line.lower()
    line = filter_string (line)
    
    # split the line into words
    word_list = line.split()
    
    # add each word to the set and the dictionary
    for word in word_list:
      word_set.add (word)
      total_words += 1

      # add words to the dictionary
      if word in word_dict:
        word_dict[word] = word_dict[word] + 1
      else:
        word_dict[word] = 1

  # close the file
  book.close()

  # print the total number of words used
  print ('Total words used = ', total_words)

  num_unique_words = len (word_set)
  print ('Number of unique words = ', num_unique_words)

  word_ratio = num_unique_words / total_words
  print ('Word ratio = ', word_ratio)

  '''
  # print the word frequencies
  all_words = list (word_dict.keys())
  all_words.sort()
  for word in all_words:
    print (word + " : " + str (word_dict[word]))
  '''

  # get the distribution according to frequency
  freq_dict = {}
  for word in word_dict:
    freq = word_dict[word]
    if freq in freq_dict:
      (freq_dict[freq]).append (word)
    else:
      new_list = []
      new_list.append (word)
      freq_dict[freq] = new_list

  # print according to frequency
  all_freq = list (freq_dict.keys())
  all_freq.sort()
  all_freq.reverse()

  for freq in all_freq:
    list_word = list (freq_dict[freq])
    list_word.sort()
    print (str(freq) + " : " + str (list_word))
main()