Intro to CS: Problem Set 3


Finished the problem set in about an hour. It was a good exercise. Here is my solution along with test case results:

My solution:

# Purpose: Check for similarity between two texts by comparing different kinds of word statistics.
import string
import math


### DO NOT MODIFY THIS FUNCTION
def load_file(filename):
    """
    Args:
        filename: string, name of file to read
    Returns:
        string, contains file contents
    """
    # print("Loading file %s" % filename)
    inFile = open(filename, 'r')
    line = inFile.read().strip()
    for char in string.punctuation:
        line = line.replace(char, "")
    inFile.close()
    return line.lower()


### Problem 0: Prep Data ###
def text_to_list(input_text):
    """
    Args:
        input_text: string representation of text from file.
                    assume the string is made of lowercase characters
    Returns:
        list representation of input_text, where each word is a different element in the list
    """
    return input_text.split()


### Problem 1: Get Frequency ###
def get_frequencies(input_iterable):
    """
    Args:
        input_iterable: a string or a list of strings, all are made of lowercase characters
    Returns:
        dictionary that maps string:int where each string
        is a letter or word in input_iterable and the corresponding int
        is the frequency of the letter or word in input_iterable
    Note: 
        You can assume that the only kinds of white space in the text documents we provide will be new lines or space(s) between words (i.e. there are no tabs)
    """
    output = {}
    for s in input_iterable:
        if s in output:
            output[s] += 1
        else:
            output[s] = 1
    return output


### Problem 2: Letter Frequencies ###
def get_letter_frequencies(word):
    """
    Args:
        word: word as a string
    Returns:
        dictionary that maps string:int where each string
        is a letter in word and the corresponding int
        is the frequency of the letter in word
    """
    return get_frequencies(word)


### Problem 3: Similarity ###
def calculate_similarity_score(freq_dict1, freq_dict2):
    """
    The keys of dict1 and dict2 are all lowercase,
    you will NOT need to worry about case sensitivity.

    Args:
        freq_dict1: frequency dictionary of letters of word1 or words of text1
        freq_dict2: frequency dictionary of letters of word2 or words of text2
    Returns:
        float, a number between 0 and 1, inclusive
        representing how similar the words/texts are to each other

        The difference in words/text frequencies = DIFF sums words
        from these three scenarios:
        * If an element occurs in dict1 and dict2 then
          get the difference in frequencies
        * If an element occurs only in dict1 then take the
          frequency from dict1
        * If an element occurs only in dict2 then take the
          frequency from dict2
         The total frequencies = ALL is calculated by summing
         all frequencies in both dict1 and dict2.
        Return 1-(DIFF/ALL) rounded to 2 decimal places
    """
    unique_words = []
    for key in freq_dict1:
        if key not in unique_words:
            unique_words.append(key)
    for key in freq_dict2:
        if key not in unique_words:
            unique_words.append(key)

    sum_freq_diffs = 0
    for u in unique_words:
        countL1 = 0
        countL2 = 0
        if u in freq_dict1:
            countL1 = freq_dict1[u]
        if u in freq_dict2:
            countL2 = freq_dict2[u]
        sum_freq_diffs += abs(countL1 - countL2)

    sum_freq_totals = sum(freq_dict1.values()) + sum(freq_dict2.values())

    similarity = 1 - (sum_freq_diffs / sum_freq_totals)

    return round(similarity, 2)



### Problem 4: Most Frequent Word(s) ###
def get_most_frequent_words(freq_dict1, freq_dict2):
    """
    The keys of dict1 and dict2 are all lowercase,
    you will NOT need to worry about case sensitivity.

    Args:
        freq_dict1: frequency dictionary for one text
        freq_dict2: frequency dictionary for another text
    Returns:
        list of the most frequent word(s) in the input dictionaries

    The most frequent word:
        * is based on the combined word frequencies across both dictionaries.
          If a word occurs in both dictionaries, consider the sum the
          freqencies as the combined word frequency.
        * need not be in both dictionaries, i.e it can be exclusively in
          dict1, dict2, or shared by dict1 and dict2.
    If multiple words are tied (i.e. share the same highest frequency),
    return an alphabetically ordered list of all these words.
    """
    combined_freqs = {}
    for word, freq in freq_dict1.items():
        if word not in combined_freqs:
            combined_freqs[word] = freq
        else:
            combined_freqs[word] += freq

    for word, freq in freq_dict2.items():
        if word not in combined_freqs:
            combined_freqs[word] = freq
        else:
            combined_freqs[word] += freq

    max_freq = max(combined_freqs.values())

    output = []
    for word, freq in combined_freqs.items():
        if freq == max_freq:
            output.append(word)

    output.sort()
    return output


### Problem 5: Finding TF-IDF ###
def get_tf(file_path):
    """
    Args:
        file_path: name of file in the form of a string
    Returns:
        a dictionary mapping each word to its TF

    * TF is calculatd as TF(i) = (number times word *i* appears
        in the document) / (total number of words in the document)
    * Think about how we can use get_frequencies from earlier
    """
    file_text = load_file(file_path)
    text_words = text_to_list(file_text)
    total_words = len(text_words)

    word_freqs = get_frequencies(text_words)

    tf = {}
    for word, freq in word_freqs.items():
        tf[word] = freq / total_words

    return tf


def get_idf(file_paths):
    """
    Args:
        file_paths: list of names of files, where each file name is a string
    Returns:
       a dictionary mapping each word to its IDF

    * IDF is calculated as IDF(i) = log_10(total number of documents / number of
    documents with word *i* in it), where log_10 is log base 10 and can be called
    with math.log10()

    """
    total_num_docs = len(file_paths)

    word_doc_freqs = {}

    for fp in file_paths:
        file_word_freqs = get_frequencies(text_to_list(load_file(fp)))
        for word in file_word_freqs:
            if word not in word_doc_freqs:
                word_doc_freqs[word] = 1
            else:
                word_doc_freqs[word] += 1

    idf = {}
    for word, doc_freq in word_doc_freqs.items():
        idf[word] = math.log10(total_num_docs / doc_freq)

    return idf

def get_tfidf(tf_file_path, idf_file_paths):
    """
    Args:
        tf_file_path: name of file in the form of a string (used to calculate TF)
        idf_file_paths: list of names of files, where each file name is a string
        (used to calculate IDF)
    Returns:
       a sorted list of tuples (in increasing TF-IDF score), where each tuple is
       of the form (word, TF-IDF). In case of words with the same TF-IDF, the
       words should be sorted in increasing alphabetical order.

    * TF-IDF(i) = TF(i) * IDF(i)
    """
    tf = get_tf(tf_file_path)
    idf = get_idf(idf_file_paths)

    text_words = text_to_list(load_file(tf_file_path))
    unique_words = []
    for tw in text_words:
        if tw not in unique_words:
            unique_words.append(tw)

    tfidf = []
    for word in unique_words:
        if word not in tfidf:
            tfidf.append((word, tf[word] * idf[word]))

    tfidf.sort(key=lambda x: (x[1], x[0]))
    return tfidf


if __name__ == "__main__":
    ###############################################################
    ## Uncomment the following lines to test your implementation ##
    ###############################################################

    # Tests Problem 0: Prep Data
    test_directory = "tests/student_tests/"
    hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
    world, friend = text_to_list(hello_world), text_to_list(hello_friend)
    print(world)      # should print ['hello', 'world', 'hello']
    print(friend)     # should print ['hello', 'friends']

    # Tests Problem 1: Get Frequencies
    test_directory = "tests/student_tests/"
    hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
    world, friend = text_to_list(hello_world), text_to_list(hello_friend)
    world_word_freq = get_frequencies(world)
    friend_word_freq = get_frequencies(friend)
    print(world_word_freq)    # should print {'hello': 2, 'world': 1}
    print(friend_word_freq)   # should print {'hello': 1, 'friends': 1}

    # Tests Problem 2: Get Letter Frequencies
    freq1 = get_letter_frequencies('hello')
    freq2 = get_letter_frequencies('that')
    print(freq1)      #  should print {'h': 1, 'e': 1, 'l': 2, 'o': 1}
    print(freq2)      #  should print {'t': 2, 'h': 1, 'a': 1}

    # Tests Problem 3: Similarity
    test_directory = "tests/student_tests/"
    hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
    world, friend = text_to_list(hello_world), text_to_list(hello_friend)
    world_word_freq = get_frequencies(world)
    friend_word_freq = get_frequencies(friend)
    word1_freq = get_letter_frequencies('toes')
    word2_freq = get_letter_frequencies('that')
    word3_freq = get_frequencies('nah')
    word_similarity1 = calculate_similarity_score(word1_freq, word1_freq)
    word_similarity2 = calculate_similarity_score(word1_freq, word2_freq)
    word_similarity3 = calculate_similarity_score(word1_freq, word3_freq)
    word_similarity4 = calculate_similarity_score(world_word_freq, friend_word_freq)
    print(word_similarity1)       # should print 1.0
    print(word_similarity2)       # should print 0.25
    print(word_similarity3)       # should print 0.0
    print(word_similarity4)       # should print 0.4

    ## Tests Problem 4: Most Frequent Word(s)
    freq_dict1, freq_dict2 = {"hello": 5, "world": 1}, {"hello": 1, "world": 5}
    most_frequent = get_most_frequent_words(freq_dict1, freq_dict2)
    print(most_frequent)      # should print ["hello", "world"]

    # Tests Problem 5: Find TF-IDF
    tf_text_file = 'tests/student_tests/hello_world.txt'
    idf_text_files = ['tests/student_tests/hello_world.txt', 'tests/student_tests/hello_friends.txt']
    tf = get_tf(tf_text_file)
    idf = get_idf(idf_text_files)
    tf_idf = get_tfidf(tf_text_file, idf_text_files)
    print(tf)     # should print {'hello': 0.6666666666666666, 'world': 0.3333333333333333}
    print(idf)    # should print {'hello': 0.0, 'world': 0.3010299956639812, 'friends': 0.3010299956639812}
    print(tf_idf) # should print [('hello', 0.0), ('world', 0.10034333188799373)]

Test results:

test_prep_1a (__main__.TestPrepData) ... ok
test_prep_all_whitespace (__main__.TestPrepData) ... ok
test_prep_hello_world (__main__.TestPrepData) ... ok
test_frequency_1a (__main__.TestWordFrequency) ... ok
test_frequency_1b (__main__.TestWordFrequency) ... ok
test_frequency_hello_world (__main__.TestWordFrequency) ... ok
test_frequency_word_hello (__main__.TestWordFrequency) ... ok
test_letter_frequency_1 (__main__.TestLetterFrequency) ... ok
test_letter_frequency_4 (__main__.TestLetterFrequency) ... ok
test_words1 (__main__.TestGetFrequentWords) ... ok
test_words2 (__main__.TestGetFrequentWords) ... ok
test_similarity_letters_1 (__main__.TestSimilarity) ... ok
test_similarity_letters_2 (__main__.TestSimilarity) ... ok
test_similarity_words1 (__main__.TestSimilarity) ... ok
test_similarity_words2 (__main__.TestSimilarity) ... ok
test_similarity_words3 (__main__.TestSimilarity) ... ok
test_similarity_words4 (__main__.TestSimilarity) ... ok
test_idf1 (__main__.TestTFIDF) ... ok
test_idf2 (__main__.TestTFIDF) ... ok
test_tf1 (__main__.TestTFIDF) ... ok
test_tf2 (__main__.TestTFIDF) ... ok
test_tfidf_big (__main__.TestTFIDF) ... ok
test_tfidf_small (__main__.TestTFIDF) ... ok

----------------------------------------------------------------------
Ran 23 tests in 0.003s

OK


Problem Set 3 Unit Test Results:
All correct!
Points: 5/5