Intro to CS: Problem Set 3
Finished the problem set in about an hour. It was a good exercise. Here is my solution along with test case results:
My solution:
# Purpose: Check for similarity between two texts by comparing different kinds of word statistics.
import string
import math
### DO NOT MODIFY THIS FUNCTION
def load_file(filename):
"""
Args:
filename: string, name of file to read
Returns:
string, contains file contents
"""
# print("Loading file %s" % filename)
inFile = open(filename, 'r')
line = inFile.read().strip()
for char in string.punctuation:
line = line.replace(char, "")
inFile.close()
return line.lower()
### Problem 0: Prep Data ###
def text_to_list(input_text):
"""
Args:
input_text: string representation of text from file.
assume the string is made of lowercase characters
Returns:
list representation of input_text, where each word is a different element in the list
"""
return input_text.split()
### Problem 1: Get Frequency ###
def get_frequencies(input_iterable):
"""
Args:
input_iterable: a string or a list of strings, all are made of lowercase characters
Returns:
dictionary that maps string:int where each string
is a letter or word in input_iterable and the corresponding int
is the frequency of the letter or word in input_iterable
Note:
You can assume that the only kinds of white space in the text documents we provide will be new lines or space(s) between words (i.e. there are no tabs)
"""
output = {}
for s in input_iterable:
if s in output:
output[s] += 1
else:
output[s] = 1
return output
### Problem 2: Letter Frequencies ###
def get_letter_frequencies(word):
"""
Args:
word: word as a string
Returns:
dictionary that maps string:int where each string
is a letter in word and the corresponding int
is the frequency of the letter in word
"""
return get_frequencies(word)
### Problem 3: Similarity ###
def calculate_similarity_score(freq_dict1, freq_dict2):
"""
The keys of dict1 and dict2 are all lowercase,
you will NOT need to worry about case sensitivity.
Args:
freq_dict1: frequency dictionary of letters of word1 or words of text1
freq_dict2: frequency dictionary of letters of word2 or words of text2
Returns:
float, a number between 0 and 1, inclusive
representing how similar the words/texts are to each other
The difference in words/text frequencies = DIFF sums words
from these three scenarios:
* If an element occurs in dict1 and dict2 then
get the difference in frequencies
* If an element occurs only in dict1 then take the
frequency from dict1
* If an element occurs only in dict2 then take the
frequency from dict2
The total frequencies = ALL is calculated by summing
all frequencies in both dict1 and dict2.
Return 1-(DIFF/ALL) rounded to 2 decimal places
"""
unique_words = []
for key in freq_dict1:
if key not in unique_words:
unique_words.append(key)
for key in freq_dict2:
if key not in unique_words:
unique_words.append(key)
sum_freq_diffs = 0
for u in unique_words:
countL1 = 0
countL2 = 0
if u in freq_dict1:
countL1 = freq_dict1[u]
if u in freq_dict2:
countL2 = freq_dict2[u]
sum_freq_diffs += abs(countL1 - countL2)
sum_freq_totals = sum(freq_dict1.values()) + sum(freq_dict2.values())
similarity = 1 - (sum_freq_diffs / sum_freq_totals)
return round(similarity, 2)
### Problem 4: Most Frequent Word(s) ###
def get_most_frequent_words(freq_dict1, freq_dict2):
"""
The keys of dict1 and dict2 are all lowercase,
you will NOT need to worry about case sensitivity.
Args:
freq_dict1: frequency dictionary for one text
freq_dict2: frequency dictionary for another text
Returns:
list of the most frequent word(s) in the input dictionaries
The most frequent word:
* is based on the combined word frequencies across both dictionaries.
If a word occurs in both dictionaries, consider the sum the
freqencies as the combined word frequency.
* need not be in both dictionaries, i.e it can be exclusively in
dict1, dict2, or shared by dict1 and dict2.
If multiple words are tied (i.e. share the same highest frequency),
return an alphabetically ordered list of all these words.
"""
combined_freqs = {}
for word, freq in freq_dict1.items():
if word not in combined_freqs:
combined_freqs[word] = freq
else:
combined_freqs[word] += freq
for word, freq in freq_dict2.items():
if word not in combined_freqs:
combined_freqs[word] = freq
else:
combined_freqs[word] += freq
max_freq = max(combined_freqs.values())
output = []
for word, freq in combined_freqs.items():
if freq == max_freq:
output.append(word)
output.sort()
return output
### Problem 5: Finding TF-IDF ###
def get_tf(file_path):
"""
Args:
file_path: name of file in the form of a string
Returns:
a dictionary mapping each word to its TF
* TF is calculatd as TF(i) = (number times word *i* appears
in the document) / (total number of words in the document)
* Think about how we can use get_frequencies from earlier
"""
file_text = load_file(file_path)
text_words = text_to_list(file_text)
total_words = len(text_words)
word_freqs = get_frequencies(text_words)
tf = {}
for word, freq in word_freqs.items():
tf[word] = freq / total_words
return tf
def get_idf(file_paths):
"""
Args:
file_paths: list of names of files, where each file name is a string
Returns:
a dictionary mapping each word to its IDF
* IDF is calculated as IDF(i) = log_10(total number of documents / number of
documents with word *i* in it), where log_10 is log base 10 and can be called
with math.log10()
"""
total_num_docs = len(file_paths)
word_doc_freqs = {}
for fp in file_paths:
file_word_freqs = get_frequencies(text_to_list(load_file(fp)))
for word in file_word_freqs:
if word not in word_doc_freqs:
word_doc_freqs[word] = 1
else:
word_doc_freqs[word] += 1
idf = {}
for word, doc_freq in word_doc_freqs.items():
idf[word] = math.log10(total_num_docs / doc_freq)
return idf
def get_tfidf(tf_file_path, idf_file_paths):
"""
Args:
tf_file_path: name of file in the form of a string (used to calculate TF)
idf_file_paths: list of names of files, where each file name is a string
(used to calculate IDF)
Returns:
a sorted list of tuples (in increasing TF-IDF score), where each tuple is
of the form (word, TF-IDF). In case of words with the same TF-IDF, the
words should be sorted in increasing alphabetical order.
* TF-IDF(i) = TF(i) * IDF(i)
"""
tf = get_tf(tf_file_path)
idf = get_idf(idf_file_paths)
text_words = text_to_list(load_file(tf_file_path))
unique_words = []
for tw in text_words:
if tw not in unique_words:
unique_words.append(tw)
tfidf = []
for word in unique_words:
if word not in tfidf:
tfidf.append((word, tf[word] * idf[word]))
tfidf.sort(key=lambda x: (x[1], x[0]))
return tfidf
if __name__ == "__main__":
###############################################################
## Uncomment the following lines to test your implementation ##
###############################################################
# Tests Problem 0: Prep Data
test_directory = "tests/student_tests/"
hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
world, friend = text_to_list(hello_world), text_to_list(hello_friend)
print(world) # should print ['hello', 'world', 'hello']
print(friend) # should print ['hello', 'friends']
# Tests Problem 1: Get Frequencies
test_directory = "tests/student_tests/"
hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
world, friend = text_to_list(hello_world), text_to_list(hello_friend)
world_word_freq = get_frequencies(world)
friend_word_freq = get_frequencies(friend)
print(world_word_freq) # should print {'hello': 2, 'world': 1}
print(friend_word_freq) # should print {'hello': 1, 'friends': 1}
# Tests Problem 2: Get Letter Frequencies
freq1 = get_letter_frequencies('hello')
freq2 = get_letter_frequencies('that')
print(freq1) # should print {'h': 1, 'e': 1, 'l': 2, 'o': 1}
print(freq2) # should print {'t': 2, 'h': 1, 'a': 1}
# Tests Problem 3: Similarity
test_directory = "tests/student_tests/"
hello_world, hello_friend = load_file(test_directory + 'hello_world.txt'), load_file(test_directory + 'hello_friends.txt')
world, friend = text_to_list(hello_world), text_to_list(hello_friend)
world_word_freq = get_frequencies(world)
friend_word_freq = get_frequencies(friend)
word1_freq = get_letter_frequencies('toes')
word2_freq = get_letter_frequencies('that')
word3_freq = get_frequencies('nah')
word_similarity1 = calculate_similarity_score(word1_freq, word1_freq)
word_similarity2 = calculate_similarity_score(word1_freq, word2_freq)
word_similarity3 = calculate_similarity_score(word1_freq, word3_freq)
word_similarity4 = calculate_similarity_score(world_word_freq, friend_word_freq)
print(word_similarity1) # should print 1.0
print(word_similarity2) # should print 0.25
print(word_similarity3) # should print 0.0
print(word_similarity4) # should print 0.4
## Tests Problem 4: Most Frequent Word(s)
freq_dict1, freq_dict2 = {"hello": 5, "world": 1}, {"hello": 1, "world": 5}
most_frequent = get_most_frequent_words(freq_dict1, freq_dict2)
print(most_frequent) # should print ["hello", "world"]
# Tests Problem 5: Find TF-IDF
tf_text_file = 'tests/student_tests/hello_world.txt'
idf_text_files = ['tests/student_tests/hello_world.txt', 'tests/student_tests/hello_friends.txt']
tf = get_tf(tf_text_file)
idf = get_idf(idf_text_files)
tf_idf = get_tfidf(tf_text_file, idf_text_files)
print(tf) # should print {'hello': 0.6666666666666666, 'world': 0.3333333333333333}
print(idf) # should print {'hello': 0.0, 'world': 0.3010299956639812, 'friends': 0.3010299956639812}
print(tf_idf) # should print [('hello', 0.0), ('world', 0.10034333188799373)]
Test results:
test_prep_1a (__main__.TestPrepData) ... ok
test_prep_all_whitespace (__main__.TestPrepData) ... ok
test_prep_hello_world (__main__.TestPrepData) ... ok
test_frequency_1a (__main__.TestWordFrequency) ... ok
test_frequency_1b (__main__.TestWordFrequency) ... ok
test_frequency_hello_world (__main__.TestWordFrequency) ... ok
test_frequency_word_hello (__main__.TestWordFrequency) ... ok
test_letter_frequency_1 (__main__.TestLetterFrequency) ... ok
test_letter_frequency_4 (__main__.TestLetterFrequency) ... ok
test_words1 (__main__.TestGetFrequentWords) ... ok
test_words2 (__main__.TestGetFrequentWords) ... ok
test_similarity_letters_1 (__main__.TestSimilarity) ... ok
test_similarity_letters_2 (__main__.TestSimilarity) ... ok
test_similarity_words1 (__main__.TestSimilarity) ... ok
test_similarity_words2 (__main__.TestSimilarity) ... ok
test_similarity_words3 (__main__.TestSimilarity) ... ok
test_similarity_words4 (__main__.TestSimilarity) ... ok
test_idf1 (__main__.TestTFIDF) ... ok
test_idf2 (__main__.TestTFIDF) ... ok
test_tf1 (__main__.TestTFIDF) ... ok
test_tf2 (__main__.TestTFIDF) ... ok
test_tfidf_big (__main__.TestTFIDF) ... ok
test_tfidf_small (__main__.TestTFIDF) ... ok
----------------------------------------------------------------------
Ran 23 tests in 0.003s
OK
Problem Set 3 Unit Test Results:
All correct!
Points: 5/5