diff --git a/sentiment.py b/sentiment.py index 96a9c9595fe6b2f807e6c3528bbc4ebfed2fced9..bbe24da2912a907c17c316a578c5a80e2564dee5 100644 --- a/sentiment.py +++ b/sentiment.py @@ -2,6 +2,8 @@ import math import os from enum import Enum from collections import Counter + + class MenuOption(Enum): SHOW_REVIEWS = 'Show reviews' CHECK_TOKEN = 'Check if a token is present' @@ -12,13 +14,14 @@ class MenuOption(Enum): SHOW_ADJUSTED_SENTENCE_STATISTICS = 'Show the statistics for a sentence with stop words ignored' EXIT = 'Exit the program' + def main(): recall = True menu_options = tuple(MenuOption) reviews = [] review_tokens_set = set() review_tokens_list = [] - positive_reviews_list,neutral_reviews_list, negative_reviews_list = [], [], [] + positive_reviews_list, neutral_reviews_list, negative_reviews_list = [], [], [] positive_reviews_set, neutral_reviews_set, negative_reviews_set = set(), set(), set() read_file(reviews) get_tokens(reviews, review_tokens_set, review_tokens_list, @@ -52,9 +55,12 @@ def main(): elif input_number == 6: save_stop_words(negative_reviews_list, neutral_reviews_list, positive_reviews_list) elif input_number == 7: - adjusted_sentence_statistics(negative_reviews_list, neutral_reviews_list, positive_reviews_list, stop_words) + adjusted_sentence_statistics(negative_reviews_list, neutral_reviews_list, positive_reviews_list, + stop_words) elif input_number == 8: exit() + + def read_file(reviews): file_path = os.path.join(os.path.dirname(__file__), "sentiment.txt") # Adjust if needed try: @@ -62,9 +68,11 @@ def read_file(reviews): for line in file: reviews.append(line.strip("\t\n")) except FileNotFoundError: - raise FileNotFoundError(f"Error - No file found at {file_path}. Ensure sentiment.txt is in the correct directory.") + raise FileNotFoundError( + f"Error - No file found at {file_path}. Ensure sentiment.txt is in the correct directory.") + -#gets list of stop words +# gets list of stop words def get_stop_words(): stop_words = set() try: @@ -75,7 +83,8 @@ def get_stop_words(): print("Warning: Stop words file not found. Using an empty set.") return stop_words -#finds the statistics for a series of inputs, ignoring stop words + +# finds the statistics for a series of inputs, ignoring stop words def adjusted_sentence_statistics(negative_reviews, neutral_reviews, positive_reviews, stop_words): input_tokens = str(input("Enter a sentence as space-seperated tokens: ")).lower().split() nonstop_tokens = [token for token in input_tokens if token not in stop_words] @@ -104,13 +113,16 @@ def adjusted_sentence_statistics(negative_reviews, neutral_reviews, positive_rev unknown_counter += 1 if valid_tokens > 0: avg_score = total_score / valid_tokens - print(f"\nThe sentence has {stop_word_count} stop-word token(s), and it has {total_negative_reviews} negative, {total_neutral_reviews} neutral, {total_positive_reviews} positive, and {unknown_counter}" + print( + f"\nThe sentence has {stop_word_count} stop-word token(s), and it has {total_negative_reviews} negative, {total_neutral_reviews} neutral, {total_positive_reviews} positive, and {unknown_counter}" f" unknown non-stop-word token(s).\nThe sentence has an average tf-idf score of {avg_score}.") else: - print(f"\nThe sentence contains only {stop_word_count} stop-word token(s) and {unknown_counter} unknown non-stop-word token(s).\n" - f"Therefore, its average tf-idf score is undefined.") + print( + f"\nThe sentence contains only {stop_word_count} stop-word token(s) and {unknown_counter} unknown non-stop-word token(s).\n" + f"Therefore, its average tf-idf score is undefined.") + -#saves stop words to a file +# saves stop words to a file def save_stop_words(negative_reviews_list, neutral_reviews_list, positive_reviews_list): positive_reviews = Counter(positive_reviews_list) neutral_reviews = Counter(neutral_reviews_list) @@ -138,7 +150,8 @@ def save_stop_words(negative_reviews_list, neutral_reviews_list, positive_review print(f'\nStop word list saved to "output.txt".') -#finds the statistics for a series of inputs + +# finds the statistics for a series of inputs def sentence_statistics(positive_reviews, neutral_reviews, negative_reviews): input_tokens = str(input("Enter a sentence as space-seperated tokens: ")).lower().split() positive_counter, neutral_counter, negative_counter, unknown_counter = 0, 0, 0, 0 @@ -146,8 +159,11 @@ def sentence_statistics(positive_reviews, neutral_reviews, negative_reviews): score_con = "" for token in input_tokens: if token in positive_reviews or token in neutral_reviews or token in negative_reviews: - negative_counter, neutral_counter, positive_counter, score, score_con = score_compute(token, 0, negative_reviews, 0, - neutral_reviews, 0, positive_reviews, score_con) + negative_counter, neutral_counter, positive_counter, score, score_con = score_compute(token, 0, + negative_reviews, 0, + neutral_reviews, 0, + positive_reviews, + score_con) if score_con == "positive": total_positive_reviews += 1 elif score_con == "neutral": @@ -163,27 +179,34 @@ def sentence_statistics(positive_reviews, neutral_reviews, negative_reviews): avg_score = total_score / valid_tokens if valid_tokens > 0 else 0 if unknown_counter != len(input_tokens): - print(f"\nThe sentence has {total_negative_reviews} negative, {total_neutral_reviews} neutral, {total_positive_reviews} positive, and {unknown_counter}" - f" unknown token(s).\nThe sentence has an average tf-idf score of {avg_score}.") + print( + f"\nThe sentence has {total_negative_reviews} negative, {total_neutral_reviews} neutral, {total_positive_reviews} positive, and {unknown_counter}" + f" unknown token(s).\nThe sentence has an average tf-idf score of {avg_score}.") else: print("\nThe sentence contains only unknown tokens; therefore, its average tf-idf score is undefined.") -#finds the statistics of a token + +# finds the statistics of a token def token_statistics(positive_reviews, neutral_reviews, negative_reviews, review_tokens_set): input_token = str(input("Enter a token: ")).lower() - positive_counter,neutral_counter,negative_counter,score = 0,0,0,0 + positive_counter, neutral_counter, negative_counter, score = 0, 0, 0, 0 score_con = "" negative_counter, neutral_counter, positive_counter, score, score_con = score_compute(input_token, negative_counter, - negative_reviews, neutral_counter, neutral_reviews, - positive_counter, positive_reviews, score_con) + negative_reviews, + neutral_counter, + neutral_reviews, + positive_counter, + positive_reviews, score_con) if input_token in review_tokens_set: - print(f"The token {input_token} has {negative_counter} negative, {neutral_counter} neutral, and {positive_counter} " - f"positive appearance(s) in the training data.\n" - f"The token {input_token} is classified as {score_con} because it has a tf-idf score of {score}.") + print( + f"The token {input_token} has {negative_counter} negative, {neutral_counter} neutral, and {positive_counter} " + f"positive appearance(s) in the training data.\n" + f"The token {input_token} is classified as {score_con} because it has a tf-idf score of {score}.") else: print(f"\nThe token \"{input_token}\" does not appear in the training data.") -#checks the score of the token + +# checks the score of the token def score_compute(input_token, negative_counter, negative_reviews, neutral_counter, neutral_reviews, positive_counter, positive_reviews, score_con): for token in positive_reviews: @@ -205,7 +228,8 @@ def score_compute(input_token, negative_counter, negative_reviews, neutral_count score_con = "positive" return negative_counter, neutral_counter, positive_counter, score, score_con -#checks the amount of times a token appears in the test data + +# checks the amount of times a token appears in the test data def word_frequencies(review_tokens_list): token_amount = 0 input_token = str(input("Enter a token: ")).lower() @@ -214,15 +238,19 @@ def word_frequencies(review_tokens_list): token_amount += 1 print(f"\nThe training data contains {token_amount} appearance(s) of the token \"{input_token}\".") -#checks if a token is present in the test data + +# checks if a token is present in the test data def check_token(review_token_set): input_token = str(input("Enter a token: ")).lower() if input_token in review_token_set: - print(f"\nThe token \"{input_token}\" is 1 of the " + str(len(review_token_set)) + " unique tokens in the training data") + print(f"\nThe token \"{input_token}\" is 1 of the " + str( + len(review_token_set)) + " unique tokens in the training data") else: - print(f"\nThe token \"{input_token}\" is not 1 of the " + str(len(review_token_set)) + " unique tokens in the training data") + print(f"\nThe token \"{input_token}\" is not 1 of the " + str( + len(review_token_set)) + " unique tokens in the training data") -#prints requested reviews by user + +# prints requested reviews by user def show_reviews(reviews): start_recall, end_recall = True, True start_number, end_number, review_number = 0, 0, 0 @@ -239,23 +267,24 @@ def show_reviews(reviews): start_recall = False while end_recall: try: - end_number = int(input("Enter an end review number from " + str(start_number+1) + " to 8529: ")) + end_number = int(input("Enter an end review number from " + str(start_number + 1) + " to 8529: ")) except ValueError: - print("\nInvalid input - please input number between " + str(start_number+1) + " and 8529\n") + print("\nInvalid input - please input number between " + str(start_number + 1) + " and 8529\n") else: if end_number <= start_number or end_number > 8529: - print("\nInvalid input - please input number between " + str(start_number+1) + " and 8529\n") - else : + print("\nInvalid input - please input number between " + str(start_number + 1) + " and 8529\n") + else: end_recall = False i = start_number while i <= end_number: - requested_reviews.append(reviews[i-1]) - print(f"Review # {i} : " + str(reviews[i-1])) + requested_reviews.append(reviews[i - 1]) + print(f"Review # {i} : " + str(reviews[i - 1])) i += 1 -#creates starting lists + +# creates starting lists def get_tokens(reviews, review_tokens_set, review_tokens_list, - positive_reviews,neutral_reviews,negative_reviews, + positive_reviews, neutral_reviews, negative_reviews, positive_reviews_set, neutral_reviews_set, negative_reviews_set): for review in reviews: token = review[2:].strip().split() @@ -271,7 +300,8 @@ def get_tokens(reviews, review_tokens_set, review_tokens_list, negative_reviews += token negative_reviews_set.update(token) -#reads the txt file and stores the lines in list + +# reads the txt file and stores the lines in list def read_file(reviews): try: with open("sentiment.txt", "r") as file: @@ -281,5 +311,6 @@ def read_file(reviews): print("Error - No file found") print("\n".join(reviews)) + if __name__ == '__main__': - main() \ No newline at end of file + main()