diff --git a/.vscode/.ropeproject/config.py b/.vscode/.ropeproject/config.py new file mode 100644 index 0000000000000000000000000000000000000000..dee2d1ae9a6be9cf0248130b6c6b9e2668052079 --- /dev/null +++ b/.vscode/.ropeproject/config.py @@ -0,0 +1,114 @@ +# The default ``config.py`` +# flake8: noqa + + +def set_prefs(prefs): + """This function is called before opening the project""" + + # Specify which files and folders to ignore in the project. + # Changes to ignored resources are not added to the history and + # VCSs. Also they are not returned in `Project.get_files()`. + # Note that ``?`` and ``*`` match all characters but slashes. + # '*.pyc': matches 'test.pyc' and 'pkg/test.pyc' + # 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc' + # '.svn': matches 'pkg/.svn' and all of its children + # 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o' + # 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o' + prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject', + '.hg', '.svn', '_svn', '.git', '.tox'] + + # Specifies which files should be considered python files. It is + # useful when you have scripts inside your project. Only files + # ending with ``.py`` are considered to be python files by + # default. + # prefs['python_files'] = ['*.py'] + + # Custom source folders: By default rope searches the project + # for finding source folders (folders that should be searched + # for finding modules). You can add paths to that list. Note + # that rope guesses project source folders correctly most of the + # time; use this if you have any problems. + # The folders should be relative to project root and use '/' for + # separating folders regardless of the platform rope is running on. + # 'src/my_source_folder' for instance. + # prefs.add('source_folders', 'src') + + # You can extend python path for looking up modules + # prefs.add('python_path', '~/python/') + + # Should rope save object information or not. + prefs['save_objectdb'] = True + prefs['compress_objectdb'] = False + + # If `True`, rope analyzes each module when it is being saved. + prefs['automatic_soa'] = True + # The depth of calls to follow in static object analysis + prefs['soa_followed_calls'] = 0 + + # If `False` when running modules or unit tests "dynamic object + # analysis" is turned off. This makes them much faster. + prefs['perform_doa'] = True + + # Rope can check the validity of its object DB when running. + prefs['validate_objectdb'] = True + + # How many undos to hold? + prefs['max_history_items'] = 32 + + # Shows whether to save history across sessions. + prefs['save_history'] = True + prefs['compress_history'] = False + + # Set the number spaces used for indenting. According to + # :PEP:`8`, it is best to use 4 spaces. Since most of rope's + # unit-tests use 4 spaces it is more reliable, too. + prefs['indent_size'] = 4 + + # Builtin and c-extension modules that are allowed to be imported + # and inspected by rope. + prefs['extension_modules'] = [] + + # Add all standard c-extensions to extension_modules list. + prefs['import_dynload_stdmods'] = True + + # If `True` modules with syntax errors are considered to be empty. + # The default value is `False`; When `False` syntax errors raise + # `rope.base.exceptions.ModuleSyntaxError` exception. + prefs['ignore_syntax_errors'] = False + + # If `True`, rope ignores unresolvable imports. Otherwise, they + # appear in the importing namespace. + prefs['ignore_bad_imports'] = False + + # If `True`, rope will insert new module imports as + # `from <package> import <module>` by default. + prefs['prefer_module_from_imports'] = False + + # If `True`, rope will transform a comma list of imports into + # multiple separate import statements when organizing + # imports. + prefs['split_imports'] = False + + # If `True`, rope will remove all top-level import statements and + # reinsert them at the top of the module when making changes. + prefs['pull_imports_to_top'] = True + + # If `True`, rope will sort imports alphabetically by module name instead + # of alphabetically by import statement, with from imports after normal + # imports. + prefs['sort_imports_alphabetically'] = False + + # Location of implementation of + # rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general + # case, you don't have to change this value, unless you're an rope expert. + # Change this value to inject you own implementations of interfaces + # listed in module rope.base.oi.type_hinting.providers.interfaces + # For example, you can add you own providers for Django Models, or disable + # the search type-hinting in a class hierarchy, etc. + prefs['type_hinting_factory'] = ( + 'rope.base.oi.type_hinting.factory.default_type_hinting_factory') + + +def project_opened(project): + """This function is called after opening the project""" + # Do whatever you like here! diff --git a/sentiment.py b/sentiment.py index 48a568b8880e30edcd046daad549f3561990af1b..a301e023c62fca68d018dd03c96c9c3edfc6ed5a 100644 --- a/sentiment.py +++ b/sentiment.py @@ -1,5 +1,7 @@ from enum import Enum, auto +from collections import Counter import math +import unittest class MenuOption(Enum): @@ -32,16 +34,12 @@ class Review(): return Sentiment.NEGATIVE def get_tokens(self): - return self.review_text.strip().lower().split()[1:] + return self.review_text.lower().strip('.').split()[1:] def token_frequency(self, token): return self.get_tokens().count(token) -def main(): - menu_input() - - def get_data(): try: with open("sentiment.txt") as file: @@ -50,6 +48,11 @@ def get_data(): print("File cannot be found") +class GetDataTest(unittest.TestCase): + def test(self): + self.assertEqual(len(get_data()), 16444) + + def get_reviews(data): reviews = [] for review in data: @@ -58,49 +61,32 @@ def get_reviews(data): return reviews -def get_tokens(data): - return "".join(data).strip().split() +def get_all_tokens(data): + return "".join(data).replace(".", "").strip().split() -def check_token(data): - tokens = frozenset(get_tokens(data)) +def check_token(data, token): + tokens = frozenset(get_all_tokens(data)) - token = input("Enter a token: ").lower() if token in tokens: - print(f"The token \"{token}\" appears in the training data.") + return True else: - print(f"The token \"{token}\" does not appear in the training data.") - - -def check_document_frequency(data): - tokens = get_tokens(data) - print(len(tokens)) - token = input("Enter a token: ").lower() - print( - f"The token \"{token}\" appears {tokens.count(token)} out of {len(tokens)} time(s) in the training data.") + return False -def show_token_stats(data): - print(f"Review num: {len(data)}") - reviews = get_reviews(data) - tokens = get_tokens(data) - token = input("Enter a token: ").lower() +def calculate_tf_idf(reviews, token): + # positive_count, positive_token_total, maxp, negative_count, negative_token_total, maxn positive_reviews = [] positive_tokens = [] - positive_num = 0 - highest_positive_num = 0 + positive_count = 0 negative_reviews = [] negative_tokens = [] - negative_num = 0 - highest_negative_num = 0 + negative_count = 0 neutral_reviews = [] - neutral_num = 0 - all_num = tokens.count(token) - # maxp = 0 - + neutral_count = 0 -# Separating Positive and Negative reviews and tokens into lists + # Separating Positive and Negative reviews and tokens into lists for review in reviews: if(review.get_sentiment_rating() == Sentiment.POSITIVE): positive_reviews.append(review) @@ -113,41 +99,150 @@ def show_token_stats(data): elif(review.get_sentiment_rating() == Sentiment.NEUTRAL): neutral_reviews.append(review) - -# Getting total number of Positive and Negative tokens + # Getting total number of Positive and Negative tokens for review in negative_reviews: - negative_num += review.get_tokens().count(token) + negative_count += review.get_tokens().count(token) for review in neutral_reviews: - neutral_num += review.get_tokens().count(token) + neutral_count += review.get_tokens().count(token) for review in positive_reviews: - positive_num += review.get_tokens().count(token) + positive_count += review.get_tokens().count(token) + + maxp = Counter(negative_tokens).most_common(1)[0][1] + maxn = Counter(positive_tokens).most_common(1)[0][1] + + return (((10 + 10 * (positive_count/maxp)) * math.log(len(positive_tokens)/maxp)) - + ((10 + 10 * (negative_count/maxn)) * math.log(len(negative_tokens)/maxn))), negative_count, positive_count, neutral_count + +# Option: 1 + + +def show_reviews(data): + review_objects = get_reviews(data) + start_selection = "1" + end_selection = "" + + while True: + + start_selection = int( + input(f"Enter a review number from {start_selection} to {len(review_objects)}: ")) + + if start_selection > len(review_objects) or start_selection <= 0 or start_selection == None: + + print("Please enter a valid, in-range number.") + start_selection = "1" + continue + + else: + while True: + + end_selection = int( + input(f"Enter a review number from {start_selection} to {len(review_objects)}: ")) + + if end_selection > len(review_objects) or end_selection < start_selection or end_selection == None: + print("Please enter a valid, in-range number.") + continue + else: -# Get the greatest number of times that any token appears in positive comments - # for test_token in frozenset(get_tokens(data)): - # for review in positive_reviews: - # if review.get_tokens().count(test_token) > highest_positive_num: - # highest_positive_num = review.get_tokens().count(test_token) + for review_num in range(start_selection-1, end_selection): -# Get the greatest number of times that any token appears in negative comments - # for test_token in frozenset(get_tokens(data)): - # for review in negative_reviews: - # if review.get_tokens().count(test_token) > highest_negative_num: - # highest_negative_num = review.get_tokens().count(test_token) + print( + f'Review #{review_num + 1}: {review_objects[review_num].review_text}') - # print(positive_num) - # print(highest_positive_num) - # print(negative_num) - # print(highest_negative_num) + break + break - score = (((10 + 10 * (positive_num/highest_positive_num)) * - math.log(len(positive_tokens)/highest_positive_num)) - ((10 + 10 * (negative_num/highest_negative_num)) * math.log(len(negative_tokens)/highest_negative_num))) - print(f"The token \"{token}\" has {negative_num} negative, {neutral_num} neutral, and {positive_num} positive appearance(s) in the training data.") +# Option: 3 +def check_document_frequency(data): + tokens = get_all_tokens(data) + print(len(tokens)) + token = input("Enter a token: ").lower() print( - f"The token \"{token}\" has a differential tf-idf score of {score} and is classified as ...") + f"The token \"{token}\" appears {tokens.count(token)} out of {len(tokens)} time(s) in the training data.") + + +class TestCheckDocumentFrequency(self): + def test(self): + self.assertEqual(check_document_frequency(self), ) + +# Option: 4 + + +def show_token_stats(data): + reviews = get_reviews(data) + input_token = input("Enter a token: ").lower() + + if check_token(data, input_token): + score, negative_count, positive_count, neutral_count = calculate_tf_idf( + reviews, input_token) + + print( + f"The token \"{input_token}\" has {negative_count} negative, {neutral_count} neutral, and {positive_count} positive appearance(s) in the training data.") + + if score > 0: + print( + f"The token \"{input_token}\" has a differential tf-idf score of {score} and is classified as positive") + else: + print( + f"The token \"{input_token}\" has a differential tf-idf score of {score} and is classified as negative") + else: + print( + f"The token \"{input_token}\" does not appear in the training data.") + + +# Option: 5 +def show_sentence_stats(data): + reviews = get_reviews(data) + + input_tokens = input( + "Enter a sentence as space-separated tokens: ").split() + + total_scores = 0 + total_negative = 0 + total_positive = 0 + total_neutral = 0 + total_unknown = 0 + + for token in input_tokens: + + if check_token(data, token): + score, negative_count, positive_count, neutral_count = calculate_tf_idf( + reviews, token) + total_scores += score + if score > .1: + total_positive += 1 + elif score < -.1: + total_negative += 1 + else: + total_neutral += 1 + else: + total_unknown += 1 + print( + f"The sentence has {total_unknown} unkown token(s): \"{token}\" Therefore, the sentence's average tf-idf score is undefined.") + return + + print( + f'The sentence has {total_negative} negative, {total_neutral} neutral, and {total_positive} positive token(s).') + print( + f'The sentence has an average tf-idf score of {total_scores/len(input_tokens)}') + + +# option: 6 +def save_stop_word_list(data): + unique_tokens = frozenset(get_all_tokens(data)) + all_tokens = get_all_tokens(data) + stop_words = [] + + for token in unique_tokens: + if all_tokens.count(token)/len(all_tokens) > 0.2: + print(token) + stop_words.append(token) + + with open("sentiment.txt") as file: + file.write(",".join(sorted(stop_words))) def handle_selection(selection, data): @@ -155,17 +250,26 @@ def handle_selection(selection, data): show_reviews(data) elif selection == MenuOption.CHECK_TOKEN: - check_token(data) + token = input("Enter a token: ").lower() + if(check_token(data, token)): + print(f"The token \"{token}\" appears in the training data.") + else: + print( + f"The token \"{token}\" does not appear in the training data.") elif selection == MenuOption.SHOW_DOCUMENT_FREQUENCY: check_document_frequency(data) elif selection == MenuOption.SHOW_TOKEN_STATISTICS: show_token_stats(data) - pass + elif selection == MenuOption.SHOW_SENTENCE_STATISTICS: - pass + show_sentence_stats(data) + elif selection == MenuOption.SAVE_STOP_WORD_LIST: + save_stop_word_list(data) + pass + elif selection == MenuOption.SHOW_ADJUSTED_SENTENCE_STATISTICS: pass @@ -189,41 +293,8 @@ def menu_input(): print("Please enter a valid, in range number.") -def show_reviews(data): - review_objects = get_reviews(data) - start_selection = "1" - end_selection = "" - - while True: - - start_selection = int( - input(f"Enter a review number from {start_selection} to {len(review_objects)}: ")) - - if start_selection > len(review_objects) or start_selection <= 0 or start_selection == None: - - print("Please enter a valid, in-range number.") - start_selection = "1" - continue - - else: - while True: - - end_selection = int( - input(f"Enter a review number from {start_selection} to {len(review_objects)}: ")) - - if end_selection > len(review_objects) or end_selection < start_selection or end_selection == None: - print("Please enter a valid, in-range number.") - continue - - else: - - for review_num in range(start_selection-1, end_selection): - - print( - f'Review #{review_num + 1}: {review_objects[review_num].review_text}') - - break - break +def main(): + menu_input() if __name__ == '__main__':