Commit 08ae39f9 authored by noahcostello5's avatar noahcostello5
Browse files

Added show_sentence_stats and save_stop_word_list functionality

parent a7bd62ad
# The default ``config.py``
# flake8: noqa
def set_prefs(prefs):
"""This function is called before opening the project"""
# Specify which files and folders to ignore in the project.
# Changes to ignored resources are not added to the history and
# VCSs. Also they are not returned in `Project.get_files()`.
# Note that ``?`` and ``*`` match all characters but slashes.
# '*.pyc': matches 'test.pyc' and 'pkg/test.pyc'
# 'mod*.pyc': matches 'test/mod1.pyc' but not 'mod/1.pyc'
# '.svn': matches 'pkg/.svn' and all of its children
# 'build/*.o': matches 'build/lib.o' but not 'build/sub/lib.o'
# 'build//*.o': matches 'build/lib.o' and 'build/sub/lib.o'
prefs['ignored_resources'] = ['*.pyc', '*~', '.ropeproject',
'.hg', '.svn', '_svn', '.git', '.tox']
# Specifies which files should be considered python files. It is
# useful when you have scripts inside your project. Only files
# ending with ``.py`` are considered to be python files by
# default.
# prefs['python_files'] = ['*.py']
# Custom source folders: By default rope searches the project
# for finding source folders (folders that should be searched
# for finding modules). You can add paths to that list. Note
# that rope guesses project source folders correctly most of the
# time; use this if you have any problems.
# The folders should be relative to project root and use '/' for
# separating folders regardless of the platform rope is running on.
# 'src/my_source_folder' for instance.
# prefs.add('source_folders', 'src')
# You can extend python path for looking up modules
# prefs.add('python_path', '~/python/')
# Should rope save object information or not.
prefs['save_objectdb'] = True
prefs['compress_objectdb'] = False
# If `True`, rope analyzes each module when it is being saved.
prefs['automatic_soa'] = True
# The depth of calls to follow in static object analysis
prefs['soa_followed_calls'] = 0
# If `False` when running modules or unit tests "dynamic object
# analysis" is turned off. This makes them much faster.
prefs['perform_doa'] = True
# Rope can check the validity of its object DB when running.
prefs['validate_objectdb'] = True
# How many undos to hold?
prefs['max_history_items'] = 32
# Shows whether to save history across sessions.
prefs['save_history'] = True
prefs['compress_history'] = False
# Set the number spaces used for indenting. According to
# :PEP:`8`, it is best to use 4 spaces. Since most of rope's
# unit-tests use 4 spaces it is more reliable, too.
prefs['indent_size'] = 4
# Builtin and c-extension modules that are allowed to be imported
# and inspected by rope.
prefs['extension_modules'] = []
# Add all standard c-extensions to extension_modules list.
prefs['import_dynload_stdmods'] = True
# If `True` modules with syntax errors are considered to be empty.
# The default value is `False`; When `False` syntax errors raise
# `rope.base.exceptions.ModuleSyntaxError` exception.
prefs['ignore_syntax_errors'] = False
# If `True`, rope ignores unresolvable imports. Otherwise, they
# appear in the importing namespace.
prefs['ignore_bad_imports'] = False
# If `True`, rope will insert new module imports as
# `from <package> import <module>` by default.
prefs['prefer_module_from_imports'] = False
# If `True`, rope will transform a comma list of imports into
# multiple separate import statements when organizing
# imports.
prefs['split_imports'] = False
# If `True`, rope will remove all top-level import statements and
# reinsert them at the top of the module when making changes.
prefs['pull_imports_to_top'] = True
# If `True`, rope will sort imports alphabetically by module name instead
# of alphabetically by import statement, with from imports after normal
# imports.
prefs['sort_imports_alphabetically'] = False
# Location of implementation of
# rope.base.oi.type_hinting.interfaces.ITypeHintingFactory In general
# case, you don't have to change this value, unless you're an rope expert.
# Change this value to inject you own implementations of interfaces
# listed in module rope.base.oi.type_hinting.providers.interfaces
# For example, you can add you own providers for Django Models, or disable
# the search type-hinting in a class hierarchy, etc.
prefs['type_hinting_factory'] = (
'rope.base.oi.type_hinting.factory.default_type_hinting_factory')
def project_opened(project):
"""This function is called after opening the project"""
# Do whatever you like here!
from enum import Enum, auto
from collections import Counter
import math
import unittest
class MenuOption(Enum):
......@@ -32,16 +34,12 @@ class Review():
return Sentiment.NEGATIVE
def get_tokens(self):
return self.review_text.strip().lower().split()[1:]
return self.review_text.lower().strip('.').split()[1:]
def token_frequency(self, token):
return self.get_tokens().count(token)
def main():
menu_input()
def get_data():
try:
with open("sentiment.txt") as file:
......@@ -50,6 +48,11 @@ def get_data():
print("File cannot be found")
class GetDataTest(unittest.TestCase):
def test(self):
self.assertEqual(len(get_data()), 16444)
def get_reviews(data):
reviews = []
for review in data:
......@@ -58,49 +61,32 @@ def get_reviews(data):
return reviews
def get_tokens(data):
return "".join(data).strip().split()
def get_all_tokens(data):
return "".join(data).replace(".", "").strip().split()
def check_token(data):
tokens = frozenset(get_tokens(data))
def check_token(data, token):
tokens = frozenset(get_all_tokens(data))
token = input("Enter a token: ").lower()
if token in tokens:
print(f"The token \"{token}\" appears in the training data.")
return True
else:
print(f"The token \"{token}\" does not appear in the training data.")
def check_document_frequency(data):
tokens = get_tokens(data)
print(len(tokens))
token = input("Enter a token: ").lower()
print(
f"The token \"{token}\" appears {tokens.count(token)} out of {len(tokens)} time(s) in the training data.")
return False
def show_token_stats(data):
print(f"Review num: {len(data)}")
reviews = get_reviews(data)
tokens = get_tokens(data)
token = input("Enter a token: ").lower()
def calculate_tf_idf(reviews, token):
# positive_count, positive_token_total, maxp, negative_count, negative_token_total, maxn
positive_reviews = []
positive_tokens = []
positive_num = 0
highest_positive_num = 0
positive_count = 0
negative_reviews = []
negative_tokens = []
negative_num = 0
highest_negative_num = 0
negative_count = 0
neutral_reviews = []
neutral_num = 0
all_num = tokens.count(token)
# maxp = 0
neutral_count = 0
# Separating Positive and Negative reviews and tokens into lists
# Separating Positive and Negative reviews and tokens into lists
for review in reviews:
if(review.get_sentiment_rating() == Sentiment.POSITIVE):
positive_reviews.append(review)
......@@ -113,41 +99,150 @@ def show_token_stats(data):
elif(review.get_sentiment_rating() == Sentiment.NEUTRAL):
neutral_reviews.append(review)
# Getting total number of Positive and Negative tokens
# Getting total number of Positive and Negative tokens
for review in negative_reviews:
negative_num += review.get_tokens().count(token)
negative_count += review.get_tokens().count(token)
for review in neutral_reviews:
neutral_num += review.get_tokens().count(token)
neutral_count += review.get_tokens().count(token)
for review in positive_reviews:
positive_num += review.get_tokens().count(token)
positive_count += review.get_tokens().count(token)
maxp = Counter(negative_tokens).most_common(1)[0][1]
maxn = Counter(positive_tokens).most_common(1)[0][1]
return (((10 + 10 * (positive_count/maxp)) * math.log(len(positive_tokens)/maxp)) -
((10 + 10 * (negative_count/maxn)) * math.log(len(negative_tokens)/maxn))), negative_count, positive_count, neutral_count
# Option: 1
def show_reviews(data):
review_objects = get_reviews(data)
start_selection = "1"
end_selection = ""
while True:
start_selection = int(
input(f"Enter a review number from {start_selection} to {len(review_objects)}: "))
if start_selection > len(review_objects) or start_selection <= 0 or start_selection == None:
print("Please enter a valid, in-range number.")
start_selection = "1"
continue
else:
while True:
end_selection = int(
input(f"Enter a review number from {start_selection} to {len(review_objects)}: "))
if end_selection > len(review_objects) or end_selection < start_selection or end_selection == None:
print("Please enter a valid, in-range number.")
continue
else:
# Get the greatest number of times that any token appears in positive comments
# for test_token in frozenset(get_tokens(data)):
# for review in positive_reviews:
# if review.get_tokens().count(test_token) > highest_positive_num:
# highest_positive_num = review.get_tokens().count(test_token)
for review_num in range(start_selection-1, end_selection):
# Get the greatest number of times that any token appears in negative comments
# for test_token in frozenset(get_tokens(data)):
# for review in negative_reviews:
# if review.get_tokens().count(test_token) > highest_negative_num:
# highest_negative_num = review.get_tokens().count(test_token)
print(
f'Review #{review_num + 1}: {review_objects[review_num].review_text}')
# print(positive_num)
# print(highest_positive_num)
# print(negative_num)
# print(highest_negative_num)
break
break
score = (((10 + 10 * (positive_num/highest_positive_num)) *
math.log(len(positive_tokens)/highest_positive_num)) - ((10 + 10 * (negative_num/highest_negative_num)) * math.log(len(negative_tokens)/highest_negative_num)))
print(f"The token \"{token}\" has {negative_num} negative, {neutral_num} neutral, and {positive_num} positive appearance(s) in the training data.")
# Option: 3
def check_document_frequency(data):
tokens = get_all_tokens(data)
print(len(tokens))
token = input("Enter a token: ").lower()
print(
f"The token \"{token}\" has a differential tf-idf score of {score} and is classified as ...")
f"The token \"{token}\" appears {tokens.count(token)} out of {len(tokens)} time(s) in the training data.")
class TestCheckDocumentFrequency(self):
def test(self):
self.assertEqual(check_document_frequency(self), )
# Option: 4
def show_token_stats(data):
reviews = get_reviews(data)
input_token = input("Enter a token: ").lower()
if check_token(data, input_token):
score, negative_count, positive_count, neutral_count = calculate_tf_idf(
reviews, input_token)
print(
f"The token \"{input_token}\" has {negative_count} negative, {neutral_count} neutral, and {positive_count} positive appearance(s) in the training data.")
if score > 0:
print(
f"The token \"{input_token}\" has a differential tf-idf score of {score} and is classified as positive")
else:
print(
f"The token \"{input_token}\" has a differential tf-idf score of {score} and is classified as negative")
else:
print(
f"The token \"{input_token}\" does not appear in the training data.")
# Option: 5
def show_sentence_stats(data):
reviews = get_reviews(data)
input_tokens = input(
"Enter a sentence as space-separated tokens: ").split()
total_scores = 0
total_negative = 0
total_positive = 0
total_neutral = 0
total_unknown = 0
for token in input_tokens:
if check_token(data, token):
score, negative_count, positive_count, neutral_count = calculate_tf_idf(
reviews, token)
total_scores += score
if score > .1:
total_positive += 1
elif score < -.1:
total_negative += 1
else:
total_neutral += 1
else:
total_unknown += 1
print(
f"The sentence has {total_unknown} unkown token(s): \"{token}\" Therefore, the sentence's average tf-idf score is undefined.")
return
print(
f'The sentence has {total_negative} negative, {total_neutral} neutral, and {total_positive} positive token(s).')
print(
f'The sentence has an average tf-idf score of {total_scores/len(input_tokens)}')
# option: 6
def save_stop_word_list(data):
unique_tokens = frozenset(get_all_tokens(data))
all_tokens = get_all_tokens(data)
stop_words = []
for token in unique_tokens:
if all_tokens.count(token)/len(all_tokens) > 0.2:
print(token)
stop_words.append(token)
with open("sentiment.txt") as file:
file.write(",".join(sorted(stop_words)))
def handle_selection(selection, data):
......@@ -155,17 +250,26 @@ def handle_selection(selection, data):
show_reviews(data)
elif selection == MenuOption.CHECK_TOKEN:
check_token(data)
token = input("Enter a token: ").lower()
if(check_token(data, token)):
print(f"The token \"{token}\" appears in the training data.")
else:
print(
f"The token \"{token}\" does not appear in the training data.")
elif selection == MenuOption.SHOW_DOCUMENT_FREQUENCY:
check_document_frequency(data)
elif selection == MenuOption.SHOW_TOKEN_STATISTICS:
show_token_stats(data)
pass
elif selection == MenuOption.SHOW_SENTENCE_STATISTICS:
pass
show_sentence_stats(data)
elif selection == MenuOption.SAVE_STOP_WORD_LIST:
save_stop_word_list(data)
pass
elif selection == MenuOption.SHOW_ADJUSTED_SENTENCE_STATISTICS:
pass
......@@ -189,41 +293,8 @@ def menu_input():
print("Please enter a valid, in range number.")
def show_reviews(data):
review_objects = get_reviews(data)
start_selection = "1"
end_selection = ""
while True:
start_selection = int(
input(f"Enter a review number from {start_selection} to {len(review_objects)}: "))
if start_selection > len(review_objects) or start_selection <= 0 or start_selection == None:
print("Please enter a valid, in-range number.")
start_selection = "1"
continue
else:
while True:
end_selection = int(
input(f"Enter a review number from {start_selection} to {len(review_objects)}: "))
if end_selection > len(review_objects) or end_selection < start_selection or end_selection == None:
print("Please enter a valid, in-range number.")
continue
else:
for review_num in range(start_selection-1, end_selection):
print(
f'Review #{review_num + 1}: {review_objects[review_num].review_text}')
break
break
def main():
menu_input()
if __name__ == '__main__':
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment