From b07ec1f9b5f6f3a9c1a02f2252cc5e3bc5905937 Mon Sep 17 00:00:00 2001 From: Zeynep Hakguder <zhakguder@cse.unl.edu> Date: Thu, 24 May 2018 14:21:14 -0500 Subject: [PATCH] added learning curve --- ProgrammingAssignment1-Solution.ipynb | 33 ---------------- ProgrammingAssignment1.ipynb | 54 ++++++++++++++++++++++++--- model.ipynb | 11 ++++-- 3 files changed, 57 insertions(+), 41 deletions(-) diff --git a/ProgrammingAssignment1-Solution.ipynb b/ProgrammingAssignment1-Solution.ipynb index 0960e42..6025c61 100644 --- a/ProgrammingAssignment1-Solution.ipynb +++ b/ProgrammingAssignment1-Solution.ipynb @@ -179,39 +179,6 @@ "Now that we have the true labels and the predicted ones from our model, we can build a confusion matrix and see how accurate our model is. Implement the \"conf_matrix\" function that takes as input an array of true labels ($true$) and an array of predicted labels ($pred$). It should output a numpy.ndarray. " ] }, - { - "cell_type": "code", - "execution_count": 287, - "metadata": {}, - "outputs": [], - "source": [ - "def conf_matrix(true_l, pred, threshold):\n", - " tp = tn = fp = fn = 0\n", - " \n", - " for i in range(len(true_l)):\n", - " tmp = -1\n", - " \n", - " if pred[i] > threshold:\n", - " tmp = 1\n", - " if tmp == true_l[i]:\n", - " \n", - " if true_l[i] == 1:\n", - " tp += 1\n", - " else:\n", - " tn += 1\n", - " else:\n", - " if true_l[i] == 1:\n", - " fn += 1\n", - " else:\n", - " fp += 1\n", - " \n", - " return np.array([tp,tn, fp, fn])\n", - " \n", - " \n", - " # returns the confusion matrix as numpy.ndarray\n", - " #raise NotImplementedError" - ] - }, { "cell_type": "code", "execution_count": 289, diff --git a/ProgrammingAssignment1.ipynb b/ProgrammingAssignment1.ipynb index 3dc81dc..a28af7c 100644 --- a/ProgrammingAssignment1.ipynb +++ b/ProgrammingAssignment1.ipynb @@ -178,25 +178,69 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Evaluate your model on the test data and report your accuracy. Also, calculate and report the confidence interval on the generalization error estimate." + "Evaluate your model on the test data and report your **accuracy**. Also, calculate and report the confidence interval on the generalization **error** estimate." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": {}, - "outputs": [], + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'my_model' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m<ipython-input-3-e365162558f6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfinal_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmy_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmy_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mthreshold\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m# Calculate accuracy and generalization error with confidence interval here. For now, We will consider a data point as predicted in the positive class if more than 0.5 of its k-neighbors are positive.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'my_model' is not defined" + ] + } + ], "source": [ "final_labels = my_model.predict(my_model.test_indices)\n", - "# Calculate accuracy and generalization error with confidence interval here." + "\n", + "# Calculate accuracy and generalization error with confidence interval here. \n", + "# For now, We will consider a data point as predicted in the positive class if more than 0.5 \n", + "# of its k-neighbors are positive.\n", + "threshold = 0.5" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# TODO: learning curve \n", + " ### Plotting a learning curve\n", + " \n", + "A learning curve shows how error changes as the training set size increases. For more information, see [learning curves](https://www.dataquest.io/blog/learning-curves-machine-learning/).\n", + "We'll plot the error values for training and validation data while varying the size of the training set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "training_sizes = np.xrange(0, my_model.train_size + 1, 100)\n", + "\n", + "# Calculate error for each entry in training_sizes\n", + "# for training and validation sets and populate\n", + "# error_train and error_val arrays. Each entry in these arrays\n", + "# should correspond to each entry in training_sizes.\n", "\n", + "plt.plot(training_sizes, error_train, 'r', label = 'training_error')\n", + "plt.plot(training_sizes, error_val, 'g', label = 'validation_error')\n", + "plt.legend()\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Computing the confusion matrix for $k = 10$\n", "Now that we have the true labels and the predicted ones from our model, we can build a confusion matrix and see how accurate our model is. Implement the \"conf_matrix\" function (in model.ipynb) that takes as input an array of true labels ($true$) and an array of predicted labels ($pred$). It should output a numpy.ndarray. You do not need to change the value of the threshold parameter yet." ] }, diff --git a/model.ipynb b/model.ipynb index fa80a4f..34e04c4 100644 --- a/model.ipynb +++ b/model.ipynb @@ -82,7 +82,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -95,11 +95,16 @@ " def __init__(self, preprocessor_f, partition_f, **kwargs):\n", " \n", " self.features, self.labels = preprocessor_f(kwargs['file_path'])\n", - " self.size = len(self.labels) # number of examples in dataset\n", + " self.size = len(self.labels) # number of examples in dataset \n", " self.feat_dim = self.features.shape[1] # number of features\n", + " \n", " self.val_indices, self.test_indices = partition_f(self.size, kwargs['p'], kwargs['v'])\n", + " self.val_size = len(self.val_indices)\n", + " self.test_size = len(self.test_indices)\n", + " \n", " self.train_indices = np.delete(np.arange(self.size), np.append(self.test_indices, self.val_indices), 0)\n", - " \n", + " self.train_size = len(self.train_indices)\n", + " \n", " def fit(self):\n", " raise NotImplementedError\n", " \n", -- GitLab