From b07ec1f9b5f6f3a9c1a02f2252cc5e3bc5905937 Mon Sep 17 00:00:00 2001
From: Zeynep Hakguder <zhakguder@cse.unl.edu>
Date: Thu, 24 May 2018 14:21:14 -0500
Subject: [PATCH] added learning curve

---
 ProgrammingAssignment1-Solution.ipynb | 33 ----------------
 ProgrammingAssignment1.ipynb          | 54 ++++++++++++++++++++++++---
 model.ipynb                           | 11 ++++--
 3 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/ProgrammingAssignment1-Solution.ipynb b/ProgrammingAssignment1-Solution.ipynb
index 0960e42..6025c61 100644
--- a/ProgrammingAssignment1-Solution.ipynb
+++ b/ProgrammingAssignment1-Solution.ipynb
@@ -179,39 +179,6 @@
     "Now that we have the true labels and the predicted ones from our model, we can build a confusion matrix and see how accurate our model is. Implement the \"conf_matrix\" function that takes as input an array of true labels ($true$) and an array of predicted labels ($pred$). It should output a numpy.ndarray. "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 287,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def conf_matrix(true_l, pred, threshold):\n",
-    "    tp = tn = fp = fn = 0\n",
-    "    \n",
-    "    for i in range(len(true_l)):\n",
-    "        tmp = -1\n",
-    "        \n",
-    "        if pred[i] > threshold:\n",
-    "            tmp = 1\n",
-    "        if tmp == true_l[i]:\n",
-    "        \n",
-    "            if true_l[i] == 1:\n",
-    "                tp += 1\n",
-    "            else:\n",
-    "                tn += 1\n",
-    "        else:\n",
-    "            if true_l[i] == 1:\n",
-    "                fn += 1\n",
-    "            else:\n",
-    "                fp += 1\n",
-    "    \n",
-    "    return np.array([tp,tn, fp, fn])\n",
-    "    \n",
-    "    \n",
-    "    # returns the confusion matrix as numpy.ndarray\n",
-    "    #raise NotImplementedError"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": 289,
diff --git a/ProgrammingAssignment1.ipynb b/ProgrammingAssignment1.ipynb
index 3dc81dc..a28af7c 100644
--- a/ProgrammingAssignment1.ipynb
+++ b/ProgrammingAssignment1.ipynb
@@ -178,25 +178,69 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Evaluate your model on the test data and report your accuracy. Also, calculate and report the confidence interval on the generalization error estimate."
+    "Evaluate your model on the test data and report your **accuracy**. Also, calculate and report the confidence interval on the generalization **error** estimate."
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'my_model' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-3-e365162558f6>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfinal_labels\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmy_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmy_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtest_indices\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mthreshold\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0.5\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0;31m# Calculate accuracy and generalization error with confidence interval here. For now, We will consider a data point as predicted in the positive class if more than 0.5 of its k-neighbors are positive.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'my_model' is not defined"
+     ]
+    }
+   ],
    "source": [
     "final_labels = my_model.predict(my_model.test_indices)\n",
-    "# Calculate accuracy and generalization error with confidence interval here."
+    "\n",
+    "# Calculate accuracy and generalization error with confidence interval here. \n",
+    "# For now, We will consider a data point as predicted in the positive class if more than 0.5 \n",
+    "# of its k-neighbors are positive.\n",
+    "threshold = 0.5"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# TODO: learning curve \n",
+    " ### Plotting a learning curve\n",
+    " \n",
+    "A learning curve shows how error changes as the training set size increases. For more information, see [learning curves](https://www.dataquest.io/blog/learning-curves-machine-learning/).\n",
+    "We'll plot the error values for training and validation data while varying the size of the training set."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_sizes = np.xrange(0, my_model.train_size + 1, 100)\n",
+    "\n",
+    "# Calculate error for each entry in training_sizes\n",
+    "# for training and validation sets and populate\n",
+    "# error_train and error_val arrays. Each entry in these arrays\n",
+    "# should correspond to each entry in training_sizes.\n",
     "\n",
+    "plt.plot(training_sizes, error_train, 'r', label = 'training_error')\n",
+    "plt.plot(training_sizes, error_val, 'g', label = 'validation_error')\n",
+    "plt.legend()\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Computing the confusion matrix for $k = 10$\n",
     "Now that we have the true labels and the predicted ones from our model, we can build a confusion matrix and see how accurate our model is. Implement the \"conf_matrix\" function (in model.ipynb) that takes as input an array of true labels ($true$) and an array of predicted labels ($pred$). It should output a numpy.ndarray. You do not need to change the value of the threshold parameter yet."
    ]
   },
diff --git a/model.ipynb b/model.ipynb
index fa80a4f..34e04c4 100644
--- a/model.ipynb
+++ b/model.ipynb
@@ -82,7 +82,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -95,11 +95,16 @@
     "    def __init__(self, preprocessor_f, partition_f, **kwargs):\n",
     "        \n",
     "        self.features, self.labels = preprocessor_f(kwargs['file_path'])\n",
-    "        self.size = len(self.labels) # number of examples in dataset\n",
+    "        self.size = len(self.labels) # number of examples in dataset       \n",
     "        self.feat_dim = self.features.shape[1] # number of features\n",
+    "        \n",
     "        self.val_indices, self.test_indices = partition_f(self.size, kwargs['p'], kwargs['v'])\n",
+    "        self.val_size = len(self.val_indices)\n",
+    "        self.test_size = len(self.test_indices)\n",
+    "        \n",
     "        self.train_indices = np.delete(np.arange(self.size), np.append(self.test_indices, self.val_indices), 0)\n",
-    "            \n",
+    "        self.train_size = len(self.train_indices)\n",
+    "        \n",
     "    def fit(self):\n",
     "        raise NotImplementedError\n",
     "    \n",
-- 
GitLab