diff --git a/ProgrammingAssignment_0/GettingFamiliar_solution.ipynb b/ProgrammingAssignment_0/GettingFamiliar_solution.ipynb deleted file mode 100644 index 001ab5a51b5bbe33989c981a809d1ed92488382d..0000000000000000000000000000000000000000 --- a/ProgrammingAssignment_0/GettingFamiliar_solution.ipynb +++ /dev/null @@ -1,472 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# JUPYTER NOTEBOOK TIPS\n", - "\n", - "Each rectangular box is called a cell. \n", - "* Ctrl+ENTER evaluates the current cell; if it contains Python code, it runs the code, if it contains Markdown, it returns rendered text.\n", - "* Alt+ENTER evaluates the current cell and adds a new cell below it.\n", - "* If you click to the left of a cell, you'll notice the frame changes color to blue. You can erase a cell by hitting 'dd' (that's two \"d\"s in a row) when the frame is blue." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# GRADING\n", - "\n", - "You will be graded on parts that are marked with **\\#TODO** comments. Read the comments in the code to make sure you don't miss any.\n", - "\n", - "### Mandatory for 478 & 878:\n", - "\n", - "| | Tasks | 478 | 878 |\n", - "|---|----------------------------|-----|-----|\n", - "| 1 | Implement `preprocess` | 10 | 5 |\n", - "| 2 | Implement `partition` | 10 | 5 |\n", - "| 3 | Putting the model together | 5 | 5 |\n", - "\n", - "### Mandatory for 878, bonus for 478\n", - "\n", - "| | Tasks | 478 | 878 |\n", - "|---|---------------------------------------|-----|-----|\n", - "|4 | Implement `normalization` | 5 | 10 |\n", - "\n", - "\n", - "Points are broken down further below in Rubric sections. The **first** score is for 478, the **second** is for 878 students. There a total of 25 points in this assignment and extra 5 bonus points for 478 students." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Supervised Learning Model Skeleton\n", - "\n", - "We'll use this skeleton for implementing different supervised learning algorithms. For this first assignment, we'll read and partition the [\"madelon\" dataset](http://archive.ics.uci.edu/ml/datasets/madelon). Features and labels for the first two examples are listed below. Please complete \"preprocess\" and \"partition\" functions. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll use numpy library for this assignment. Please do not import any other libraries." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "# import necessary libraries\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The 500 features in the \"madelon\" dataset have integer values:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../data/madelon.data\r\n", - " 1-) 485 477 537 479 452 471 491 476 475 473 455 500 456 507 478 491 447 422 480 482 515 482 464 484 477 496 509 491 459 482 483 505 508 458 509 517 479 487 473 472 474 531 485 508 517 489 507 515 440 465 550 532 450 483 460 469 507 485 479 458 516 480 460 479 648 480 561 481 474 474 544 484 490 451 494 480 486 459 521 500 466 457 494 492 488 497 477 461 473 464 476 471 481 507 474 500 481 536 464 501 479 480 483 462 470 181 510 470 431 482 496 481 469 539 491 482 481 476 533 495 474 485 479 495 465 541 493 488 452 481 491 501 477 479 503 529 540 504 482 463 477 530 508 488 488 474 479 506 478 511 501 474 483 575 478 482 461 480 543 415 527 477 487 486 511 474 477 482 476 516 466 492 561 479 472 457 497 475 452 491 477 454 461 472 481 490 526 490 459 478 461 516 511 544 519 487 485 475 477 476 478 470 493 581 484 476 521 474 492 459 487 504 464 485 478 465 603 475 481 491 555 424 528 511 384 525 459 478 477 539 479 508 471 517 482 518 473 478 506 476 507 434 466 480 547 518 516 476 492 454 463 497 477 531 472 495 532 496 492 480 480 479 517 470 470 500 468 477 486 553 490 499 450 469 466 479 476 401 491 551 477 517 492 475 537 516 472 451 484 471 469 523 496 482 458 487 477 457 458 493 458 517 478 482 474 517 482 488 490 485 440 455 464 531 483 467 494 488 414 491 494 497 501 476 481 485 478 476 491 492 523 492 476 464 496 473 658 507 628 484 468 448 502 618 438 486 496 535 452 497 490 485 504 477 481 473 517 476 479 483 482 458 464 466 473 482 497 479 497 495 489 483 500 490 479 471 468 496 419 513 475 471 514 479 480 486 480 477 494 454 480 539 477 441 482 461 484 510 475 485 480 474 474 442 477 502 402 478 504 476 484 475 488 486 524 506 480 451 512 498 478 485 495 476 496 485 496 485 486 482 505 528 496 533 504 512 474 646 526 485 541 487 568 492 467 479 483 479 546 476 457 463 517 471 482 630 481 494 440 509 507 512 496 488 462 498 480 511 500 437 537 470 515 476 467 401 485 499 495 490 508 463 487 531 515 476 482 463 467 479 477 481 477 485 511 485 481 479 475 496 \r\n", - " 2-) 483 458 460 487 587 475 526 479 485 469 434 483 465 503 472 478 469 518 495 491 478 530 462 494 549 469 516 487 475 486 478 514 542 406 469 452 483 498 480 476 474 504 478 493 472 461 521 521 499 458 466 519 487 485 489 485 551 516 435 487 525 481 529 486 488 513 415 463 481 481 491 504 496 433 475 416 481 482 493 536 483 416 553 460 554 447 477 499 470 527 476 480 507 522 474 485 478 479 468 397 482 469 477 476 553 431 489 447 535 487 488 557 485 515 484 497 479 494 436 470 477 468 480 587 503 429 496 502 473 485 522 484 481 486 519 455 442 499 470 483 508 510 481 494 483 473 481 510 480 447 538 497 475 404 479 519 486 492 520 519 500 482 486 487 533 487 476 480 475 459 470 522 489 477 447 519 484 472 458 510 529 539 456 478 490 509 481 524 530 478 495 507 459 467 494 470 480 491 476 503 485 475 508 488 495 477 507 482 447 482 483 455 485 474 478 579 540 484 508 480 492 517 490 547 510 465 495 477 475 497 477 442 489 507 466 504 493 471 478 467 530 551 476 470 575 477 510 486 473 504 451 450 477 506 480 506 575 502 486 489 485 479 488 524 465 516 443 503 517 498 482 467 454 407 484 479 475 498 514 492 477 435 491 475 503 480 506 512 482 477 504 527 454 483 458 473 484 542 469 459 462 503 477 492 469 467 475 483 491 464 466 475 477 502 483 506 474 494 469 524 483 434 488 463 495 483 468 481 493 489 538 469 477 480 460 495 469 469 528 544 497 497 462 478 494 481 493 461 482 483 471 422 493 511 471 497 523 476 462 453 471 502 475 536 481 389 491 464 500 553 467 497 489 486 490 540 487 488 526 477 480 462 523 483 488 475 485 479 492 452 479 441 475 442 476 475 484 500 570 482 481 428 477 456 477 546 502 477 516 467 512 469 498 501 503 539 493 505 543 556 486 483 514 476 457 507 475 448 479 481 486 500 489 442 509 479 500 517 489 488 494 496 463 460 472 478 457 487 420 463 484 474 459 311 479 582 480 495 538 487 537 488 485 483 500 487 476 526 449 363 466 478 465 479 482 549 470 506 481 494 492 448 492 447 598 507 478 483 492 485 463 478 487 338 513 486 483 492 510 517 \r\n" - ] - } - ], - "source": [ - "! echo '../data/madelon.data'; head -n 2 ../data/madelon.data | nl -s '-) '" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Labels are either positive (1) or negative (-1):" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "../data/madelon.labels\r\n", - " 1-) -1\r\n", - " 2-) -1\r\n" - ] - } - ], - "source": [ - "! echo '../data/madelon.labels'; head -n 2 ../data/madelon.labels | nl -s '-) '" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 1: Implement `preprocess`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This step is for reading the dataset and for extracting features and labels. The \"preprocess\" function should return an $n \\times d$ \"features\" array, and an $n \\times 1$ \"labels\" array, where $n$ is the number of examples and $d$ is the number of features in the dataset. In cases where there is a big difference between the scales of features, we want to normalize the features to have values in the same range [0,1]. Since this is not the case with this dataset, we will not do normalization." - ] - }, - { - "cell_type": "code", - "execution_count": 60, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(feature_file, label_file):\n", - " '''\n", - " Args:\n", - " feature_file: str \n", - " file containing features\n", - " label_file: str\n", - " file containing labels\n", - " Returns:\n", - " features: ndarray\n", - " nxd features\n", - " labels: ndarray\n", - " nx1 labels\n", - " '''\n", - " # You might find np.genfromtxt useful for reading in the file. Be careful with the file delimiter, \n", - " # e.g. for comma-separated files use delimiter=',' argument.\n", - " \n", - " #TODO\n", - " \n", - " # read in features and labels\n", - " features = np.genfromtxt(feature_file)\n", - " labels = np.genfromtxt(label_file)\n", - " \n", - " return features, labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Correct features size +5, +2.5\n", - "* Correct labels size +5, +2.5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test `preprocess`" - ] - }, - { - "cell_type": "code", - "execution_count": 61, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Shape of features: (2000, 500)\n", - "Shape of labels: (2000,)\n" - ] - } - ], - "source": [ - "features, labels = preprocess(feature_file = '../data/madelon.data', label_file = '../data/madelon.labels')\n", - "# TODO: Output the dimension of both features and labels.\n", - "print('Shape of features: {}'.format(features.shape))\n", - "print('Shape of labels: {}'.format(labels.shape))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 2: Implement `partition`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, you'll need to split your dataset into training, validation and test sets. The \"partition\" function should take as input the size of the whole dataset and randomly sample a proportion $t$ of the dataset indices for test partition and a proportion of $v$ for validation partition. The remaining will be used as indices for training data. For example, to keep 30% of the examples as test and %10 as validation, set $t=0.3$ and $v=0.1$. You should choose these values according to the size of the data available to you. The \"split\" function should return indices of the training, validation and test sets. These will be used to index into the whole training set." - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "metadata": {}, - "outputs": [], - "source": [ - "def partition(size, t, v = 0):\n", - " '''\n", - " Args:\n", - " size: int\n", - " number of examples in the whole dataset\n", - " t: float\n", - " proportion kept for test\n", - " v: float\n", - " proportion kept for validation\n", - " Returns:\n", - " test_indices: ndarray\n", - " 1D array containing test set indices\n", - " val_indices: ndarray\n", - " 1D array containing validation set indices\n", - " '''\n", - " \n", - " # np.random.permutation might come in handy. Do not sample with replacement!\n", - " # Be sure not to use the same indices in test and validation sets!\n", - " \n", - " # use the first np.ceil(size*t) for test, \n", - " # the following np.ceil(size*v) for validation set.\n", - " \n", - " # TODO\n", - " \n", - " # number of test and validation examples\n", - " t_size = np.int(np.ceil(size*t))\n", - " v_size = np.int(np.ceil(size*v))\n", - "\n", - " # shuffle the indices\n", - " permuted = np.random.permutation(size)\n", - " \n", - " # spare the first t_size for test\n", - " test_indices = permuted[:t_size]\n", - " # and the next v_size for validation\n", - " val_indices = permuted[t_size+1:t_size+v_size+1]\n", - " train_indices = np.delete(np.arange(size), np.append(test_indices, val_indices), 0)\n", - " \n", - " return test_indices, val_indices, train_indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Correct length of test indices +5, +2.5\n", - "* Correct length of validation indices +5, +2.5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test `partition`" - ] - }, - { - "cell_type": "code", - "execution_count": 63, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Test size: 600, validation size: 200, training size: 1200\n" - ] - } - ], - "source": [ - "# TODO\n", - "# Pass the correct size argument (number of examples in the whole dataset)\n", - "test_indices, val_indices, train_indices = partition(size = features.shape[0], t = 0.3, v = 0.1)\n", - "\n", - "# Output the length of both test and validation indices.\n", - "print('Test size: {}, validation size: {}, training size: {}'.format(test_indices.shape[0], val_indices.shape[0], train_indices.shape[0]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 3: Putting things together" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The model definition is given below. We'll extend this class for different supervised classification algorithms. Specifically, we'll implement \"fit\" and \"predict\" methods for these algorithms. For this assignment, you are not asked to implement these methods. Run the cells below and make sure each piece of code fits together and works as expected." - ] - }, - { - "cell_type": "code", - "execution_count": 64, - "metadata": {}, - "outputs": [], - "source": [ - "class Model:\n", - " \n", - " def fit(self, training_features, training_labels):\n", - " print('There are {} data points in training partition with {} features.'.format(\n", - " training_features.shape[0], training_features.shape[1]))\n", - " return\n", - " \n", - " def predict(self, test_points):\n", - " raise NotImplementedError" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Correct training size +5, +5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test `Model`" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Initialize the model and call fit method with the training features and labels." - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "There are 1200 data points in training partition with 500 features.\n" - ] - } - ], - "source": [ - "my_model = Model()\n", - "# obtain features and labels from files\n", - "features, labels = preprocess('../data/madelon.data', '../data/madelon.labels')\n", - "# partition the data set\n", - "val_indices, test_indices, train_indices = partition(features.shape[0], 0.3, 0.1)\n", - "# pass the training features and labels to the fit method\n", - "my_model.fit(features[train_indices], labels[train_indices])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 4: Normalization" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Implement `normalization` function such that the output features take values in the range [0, 1]. Check that the values of the features are in [0, 1]." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Correct range for feature values +5, +10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test Normalization" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO\n", - "# args is a placeholder for the parameters of the function\n", - "# Args and Returns are as in \"preprocess\"\n", - "def normalization(raw_features):\n", - " \n", - " # initialize an empty ndarray with the shape of raw_features\n", - " dims = raw_features.shape\n", - " features = np.empty(dims)\n", - " \n", - " # divide each column with the max value in it\n", - " for col in range(dims[1]):\n", - " col_values = raw_features[:, col]\n", - " max_val = max(col_values)\n", - " features[:, col] = col_values/max_val \n", - " \n", - " return features" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Min value: 0.0, max value: 1.0\n" - ] - } - ], - "source": [ - "# TODO\n", - "\n", - "features = normalization(features)\n", - "# Check that the range of each feature in the training set is in range [0, 1]\n", - "\n", - "print('Min value: {}, max value: {}'.format(features.min(), features.max()))" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/ProgrammingAssignment_1/ProgrammingAssignment1.ipynb b/ProgrammingAssignment_1/ProgrammingAssignment1.ipynb deleted file mode 100644 index bb66aac852c3c3513fb12f3d4f7f5bab028b22b5..0000000000000000000000000000000000000000 --- a/ProgrammingAssignment_1/ProgrammingAssignment1.ipynb +++ /dev/null @@ -1,485 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# *k*-Nearest Neighbor\n", - "\n", - "We'll implement *k*-Nearest Neighbor (*k*-NN) algorithm for this assignment. You can use data available in machine learning repositories such as [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) or a dataset related to your research. Your dataset should \n", - "* have labels (suited for classification)\n", - "* ideally have between 1,000 - 5,000 examples\n", - "\n", - "A skeleton of a general supervised learning model is provided in \"model.ipynb\". The functions that will be implemented there will be indicated in this notebook. \n", - "\n", - "### Assignment Goals:\n", - "In this assignment, we will:\n", - "* implement 'Euclidean' and 'Manhattan' distance metrics \n", - "* use the validation dataset to find a good value for *k*\n", - "* evaluate our model with respect to performance measures:\n", - " * accuracy, generalization error and ROC curve\n", - "* try to assess if *k*-NN is suitable for the dataset you used\n", - "\n", - "### Note:\n", - "\n", - "You are not required to follow this exact template. You can change what parameters your functions take or partition the tasks across functions differently. However, make sure there are outputs and implementation for items listed in the rubric for each task. Also, indicate in code with comments which task you are attempting." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# GRADING\n", - "\n", - "You will be graded on parts that are marked with **\\#TODO** comments. Read the comments in the code to make sure you don't miss any.\n", - "\n", - "### Mandatory for 478 & 878:\n", - "\n", - "| | Tasks | 478 | 878 |\n", - "|---|----------------------------|-----|-----|\n", - "| 1 | Implement `distance` | 10 | 10 |\n", - "| 2 | Implement `k-NN` methods | 25 | 20 |\n", - "| 3 | Model evaluation | 25 | 20 |\n", - "| 4 | Learning curve | 20 | 20 |\n", - "| 6 | ROC curve analysis | 20 | 20 |\n", - "\n", - "### Mandatory for 878, bonus for 478\n", - "\n", - "| | Tasks | 478 | 878 |\n", - "|---|----------------|-----|-----|\n", - "| 5 | Optimizing *k* | 10 | 10 |\n", - "\n", - "### Bonus for 478/878\n", - "\n", - "| | Tasks | 478 | 878 |\n", - "|---|----------------|-----|-----|\n", - "| 7 | Assess suitability of *k*-NN | 10 | 10 |\n", - "\n", - "Points are broken down further below in Rubric sections. The **first** score is for 478, the **second** is for 878 students. There are a total of 100 points in this assignment and extra 20 bonus points for 478 students and 10 bonus points for 878 students." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use numpy for array operations and matplotlib for plotting for this assignment. Please do not add other libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Following code makes the Model class and relevant functions available from model.ipynb." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%run 'model.ipynb'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 1: Implement `distance` function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Choice of distance metric plays an important role in the performance of *k*-NN. Let's start with implementing a distance method in the \"distance\" function in **model.ipynb**. It should take two data points and the name of the metric and return a scalar value." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Euclidean +5, +5\n", - "* Manhattan +5, +5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Test `distance`" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "x = np.array(range(100))\n", - "y = np.array(range(100, 200))\n", - "dist_euclidean = distance(x, y, 'Euclidean')\n", - "dist_manhattan = distance(x, y, 'Manhattan')\n", - "print('Euclidean distance: {}, Manhattan distance: {}'.format(dist_euclidean, dist_manhattan))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 2: Implement $k$-NN Class Methods" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can start implementing our *k*-NN classifier. *k*-NN class inherits Model class. Use the \"distance\" function you defined above. \"fit\" method takes *k* as an argument. \"predict\" takes as input an *mxd* array containing *d*-dimensional *m* feature vectors for examples and outputs the predicted class and the ratio of positive examples in *k* nearest neighbors." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* correct implementation of fit method +5, +5\n", - "* correct implementation of predict method +20, +15" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class kNN(Model):\n", - " '''\n", - " Inherits Model class. Implements the k-NN algorithm for classification.\n", - " '''\n", - " \n", - " def fit(self, training_features, training_labels, classes, k, distance_f,**kwargs):\n", - " '''\n", - " Fit the model. This is pretty straightforward for k-NN.\n", - " Args:\n", - " training_features: ndarray\n", - " training_labels: ndarray\n", - " classes: ndarray\n", - " 1D array containing unique classes in the dataset\n", - " k: int\n", - " distance_f: function\n", - " kwargs: dict\n", - " Contains keyword arguments that will be passed to distance_f\n", - " '''\n", - " # TODO\n", - " # set self.train_features, self.train_labels, self.classes, self.k, self.distance_f, self.distance_metric\n", - " \n", - " raise NotImplementedError\n", - "\n", - " return\n", - " \n", - " \n", - " def predict(self, test_features):\n", - " '''\n", - " Args:\n", - " test_features: ndarray\n", - " mxd array containing features for the points to be predicted\n", - " Returns: \n", - " ndarray\n", - " '''\n", - " raise NotImplementedError\n", - " \n", - " pred = []\n", - " # TODO\n", - " \n", - " # for each point in test_features\n", - " # use your implementation of distance function\n", - " # distance_f(..., distance_metric)\n", - " # to find the labels of k-nearest neighbors. \n", - "\n", - " # you'll need proportion of the dominant class\n", - " # in k nearest neighbors\n", - " \n", - " return np.array(pred)\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 3: Build and Evaluate the Model" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Reasonable accuracy values +10, +5\n", - "* Reasonable confidence intervals on the error estimate +10, +10\n", - "* Reasonable confusion matrix +5, +5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Preprocess the data files and partition the data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# initialize the model\n", - "my_model = kNN()\n", - "# obtain features and labels from files\n", - "features, labels = preprocess(feature_file=..., label_file=...)\n", - "# get class names (unique entries in labels)\n", - "classes = np.unique(labels)\n", - "# partition the data set\n", - "val_indices, test_indices, train_indices = partition(size=..., t = 0.3, v = 0.1)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Assign a value to *k* and fit the *k*-NN model." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pass the training features and labels to the fit method\n", - "kwargs_f = {'metric': 'Euclidean'}\n", - "my_model.fit(training_features=..., training_labels-..., classes, k=10, distance_f=..., **kwargs_f)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Computing the confusion matrix for *k* = 10\n", - "Now that we have the true labels and the predicted ones from our model, we can build a confusion matrix and see how accurate our model is. Implement the \"conf_matrix\" function (in model.ipynb) that takes as input an array of true labels (*true*) and an array of predicted labels (*pred*). It should output a numpy.ndarray. You do not need to change the value of the threshold parameter yet." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO\n", - "\n", - "# get model predictions\n", - "pred_ratios = my_model.predict(features[test_indices])\n", - "\n", - "# For now, we will consider a data point as predicted in a class if more than 0.5 \n", - "# of its k-neighbors are in that class.\n", - "threshold = 0.5\n", - "# convert predicted ratios to predicted labels\n", - "pred_labels = None\n", - "\n", - "# show the distribution of predicted and true labels in a confusion matrix\n", - "confusion = conf_matrix(...)\n", - "confusion" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Evaluate your model on the test data and report your **accuracy**. Also, calculate and report the 95% confidence interval on the generalization **error** estimate." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO\n", - "# Calculate and report accuracy and generalization error with confidence interval here. Show your work in this cell.\n", - "\n", - "print('Accuracy: {}'.format(accuracy))\n", - "print('Confidence interval: {}-{}'.format(lower_bound, upper_bound))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - " ## TASK 4: Plotting a learning curve\n", - " \n", - "A learning curve shows how error changes as the training set size increases. For more information, see [learning curves](https://www.dataquest.io/blog/learning-curves-machine-learning/).\n", - "We'll plot the error values for training and validation data while varying the size of the training set. Report a good size for training set for which there is a good balance between bias and variance." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Correct training error calculation for different training set sizes +8, +8\n", - "* Correct validation error calculation for different training set sizes +8, +8\n", - "* Reasonable learning curve +4, +4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# train using %10, %20, %30, ..., 100% of training data\n", - "training_proportions = np.arange(0.10, 1.01, 0.10)\n", - "train_size = len(train_indices)\n", - "training_sizes = np.int(np.ceil(train_size*proportion))\n", - "\n", - "# TODO\n", - "error_train = []\n", - "error_val = []\n", - "\n", - "# For each size in training_sizes\n", - "for size in training_sizes:\n", - " # fit the model using \"size\" data point\n", - " # Calculate error for training and validation sets\n", - " # populate error_train and error_val arrays. \n", - " # Each entry in these arrays\n", - " # should correspond to each entry in training_sizes.\n", - "\n", - "# plot the learning curve\n", - "plt.plot(training_sizes, error_train, 'r', label = 'training_error')\n", - "plt.plot(training_sizes, error_val, 'g', label = 'validation_error')\n", - "plt.legend()\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 5: Determining *k*" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Rubric:\n", - "* Accuracies reported with various *k* values +5, +5\n", - "* Confusion matrices shown for various *k* values +5, +5" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the validation set to come up with a *k* value that results in better performance in terms of accuracy.\n", - "\n", - "Below calculate the accuracies for different values of *k* using the validation set. Report a good *k* value and use it in the analyses that follow this section. Report confusion matrix for the new value of *k*." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO\n", - "\n", - "# Change values of k. \n", - "# Calculate accuracies for the validation set.\n", - "# Report a good k value.\n", - "# Calculate the confusion matrix for new k." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 6: ROC curve analysis\n", - "* Correct implementation +20, +20" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "ROC curves are a good way to visualize sensitivity vs. 1-specificity for varying cut off points. Now, implement, in *model.ipynb*, a \"ROC\" function. \"ROC\" takes a list containing different threshold values to try and returns two arrays; one where each entry is the sensitivity at a given threshold and the other where entries are 1-specificities." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use the *k* value you found above, if you completed TASK 5, else use *k* = 10 to plot the ROC curve for values between 0.1 and 1.0." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO\n", - "# ROC curve\n", - "roc_sens, roc_spec_ = ROC(true_labels=..., preds=..., np.arange(0.1, 1.0, 0.1))\n", - "plt.plot(roc_sens, roc_spec_)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 7: Assess suitability of *k*-NN to your dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Use this cell to write about your understanding of why *k*-NN performed well if it did or why not if it didn't. What properties of the dataset could have affected the performance of the algorithm?" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/ProgrammingAssignment_1/model.ipynb b/ProgrammingAssignment_1/model.ipynb deleted file mode 100644 index a35f7529fb6c5cb4b89733f4f10152a79fca0c59..0000000000000000000000000000000000000000 --- a/ProgrammingAssignment_1/model.ipynb +++ /dev/null @@ -1,250 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# JUPYTER NOTEBOOK TIPS\n", - "\n", - "Each rectangular box is called a cell. \n", - "* Ctrl+ENTER evaluates the current cell; if it contains Python code, it runs the code, if it contains Markdown, it returns rendered text.\n", - "* Alt+ENTER evaluates the current cell and adds a new cell below it.\n", - "* If you click to the left of a cell, you'll notice the frame changes color to blue. You can erase a cell by hitting 'dd' (that's two \"d\"s in a row) when the frame is blue." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Supervised Learning Model Skeleton\n", - "\n", - "We'll use this skeleton for implementing different supervised learning algorithms." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "class Model:\n", - " \n", - " def fit(self):\n", - " \n", - " raise NotImplementedError\n", - " \n", - " def predict(self, test_points):\n", - " raise NotImplementedError" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(feature_file, label_file):\n", - " '''\n", - " Args:\n", - " feature_file: str \n", - " file containing features\n", - " label_file: str\n", - " file containing labels\n", - " Returns:\n", - " features: ndarray\n", - " nxd features\n", - " labels: ndarray\n", - " nx1 labels\n", - " '''\n", - " \n", - " # read in features and labels\n", - " features = np.genfromtxt(feature_file)\n", - " labels = np.genfromtxt(label_file)\n", - " \n", - " return features, labels" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "def partition(size, t, v = 0):\n", - " '''\n", - " Args:\n", - " size: int\n", - " number of examples in the whole dataset\n", - " t: float\n", - " proportion kept for test\n", - " v: float\n", - " proportion kept for validation\n", - " Returns:\n", - " test_indices: ndarray\n", - " 1D array containing test set indices\n", - " val_indices: ndarray\n", - " 1D array containing validation set indices\n", - " '''\n", - " \n", - " # number of test and validation examples\n", - " t_size = np.int(np.ceil(size*t))\n", - " v_size = np.int(np.ceil(size*v))\n", - "\n", - " # shuffle the indices\n", - " permuted = np.random.permutation(size)\n", - " \n", - " # spare the first t_size for test\n", - " test_indices = permuted[:t_size]\n", - " # and the next v_size for validation\n", - " val_indices = permuted[t_size+1:t_size+v_size+1]\n", - " train_indices = np.delete(np.arange(size), np.append(test_indices, val_indices), 0)\n", - " \n", - " return test_indices, val_indices, train_indices" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TASK 1: Implement `distance` function" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"distance\" function will be used in calculating cost of *k*-NN. It should take two data points and the name of the metric and return a scalar value." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "#TODO: Programming Assignment 1\n", - "def distance(x, y, metric):\n", - " '''\n", - " Args:\n", - " x: ndarray \n", - " 1D array containing coordinates for a point\n", - " y: ndarray\n", - " 1D array containing coordinates for a point\n", - " metric: str\n", - " Euclidean, Manhattan \n", - " Returns:\n", - " dist: float\n", - " '''\n", - " if metric == 'Euclidean':\n", - " raise NotImplementedError\n", - " elif metric == 'Manhattan':\n", - " raise NotImplementedError\n", - " else:\n", - " raise ValueError('{} is not a valid metric.'.format(metric))\n", - " return dist # scalar distance btw x and y" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## General supervised learning performance related functions " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Implement the \"conf_matrix\" function that takes as input an array of true labels (*true*) and an array of predicted labels (*pred*). It should output a numpy.ndarray." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: Programming Assignment 1\n", - "\n", - "def conf_matrix(true, pred, n_classes):\n", - " '''\n", - " Args: \n", - " true: ndarray\n", - " nx1 array of true labels for test set\n", - " pred: ndarray \n", - " nx1 array of predicted labels for test set\n", - " n_classes: int\n", - " Returns:\n", - " result: ndarray\n", - " n_classes x n_classes array confusion matrix\n", - " '''\n", - " raise NotImplementedError\n", - " result = np.ndarray([n_classes, n_classes])\n", - " \n", - " \n", - " # returns the confusion matrix as numpy.ndarray\n", - " return result" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "ROC curves are a good way to visualize sensitivity vs. 1-specificity for varying cut off points. \"ROC\" takes a list containing different *threshold* parameter values to try and returns two arrays; one where each entry is the sensitivity at a given threshold and the other where entries are 1-specificities." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: Programming Assignment 1\n", - "\n", - "def ROC(true_labels, preds, value_list):\n", - " '''\n", - " Args:\n", - " true_labels: ndarray\n", - " 1D array containing true labels\n", - " preds: ndarray\n", - " 1D array containing thresholded value (e.g. proportion of neighbors in kNN)\n", - " value_list: ndarray\n", - " 1D array containing different threshold values\n", - " Returns:\n", - " sens: ndarray\n", - " 1D array containing sensitivities\n", - " spec_: ndarray\n", - " 1D array containing 1-specifities\n", - " '''\n", - " \n", - " # calculate sensitivity, 1-specificity\n", - " # return two arrays\n", - " \n", - " raise NotImplementedError\n", - " \n", - " return sens, spec_" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/ProgrammingAssignment_2/ProgrammingAssignment2.ipynb b/ProgrammingAssignment_2/ProgrammingAssignment2.ipynb deleted file mode 100644 index b92ee64c0185b3f5f3da3fd2c7d984f4e2581d72..0000000000000000000000000000000000000000 --- a/ProgrammingAssignment_2/ProgrammingAssignment2.ipynb +++ /dev/null @@ -1,451 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Linear Regression & Naive Bayes\n", - "\n", - "We'll implement linear regression & Naive Bayes algorithms for this assignment. Please modify the \"preprocess\" in this notebook and \"partition\" method in \"model.ipynb\" to suit your datasets for this assignment. In the linear regression part of this assignment, we have a small dataset available to us. We won't have examples to spare for validation set, instead we'll use cross-validation to tune hyperparameters. In our Naive Bayes implementation, we will not use validation set or crossvalidation.\n", - "\n", - "### Assignment Goals:\n", - "In this assignment, we will:\n", - "* implement linear regression\n", - " * use gradient descent for optimization\n", - " * use residuals to decide if we need a polynomial model\n", - " * change our model to quadratic/cubic regression and use cross-validation to find the \"best\" polynomial degree\n", - " * implement regularization techniques\n", - " * $l_1$/$l_2$ regularization\n", - " * use cross-validation to find a good regularization parameter $\\lambda$\n", - " \n", - "* implement Naive Bayes\n", - " * address sparse data problem with **pseudocounts** (**$m$-estimate**)\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You can use numpy for array operations and matplotlib for plotting for this assignment. Please do not add other libraries." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import numpy as np\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Following code makes the Model class and relevant functions available from \"model.ipynb\"." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%run 'model.ipynb'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll implement the \"preprocess\" function and \"kfold\" function for $k$-fold cross-validation in \"model.ipynb\". 5 and 10 are commonly used values for $k$. You can use either one of them." - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(file_path):\n", - " '''\n", - " file_path: where to read the dataset from\n", - " Returns:\n", - " features: ndarray\n", - " nxd array containing `float` feature values\n", - " labels: ndarray\n", - " 1D array containing `float` label\n", - " '''\n", - " # You might find np.genfromtxt useful for reading in the file. Be careful with the file delimiter, \n", - " # e.g. for comma-separated files use delimiter=',' argument.\n", - " \n", - " raise NotImplementedError\n", - "\n", - " \n", - " return features, labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We'll need to use mean squared error (mse) for linear regression. Next, implement \"mse\" function that takes predicted and true y values, and returns the \"mse\" between them." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def mse(y_pred, y_true):\n", - " '''\n", - " Args:\n", - " y_hat: ndarray \n", - " 1D array containing data with `float` type. Values predicted by our method\n", - " y_true: ndarray\n", - " 1D array containing data with `float` type. True y values\n", - " Returns:\n", - " cost: float\n", - " A single value. Mean squared error between y_pred and y_true.\n", - " \n", - " '''\n", - " raise NotImplementedError\n", - "\n", - " return cost\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can define our linear_regression model class now. Implement the \"fit\" and \"predict\" methods. Keep the default values for now, later we'll change the $polynomial\\_degree$. If your \"kfold\" implementation works as it should, each call to fit and predict " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class linear_regression(Model):\n", - " def __init__(self, preprocessor_f, partition_f, **kwargs):\n", - " super().__init__(preprocessor_f, partition_f, **kwargs)\n", - " if k_fold:\n", - " self.data_dict = kfold(self.train_indices, k = kwargs['k'])\n", - " # counter for train fold\n", - " self.i = 0\n", - " # counter for test fold\n", - " self.j = 0 \n", - " \n", - " # You can disregard polynomial_degree and regularizer in your first pass\n", - " def fit(self, learning_rate = 0.001, epochs = 1000, regularizer=None, polynomial_degree=1, **kwargs):\n", - " \n", - " train_features = self.train_features[self.data_dict[self.i]]\n", - " train_labels = self.train_labels[self.data_dict[self.i]]\n", - " \n", - " #initialize theta_cur randomly\n", - " \n", - " # for each epoch\n", - " # compute model predictions for training examples\n", - " y_hat = None\n", - " \n", - " if regularizer = None:\n", - " \n", - " # use mse function to find the cost\n", - " cost = None\n", - " # calculate gradients wrt theta\n", - " grad_theta = None\n", - " # update theta\n", - " theta_curr = None\n", - " raise NotImplementedError\n", - " \n", - " else:\n", - " # take regularization into account\n", - " raise NotImplementedError\n", - " \n", - " # update the model parameters to be used in predict method\n", - " self.theta = theta_curr\n", - " # increment counter for next fold\n", - " self.i += 1\n", - " \n", - " def predict(self, indices):\n", - " \n", - " # obtain test features for current fold\n", - " \n", - " test_features = self.train_features[self.data_dict[self.j]]\n", - " raise NotImplementedError\n", - " \n", - " # increment counter for next fold\n", - " self.j += 1\n", - " return y_hat\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# populate the keyword arguments dictionary kwargs\n", - "# p: proportion for test data\n", - "# k: parameter for k-fold crossvalidation\n", - "kwargs = {'p': 0.3, 'v': 0.1, 'file_path': 'madelon', 'k': 1}\n", - "# initialize the model\n", - "my_model = linear_regression(preprocessor_f=preprocess, partition_f=partition, k_fold=True, **kwargs)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# use fit_kwargs to pass arguments to regularization function\n", - "# fit_kwargs is empty for now since we are not applying \n", - "# regularization yet\n", - "fit_kwargs = {}\n", - "my_model.fit(**fit_kwargs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Residuals are the differences between the predicted value $y_{hat}$ and the true value $y$ for each example. Predict $y_{hat}$ for the validation set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "y_hat_val = my_model.predict(my_model.features[my_model.val_indices])\n", - "residuals = my_model.labels[my_model.val_indices] - y_hat_val\n", - "plt.plot(residuals)\n", - "plt.show()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If the data is better suited for quadratic/cubic regression, regions of positive and negative residuals will alternate in the plot. Regardless, modify fit\" and \"predict\" in the class definition to raise the feature values to $polynomial\\_degree$. You can directly make the modification in the above definition, do not repeat. Use the validation set to find the degree of polynomial that results in lowest _mse_." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "kwargs = {'p': 0.3, 'file_path': 'madelon', 'k': 5}\n", - "# initialize the model\n", - "my_model = linear_regression(preprocessor_f=preprocess, partition_f=partition, k_fold=True, **kwargs)\n", - "\n", - "fit_kwargs = {}\n", - "\n", - "# calculate mse for each of linear model, quadratic and cubic models\n", - "# and append to mses_for_models\n", - "\n", - "mses_for_models = []\n", - "\n", - "for i in range(1,4):\n", - " kfold_mse = 0\n", - " for k in range(5):\n", - " my_model.fit(polynomial_degree = i ,**fit_kwargs)\n", - " pred = my_model.predict(my_model.features[my_model.val_indices], fold = k)\n", - " k_fold_mse += mse(pred, my_model.labels[my_model.val_indices])\n", - " mses_for_models_for_models.append(k_fold_mse/k)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Define \"regularization\" function which implements $l_1$ and $l_2$ regularization. You'll use this function in \"fit\" method of \"linear_regression\" class." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def regularization(weights, method):\n", - " '''\n", - " Args:\n", - " weights: ndarray\n", - " 1D array with `float` entries\n", - " method: str\n", - " Returns:\n", - " value: float\n", - " A single value. Regularization term that will be used in cost function in fit.\n", - " '''\n", - " if method == \"l1\":\n", - " value = None\n", - " raise NotImplementedError\n", - " elif method == \"l2\":\n", - " value = None\n", - " raise NotImplementedError\n", - " return value" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Using crossvalidation and the value of $polynomial_{degree}$ you found above, try different values of $\\lambda$ to find a a good value that results in low _mse_. Report the best values you found for hyperparameters and the resulting _mse_." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Naive Bayes Spam Classifier\n", - "\n", - "This part is independent of the above part. We will use the Enron spam/ham dataset. You will need to decompress the provided \"enron.tar.gz\" folder. The two subfolders contain spam and ham emails.\n", - "\n", - "The features for Naive Bayes algorithm will be word counts. Number of features will be equal to the unique words seen in the whole dataset. The \"preprocess\" function will be more involved this time. You'll need to remove pucntuation marks (you may find string.punctuation useful), tokenize text to words (remember to lowercase all) and count the number of words." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess_bayes(folder_path):\n", - " '''\n", - " Args:\n", - " folder_path: str\n", - " Where to read the dataset from.\n", - " Returns:\n", - " features: ndarray\n", - " nxd array with n emails, d words. features_ij is the count of word_j in email_i\n", - " labels: ndarray\n", - " 1D array of labels (1: spam, 0: ham)\n", - " '''\n", - " # remove punctutaion marks\n", - " # tokenize, lowercase\n", - " # count number of words in each email\n", - " \n", - " raise NotImplementedError\n", - "\n", - " \n", - " return features, labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Implement the \"fit\" and \"predict\" methods for Naive Bayes. Use $m$-estimate to address missing attribute values (also called **Laplace smoothing** when $m$ = 1). In general, $m$ values should be small. We'll use $m$ = 1." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "class naive_bayes(Model):\n", - " def __init__(self, preprocessor_f, partition_f, **kwargs):\n", - " super().__init__(preprocessor_f, partition_f, **kwargs)\n", - " \n", - " def fit(self, m, **kwargs):\n", - " \n", - " self.ham_word_counts = np.zeros(self.feat_dim)\n", - " self.spam_word_counts = np.zeros(self.feat_dim)\n", - " \n", - " # find class prior probabilities\n", - " self.ham_prior = None\n", - " self.spam_prior = None\n", - " # find the number of words(counting repeats) summed across all emails in a class\n", - " n = None\n", - " # find the number of each word summed across all emails in a class\n", - " # populate self.ham_word_counts and self.spam_word_counts\n", - " \n", - " # find the likelihood of a word_i in each class\n", - " # 1D ndarray\n", - " self.ham_likelihood = None\n", - " self.spam_likelihood = None\n", - " \n", - " \n", - " def predict(self, indices):\n", - " '''\n", - " Returns:\n", - " preds: ndarray\n", - " 1D binary array containing predicted labels\n", - " '''\n", - " raise NotImplementedError\n", - " \n", - " return preds\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can fit our model and see how accurately it predicts spam emails now. We won't use a validation set or crossvalidation this time." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# populate the keyword arguments dictionary kwargs\n", - "# p: proportion for test data\n", - "# k: parameter for k-fold crossvalidation\n", - "kwargs = {'p': 0.3, 'file_path': 'enron'}\n", - "# initialize the model\n", - "my_model = linear_regression(preprocessor_f=preprocess_bayes, partition_f=partition, **kwargs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can use the \"conf_matrix\" function we defined before to see how error is distributed." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "preds = my_model.predict(my_model.test_indices)\n", - "tp,tn, fp, fn = conf_matrix(true = my_model.features[my_model.test_indices], pred = preds)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/data/enron.tar.gz b/data/enron.tar.gz deleted file mode 100644 index 11982f4502552ca13381ef89804e253065b415c0..0000000000000000000000000000000000000000 Binary files a/data/enron.tar.gz and /dev/null differ diff --git a/model-Solution.ipynb b/model-Solution.ipynb deleted file mode 100644 index ccdf6101e2a4027f83ac529c3f66d6e214faed01..0000000000000000000000000000000000000000 --- a/model-Solution.ipynb +++ /dev/null @@ -1,156 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "You might need to preprocess your dataset depending on which dataset you are using. This step is for reading the dataset and for extracting features and labels. The \"preprocess\" function should return an $n \\times d$ features array, and an $n \\times 1$ labels array, where $n$ is the number of examples and $d$ is the number of features in the dataset." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def preprocess(file_path):\n", - " '''\n", - " file_path: where to read the dataset from\n", - " returns nxd features, nx1 labels\n", - " '''\n", - " # You might find np.genfromtxt useful for reading in the file. Be careful with the file delimiter, \n", - " # e.g. for comma-separated files use delimiter=',' argument.\n", - " \n", - " #raise NotImplementedError\n", - " feature_path = file_path + '.data'\n", - " label_path = file_path + '.labels'\n", - " features = np.genfromtxt(feature_path)\n", - " labels = np.genfromtxt(label_path)\n", - " #features = data[:, 1:]\n", - " #labels = data[:, 0]\n", - " \n", - "####################\n", - " \n", - " return features, labels" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, you'll need to split your dataset into training and validation and test sets. The \"split\" function should take as input the size of the whole dataset and randomly sample a proportion $p$ of the dataset as test partition and a proportion of $v$ as validation partition. The remaining will be used as training data. For example, to keep 30% of the examples as test and %10 as validation, set $p=0.3$ and $v=0.1$. You should choose these values according to the size of the data available to you. The \"split\" function should return indices of the training, validation and test sets. These will be used to index into the whole training set." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def partition(size, p, v, seed):\n", - " '''\n", - " size: number of examples in the whole dataset\n", - " p: proportion kept for test\n", - " v: proportion kept for validation\n", - " '''\n", - " \n", - " # np.random.choice might come in handy. Do not sample with replacement!\n", - " # Be sure to not use the same indices in test and validation sets!\n", - " #raise NotImplementedError\n", - " \n", - " data_list = np.arange(size)\n", - " p_size = np.int(np.ceil(size*p))\n", - " v_size = np.int(np.ceil(size*v))\n", - " \n", - " np.random.seed(seed)\n", - " permuted = np.random.permutation(data_list)\n", - " \n", - " test_indices = permuted[:p_size]\n", - " val_indices = permuted[p_size+1:p_size+v_size]\n", - " ##########################\n", - " # return two 1d arrays: one keeping validation set indices, the other keeping test set indices \n", - " return val_indices, test_indices" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "class Model:\n", - " # set the preprocessing function, partition_function\n", - " # use kwargs to pass arguments to preprocessor_f and partition_f\n", - " # kwargs is a dictionary and should contain p, v and file_path\n", - " # e.g. {'p': 0.3, 'v': 0.1, 'file_path': some_path}\n", - " \n", - " def __init__(self, preprocessor_f, partition_f, distance_f=None, **kwargs):\n", - " \n", - " self.features, self.labels = preprocessor_f(kwargs['file_path'])\n", - " self.size = len(self.labels)\n", - " self.val_indices, self.test_indices = partition_f(self.size, kwargs['p'], kwargs['v'], kwargs['seed'])\n", - " self.training_indices = np.delete(np.arange(self.size), np.append(self.test_indices, self.val_indices), 0)\n", - " \n", - " def fit(self):\n", - " raise NotImplementedError\n", - " \n", - " def predict(self):\n", - " raise NotImplementedError" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def conf_matrix(true_l, pred, threshold):\n", - " tp = tn = fp = fn = 0\n", - " \n", - " for i in range(len(true_l)):\n", - " tmp = -1\n", - " \n", - " if pred[i] > threshold:\n", - " tmp = 1\n", - " if tmp == true_l[i]:\n", - " \n", - " if true_l[i] == 1:\n", - " tp += 1\n", - " else:\n", - " tn += 1\n", - " else:\n", - " if true_l[i] == 1:\n", - " fn += 1\n", - " else:\n", - " fp += 1\n", - " \n", - " return np.array([tp,tn, fp, fn])\n", - " \n", - " \n", - " # returns the confusion matrix as numpy.ndarray\n", - " #raise NotImplementedError" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.4" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}