{"cells":[{"cell_type":"markdown","metadata":{"id":"79rWt21wd_5Z"},"source":["The decision tree algorithm can be used to do both classification as well as regression and has the advantage of not assuming a linear model. Decisions trees are usually easy to represent visually which makes it easy to understand how the model actually works."]},{"cell_type":"markdown","metadata":{"id":"e2ipLHcwd_5Z"},"source":["### Geometric Intuition\n","![image](https://docs.microsoft.com/en-us/azure/machine-learning/studio/media/algorithm-choice/image5.png)"]},{"cell_type":"code","execution_count":1,"metadata":{"id":"eGym5xlHd_5c","executionInfo":{"status":"ok","timestamp":1711469960227,"user_tz":240,"elapsed":2289,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[],"source":["# import necessary packages\n","import pandas as pd\n","import numpy as np\n","from sklearn.metrics import accuracy_score\n","from sklearn.model_selection import train_test_split\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn import tree\n","from sklearn import datasets"]},{"cell_type":"code","execution_count":3,"metadata":{"id":"_gf8r9z8d_5f","outputId":"8ae8e7db-bfeb-4d98-9988-ef1877e95513","colab":{"base_uri":"https://localhost:8080/","height":270},"executionInfo":{"status":"ok","timestamp":1711470614179,"user_tz":240,"elapsed":521,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":["         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \\\n","0    842302         M        17.99         10.38          122.80     1001.0   \n","1    842517         M        20.57         17.77          132.90     1326.0   \n","2  84300903         M        19.69         21.25          130.00     1203.0   \n","3  84348301         M        11.42         20.38           77.58      386.1   \n","4  84358402         M        20.29         14.34          135.10     1297.0   \n","\n","   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \\\n","0          0.11840           0.27760          0.3001              0.14710   \n","1          0.08474           0.07864          0.0869              0.07017   \n","2          0.10960           0.15990          0.1974              0.12790   \n","3          0.14250           0.28390          0.2414              0.10520   \n","4          0.10030           0.13280          0.1980              0.10430   \n","\n","   ...  texture_worst  perimeter_worst  area_worst  smoothness_worst  \\\n","0  ...          17.33           184.60      2019.0            0.1622   \n","1  ...          23.41           158.80      1956.0            0.1238   \n","2  ...          25.53           152.50      1709.0            0.1444   \n","3  ...          26.50            98.87       567.7            0.2098   \n","4  ...          16.67           152.20      1575.0            0.1374   \n","\n","   compactness_worst  concavity_worst  concave points_worst  symmetry_worst  \\\n","0             0.6656           0.7119                0.2654          0.4601   \n","1             0.1866           0.2416                0.1860          0.2750   \n","2             0.4245           0.4504                0.2430          0.3613   \n","3             0.8663           0.6869                0.2575          0.6638   \n","4             0.2050           0.4000                0.1625          0.2364   \n","\n","   fractal_dimension_worst  Unnamed: 32  \n","0                  0.11890          NaN  \n","1                  0.08902          NaN  \n","2                  0.08758          NaN  \n","3                  0.17300          NaN  \n","4                  0.07678          NaN  \n","\n","[5 rows x 33 columns]"],"text/html":["\n","  <div id=\"df-c7d47a00-10b8-4e52-a993-c332ef5ce664\" class=\"colab-df-container\">\n","    <div>\n","<style scoped>\n","    .dataframe tbody tr th:only-of-type {\n","        vertical-align: middle;\n","    }\n","\n","    .dataframe tbody tr th {\n","        vertical-align: top;\n","    }\n","\n","    .dataframe thead th {\n","        text-align: right;\n","    }\n","</style>\n","<table border=\"1\" class=\"dataframe\">\n","  <thead>\n","    <tr style=\"text-align: right;\">\n","      <th></th>\n","      <th>id</th>\n","      <th>diagnosis</th>\n","      <th>radius_mean</th>\n","      <th>texture_mean</th>\n","      <th>perimeter_mean</th>\n","      <th>area_mean</th>\n","      <th>smoothness_mean</th>\n","      <th>compactness_mean</th>\n","      <th>concavity_mean</th>\n","      <th>concave points_mean</th>\n","      <th>...</th>\n","      <th>texture_worst</th>\n","      <th>perimeter_worst</th>\n","      <th>area_worst</th>\n","      <th>smoothness_worst</th>\n","      <th>compactness_worst</th>\n","      <th>concavity_worst</th>\n","      <th>concave points_worst</th>\n","      <th>symmetry_worst</th>\n","      <th>fractal_dimension_worst</th>\n","      <th>Unnamed: 32</th>\n","    </tr>\n","  </thead>\n","  <tbody>\n","    <tr>\n","      <th>0</th>\n","      <td>842302</td>\n","      <td>M</td>\n","      <td>17.99</td>\n","      <td>10.38</td>\n","      <td>122.80</td>\n","      <td>1001.0</td>\n","      <td>0.11840</td>\n","      <td>0.27760</td>\n","      <td>0.3001</td>\n","      <td>0.14710</td>\n","      <td>...</td>\n","      <td>17.33</td>\n","      <td>184.60</td>\n","      <td>2019.0</td>\n","      <td>0.1622</td>\n","      <td>0.6656</td>\n","      <td>0.7119</td>\n","      <td>0.2654</td>\n","      <td>0.4601</td>\n","      <td>0.11890</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>1</th>\n","      <td>842517</td>\n","      <td>M</td>\n","      <td>20.57</td>\n","      <td>17.77</td>\n","      <td>132.90</td>\n","      <td>1326.0</td>\n","      <td>0.08474</td>\n","      <td>0.07864</td>\n","      <td>0.0869</td>\n","      <td>0.07017</td>\n","      <td>...</td>\n","      <td>23.41</td>\n","      <td>158.80</td>\n","      <td>1956.0</td>\n","      <td>0.1238</td>\n","      <td>0.1866</td>\n","      <td>0.2416</td>\n","      <td>0.1860</td>\n","      <td>0.2750</td>\n","      <td>0.08902</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>2</th>\n","      <td>84300903</td>\n","      <td>M</td>\n","      <td>19.69</td>\n","      <td>21.25</td>\n","      <td>130.00</td>\n","      <td>1203.0</td>\n","      <td>0.10960</td>\n","      <td>0.15990</td>\n","      <td>0.1974</td>\n","      <td>0.12790</td>\n","      <td>...</td>\n","      <td>25.53</td>\n","      <td>152.50</td>\n","      <td>1709.0</td>\n","      <td>0.1444</td>\n","      <td>0.4245</td>\n","      <td>0.4504</td>\n","      <td>0.2430</td>\n","      <td>0.3613</td>\n","      <td>0.08758</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>3</th>\n","      <td>84348301</td>\n","      <td>M</td>\n","      <td>11.42</td>\n","      <td>20.38</td>\n","      <td>77.58</td>\n","      <td>386.1</td>\n","      <td>0.14250</td>\n","      <td>0.28390</td>\n","      <td>0.2414</td>\n","      <td>0.10520</td>\n","      <td>...</td>\n","      <td>26.50</td>\n","      <td>98.87</td>\n","      <td>567.7</td>\n","      <td>0.2098</td>\n","      <td>0.8663</td>\n","      <td>0.6869</td>\n","      <td>0.2575</td>\n","      <td>0.6638</td>\n","      <td>0.17300</td>\n","      <td>NaN</td>\n","    </tr>\n","    <tr>\n","      <th>4</th>\n","      <td>84358402</td>\n","      <td>M</td>\n","      <td>20.29</td>\n","      <td>14.34</td>\n","      <td>135.10</td>\n","      <td>1297.0</td>\n","      <td>0.10030</td>\n","      <td>0.13280</td>\n","      <td>0.1980</td>\n","      <td>0.10430</td>\n","      <td>...</td>\n","      <td>16.67</td>\n","      <td>152.20</td>\n","      <td>1575.0</td>\n","      <td>0.1374</td>\n","      <td>0.2050</td>\n","      <td>0.4000</td>\n","      <td>0.1625</td>\n","      <td>0.2364</td>\n","      <td>0.07678</td>\n","      <td>NaN</td>\n","    </tr>\n","  </tbody>\n","</table>\n","<p>5 rows × 33 columns</p>\n","</div>\n","    <div class=\"colab-df-buttons\">\n","\n","  <div class=\"colab-df-container\">\n","    <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-c7d47a00-10b8-4e52-a993-c332ef5ce664')\"\n","            title=\"Convert this dataframe to an interactive table.\"\n","            style=\"display:none;\">\n","\n","  <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n","    <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n","  </svg>\n","    </button>\n","\n","  <style>\n","    .colab-df-container {\n","      display:flex;\n","      gap: 12px;\n","    }\n","\n","    .colab-df-convert {\n","      background-color: #E8F0FE;\n","      border: none;\n","      border-radius: 50%;\n","      cursor: pointer;\n","      display: none;\n","      fill: #1967D2;\n","      height: 32px;\n","      padding: 0 0 0 0;\n","      width: 32px;\n","    }\n","\n","    .colab-df-convert:hover {\n","      background-color: #E2EBFA;\n","      box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);\n","      fill: #174EA6;\n","    }\n","\n","    .colab-df-buttons div {\n","      margin-bottom: 4px;\n","    }\n","\n","    [theme=dark] .colab-df-convert {\n","      background-color: #3B4455;\n","      fill: #D2E3FC;\n","    }\n","\n","    [theme=dark] .colab-df-convert:hover {\n","      background-color: #434B5C;\n","      box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);\n","      filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));\n","      fill: #FFFFFF;\n","    }\n","  </style>\n","\n","    <script>\n","      const buttonEl =\n","        document.querySelector('#df-c7d47a00-10b8-4e52-a993-c332ef5ce664 button.colab-df-convert');\n","      buttonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","\n","      async function convertToInteractive(key) {\n","        const element = document.querySelector('#df-c7d47a00-10b8-4e52-a993-c332ef5ce664');\n","        const dataTable =\n","          await google.colab.kernel.invokeFunction('convertToInteractive',\n","                                                    [key], {});\n","        if (!dataTable) return;\n","\n","        const docLinkHtml = 'Like what you see? Visit the ' +\n","          '<a target=\"_blank\" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'\n","          + ' to learn more about interactive tables.';\n","        element.innerHTML = '';\n","        dataTable['output_type'] = 'display_data';\n","        await google.colab.output.renderOutput(dataTable, element);\n","        const docLink = document.createElement('div');\n","        docLink.innerHTML = docLinkHtml;\n","        element.appendChild(docLink);\n","      }\n","    </script>\n","  </div>\n","\n","\n","<div id=\"df-ac8e9060-065d-4607-a7b3-a86acd1947cf\">\n","  <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-ac8e9060-065d-4607-a7b3-a86acd1947cf')\"\n","            title=\"Suggest charts\"\n","            style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n","     width=\"24px\">\n","    <g>\n","        <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n","    </g>\n","</svg>\n","  </button>\n","\n","<style>\n","  .colab-df-quickchart {\n","      --bg-color: #E8F0FE;\n","      --fill-color: #1967D2;\n","      --hover-bg-color: #E2EBFA;\n","      --hover-fill-color: #174EA6;\n","      --disabled-fill-color: #AAA;\n","      --disabled-bg-color: #DDD;\n","  }\n","\n","  [theme=dark] .colab-df-quickchart {\n","      --bg-color: #3B4455;\n","      --fill-color: #D2E3FC;\n","      --hover-bg-color: #434B5C;\n","      --hover-fill-color: #FFFFFF;\n","      --disabled-bg-color: #3B4455;\n","      --disabled-fill-color: #666;\n","  }\n","\n","  .colab-df-quickchart {\n","    background-color: var(--bg-color);\n","    border: none;\n","    border-radius: 50%;\n","    cursor: pointer;\n","    display: none;\n","    fill: var(--fill-color);\n","    height: 32px;\n","    padding: 0;\n","    width: 32px;\n","  }\n","\n","  .colab-df-quickchart:hover {\n","    background-color: var(--hover-bg-color);\n","    box-shadow: 0 1px 2px rgba(60, 64, 67, 0.3), 0 1px 3px 1px rgba(60, 64, 67, 0.15);\n","    fill: var(--button-hover-fill-color);\n","  }\n","\n","  .colab-df-quickchart-complete:disabled,\n","  .colab-df-quickchart-complete:disabled:hover {\n","    background-color: var(--disabled-bg-color);\n","    fill: var(--disabled-fill-color);\n","    box-shadow: none;\n","  }\n","\n","  .colab-df-spinner {\n","    border: 2px solid var(--fill-color);\n","    border-color: transparent;\n","    border-bottom-color: var(--fill-color);\n","    animation:\n","      spin 1s steps(1) infinite;\n","  }\n","\n","  @keyframes spin {\n","    0% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","      border-left-color: var(--fill-color);\n","    }\n","    20% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    30% {\n","      border-color: transparent;\n","      border-left-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","      border-right-color: var(--fill-color);\n","    }\n","    40% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-top-color: var(--fill-color);\n","    }\n","    60% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","    }\n","    80% {\n","      border-color: transparent;\n","      border-right-color: var(--fill-color);\n","      border-bottom-color: var(--fill-color);\n","    }\n","    90% {\n","      border-color: transparent;\n","      border-bottom-color: var(--fill-color);\n","    }\n","  }\n","</style>\n","\n","  <script>\n","    async function quickchart(key) {\n","      const quickchartButtonEl =\n","        document.querySelector('#' + key + ' button');\n","      quickchartButtonEl.disabled = true;  // To prevent multiple clicks.\n","      quickchartButtonEl.classList.add('colab-df-spinner');\n","      try {\n","        const charts = await google.colab.kernel.invokeFunction(\n","            'suggestCharts', [key], {});\n","      } catch (error) {\n","        console.error('Error during call to suggestCharts:', error);\n","      }\n","      quickchartButtonEl.classList.remove('colab-df-spinner');\n","      quickchartButtonEl.classList.add('colab-df-quickchart-complete');\n","    }\n","    (() => {\n","      let quickchartButtonEl =\n","        document.querySelector('#df-ac8e9060-065d-4607-a7b3-a86acd1947cf button');\n","      quickchartButtonEl.style.display =\n","        google.colab.kernel.accessAllowed ? 'block' : 'none';\n","    })();\n","  </script>\n","</div>\n","    </div>\n","  </div>\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"df"}},"metadata":{},"execution_count":3}],"source":["df = pd.read_csv('lecture7example.csv')\n","X=df.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1)\n","Y=df['diagnosis']\n","df.head()"]},{"cell_type":"markdown","metadata":{"id":"JOYI2tFu4ZKE"},"source":["# Decision Tree with Ordinary Train Test Split"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2d75MqTcnUDq","executionInfo":{"status":"ok","timestamp":1711470807875,"user_tz":240,"elapsed":18867,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}},"outputId":"5da55a8e-4fa7-4f07-8abe-d2246017546b"},"execution_count":4,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","execution_count":5,"metadata":{"id":"xJUqYe9md_5h","executionInfo":{"status":"ok","timestamp":1711470809961,"user_tz":240,"elapsed":189,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[],"source":["X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1998)"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"7xSPG4FId_5j","outputId":"76a7b909-2c84-4c06-8b9a-0c284fb677ac","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1711470821496,"user_tz":240,"elapsed":300,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Train Accuracy:  0.994750656167979\n","Test Accuracy:  0.9095744680851063\n"]}],"source":["\n","# Creates the Decision Tree Classifier\n","model=tree.DecisionTreeClassifier(max_depth=5)\n","\n","#TODO: train the model\n","model.fit(X_train, Y_train)\n","\n","#TODO: Calculate the training and testing accuracy\n","dtree_predict_train = model.predict(X_train)\n","dtree_predict_test = model.predict(X_test)\n","\n","print(\"Train Accuracy: \", accuracy_score(Y_train, dtree_predict_train))\n","print(\"Test Accuracy: \", accuracy_score(Y_test, dtree_predict_test) )\n"]},{"cell_type":"markdown","metadata":{"id":"zBHK8Coy4ZKH"},"source":["# Decision Tree with K-Fold Cross Validation"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"s8p7ER1rd_5o","outputId":"2eca4b0c-951e-4983-d6f8-92d90adfcf48","scrolled":true,"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1711470838420,"user_tz":240,"elapsed":530,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Test Accuracy:  0.8771929824561403\n","Test Accuracy:  0.9210526315789473\n","Test Accuracy:  0.9473684210526315\n","Test Accuracy:  0.9385964912280702\n","Test Accuracy:  0.9026548672566371\n","Decision Tree Model performance using k-fold CV: 0.9173730787144854\n"]}],"source":["from sklearn.model_selection import KFold\n","\n","incX = X\n","incY = Y\n","\n","#Fill in code here\n","kf = KFold(n_splits=5)\n","\n","scorelist = []\n","\n","for train_index, test_index in kf.split(incX):\n","  X_train = incX.iloc[train_index]\n","  Y_train = incY.iloc[train_index]\n","  X_test = incX.iloc[test_index]\n","  Y_test = incY.iloc[test_index]\n","\n","  model.fit(X_train, Y_train)\n","  dtree_pred_test = model.predict(X_test)\n","  print(\"Test Accuracy: \", accuracy_score(Y_test, dtree_pred_test))\n","  scorelist.append(accuracy_score(Y_test, dtree_pred_test))\n","\n","print(\"Decision Tree Model performance using k-fold CV: \" + str(np.mean(scorelist)))\n"]},{"cell_type":"markdown","metadata":{"id":"wheZI5Dad_5q"},"source":["# Logistic Regression Classifier"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"I33OQQAU4ZKK","outputId":"35f6affc-2a1e-432c-cee1-dfbf83fd2957","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1698267486920,"user_tz":240,"elapsed":167,"user":{"displayName":"Mericel Tao","userId":"04864522569170138846"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Train Accuracy:  0.958005249343832\n","Test Accuracy:  0.9521276595744681\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n","STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n","\n","Increase the number of iterations (max_iter) or scale the data as shown in:\n","    https://scikit-learn.org/stable/modules/preprocessing.html\n","Please also refer to the documentation for alternative solver options:\n","    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n","  n_iter_i = _check_optimize_result(\n"]}],"source":["from sklearn.linear_model import LogisticRegression\n","\n","logreg = LogisticRegression()\n","\n","logreg.fit(X_train,Y_train)\n","\n","#TODO: Calculate the training and testing accuracy\n","logreg_pred_train = logreg.predict(X_train)\n","logreg_pred_test = logreg.predict(X_test)\n","print(\"Train Accuracy: \", accuracy_score(Y_train, logreg_pred_train))\n","print(\"Test Accuracy: \", accuracy_score(Y_test, logreg_pred_test))"]},{"cell_type":"code","source":[],"metadata":{"id":"cpL0oyDIoNgR"},"execution_count":null,"outputs":[]}],"metadata":{"colab":{"provenance":[]},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.3"},"notebookId":"^EG=G=gSDql5SN"},"nbformat":4,"nbformat_minor":0}