{"cells":[{"cell_type":"markdown","metadata":{"id":"79rWt21wd_5Z"},"source":["The decision tree algorithm can be used to do both classification as well as regression and has the advantage of not assuming a linear model. Decisions trees are usually easy to represent visually which makes it easy to understand how the model actually works."]},{"cell_type":"markdown","metadata":{"id":"e2ipLHcwd_5Z"},"source":["### Geometric Intuition\n","![image](https://docs.microsoft.com/en-us/azure/machine-learning/studio/media/algorithm-choice/image5.png)"]},{"cell_type":"code","execution_count":1,"metadata":{"id":"eGym5xlHd_5c","executionInfo":{"status":"ok","timestamp":1711469960227,"user_tz":240,"elapsed":2289,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[],"source":["# import necessary packages\n","import pandas as pd\n","import numpy as np\n","from sklearn.metrics import accuracy_score\n","from sklearn.model_selection import train_test_split\n","from sklearn.tree import DecisionTreeClassifier\n","from sklearn.neighbors import KNeighborsClassifier\n","from sklearn import tree\n","from sklearn import datasets"]},{"cell_type":"code","execution_count":3,"metadata":{"id":"_gf8r9z8d_5f","outputId":"8ae8e7db-bfeb-4d98-9988-ef1877e95513","colab":{"base_uri":"https://localhost:8080/","height":270},"executionInfo":{"status":"ok","timestamp":1711470614179,"user_tz":240,"elapsed":521,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"execute_result","data":{"text/plain":[" id diagnosis radius_mean texture_mean perimeter_mean area_mean \\\n","0 842302 M 17.99 10.38 122.80 1001.0 \n","1 842517 M 20.57 17.77 132.90 1326.0 \n","2 84300903 M 19.69 21.25 130.00 1203.0 \n","3 84348301 M 11.42 20.38 77.58 386.1 \n","4 84358402 M 20.29 14.34 135.10 1297.0 \n","\n"," smoothness_mean compactness_mean concavity_mean concave points_mean \\\n","0 0.11840 0.27760 0.3001 0.14710 \n","1 0.08474 0.07864 0.0869 0.07017 \n","2 0.10960 0.15990 0.1974 0.12790 \n","3 0.14250 0.28390 0.2414 0.10520 \n","4 0.10030 0.13280 0.1980 0.10430 \n","\n"," ... texture_worst perimeter_worst area_worst smoothness_worst \\\n","0 ... 17.33 184.60 2019.0 0.1622 \n","1 ... 23.41 158.80 1956.0 0.1238 \n","2 ... 25.53 152.50 1709.0 0.1444 \n","3 ... 26.50 98.87 567.7 0.2098 \n","4 ... 16.67 152.20 1575.0 0.1374 \n","\n"," compactness_worst concavity_worst concave points_worst symmetry_worst \\\n","0 0.6656 0.7119 0.2654 0.4601 \n","1 0.1866 0.2416 0.1860 0.2750 \n","2 0.4245 0.4504 0.2430 0.3613 \n","3 0.8663 0.6869 0.2575 0.6638 \n","4 0.2050 0.4000 0.1625 0.2364 \n","\n"," fractal_dimension_worst Unnamed: 32 \n","0 0.11890 NaN \n","1 0.08902 NaN \n","2 0.08758 NaN \n","3 0.17300 NaN \n","4 0.07678 NaN \n","\n","[5 rows x 33 columns]"],"text/html":["\n","
\n","
\n","\n","\n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n"," \n","
iddiagnosisradius_meantexture_meanperimeter_meanarea_meansmoothness_meancompactness_meanconcavity_meanconcave points_mean...texture_worstperimeter_worstarea_worstsmoothness_worstcompactness_worstconcavity_worstconcave points_worstsymmetry_worstfractal_dimension_worstUnnamed: 32
0842302M17.9910.38122.801001.00.118400.277600.30010.14710...17.33184.602019.00.16220.66560.71190.26540.46010.11890NaN
1842517M20.5717.77132.901326.00.084740.078640.08690.07017...23.41158.801956.00.12380.18660.24160.18600.27500.08902NaN
284300903M19.6921.25130.001203.00.109600.159900.19740.12790...25.53152.501709.00.14440.42450.45040.24300.36130.08758NaN
384348301M11.4220.3877.58386.10.142500.283900.24140.10520...26.5098.87567.70.20980.86630.68690.25750.66380.17300NaN
484358402M20.2914.34135.101297.00.100300.132800.19800.10430...16.67152.201575.00.13740.20500.40000.16250.23640.07678NaN
\n","

5 rows × 33 columns

\n","
\n","
\n","\n","
\n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","
\n"," \n","\n","\n","\n"," \n","
\n","
\n","
\n"],"application/vnd.google.colaboratory.intrinsic+json":{"type":"dataframe","variable_name":"df"}},"metadata":{},"execution_count":3}],"source":["df = pd.read_csv('lecture7example.csv')\n","X=df.drop(['id', 'diagnosis', 'Unnamed: 32'], axis=1)\n","Y=df['diagnosis']\n","df.head()"]},{"cell_type":"markdown","metadata":{"id":"JOYI2tFu4ZKE"},"source":["# Decision Tree with Ordinary Train Test Split"]},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"2d75MqTcnUDq","executionInfo":{"status":"ok","timestamp":1711470807875,"user_tz":240,"elapsed":18867,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}},"outputId":"5da55a8e-4fa7-4f07-8abe-d2246017546b"},"execution_count":4,"outputs":[{"output_type":"stream","name":"stdout","text":["Mounted at /content/drive\n"]}]},{"cell_type":"code","execution_count":5,"metadata":{"id":"xJUqYe9md_5h","executionInfo":{"status":"ok","timestamp":1711470809961,"user_tz":240,"elapsed":189,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[],"source":["X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=1998)"]},{"cell_type":"code","execution_count":6,"metadata":{"id":"7xSPG4FId_5j","outputId":"76a7b909-2c84-4c06-8b9a-0c284fb677ac","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1711470821496,"user_tz":240,"elapsed":300,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Train Accuracy: 0.994750656167979\n","Test Accuracy: 0.9095744680851063\n"]}],"source":["\n","# Creates the Decision Tree Classifier\n","model=tree.DecisionTreeClassifier(max_depth=5)\n","\n","#TODO: train the model\n","model.fit(X_train, Y_train)\n","\n","#TODO: Calculate the training and testing accuracy\n","dtree_predict_train = model.predict(X_train)\n","dtree_predict_test = model.predict(X_test)\n","\n","print(\"Train Accuracy: \", accuracy_score(Y_train, dtree_predict_train))\n","print(\"Test Accuracy: \", accuracy_score(Y_test, dtree_predict_test) )\n"]},{"cell_type":"markdown","metadata":{"id":"zBHK8Coy4ZKH"},"source":["# Decision Tree with K-Fold Cross Validation"]},{"cell_type":"code","execution_count":7,"metadata":{"id":"s8p7ER1rd_5o","outputId":"2eca4b0c-951e-4983-d6f8-92d90adfcf48","scrolled":true,"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1711470838420,"user_tz":240,"elapsed":530,"user":{"displayName":"Audrey Wang","userId":"01727017967557536978"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Test Accuracy: 0.8771929824561403\n","Test Accuracy: 0.9210526315789473\n","Test Accuracy: 0.9473684210526315\n","Test Accuracy: 0.9385964912280702\n","Test Accuracy: 0.9026548672566371\n","Decision Tree Model performance using k-fold CV: 0.9173730787144854\n"]}],"source":["from sklearn.model_selection import KFold\n","\n","incX = X\n","incY = Y\n","\n","#Fill in code here\n","kf = KFold(n_splits=5)\n","\n","scorelist = []\n","\n","for train_index, test_index in kf.split(incX):\n"," X_train = incX.iloc[train_index]\n"," Y_train = incY.iloc[train_index]\n"," X_test = incX.iloc[test_index]\n"," Y_test = incY.iloc[test_index]\n","\n"," model.fit(X_train, Y_train)\n"," dtree_pred_test = model.predict(X_test)\n"," print(\"Test Accuracy: \", accuracy_score(Y_test, dtree_pred_test))\n"," scorelist.append(accuracy_score(Y_test, dtree_pred_test))\n","\n","print(\"Decision Tree Model performance using k-fold CV: \" + str(np.mean(scorelist)))\n"]},{"cell_type":"markdown","metadata":{"id":"wheZI5Dad_5q"},"source":["# Logistic Regression Classifier"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"I33OQQAU4ZKK","outputId":"35f6affc-2a1e-432c-cee1-dfbf83fd2957","colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1698267486920,"user_tz":240,"elapsed":167,"user":{"displayName":"Mericel Tao","userId":"04864522569170138846"}}},"outputs":[{"output_type":"stream","name":"stdout","text":["Train Accuracy: 0.958005249343832\n","Test Accuracy: 0.9521276595744681\n"]},{"output_type":"stream","name":"stderr","text":["/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py:458: ConvergenceWarning: lbfgs failed to converge (status=1):\n","STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n","\n","Increase the number of iterations (max_iter) or scale the data as shown in:\n"," https://scikit-learn.org/stable/modules/preprocessing.html\n","Please also refer to the documentation for alternative solver options:\n"," https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n"," n_iter_i = _check_optimize_result(\n"]}],"source":["from sklearn.linear_model import LogisticRegression\n","\n","logreg = LogisticRegression()\n","\n","logreg.fit(X_train,Y_train)\n","\n","#TODO: Calculate the training and testing accuracy\n","logreg_pred_train = logreg.predict(X_train)\n","logreg_pred_test = logreg.predict(X_test)\n","print(\"Train Accuracy: \", accuracy_score(Y_train, logreg_pred_train))\n","print(\"Test Accuracy: \", accuracy_score(Y_test, logreg_pred_test))"]},{"cell_type":"code","source":[],"metadata":{"id":"cpL0oyDIoNgR"},"execution_count":null,"outputs":[]}],"metadata":{"colab":{"provenance":[]},"kernelspec":{"display_name":"Python 3 (ipykernel)","language":"python","name":"python3"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.10.3"},"notebookId":"^EG=G=gSDql5SN"},"nbformat":4,"nbformat_minor":0}