diff --git a/data_science/ensemble/randomforest.ipynb b/data_science/ensemble/randomforest.ipynb new file mode 100755 index 0000000..662d955 --- /dev/null +++ b/data_science/ensemble/randomforest.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "from sklearn import tree\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.model_selection import cross_val_score\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load MNIST dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "mnist = datasets.load_digits()\n", + "features, labels = mnist.data, mnist.target" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Cross Validation" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "def cross_validation(classifier,features, labels):\n", + " cv_scores = []\n", + "\n", + " for i in range(10):\n", + " scores = cross_val_score(classifier, features, labels, cv=10, scoring='accuracy')\n", + " cv_scores.append(scores.mean())\n", + " \n", + " return cv_scores" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "dt_cv_scores = cross_validation(tree.DecisionTreeClassifier(), features, labels)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "rf_cv_scores = cross_validation(RandomForestClassifier(), features, labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Random Forest VS Decision Tree visualization" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "cv_list = [ \n", + " ['random_forest',rf_cv_scores],\n", + " ['decision_tree',dt_cv_scores],\n", + " ]\n", + "df = pd.DataFrame.from_items(cv_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "\n", + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "df.plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Decision Tree Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.8343173330831328" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(dt_cv_scores)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Random Forest Accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.9223850187122359" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "np.mean(rf_cv_scores)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_science/ensemble/voting.ipynb b/data_science/ensemble/voting.ipynb new file mode 100755 index 0000000..e42e8d3 --- /dev/null +++ b/data_science/ensemble/voting.ipynb @@ -0,0 +1,279 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Voting\n", + "Based on the idea that classifiers can complement each other, \n", + "Aggregating individual classifier's prediction to make better prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "from sklearn import datasets\n", + "from sklearn import tree\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.svm import SVC\n", + "from sklearn.ensemble import VotingClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# load mnist dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "mnist = datasets.load_digits()\n", + "features, labels = mnist.data, mnist.target\n", + "X_train,X_test,y_train,y_test=train_test_split(features,labels,test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# single classifiers accuracy on mnist\n", + "build decision tree, knn, svm and check accuracy on MNIST data." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "dtree = tree.DecisionTreeClassifier(\n", + " criterion=\"gini\", max_depth=8, max_features=32,random_state=35)\n", + "\n", + "dtree = dtree.fit(X_train, y_train)\n", + "dtree_predicted = dtree.predict(X_test)\n", + "\n", + "knn = KNeighborsClassifier(n_neighbors=299).fit(X_train, y_train)\n", + "knn_predicted = knn.predict(X_test)\n", + "\n", + "svm = SVC(C=0.1, gamma=0.003,\n", + " probability=True,random_state=35).fit(X_train, y_train)\n", + "svm_predicted = svm.predict(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[accuarcy]\n", + "d-tree: 0.7972222222222223\n", + "knn : 0.8416666666666667\n", + "svm : 0.85\n" + ] + } + ], + "source": [ + "print(\"[accuarcy]\")\n", + "print(\"d-tree: \",accuracy_score(y_test, dtree_predicted))\n", + "print(\"knn : \",accuracy_score(y_test, knn_predicted))\n", + "print(\"svm : \",accuracy_score(y_test, svm_predicted))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "we can easily do soft voting or hard voting using sklearn's voting classifier \n", + "when you want to implement soft voting by scratch, you can use predict_proba just like below, \n", + "Below is the example of SVM's prediction (digit 0 to 9) on two MNIST data." + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[9.95557918e-01 3.42018637e-04 4.57700824e-04 4.19160266e-04\n", + " 4.21146304e-04 7.99436984e-04 4.11439277e-04 6.08753549e-04\n", + " 4.33211441e-04 5.49214707e-04]\n", + " [2.86586264e-03 4.17512273e-03 4.28013091e-03 4.14650212e-03\n", + " 9.27814553e-01 2.24791840e-02 3.06764221e-03 9.50855980e-03\n", + " 1.51437526e-02 6.51868962e-03]]\n" + ] + } + ], + "source": [ + "svm_proba = svm.predict_proba(X_test)\n", + "print(svm_proba[0:2])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# hard voting\n", + "hard voting is just majority vote which collects each classifier's prediction and take the most voted prediction." + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/wikiml/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9083333333333333" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf = VotingClassifier(estimators=[\n", + " ('decision_tree', dtree), ('knn', knn), ('svm', svm)], \n", + " weights=[1,1,1], voting='hard').fit(X_train, y_train)\n", + "hard_voting_predicted = voting_clf.predict(X_test)\n", + "accuracy_score(y_test, hard_voting_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# soft voting\n", + "soft voting takes each classifier's predict_proba and then sum up all probabilities to take the prediction has highest probabilities." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/anaconda3/envs/wikiml/lib/python3.6/site-packages/sklearn/preprocessing/label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.\n", + " if diff:\n" + ] + }, + { + "data": { + "text/plain": [ + "0.9138888888888889" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "voting_clf = VotingClassifier(estimators=[\n", + " ('decision_tree', dtree), ('knn', knn), ('svm', svm)], \n", + " weights=[1,1,1], voting='soft').fit(X_train, y_train)\n", + "soft_voting_predicted = voting_clf.predict(X_test)\n", + "accuracy_score(y_test, soft_voting_predicted)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Visualization\n", + "we can visualize accuracy to check voting result is stabled or better than single model accuracy. \n", + "it is hard to say which voting is better, but we can confirm classifiers complement each other, \n", + "and voting result is better in this example." + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXcAAAD8CAYAAACMwORRAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4wLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvqOYd8AAAEepJREFUeJzt3XvQHXV9x/H3h2BEES8lqVUghiqoqVaoGbwgikpbwAo4oEK1LQ6V6QVtvc3QwTIWrVXROrViK7SKYpWLiqYYDZWKUK2YIBdJMDQTUFLaMSpSURGRb//YjZwcT/Kc58l58iQ/3q+ZzLOX39n97e5vP2fP75zdpKqQJLVll7mugCRp8gx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoN2nasVL1iwoBYvXjxXq5ekndLVV1/9napaOFW5OQv3xYsXs2rVqrlavSTtlJJ8c5xydstIUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KD5uwOVUmajsWnfmauqzAxt7ztBbO+DsNd2om0EnDbI9zu7+yWkaQGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDfHCYdiqtPDgLfHiWZpdX7pLUIK/cd0KtXL165SrNHq/cJalBhrskNchwl6QGGe6S1CDDXZIaNFa4Jzk8ydok65KcOmL+oiRfSHJNkuuTHDn5qkqSxjVluCeZB5wFHAEsAU5IsmSo2BuBC6vqQOB44H2TrqgkaXzjXLkfBKyrqvVVdTdwPnD0UJkCHtoPPwy4bXJVlCRN1zg3Me0F3DowvgF42lCZNwGXJnkVsDtw2ERqJ0makXHCPSOm1dD4CcC5VfWuJM8AzkvypKq6d7MFJScDJwMsWrRoJvUF2rlDE7xLU9LsGKdbZgOwz8D43vxit8tJwIUAVfWfwG7AguEFVdXZVbW0qpYuXLhwZjWWJE1pnHBfCeyXZN8k8+m+MF02VOZbwPMBkjyRLtw3TrKikqTxTRnuVXUPcAqwAriR7lcxq5OckeSovtjrgFcmuQ74GHBiVQ133UiStpOxngpZVcuB5UPTTh8YXgMcPNmqSZJmyjtUJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktSgscI9yeFJ1iZZl+TULZR5SZI1SVYn+ehkqylJmo5dpyqQZB5wFvCbwAZgZZJlVbVmoMx+wF8AB1fV7Ul+ebYqLEma2jhX7gcB66pqfVXdDZwPHD1U5pXAWVV1O0BVfXuy1ZQkTcc44b4XcOvA+IZ+2qD9gf2TfCnJV5IcPqkKSpKmb8puGSAjptWI5ewHHArsDVyZ5ElV9f3NFpScDJwMsGjRomlXVpI0nnGu3DcA+wyM7w3cNqLMp6vqp1V1M7CWLuw3U1VnV9XSqlq6cOHCmdZZkjSFccJ9JbBfkn2TzAeOB5YNlfkU8FyAJAvoumnWT7KikqTxTRnuVXUPcAqwArgRuLCqVic5I8lRfbEVwHeTrAG+ALyhqr47W5WWJG3dOH3uVNVyYPnQtNMHhgt4bf9PkjTHvENVkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQWOFe5LDk6xNsi7JqVspd1ySSrJ0clWUJE3XlOGeZB5wFnAEsAQ4IcmSEeX2AF4NXDXpSkqSpmecK/eDgHVVtb6q7gbOB44eUe7NwDuAuyZYP0nSDIwT7nsBtw6Mb+in/VySA4F9quqSrS0oyclJViVZtXHjxmlXVpI0nnHCPSOm1c9nJrsA7wZeN9WCqursqlpaVUsXLlw4fi0lSdMyTrhvAPYZGN8buG1gfA/gScDlSW4Bng4s80tVSZo744T7SmC/JPsmmQ8cDyzbNLOq7qiqBVW1uKoWA18BjqqqVbNSY0nSlKYM96q6BzgFWAHcCFxYVauTnJHkqNmuoCRp+nYdp1BVLQeWD007fQtlD932akmStoV3qEpSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQWOFe5LDk6xNsi7JqSPmvzbJmiTXJ7ksyWMmX1VJ0rimDPck84CzgCOAJcAJSZYMFbsGWFpVvw58HHjHpCsqSRrfOFfuBwHrqmp9Vd0NnA8cPVigqr5QVT/qR78C7D3ZakqSpmOccN8LuHVgfEM/bUtOAj47akaSk5OsSrJq48aN49dSkjQt44R7RkyrkQWTlwNLgTNHza+qs6tqaVUtXbhw4fi1lCRNy65jlNkA7DMwvjdw23ChJIcBpwHPqaqfTKZ6kqSZGOfKfSWwX5J9k8wHjgeWDRZIciDwfuCoqvr25KspSZqOKcO9qu4BTgFWADcCF1bV6iRnJDmqL3Ym8BDgoiTXJlm2hcVJkraDcbplqKrlwPKhaacPDB824XpJkraBd6hKUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUGGuyQ1yHCXpAYZ7pLUIMNdkhpkuEtSgwx3SWqQ4S5JDTLcJalBhrskNchwl6QGGe6S1CDDXZIaZLhLUoMMd0lqkOEuSQ0y3CWpQYa7JDXIcJekBhnuktQgw12SGmS4S1KDDHdJapDhLkkNMtwlqUFjhXuSw5OsTbIuyakj5j8wyQX9/KuSLJ50RSVJ45sy3JPMA84CjgCWACckWTJU7CTg9qp6HPBu4O2TrqgkaXzjXLkfBKyrqvVVdTdwPnD0UJmjgQ/1wx8Hnp8kk6umJGk6xgn3vYBbB8Y39NNGlqmqe4A7gD0nUUFJ0vTtOkaZUVfgNYMyJDkZOLkfvTPJ2jHWP5cWAN+ZzRVkx+3Acttn2f15++/P2w7bvP2PGafQOOG+AdhnYHxv4LYtlNmQZFfgYcD3hhdUVWcDZ49TsR1BklVVtXSu6zEX3Pb757bD/Xv7W9r2cbplVgL7Jdk3yXzgeGDZUJllwB/0w8cB/15Vv3DlLknaPqa8cq+qe5KcAqwA5gEfqKrVSc4AVlXVMuCfgfOSrKO7Yj9+NistSdq6cbplqKrlwPKhaacPDN8FvHiyVdsh7DRdSLPAbb//uj9vfzPbHntPJKk9Pn5Akhq004R7kjclef0MX/vlKeYvT/LwmdVss+UcM+Lu3R1SksVJbpjremjrZuM4JbklyYJtXMbDk/zJwPijk3x822s3GUkOSbI6ybVJnpjkdye03AOSHDkwftSoR7LsCHaacN8WVfXMKeYfWVXfn8CqjqF7RMMv6H8iKm1Xs9juHg78PNyr6raqOm6W1jUTLwPeWVUHAI8EJhLuwAHAz8O9qpZV1dsmtOzJqqod9h9wGrAW+DzwMeD1/fTHAp8DrgauBJ7QT38kcDFwXf/vmf30O/u/jwKuAK4FbgAO6affAizoh1/bz7sB+PN+2mLgRuAcYDVwKfCgobo+k+6XQjf3y38scDnwVuCLwOuAhcAn6H5euhI4uH/t7sAH+mnXAEdvh327GLihH/7Vfr1vAD7Z79v/At4xUP5O4K/7/foV4JFz3T5muN27A5/pt+MGup/wXjgw/1DgXwe2+e19O/s83aM4LgfWA0dtp/puse0Br+zbzHV9u3pwP/1c4G+BLwDvortb/NL+GL8f+Oam9j6wnj8eOt4nAn+/lXPifODHfVs/c6g9nbiVdnQScFO/H88B3jvD4/bSfvrz++36en8OPRD4Q+47F/+lb6939HV9zdByLwCOHBg/FzgW2A34YL/ca4DnAvOBbwEb+2W9tN/W9w689j3Al/s2clw/fRfgff3xu4TuxynHzXrbmeuTbSsH86n9jn0w8FBgHfeF+2XAfv3w0+h+V7/pQG1qfPOAh206Sfu/rwNOG5i/Rz98C92daZvWuTvwkP5gHNg33HuAA/ryFwIvH1HncwcPWt+A3zcw/lHgWf3wIuDGfvitm5ZHd0V0E7D7LO/fxf1J8vi+8R7QN9T1dDeh7UYXAvv05Qt4YT/8DuCNc91GZrjdxwLnDIw/rD9hd+/H/2HgWBRwRD98MV1APgB4CnDtdqrvFtsesOdAubcArxpoh5cA8/rx9wCn98Mv6LdrONwX0j1DatP4Z4FnTXFO3DDcnvrhke0IeDTdufZL/X68kvHDfdRx243usSf799M+zH3n/7ncF66HApdsYbkvAj7UD8/vl/cguqz4YD/9CX0b2Y2BMB/Y1sFwv4guzJds2p909/4s76f/CnA72yHcd+RumUOAi6vqR1X1f/Q3TiV5CN1V8kVJrqW7EnlU/5rn0Z2cVNXPquqOoWWuBF6R5E3Ak6vqB0Pzn9Wv84dVdSfd1cch/bybq+rafvhqusY8jgsGhg8D3tvXexnw0CR7AL8FnNpPv5yuES0ac/nbYiHwabqw2LRtl1XVHdX9vHUN993qfDddYMD0tn9H83XgsCRvT3JI30Y+B7yw78J4Ad0+gW6bPzfwui9W1U/74cXbsc5bantPSnJlkq/TdUP82sBrLqqqn/XDzwY+AlBVn6ELl81U1UZgfZKnJ9mT7k3/S2z9nNiaUe3oILp9+L1+P1405vbD6OP2eLp9c1Nf5kP9tk7HZ4HnJXkg3ZNvr6iqH9Nt93kAVfUNujeo/cdY3qeq6t6qWkPXk0C/rIv66f9L94lq1u3o/cCjfqe5C/D96vrSprewqiuSPJvuBD4vyZlV9eGBIlt7kuVPBoZ/RvfuPo4fDgzvAjyjbzz3rbR7guaxVbW9n7VzB92VysF0V2Twi9u5qY38tPrLkKHpO5WquinJU+n6Tf8myaV0b8B/SvdRfuXAm/7gNt9Lv2+q6t7t/B3KltreucAxVXVdkhPprlA3GWx3MPpcGnYB8BLgG3SBXtvwdNdR7WjGT4rdwnEbvlN+Jsu9K8nlwG/TdbN8rJ81ie3O0N/take+cr8CeFGSB/VXty8E6K/ib07yYuiCMclT+tdcRtd3SJJ5SR46uMAkjwG+XVXn0N1V+xsj1nlMkgcn2Z3uI9uV06jzD4A9tjL/UuCUgfpseoNaAbxq04mU5MBprHNb3E33JfDvT+rXBDu6JI8GflRVHwHeSdcGLu//vpLNP2nt6PYA/ifJA+iu3Lfkik3zkxwBPGIL5T5J1x5O4L79sKVzYqq2PspXgeckeUT/5njsuC/cwnH7BrA4yeP6Yr9H9/3WsKnqej7wCrpPJCv6aYP7bH+6T9Jrx1jWKP8BHJtklySPZPM34Vmzw4Z7VX2NroFdS/dl0WDIvgw4Kcl1dFecm54v/2fAc/uPqVez+cdU6HbqtUmuoWtYfzdinefSNcKrgH+qqmumUe3zgTckuSbJY0fMfzWwNMn1SdYAf9RPfzNdH+T1/c/e3jyNdW6Tqvoh8DvAa+j6MVv3ZOCrfRfYacBb+u6LS+g+ll+ytRfvYP6Srp3+G13QbclfAc9O8jW6LsBvjSpUVbfTd6FU1Vf7aSPPiar6LvClJDckOXOcylbVf9N9v3QV3RfUa+g+PY5j1HG7iy6UL+rP+XuBfxzx2uuBe5Jcl+Q1I+ZfSted8/nq/s8K6L4Andcv9wLgxKr6CV2XypL+J5YvHbPun6B7uOINdN3IVzH+ds+Yd6hK2m6SPKSq7uyv3C+me1bVxXNdr9k2sN170r1RHtz3v8+anbLfVNJO601JDqP70cClwKfmuD7byyX9jZLzgTfPdrCDV+6S1KQdts9dkjRzhrskNchwl6QGGe6S1CDDXZIaZLhLUoP+H47Jp0tra/pcAAAAAElFTkSuQmCC\n", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "%matplotlib inline\n", + "\n", + "x = np.arange(5)\n", + "plt.bar(x, height= [accuracy_score(y_test, dtree_predicted),\n", + " accuracy_score(y_test, knn_predicted),\n", + " accuracy_score(y_test, svm_predicted),\n", + " accuracy_score(y_test, hard_voting_predicted),\n", + " accuracy_score(y_test, soft_voting_predicted)])\n", + "plt.xticks(x, ['decision tree','knn','svm','hard voting','soft voting']);" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_science/nlp/word2vec_gensim.ipynb b/data_science/nlp/word2vec_gensim.ipynb new file mode 100644 index 0000000..5024407 --- /dev/null +++ b/data_science/nlp/word2vec_gensim.ipynb @@ -0,0 +1,229 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# pretrained Word2Vec download" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "--2020-06-03 23:13:08-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n", + "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.16.238\n", + "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.16.238|:443... connected.\n", + "HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable\n", + "\n", + " The file is already fully retrieved; nothing to do.\n", + "\n" + ] + } + ], + "source": [ + "!wget -P . -c \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\"" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "import gensim" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# load pretrained word2vec\n", + "model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('pal', 0.7476358413696289),\n", + " ('friends', 0.7098034620285034),\n", + " ('buddy', 0.6972494125366211),\n", + " ('dear_friend', 0.6960037350654602),\n", + " ('acquaintance', 0.6843010187149048)]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# similar words\n", + "model.most_similar(positive=['friend'], topn=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[('queen', 0.7118192911148071)]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# king + woman - man = queen\n", + "model.most_similar(positive=['king', 'woman'], negative=['man'], topn=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "300" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Word2Vec vector dimension\n", + "len(model['friend'])" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([ 0.07080078, -0.21386719, 0.15332031, 0.09423828, -0.03442383,\n", + " 0.43359375, -0.16503906, -0.05786133, 0.17578125, -0.08203125,\n", + " 0.24511719, -0.19335938, -0.0255127 , -0.09619141, -0.125 ,\n", + " 0.02575684, 0.16796875, -0.03759766, 0.09472656, -0.04760742,\n", + " 0.20605469, 0.31835938, 0.15917969, -0.17089844, 0.09033203,\n", + " -0.1640625 , -0.15234375, 0.3125 , 0.06298828, -0.24902344,\n", + " 0.15625 , -0.04516602, -0.12890625, -0.00686646, -0.02160645,\n", + " 0.14453125, 0.2734375 , 0.12695312, 0.10742188, 0.11376953,\n", + " 0.14355469, -0.00173187, 0.22851562, -0.03515625, 0.17089844,\n", + " 0.04516602, -0.07958984, -0.08886719, -0.01342773, -0.09667969,\n", + " -0.12597656, 0.10595703, 0.15332031, -0.03808594, 0.02246094,\n", + " 0.01428223, -0.03295898, 0.20703125, -0.03417969, 0.02233887,\n", + " 0.00244141, 0.13476562, -0.01403809, 0.13378906, 0.0201416 ,\n", + " 0.14746094, 0.00759888, -0.18652344, 0.16113281, 0.109375 ,\n", + " 0.14355469, 0.01623535, 0.01867676, 0.09179688, -0.33789062,\n", + " 0.19335938, -0.29101562, -0.00860596, 0.10644531, 0.359375 ,\n", + " 0.25585938, -0.03320312, 0.15625 , -0.24316406, -0.06738281,\n", + " 0.09033203, -0.125 , 0.21777344, -0.02380371, -0.06445312,\n", + " -0.14355469, 0.05664062, -0.12597656, 0.02172852, 0.03833008,\n", + " -0.17578125, -0.08349609, 0.21386719, -0.01855469, -0.23535156,\n", + " -0.14746094, -0.16113281, -0.03125 , -0.10107422, 0.07080078,\n", + " 0.01135254, -0.04370117, 0.07666016, 0.16503906, 0.04541016,\n", + " -0.13867188, 0.13085938, 0.13378906, -0.14453125, 0.12792969,\n", + " -0.06787109, -0.04296875, -0.03369141, 0.10302734, 0.22949219,\n", + " 0.14160156, -0.01153564, -0.00086212, -0.10449219, -0.03710938,\n", + " 0.01928711, 0.16699219, -0.06079102, 0.09814453, 0.0703125 ,\n", + " -0.39648438, -0.23242188, -0.04077148, 0.09570312, -0.0546875 ,\n", + " -0.09814453, 0.09082031, 0.03588867, 0.09228516, 0.3125 ,\n", + " 0.10595703, 0.18847656, -0.11230469, 0.00842285, 0.08935547,\n", + " 0.04663086, -0.25 , -0.03369141, 0.03808594, -0.03710938,\n", + " 0.42773438, 0.10839844, -0.01391602, -0.01965332, -0.04296875,\n", + " -0.11035156, 0.0390625 , 0.04541016, -0.20019531, -0.14355469,\n", + " -0.14257812, 0.03662109, 0.25 , 0.3671875 , -0.12304688,\n", + " -0.0859375 , 0.24902344, -0.21582031, 0.02648926, 0.17871094,\n", + " 0.29296875, 0.21582031, 0.1015625 , 0.00167084, -0.07177734,\n", + " 0.03686523, 0.22851562, -0.125 , 0.17285156, 0.22265625,\n", + " 0.21191406, 0.03686523, 0.09570312, -0.00344849, 0.13183594,\n", + " -0.23925781, 0.00576782, 0.27148438, 0.10400391, 0.0098877 ,\n", + " -0.24511719, 0.21777344, -0.03027344, 0.23046875, 0.11816406,\n", + " 0.1640625 , -0.00109863, 0.00349426, -0.02197266, -0.09179688,\n", + " -0.10351562, 0.06933594, -0.13476562, -0.06201172, 0.14355469,\n", + " -0.10888672, -0.11328125, 0.2109375 , -0.10839844, -0.18261719,\n", + " -0.06689453, -0.265625 , -0.13378906, -0.04296875, -0.17773438,\n", + " 0.00689697, -0.00982666, -0.00640869, -0.12792969, 0.08203125,\n", + " -0.01367188, 0.02734375, 0.12597656, -0.00772095, -0.04614258,\n", + " -0.12255859, 0.16210938, 0.28320312, 0.04296875, -0.05175781,\n", + " -0.16210938, 0.14648438, -0.18359375, -0.24511719, 0.22167969,\n", + " 0.0546875 , -0.10302734, -0.07763672, -0.33984375, -0.05908203,\n", + " -0.0022583 , -0.11962891, -0.3046875 , 0.02233887, 0.02941895,\n", + " 0.37695312, -0.01721191, -0.05932617, 0.30273438, -0.13574219,\n", + " 0.14746094, 0.17089844, 0.16015625, 0.21484375, 0.01013184,\n", + " 0.06738281, -0.12109375, -0.12304688, -0.20117188, 0.02880859,\n", + " -0.00662231, -0.20410156, 0.02001953, -0.15136719, 0.16699219,\n", + " 0.14160156, -0.02331543, 0.14550781, -0.13476562, 0.04785156,\n", + " 0.14160156, 0.03808594, -0.12109375, 0.02770996, -0.0123291 ,\n", + " -0.20410156, -0.06445312, 0.06079102, -0.07519531, -0.28125 ,\n", + " 0.18261719, -0.25390625, -0.0456543 , 0.14160156, -0.0546875 ,\n", + " -0.01477051, -0.38085938, 0.14355469, 0.12255859, 0.14941406,\n", + " -0.03320312, 0.19433594, -0.34375 , -0.24902344, -0.00331116,\n", + " -0.05639648, -0.00079727, -0.21679688, -0.01977539, 0.10644531],\n", + " dtype=float32)" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# print word2vec\n", + "model['friend']" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/data_science/svm/svm.ipynb b/data_science/svm/svm.ipynb new file mode 100755 index 0000000..db0733a --- /dev/null +++ b/data_science/svm/svm.ipynb @@ -0,0 +1,401 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 79, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from sklearn.datasets import load_iris\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.model_selection import GridSearchCV\n", + "from sklearn.metrics import classification_report\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.svm import SVC" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 80, + "metadata": {}, + "outputs": [], + "source": [ + "# load iris data\n", + "dataset = load_iris()\n", + "\n", + "# use 80% as train data, 20% as test data\n", + "X_train,X_test,y_train,y_test=train_test_split(dataset.data,dataset.target,test_size=0.2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Find best hyperparamters\n", + "RBF kernel SVM has two parameters.\n", + "1. C (cost): The C parameter trades off correct classification of training examples against maximization of the decision function’s margin. For larger values of C, a smaller margin will be accepted if the decision function is better at classifying all training points correctly. \n", + "\n", + "2. gamma: the gamma parameter defines how far the influence of a single training example reaches, with low values meaning ‘far’ and high values meaning ‘close’. The gamma parameters can be seen as the inverse of the radius of influence of samples selected by the model as support vectors.\n", + "\n", + "reference:\n", + "http://scikit-learn.org/stable/auto_examples/svm/plot_rbf_parameters.html" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Grid Search\n", + "find best hyperparameter using grid search." + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "metadata": {}, + "outputs": [], + "source": [ + "def svc_param_selection(X, y, nfolds):\n", + " svm_parameters = [\n", + " {'kernel': ['rbf'],\n", + " 'gamma': [0.00001,0.0001, 0.001, 0.01, 0.1, 1],\n", + " 'C': [0.01, 0.1, 1, 10, 100, 1000]\n", + " }\n", + " ]\n", + " \n", + " clf = GridSearchCV(SVC(), svm_parameters, cv=10)\n", + " clf.fit(X_train, y_train)\n", + " print(clf.best_params_)\n", + " \n", + " return clf" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}\n" + ] + } + ], + "source": [ + "clf = svc_param_selection(X_train, y_train, 10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 1.00 1.00 1.00 7\n", + " 1 1.00 1.00 1.00 13\n", + " 2 1.00 1.00 1.00 10\n", + "\n", + "avg / total 1.00 1.00 1.00 30\n", + "\n", + "\n", + "accuracy : 1.0\n" + ] + } + ], + "source": [ + "y_true, y_pred = y_test, clf.predict(X_test)\n", + "\n", + "print(classification_report(y_true, y_pred))\n", + "print()\n", + "print(\"accuracy : \"+ str(accuracy_score(y_true, y_pred)) )" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ground_truthprediction
011
111
222
300
411
522
622
722
811
900
1011
1100
1200
1311
1411
1522
1611
1722
1811
1922
2000
2111
2200
2300
2411
2511
2622
2711
2822
2922
\n", + "
" + ], + "text/plain": [ + " ground_truth prediction\n", + "0 1 1\n", + "1 1 1\n", + "2 2 2\n", + "3 0 0\n", + "4 1 1\n", + "5 2 2\n", + "6 2 2\n", + "7 2 2\n", + "8 1 1\n", + "9 0 0\n", + "10 1 1\n", + "11 0 0\n", + "12 0 0\n", + "13 1 1\n", + "14 1 1\n", + "15 2 2\n", + "16 1 1\n", + "17 2 2\n", + "18 1 1\n", + "19 2 2\n", + "20 0 0\n", + "21 1 1\n", + "22 0 0\n", + "23 0 0\n", + "24 1 1\n", + "25 1 1\n", + "26 2 2\n", + "27 1 1\n", + "28 2 2\n", + "29 2 2" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Visualize true value with prediction value in pandas dataframe.\n", + "comparison = pd.DataFrame({'prediction':y_pred, 'ground_truth':y_true}) \n", + "comparison" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}