|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "Unsupervised Learning<br>\n", |
| 8 | + "No Output or supervisor present<br>\n", |
| 9 | + "This is first Unsupervised Learning algorithm<br>\n", |
| 10 | + "Used to find association<br>\n", |
| 11 | + "First it will check for all products and then store in a tabular format<br>\n", |
| 12 | + "It will have one more nan column for this dataset<br>\n", |
| 13 | + "We have to delete nan data" |
| 14 | + ] |
| 15 | + }, |
| 16 | + { |
| 17 | + "cell_type": "markdown", |
| 18 | + "metadata": {}, |
| 19 | + "source": [ |
| 20 | + "For reading csv file" |
| 21 | + ] |
| 22 | + }, |
| 23 | + { |
| 24 | + "cell_type": "code", |
| 25 | + "execution_count": 11, |
| 26 | + "metadata": {}, |
| 27 | + "outputs": [], |
| 28 | + "source": [ |
| 29 | + "import pandas as pd\n", |
| 30 | + "# For getting T,F values of dataset\n", |
| 31 | + "from mlxtend.preprocessing import TransactionEncoder\n", |
| 32 | + "# Importing Algorithm\n", |
| 33 | + "from mlxtend.frequent_patterns import apriori" |
| 34 | + ] |
| 35 | + }, |
| 36 | + { |
| 37 | + "cell_type": "markdown", |
| 38 | + "metadata": {}, |
| 39 | + "source": [ |
| 40 | + "We have various category and we need to find the association<br>\n", |
| 41 | + "Dataset contains string data<br>\n", |
| 42 | + "Error because of nan value, so we set filter parameter as false" |
| 43 | + ] |
| 44 | + }, |
| 45 | + { |
| 46 | + "cell_type": "code", |
| 47 | + "execution_count": 12, |
| 48 | + "metadata": {}, |
| 49 | + "outputs": [ |
| 50 | + { |
| 51 | + "name": "stdout", |
| 52 | + "output_type": "stream", |
| 53 | + "text": [ |
| 54 | + "Dataset is:\n", |
| 55 | + " shrimp almonds avocado vegetables mix \\\n", |
| 56 | + "0 burgers meatballs eggs \n", |
| 57 | + "1 chutney \n", |
| 58 | + "2 turkey avocado \n", |
| 59 | + "3 mineral water milk energy bar whole wheat rice \n", |
| 60 | + "4 low fat yogurt \n", |
| 61 | + "... ... ... ... ... \n", |
| 62 | + "7495 butter light mayo fresh bread \n", |
| 63 | + "7496 burgers frozen vegetables eggs french fries \n", |
| 64 | + "7497 chicken \n", |
| 65 | + "7498 escalope green tea \n", |
| 66 | + "7499 eggs frozen smoothie yogurt cake low fat yogurt \n", |
| 67 | + "\n", |
| 68 | + " green grapes whole weat flour yams cottage cheese energy drink \\\n", |
| 69 | + "0 \n", |
| 70 | + "1 \n", |
| 71 | + "2 \n", |
| 72 | + "3 green tea \n", |
| 73 | + "4 \n", |
| 74 | + "... ... ... ... ... ... \n", |
| 75 | + "7495 \n", |
| 76 | + "7496 magazines green tea \n", |
| 77 | + "7497 \n", |
| 78 | + "7498 \n", |
| 79 | + "7499 \n", |
| 80 | + "\n", |
| 81 | + " tomato juice low fat yogurt green tea honey salad mineral water salmon \\\n", |
| 82 | + "0 \n", |
| 83 | + "1 \n", |
| 84 | + "2 \n", |
| 85 | + "3 \n", |
| 86 | + "4 \n", |
| 87 | + "... ... ... ... ... ... ... ... \n", |
| 88 | + "7495 \n", |
| 89 | + "7496 \n", |
| 90 | + "7497 \n", |
| 91 | + "7498 \n", |
| 92 | + "7499 \n", |
| 93 | + "\n", |
| 94 | + " antioxydant juice frozen smoothie spinach olive oil \n", |
| 95 | + "0 \n", |
| 96 | + "1 \n", |
| 97 | + "2 \n", |
| 98 | + "3 \n", |
| 99 | + "4 \n", |
| 100 | + "... ... ... ... ... \n", |
| 101 | + "7495 \n", |
| 102 | + "7496 \n", |
| 103 | + "7497 \n", |
| 104 | + "7498 \n", |
| 105 | + "7499 \n", |
| 106 | + "\n", |
| 107 | + "[7500 rows x 20 columns]\n", |
| 108 | + "\n", |
| 109 | + "Dataset after converting to numpy array is:\n", |
| 110 | + " [['burgers' 'meatballs' 'eggs' ... '' '' '']\n", |
| 111 | + " ['chutney' '' '' ... '' '' '']\n", |
| 112 | + " ['turkey' 'avocado' '' ... '' '' '']\n", |
| 113 | + " ...\n", |
| 114 | + " ['chicken' '' '' ... '' '' '']\n", |
| 115 | + " ['escalope' 'green tea' '' ... '' '' '']\n", |
| 116 | + " ['eggs' 'frozen smoothie' 'yogurt cake' ... '' '' '']]\n" |
| 117 | + ] |
| 118 | + } |
| 119 | + ], |
| 120 | + "source": [ |
| 121 | + "data=pd.read_csv(\"Market_Basket_Optimisation.csv\",na_filter=False)\n", |
| 122 | + "print(\"Dataset is:\\n\",data)\n", |
| 123 | + "# We need to give dataframe in a tabular format with T and F\n", |
| 124 | + "# Converting to numpy array\n", |
| 125 | + "data=data.to_numpy()\n", |
| 126 | + "print(\"\\nDataset after converting to numpy array is:\\n\",data)" |
| 127 | + ] |
| 128 | + }, |
| 129 | + { |
| 130 | + "cell_type": "markdown", |
| 131 | + "metadata": {}, |
| 132 | + "source": [ |
| 133 | + "For getting T,F values of dataset" |
| 134 | + ] |
| 135 | + }, |
| 136 | + { |
| 137 | + "cell_type": "code", |
| 138 | + "execution_count": 13, |
| 139 | + "metadata": {}, |
| 140 | + "outputs": [ |
| 141 | + { |
| 142 | + "name": "stdout", |
| 143 | + "output_type": "stream", |
| 144 | + "text": [ |
| 145 | + "Converted Array is:\n", |
| 146 | + " [[ True False False ... False False False]\n", |
| 147 | + " [ True False False ... False False False]\n", |
| 148 | + " [ True False False ... False False False]\n", |
| 149 | + " ...\n", |
| 150 | + " [ True False False ... False False False]\n", |
| 151 | + " [ True False False ... False False False]\n", |
| 152 | + " [ True False False ... False True False]]\n", |
| 153 | + "\n", |
| 154 | + "Name of columns are:\n", |
| 155 | + " ['', ' asparagus', 'almonds', 'antioxydant juice', 'asparagus', 'avocado', 'babies food', 'bacon', 'barbecue sauce', 'black tea', 'blueberries', 'body spray', 'bramble', 'brownies', 'bug spray', 'burger sauce', 'burgers', 'butter', 'cake', 'candy bars', 'carrots', 'cauliflower', 'cereals', 'champagne', 'chicken', 'chili', 'chocolate', 'chocolate bread', 'chutney', 'cider', 'clothes accessories', 'cookies', 'cooking oil', 'corn', 'cottage cheese', 'cream', 'dessert wine', 'eggplant', 'eggs', 'energy bar', 'energy drink', 'escalope', 'extra dark chocolate', 'flax seed', 'french fries', 'french wine', 'fresh bread', 'fresh tuna', 'fromage blanc', 'frozen smoothie', 'frozen vegetables', 'gluten free bar', 'grated cheese', 'green beans', 'green grapes', 'green tea', 'ground beef', 'gums', 'ham', 'hand protein bar', 'herb & pepper', 'honey', 'hot dogs', 'ketchup', 'light cream', 'light mayo', 'low fat yogurt', 'magazines', 'mashed potato', 'mayonnaise', 'meatballs', 'melons', 'milk', 'mineral water', 'mint', 'mint green tea', 'muffins', 'mushroom cream sauce', 'napkins', 'nonfat milk', 'oatmeal', 'oil', 'olive oil', 'pancakes', 'parmesan cheese', 'pasta', 'pepper', 'pet food', 'pickles', 'protein bar', 'red wine', 'rice', 'salad', 'salmon', 'salt', 'sandwich', 'shallot', 'shampoo', 'shrimp', 'soda', 'soup', 'spaghetti', 'sparkling water', 'spinach', 'strawberries', 'strong cheese', 'tea', 'tomato juice', 'tomato sauce', 'tomatoes', 'toothpaste', 'turkey', 'vegetables mix', 'water spray', 'white wine', 'whole weat flour', 'whole wheat pasta', 'whole wheat rice', 'yams', 'yogurt cake', 'zucchini']\n" |
| 156 | + ] |
| 157 | + } |
| 158 | + ], |
| 159 | + "source": [ |
| 160 | + "tran_enc=TransactionEncoder()\n", |
| 161 | + "# Doesn't mention name of columns(By Default)\n", |
| 162 | + "array=tran_enc.fit(data).transform(data)\n", |
| 163 | + "print(\"Converted Array is:\\n\",array)\n", |
| 164 | + "# Seeing the name of columns\n", |
| 165 | + "columns=tran_enc.columns_\n", |
| 166 | + "print(\"\\nName of columns are:\\n\",columns)" |
| 167 | + ] |
| 168 | + }, |
| 169 | + { |
| 170 | + "cell_type": "markdown", |
| 171 | + "metadata": {}, |
| 172 | + "source": [ |
| 173 | + "Now we will create a dataframe with values and column names<br>\n", |
| 174 | + "First column is useless so we drop it (Only used for index values)" |
| 175 | + ] |
| 176 | + }, |
| 177 | + { |
| 178 | + "cell_type": "code", |
| 179 | + "execution_count": 14, |
| 180 | + "metadata": {}, |
| 181 | + "outputs": [ |
| 182 | + { |
| 183 | + "name": "stdout", |
| 184 | + "output_type": "stream", |
| 185 | + "text": [ |
| 186 | + "Dataframe is:\n", |
| 187 | + " asparagus almonds antioxydant juice asparagus avocado babies food \\\n", |
| 188 | + "0 False False False False False False \n", |
| 189 | + "1 False False False False False False \n", |
| 190 | + "2 False False False False True False \n", |
| 191 | + "3 False False False False False False \n", |
| 192 | + "4 False False False False False False \n", |
| 193 | + "... ... ... ... ... ... ... \n", |
| 194 | + "7495 False False False False False False \n", |
| 195 | + "7496 False False False False False False \n", |
| 196 | + "7497 False False False False False False \n", |
| 197 | + "7498 False False False False False False \n", |
| 198 | + "7499 False False False False False False \n", |
| 199 | + "\n", |
| 200 | + " bacon barbecue sauce black tea blueberries ... turkey \\\n", |
| 201 | + "0 False False False False ... False \n", |
| 202 | + "1 False False False False ... False \n", |
| 203 | + "2 False False False False ... True \n", |
| 204 | + "3 False False False False ... False \n", |
| 205 | + "4 False False False False ... False \n", |
| 206 | + "... ... ... ... ... ... ... \n", |
| 207 | + "7495 False False False False ... False \n", |
| 208 | + "7496 False False False False ... False \n", |
| 209 | + "7497 False False False False ... False \n", |
| 210 | + "7498 False False False False ... False \n", |
| 211 | + "7499 False False False False ... False \n", |
| 212 | + "\n", |
| 213 | + " vegetables mix water spray white wine whole weat flour \\\n", |
| 214 | + "0 False False False False \n", |
| 215 | + "1 False False False False \n", |
| 216 | + "2 False False False False \n", |
| 217 | + "3 False False False False \n", |
| 218 | + "4 False False False False \n", |
| 219 | + "... ... ... ... ... \n", |
| 220 | + "7495 False False False False \n", |
| 221 | + "7496 False False False False \n", |
| 222 | + "7497 False False False False \n", |
| 223 | + "7498 False False False False \n", |
| 224 | + "7499 False False False False \n", |
| 225 | + "\n", |
| 226 | + " whole wheat pasta whole wheat rice yams yogurt cake zucchini \n", |
| 227 | + "0 False False False False False \n", |
| 228 | + "1 False False False False False \n", |
| 229 | + "2 False False False False False \n", |
| 230 | + "3 False True False False False \n", |
| 231 | + "4 False False False False False \n", |
| 232 | + "... ... ... ... ... ... \n", |
| 233 | + "7495 False False False False False \n", |
| 234 | + "7496 False False False False False \n", |
| 235 | + "7497 False False False False False \n", |
| 236 | + "7498 False False False False False \n", |
| 237 | + "7499 False False False True False \n", |
| 238 | + "\n", |
| 239 | + "[7500 rows x 120 columns]\n" |
| 240 | + ] |
| 241 | + } |
| 242 | + ], |
| 243 | + "source": [ |
| 244 | + "df=pd.DataFrame(array,columns=columns).drop([''],axis=1)\n", |
| 245 | + "print(\"Dataframe is:\\n\",df)" |
| 246 | + ] |
| 247 | + }, |
| 248 | + { |
| 249 | + "cell_type": "markdown", |
| 250 | + "metadata": {}, |
| 251 | + "source": [ |
| 252 | + "Calculating Support values<br>\n", |
| 253 | + "Part of algorithm<br>\n", |
| 254 | + "Passing arguments to algorithm<br>\n", |
| 255 | + "Value of min_support is greater than 0<br>\n", |
| 256 | + "After setting this value to 0.05, we will get association of multiple columns<br>\n", |
| 257 | + "By default use_colnames(Parameter for giving name to columns) value is False<br>\n", |
| 258 | + "We explicitly needs to set that to True to get column names" |
| 259 | + ] |
| 260 | + }, |
| 261 | + { |
| 262 | + "cell_type": "code", |
| 263 | + "execution_count": 15, |
| 264 | + "metadata": {}, |
| 265 | + "outputs": [ |
| 266 | + { |
| 267 | + "name": "stdout", |
| 268 | + "output_type": "stream", |
| 269 | + "text": [ |
| 270 | + "After applying algorithm, dataset is:\n", |
| 271 | + " support itemsets\n", |
| 272 | + "0 0.020267 (almonds)\n", |
| 273 | + "1 0.033200 (avocado)\n", |
| 274 | + "2 0.010800 (barbecue sauce)\n", |
| 275 | + "3 0.014267 (black tea)\n", |
| 276 | + "4 0.011467 (body spray)\n", |
| 277 | + ".. ... ...\n", |
| 278 | + "254 0.011067 (mineral water, milk, ground beef)\n", |
| 279 | + "255 0.017067 (mineral water, ground beef, spaghetti)\n", |
| 280 | + "256 0.015733 (mineral water, milk, spaghetti)\n", |
| 281 | + "257 0.010267 (mineral water, olive oil, spaghetti)\n", |
| 282 | + "258 0.011467 (mineral water, pancakes, spaghetti)\n", |
| 283 | + "\n", |
| 284 | + "[259 rows x 2 columns]\n" |
| 285 | + ] |
| 286 | + } |
| 287 | + ], |
| 288 | + "source": [ |
| 289 | + "print(\"After applying algorithm, dataset is:\\n\",apriori(df,min_support=0.01,use_colnames=True))" |
| 290 | + ] |
| 291 | + }, |
| 292 | + { |
| 293 | + "cell_type": "markdown", |
| 294 | + "metadata": {}, |
| 295 | + "source": [ |
| 296 | + "In case of 0.04 we are getting 5 rows<br>\n", |
| 297 | + "In case of 0.01 we are getting many values" |
| 298 | + ] |
| 299 | + }, |
| 300 | + { |
| 301 | + "cell_type": "code", |
| 302 | + "execution_count": 16, |
| 303 | + "metadata": {}, |
| 304 | + "outputs": [ |
| 305 | + { |
| 306 | + "name": "stdout", |
| 307 | + "output_type": "stream", |
| 308 | + "text": [ |
| 309 | + "Support values are:\n", |
| 310 | + " support itemsets Length\n", |
| 311 | + "0 0.020267 (almonds) 1\n", |
| 312 | + "1 0.033200 (avocado) 1\n", |
| 313 | + "2 0.010800 (barbecue sauce) 1\n", |
| 314 | + "3 0.014267 (black tea) 1\n", |
| 315 | + "4 0.011467 (body spray) 1\n", |
| 316 | + ".. ... ... ...\n", |
| 317 | + "254 0.011067 (mineral water, milk, ground beef) 3\n", |
| 318 | + "255 0.017067 (mineral water, ground beef, spaghetti) 3\n", |
| 319 | + "256 0.015733 (mineral water, milk, spaghetti) 3\n", |
| 320 | + "257 0.010267 (mineral water, olive oil, spaghetti) 3\n", |
| 321 | + "258 0.011467 (mineral water, pancakes, spaghetti) 3\n", |
| 322 | + "\n", |
| 323 | + "[259 rows x 3 columns]\n" |
| 324 | + ] |
| 325 | + } |
| 326 | + ], |
| 327 | + "source": [ |
| 328 | + "support_df=apriori(df,min_support=0.01,use_colnames=True)\n", |
| 329 | + "# Getting number of elements in dataframe\n", |
| 330 | + "support_df[\"Length\"]=support_df['itemsets'].apply(lambda x:len(x))\n", |
| 331 | + "print(\"Support values are:\\n\",support_df)" |
| 332 | + ] |
| 333 | + }, |
| 334 | + { |
| 335 | + "cell_type": "markdown", |
| 336 | + "metadata": {}, |
| 337 | + "source": [ |
| 338 | + "Checking which rows are having length greater than 2 and support value greater than 0.01<br>\n", |
| 339 | + "This is the final result for this algorithm as its main purpose is to find association" |
| 340 | + ] |
| 341 | + }, |
| 342 | + { |
| 343 | + "cell_type": "code", |
| 344 | + "execution_count": 17, |
| 345 | + "metadata": {}, |
| 346 | + "outputs": [ |
| 347 | + { |
| 348 | + "name": "stdout", |
| 349 | + "output_type": "stream", |
| 350 | + "text": [ |
| 351 | + "Rows which are having length greater than 2 and support value greater than 0.01 are:\n", |
| 352 | + "\n", |
| 353 | + " support itemsets Length\n", |
| 354 | + "75 0.011467 (mineral water, avocado) 2\n", |
| 355 | + "76 0.011467 (burgers, cake) 2\n", |
| 356 | + "77 0.017067 (burgers, chocolate) 2\n", |
| 357 | + "78 0.028800 (burgers, eggs) 2\n", |
| 358 | + "79 0.022000 (burgers, french fries) 2\n", |
| 359 | + ".. ... ... ...\n", |
| 360 | + "254 0.011067 (mineral water, milk, ground beef) 3\n", |
| 361 | + "255 0.017067 (mineral water, ground beef, spaghetti) 3\n", |
| 362 | + "256 0.015733 (mineral water, milk, spaghetti) 3\n", |
| 363 | + "257 0.010267 (mineral water, olive oil, spaghetti) 3\n", |
| 364 | + "258 0.011467 (mineral water, pancakes, spaghetti) 3\n", |
| 365 | + "\n", |
| 366 | + "[182 rows x 3 columns]\n" |
| 367 | + ] |
| 368 | + } |
| 369 | + ], |
| 370 | + "source": [ |
| 371 | + "print(\"Rows which are having length greater than 2 and support value greater than 0.01 are:\\n\")\n", |
| 372 | + "print(support_df[(support_df['Length']>=2) & (support_df['support']>0.01)])" |
| 373 | + ] |
| 374 | + } |
| 375 | + ], |
| 376 | + "metadata": { |
| 377 | + "kernelspec": { |
| 378 | + "display_name": "Python 3 (ipykernel)", |
| 379 | + "language": "python", |
| 380 | + "name": "python3" |
| 381 | + }, |
| 382 | + "language_info": { |
| 383 | + "codemirror_mode": { |
| 384 | + "name": "ipython", |
| 385 | + "version": 3 |
| 386 | + }, |
| 387 | + "file_extension": ".py", |
| 388 | + "mimetype": "text/x-python", |
| 389 | + "name": "python", |
| 390 | + "nbconvert_exporter": "python", |
| 391 | + "pygments_lexer": "ipython3", |
| 392 | + "version": "3.9.12" |
| 393 | + } |
| 394 | + }, |
| 395 | + "nbformat": 4, |
| 396 | + "nbformat_minor": 2 |
| 397 | +} |
0 commit comments