madhurimarawat
diff --git a/‎Unsupervised Learning/ML_Apriori-Algorithm.ipynb
Lines changed: 397 additions & 0 deletions b/‎Unsupervised Learning/ML_Apriori-Algorithm.ipynb
Lines changed: 397 additions & 0 deletions
@@ -0,0 +1,397 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Unsupervised Learning<br>\n",
+    "No Output or supervisor present<br>\n",
+    "This is first Unsupervised Learning algorithm<br>\n",
+    "Used to find association<br>\n",
+    "First it will check for all products and then store in a tabular format<br>\n",
+    "It will have one more nan column for this dataset<br>\n",
+    "We have to delete nan data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For reading csv file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "# For getting T,F values of dataset\n",
+    "from mlxtend.preprocessing import TransactionEncoder\n",
+    "# Importing Algorithm\n",
+    "from mlxtend.frequent_patterns import apriori"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We have various category and we need to find the association<br>\n",
+    "Dataset contains string data<br>\n",
+    "Error because of nan value, so we set filter parameter as false"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset is:\n",
+      "               shrimp            almonds      avocado    vegetables mix  \\\n",
+      "0            burgers          meatballs         eggs                     \n",
+      "1            chutney                                                     \n",
+      "2             turkey            avocado                                  \n",
+      "3      mineral water               milk   energy bar  whole wheat rice   \n",
+      "4     low fat yogurt                                                     \n",
+      "...              ...                ...          ...               ...   \n",
+      "7495          butter         light mayo  fresh bread                     \n",
+      "7496         burgers  frozen vegetables         eggs      french fries   \n",
+      "7497         chicken                                                     \n",
+      "7498        escalope          green tea                                  \n",
+      "7499            eggs    frozen smoothie  yogurt cake    low fat yogurt   \n",
+      "\n",
+      "     green grapes whole weat flour yams cottage cheese energy drink  \\\n",
+      "0                                                                     \n",
+      "1                                                                     \n",
+      "2                                                                     \n",
+      "3       green tea                                                     \n",
+      "4                                                                     \n",
+      "...           ...              ...  ...            ...          ...   \n",
+      "7495                                                                  \n",
+      "7496    magazines        green tea                                    \n",
+      "7497                                                                  \n",
+      "7498                                                                  \n",
+      "7499                                                                  \n",
+      "\n",
+      "     tomato juice low fat yogurt green tea honey salad mineral water salmon  \\\n",
+      "0                                                                             \n",
+      "1                                                                             \n",
+      "2                                                                             \n",
+      "3                                                                             \n",
+      "4                                                                             \n",
+      "...           ...            ...       ...   ...   ...           ...    ...   \n",
+      "7495                                                                          \n",
+      "7496                                                                          \n",
+      "7497                                                                          \n",
+      "7498                                                                          \n",
+      "7499                                                                          \n",
+      "\n",
+      "     antioxydant juice frozen smoothie spinach olive oil  \n",
+      "0                                                         \n",
+      "1                                                         \n",
+      "2                                                         \n",
+      "3                                                         \n",
+      "4                                                         \n",
+      "...                ...             ...     ...       ...  \n",
+      "7495                                                      \n",
+      "7496                                                      \n",
+      "7497                                                      \n",
+      "7498                                                      \n",
+      "7499                                                      \n",
+      "\n",
+      "[7500 rows x 20 columns]\n",
+      "\n",
+      "Dataset after converting to numpy array is:\n",
+      " [['burgers' 'meatballs' 'eggs' ... '' '' '']\n",
+      " ['chutney' '' '' ... '' '' '']\n",
+      " ['turkey' 'avocado' '' ... '' '' '']\n",
+      " ...\n",
+      " ['chicken' '' '' ... '' '' '']\n",
+      " ['escalope' 'green tea' '' ... '' '' '']\n",
+      " ['eggs' 'frozen smoothie' 'yogurt cake' ... '' '' '']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "data=pd.read_csv(\"Market_Basket_Optimisation.csv\",na_filter=False)\n",
+    "print(\"Dataset is:\\n\",data)\n",
+    "# We need to give dataframe in a tabular format with T and F\n",
+    "# Converting to numpy array\n",
+    "data=data.to_numpy()\n",
+    "print(\"\\nDataset after converting to numpy array is:\\n\",data)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For getting T,F values of dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Converted Array is:\n",
+      " [[ True False False ... False False False]\n",
+      " [ True False False ... False False False]\n",
+      " [ True False False ... False False False]\n",
+      " ...\n",
+      " [ True False False ... False False False]\n",
+      " [ True False False ... False False False]\n",
+      " [ True False False ... False  True False]]\n",
+      "\n",
+      "Name of columns are:\n",
+      " ['', ' asparagus', 'almonds', 'antioxydant juice', 'asparagus', 'avocado', 'babies food', 'bacon', 'barbecue sauce', 'black tea', 'blueberries', 'body spray', 'bramble', 'brownies', 'bug spray', 'burger sauce', 'burgers', 'butter', 'cake', 'candy bars', 'carrots', 'cauliflower', 'cereals', 'champagne', 'chicken', 'chili', 'chocolate', 'chocolate bread', 'chutney', 'cider', 'clothes accessories', 'cookies', 'cooking oil', 'corn', 'cottage cheese', 'cream', 'dessert wine', 'eggplant', 'eggs', 'energy bar', 'energy drink', 'escalope', 'extra dark chocolate', 'flax seed', 'french fries', 'french wine', 'fresh bread', 'fresh tuna', 'fromage blanc', 'frozen smoothie', 'frozen vegetables', 'gluten free bar', 'grated cheese', 'green beans', 'green grapes', 'green tea', 'ground beef', 'gums', 'ham', 'hand protein bar', 'herb & pepper', 'honey', 'hot dogs', 'ketchup', 'light cream', 'light mayo', 'low fat yogurt', 'magazines', 'mashed potato', 'mayonnaise', 'meatballs', 'melons', 'milk', 'mineral water', 'mint', 'mint green tea', 'muffins', 'mushroom cream sauce', 'napkins', 'nonfat milk', 'oatmeal', 'oil', 'olive oil', 'pancakes', 'parmesan cheese', 'pasta', 'pepper', 'pet food', 'pickles', 'protein bar', 'red wine', 'rice', 'salad', 'salmon', 'salt', 'sandwich', 'shallot', 'shampoo', 'shrimp', 'soda', 'soup', 'spaghetti', 'sparkling water', 'spinach', 'strawberries', 'strong cheese', 'tea', 'tomato juice', 'tomato sauce', 'tomatoes', 'toothpaste', 'turkey', 'vegetables mix', 'water spray', 'white wine', 'whole weat flour', 'whole wheat pasta', 'whole wheat rice', 'yams', 'yogurt cake', 'zucchini']\n"
+     ]
+    }
+   ],
+   "source": [
+    "tran_enc=TransactionEncoder()\n",
+    "# Doesn't mention name of columns(By Default)\n",
+    "array=tran_enc.fit(data).transform(data)\n",
+    "print(\"Converted Array is:\\n\",array)\n",
+    "# Seeing the name of columns\n",
+    "columns=tran_enc.columns_\n",
+    "print(\"\\nName of columns are:\\n\",columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we will create a dataframe with values and column names<br>\n",
+    "First column is useless so we drop it (Only used for index values)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataframe is:\n",
+      "        asparagus  almonds  antioxydant juice  asparagus  avocado  babies food  \\\n",
+      "0          False    False              False      False    False        False   \n",
+      "1          False    False              False      False    False        False   \n",
+      "2          False    False              False      False     True        False   \n",
+      "3          False    False              False      False    False        False   \n",
+      "4          False    False              False      False    False        False   \n",
+      "...          ...      ...                ...        ...      ...          ...   \n",
+      "7495       False    False              False      False    False        False   \n",
+      "7496       False    False              False      False    False        False   \n",
+      "7497       False    False              False      False    False        False   \n",
+      "7498       False    False              False      False    False        False   \n",
+      "7499       False    False              False      False    False        False   \n",
+      "\n",
+      "      bacon  barbecue sauce  black tea  blueberries  ...  turkey  \\\n",
+      "0     False           False      False        False  ...   False   \n",
+      "1     False           False      False        False  ...   False   \n",
+      "2     False           False      False        False  ...    True   \n",
+      "3     False           False      False        False  ...   False   \n",
+      "4     False           False      False        False  ...   False   \n",
+      "...     ...             ...        ...          ...  ...     ...   \n",
+      "7495  False           False      False        False  ...   False   \n",
+      "7496  False           False      False        False  ...   False   \n",
+      "7497  False           False      False        False  ...   False   \n",
+      "7498  False           False      False        False  ...   False   \n",
+      "7499  False           False      False        False  ...   False   \n",
+      "\n",
+      "      vegetables mix  water spray  white wine  whole weat flour  \\\n",
+      "0              False        False       False             False   \n",
+      "1              False        False       False             False   \n",
+      "2              False        False       False             False   \n",
+      "3              False        False       False             False   \n",
+      "4              False        False       False             False   \n",
+      "...              ...          ...         ...               ...   \n",
+      "7495           False        False       False             False   \n",
+      "7496           False        False       False             False   \n",
+      "7497           False        False       False             False   \n",
+      "7498           False        False       False             False   \n",
+      "7499           False        False       False             False   \n",
+      "\n",
+      "      whole wheat pasta  whole wheat rice   yams  yogurt cake  zucchini  \n",
+      "0                 False             False  False        False     False  \n",
+      "1                 False             False  False        False     False  \n",
+      "2                 False             False  False        False     False  \n",
+      "3                 False              True  False        False     False  \n",
+      "4                 False             False  False        False     False  \n",
+      "...                 ...               ...    ...          ...       ...  \n",
+      "7495              False             False  False        False     False  \n",
+      "7496              False             False  False        False     False  \n",
+      "7497              False             False  False        False     False  \n",
+      "7498              False             False  False        False     False  \n",
+      "7499              False             False  False         True     False  \n",
+      "\n",
+      "[7500 rows x 120 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "df=pd.DataFrame(array,columns=columns).drop([''],axis=1)\n",
+    "print(\"Dataframe is:\\n\",df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Calculating Support values<br>\n",
+    "Part of algorithm<br>\n",
+    "Passing arguments to algorithm<br>\n",
+    "Value of min_support is greater than 0<br>\n",
+    "After setting this value to 0.05, we will get association of multiple columns<br>\n",
+    "By default use_colnames(Parameter for giving name to columns) value is False<br>\n",
+    "We explicitly needs to set that to True to get column names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "After applying algorithm, dataset is:\n",
+      "       support                                 itemsets\n",
+      "0    0.020267                                (almonds)\n",
+      "1    0.033200                                (avocado)\n",
+      "2    0.010800                         (barbecue sauce)\n",
+      "3    0.014267                              (black tea)\n",
+      "4    0.011467                             (body spray)\n",
+      "..        ...                                      ...\n",
+      "254  0.011067       (mineral water, milk, ground beef)\n",
+      "255  0.017067  (mineral water, ground beef, spaghetti)\n",
+      "256  0.015733         (mineral water, milk, spaghetti)\n",
+      "257  0.010267    (mineral water, olive oil, spaghetti)\n",
+      "258  0.011467     (mineral water, pancakes, spaghetti)\n",
+      "\n",
+      "[259 rows x 2 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"After applying algorithm, dataset is:\\n\",apriori(df,min_support=0.01,use_colnames=True))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "In case of 0.04 we are getting 5 rows<br>\n",
+    "In case of 0.01 we are getting many values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Support values are:\n",
+      "       support                                 itemsets  Length\n",
+      "0    0.020267                                (almonds)       1\n",
+      "1    0.033200                                (avocado)       1\n",
+      "2    0.010800                         (barbecue sauce)       1\n",
+      "3    0.014267                              (black tea)       1\n",
+      "4    0.011467                             (body spray)       1\n",
+      "..        ...                                      ...     ...\n",
+      "254  0.011067       (mineral water, milk, ground beef)       3\n",
+      "255  0.017067  (mineral water, ground beef, spaghetti)       3\n",
+      "256  0.015733         (mineral water, milk, spaghetti)       3\n",
+      "257  0.010267    (mineral water, olive oil, spaghetti)       3\n",
+      "258  0.011467     (mineral water, pancakes, spaghetti)       3\n",
+      "\n",
+      "[259 rows x 3 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "support_df=apriori(df,min_support=0.01,use_colnames=True)\n",
+    "# Getting number of elements in dataframe\n",
+    "support_df[\"Length\"]=support_df['itemsets'].apply(lambda x:len(x))\n",
+    "print(\"Support values are:\\n\",support_df)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Checking which rows are having length greater than 2 and support value greater than 0.01<br>\n",
+    "This is the final result for this algorithm as its main purpose is to find association"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Rows which are having length greater than 2 and support value greater than 0.01 are:\n",
+      "\n",
+      "      support                                 itemsets  Length\n",
+      "75   0.011467                 (mineral water, avocado)       2\n",
+      "76   0.011467                          (burgers, cake)       2\n",
+      "77   0.017067                     (burgers, chocolate)       2\n",
+      "78   0.028800                          (burgers, eggs)       2\n",
+      "79   0.022000                  (burgers, french fries)       2\n",
+      "..        ...                                      ...     ...\n",
+      "254  0.011067       (mineral water, milk, ground beef)       3\n",
+      "255  0.017067  (mineral water, ground beef, spaghetti)       3\n",
+      "256  0.015733         (mineral water, milk, spaghetti)       3\n",
+      "257  0.010267    (mineral water, olive oil, spaghetti)       3\n",
+      "258  0.011467     (mineral water, pancakes, spaghetti)       3\n",
+      "\n",
+      "[182 rows x 3 columns]\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(\"Rows which are having length greater than 2 and support value greater than 0.01 are:\\n\")\n",
+    "print(support_df[(support_df['Length']>=2) & (support_df['support']>0.01)])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}