From 7cedd51ffb75490765494d22f993646bd694740f Mon Sep 17 00:00:00 2001 From: Lisa Broadhead <37422388+lisabroadhead@users.noreply.github.com> Date: Mon, 6 Jun 2022 15:38:09 -0500 Subject: [PATCH 1/2] Created using Colaboratory --- olympian.ipynb | 902 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 902 insertions(+) create mode 100644 olympian.ipynb diff --git a/olympian.ipynb b/olympian.ipynb new file mode 100644 index 0000000..fec0af3 --- /dev/null +++ b/olympian.ipynb @@ -0,0 +1,902 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "name": "olympian.ipynb", + "provenance": [], + "collapsed_sections": [], + "mount_file_id": "1BnM2nF0qZYP7dc9ciyIDoKcMjMLsh-8M", + "authorship_tag": "ABX9TyOTuJS8S5ykRqDGRYe66Bak", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# Find average height per individual olympian per year exercise (Core)" + ], + "metadata": { + "id": "TgcY0EcYnNrE" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 1. Load this dataset and explore it! Answer the following questions:" + ], + "metadata": { + "id": "8Wmk4yyanVAG" + } + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "import numpy as np" + ], + "metadata": { + "id": "Bgnsj0JOnW2s" + }, + "execution_count": 102, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "file = \"/content/drive/MyDrive/Colab Notebooks/coding_dojo/files/athleteEventsNoPersonal.csv\"\n", + "df = pd.read_csv(file)" + ], + "metadata": { + "id": "uRrzNXwSnd8D" + }, + "execution_count": 103, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.head()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 206 + }, + "id": "kOoXnWyTn9zt", + "outputId": "666c6c01-e501-4489-d56d-28faa3a11e95" + }, + "execution_count": 105, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " ID Age Height Team NOC Games Year Season City \\\n", + "0 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n", + "1 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n", + "2 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n", + "3 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n", + "4 5 27.0 185.0 Netherlands NED 1994 Winter 1994 Winter Lillehammer \n", + "\n", + " Sport Event Medal \n", + "0 Speed Skating Speed Skating Women's 500 metres NaN \n", + "1 Speed Skating Speed Skating Women's 1,000 metres NaN \n", + "2 Speed Skating Speed Skating Women's 500 metres NaN \n", + "3 Speed Skating Speed Skating Women's 1,000 metres NaN \n", + "4 Speed Skating Speed Skating Women's 500 metres NaN " + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IDAgeHeightTeamNOCGamesYearSeasonCitySportEventMedal
0521.0185.0NetherlandsNED1988 Winter1988WinterCalgarySpeed SkatingSpeed Skating Women's 500 metresNaN
1521.0185.0NetherlandsNED1988 Winter1988WinterCalgarySpeed SkatingSpeed Skating Women's 1,000 metresNaN
2525.0185.0NetherlandsNED1992 Winter1992WinterAlbertvilleSpeed SkatingSpeed Skating Women's 500 metresNaN
3525.0185.0NetherlandsNED1992 Winter1992WinterAlbertvilleSpeed SkatingSpeed Skating Women's 1,000 metresNaN
4527.0185.0NetherlandsNED1994 Winter1994WinterLillehammerSpeed SkatingSpeed Skating Women's 500 metresNaN
\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 105 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### a. How many rows?" + ], + "metadata": { + "id": "NHDoLsh-naiT" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"{df.shape[0]} : rows\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "-RmJqFbindHG", + "outputId": "ab196244-fabc-47e4-b04c-90ab3393d5f3" + }, + "execution_count": 106, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "40616 : rows\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### a. How many columns?" + ], + "metadata": { + "id": "AdHEZddqoHZE" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"{df.shape[1]} : columns\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "01JwkvpcoI1f", + "outputId": "17f8bdde-4a76-449c-8be4-fe4d3d3510f4" + }, + "execution_count": 107, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "12 : columns\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### c. Which columns have missing values? (Why do think there are there so many null values in the Medals column?)" + ], + "metadata": { + "id": "621nGurCoOLS" + } + }, + { + "cell_type": "code", + "source": [ + "df.isna().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "1rTNIYU8oRVl", + "outputId": "012ac4fa-2466-4ce6-dd0e-df3ccad82a5d" + }, + "execution_count": 108, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ID 0\n", + "Age 1473\n", + "Height 9001\n", + "Team 0\n", + "NOC 0\n", + "Games 0\n", + "Year 0\n", + "Season 0\n", + "City 0\n", + "Sport 0\n", + "Event 0\n", + "Medal 34699\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 108 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# Missing values in the medals because there are only 3 medals given out and there are more athletes to medals. Not everyone will get a medal" + ], + "metadata": { + "id": "88eF9sOZocYi" + }, + "execution_count": 109, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### d. How many entries correspond to the city of London?" + ], + "metadata": { + "id": "AtRfXHPaojJ6" + } + }, + { + "cell_type": "code", + "source": [ + "london = df['City'] == \"London\"\n", + "print(f\"{london.sum()} : london entries\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pmr0cqluolD7", + "outputId": "33a7c62d-7b5d-4288-daa3-2f3d846e4c66" + }, + "execution_count": 110, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "3370 : london entries\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + " ### e. What age is the youngest athlete in our sample data? Hint: use min()" + ], + "metadata": { + "id": "T_PIpjQlpWKg" + } + }, + { + "cell_type": "code", + "source": [ + "print(f\"{df['Age'].min()} youngest olympic athlete\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "rirUZ4c3pXRv", + "outputId": "9c8eb601-6771-451f-8ad4-762adc885a2c" + }, + "execution_count": 111, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "11.0 youngest olympic athlete\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 2. Create filters to find out:" + ], + "metadata": { + "id": "Oet_zYgQobsm" + } + }, + { + "cell_type": "markdown", + "source": [ + "### a. How many athletes who participated in the Sport \"Tug-Of-War\" were from Team \"Sweden\" in our sample dataset?" + ], + "metadata": { + "id": "N7sAzCm5pwdR" + } + }, + { + "cell_type": "code", + "source": [ + "df.info()" + ], + "metadata": { + "id": "ulRJ93drp2yT" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "tug = df['Sport'] == \"Tug-Of-War\"\n", + "country = df['Team'] == \"Sweden\"\n", + "total = df[tug & country]\n", + "\n", + "print(f\"{total.shape[0]} : Swedish Tug-Of-War athletes\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "A-XBIcgLpz1P", + "outputId": "eef47704-0261-46bd-a955-bdfbef7686f6" + }, + "execution_count": 113, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "6 : Swedish Tug-Of-War athletes\n" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "# tug.isna().sum()\n", + "# country.isna().sum()" + ], + "metadata": { + "id": "zSS-sahIqezG" + }, + "execution_count": 114, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### b. How many Gold medals were awarded to athletes who were over 40 years old?" + ], + "metadata": { + "id": "gielhduHqnPe" + } + }, + { + "cell_type": "code", + "source": [ + "gold = df['Medal'] == \"Gold\"\n", + "forty = df['Age'] > 40\n", + "total = df[gold & forty]\n", + "\n", + "print(f\"{total.shape[0]} athletes over 40 and won gold\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "cIBJgXjzqyUV", + "outputId": "3977e274-3280-49cc-fa66-ec0205a7ff6b" + }, + "execution_count": 115, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "66 athletes over 40 and won gold\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 3. Determine the average height if:" + ], + "metadata": { + "id": "KQvXpE0srClp" + } + }, + { + "cell_type": "markdown", + "source": [ + "### a. All rows are included." + ], + "metadata": { + "id": "LAD9A4CGrGJt" + } + }, + { + "cell_type": "code", + "source": [ + "df.isna().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "65ETAWT9rHpB", + "outputId": "cad9f06b-6d97-4ef9-d44a-fc978addf3c9" + }, + "execution_count": 116, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "ID 0\n", + "Age 1473\n", + "Height 9001\n", + "Team 0\n", + "NOC 0\n", + "Games 0\n", + "Year 0\n", + "Season 0\n", + "City 0\n", + "Sport 0\n", + "Event 0\n", + "Medal 34699\n", + "dtype: int64" + ] + }, + "metadata": {}, + "execution_count": 116 + } + ] + }, + { + "cell_type": "code", + "source": [ + "df['Height'].fillna(value=df['Height'].mean(), inplace=True)" + ], + "metadata": { + "id": "PIu07JtJrlmu" + }, + "execution_count": 117, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.info().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 508 + }, + "id": "ekeejLD9s6I5", + "outputId": "a388962d-b4ec-47fe-bbe6-482d25f0f2d6" + }, + "execution_count": 118, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "\n", + "RangeIndex: 40616 entries, 0 to 40615\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 ID 40616 non-null int64 \n", + " 1 Age 39143 non-null float64\n", + " 2 Height 40616 non-null float64\n", + " 3 Team 40616 non-null object \n", + " 4 NOC 40616 non-null object \n", + " 5 Games 40616 non-null object \n", + " 6 Year 40616 non-null int64 \n", + " 7 Season 40616 non-null object \n", + " 8 City 40616 non-null object \n", + " 9 Sport 40616 non-null object \n", + " 10 Event 40616 non-null object \n", + " 11 Medal 5917 non-null object \n", + "dtypes: float64(2), int64(2), object(8)\n", + "memory usage: 3.7+ MB\n" + ] + }, + { + "output_type": "error", + "ename": "AttributeError", + "evalue": "ignored", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'sum'" + ] + } + ] + }, + { + "cell_type": "code", + "source": [ + "print(f\"{round(df['Height'].mean(), 2)} : average height\")" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "GZvlSKHUs9pI", + "outputId": "98278521-673b-41ea-c6cb-14bdd0cd6323" + }, + "execution_count": 119, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "175.38 : average height\n" + ] + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### b. All rows are included but grouped by Event." + ], + "metadata": { + "id": "8jItbMYWtO4t" + } + }, + { + "cell_type": "code", + "source": [ + "df.groupby(['Event'])['Height'].mean()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "2xFAO2nAtSyp", + "outputId": "7c18c1ae-f5b2-412d-91fa-56d3019ac24e" + }, + "execution_count": 120, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Event\n", + "Alpine Skiing Men's Combined 177.744508\n", + "Alpine Skiing Men's Downhill 177.422101\n", + "Alpine Skiing Men's Giant Slalom 176.334801\n", + "Alpine Skiing Men's Slalom 176.161591\n", + "Alpine Skiing Men's Super G 179.193301\n", + " ... \n", + "Wrestling Women's Flyweight, Freestyle 158.100000\n", + "Wrestling Women's Heavyweight, Freestyle 173.444444\n", + "Wrestling Women's Light-Heavyweight, Freestyle 170.000000\n", + "Wrestling Women's Lightweight, Freestyle 162.666667\n", + "Wrestling Women's Middleweight, Freestyle 164.533333\n", + "Name: Height, Length: 726, dtype: float64" + ] + }, + "metadata": {}, + "execution_count": 120 + } + ] + }, + { + "cell_type": "markdown", + "source": [ + "### c. Bonus: Fin average height of all the athletes, but don't want to include any athlete more than once. \n", + "\n", + "Hint: We learned how to drop duplicates if EVERY column was identical\n", + "\n", + "Hint: Now we want to drop a row any time just the ID is repeated. To do this, you can use a subset of your data as an argument in the drop_duplicates function. You can also tell Python which of the duplicates to keep by using the keep argument." + ], + "metadata": { + "id": "MwZIpOlmtrVk" + } + }, + { + "cell_type": "code", + "source": [ + "df.duplicated().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "zDAEIiF-t9dd", + "outputId": "d90572b4-16ff-456f-e5b3-88d5ac5fb7ce" + }, + "execution_count": 121, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "207" + ] + }, + "metadata": {}, + "execution_count": 121 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# drops the identical rows\n", + "new_df = df.drop_duplicates()" + ], + "metadata": { + "id": "9RdX8TfduBRP" + }, + "execution_count": 122, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.duplicated().sum()" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pQHyua7zuEBl", + "outputId": "c80cbcc9-80aa-418e-8d8f-c0bb0a2aba64" + }, + "execution_count": 123, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "207" + ] + }, + "metadata": {}, + "execution_count": 123 + } + ] + }, + { + "cell_type": "code", + "source": [ + "# drops identical ids\n", + "new_df = df.drop_duplicates(subset=['ID'])" + ], + "metadata": { + "id": "SapWRK1cuKl1" + }, + "execution_count": 124, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "df.shape, new_df.shape" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "dsrJ5jZOurZI", + "outputId": "dccdb2dd-14f6-4e4c-a648-fa79e064278c" + }, + "execution_count": 126, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "((40616, 12), (20336, 12))" + ] + }, + "metadata": {}, + "execution_count": 126 + } + ] + } + ] +} \ No newline at end of file From 21f3078ae11a8a392295d0ad7add4446d9507d26 Mon Sep 17 00:00:00 2001 From: Lisa Broadhead <37422388+lisabroadhead@users.noreply.github.com> Date: Mon, 6 Jun 2022 15:38:33 -0500 Subject: [PATCH 2/2] Delete olympian.ipynb --- olympian.ipynb | 902 ------------------------------------------------- 1 file changed, 902 deletions(-) delete mode 100644 olympian.ipynb diff --git a/olympian.ipynb b/olympian.ipynb deleted file mode 100644 index fec0af3..0000000 --- a/olympian.ipynb +++ /dev/null @@ -1,902 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "name": "olympian.ipynb", - "provenance": [], - "collapsed_sections": [], - "mount_file_id": "1BnM2nF0qZYP7dc9ciyIDoKcMjMLsh-8M", - "authorship_tag": "ABX9TyOTuJS8S5ykRqDGRYe66Bak", - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "markdown", - "source": [ - "# Find average height per individual olympian per year exercise (Core)" - ], - "metadata": { - "id": "TgcY0EcYnNrE" - } - }, - { - "cell_type": "markdown", - "source": [ - "## 1. Load this dataset and explore it! Answer the following questions:" - ], - "metadata": { - "id": "8Wmk4yyanVAG" - } - }, - { - "cell_type": "code", - "source": [ - "import pandas as pd\n", - "import numpy as np" - ], - "metadata": { - "id": "Bgnsj0JOnW2s" - }, - "execution_count": 102, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "file = \"/content/drive/MyDrive/Colab Notebooks/coding_dojo/files/athleteEventsNoPersonal.csv\"\n", - "df = pd.read_csv(file)" - ], - "metadata": { - "id": "uRrzNXwSnd8D" - }, - "execution_count": 103, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "df.head()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 206 - }, - "id": "kOoXnWyTn9zt", - "outputId": "666c6c01-e501-4489-d56d-28faa3a11e95" - }, - "execution_count": 105, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " ID Age Height Team NOC Games Year Season City \\\n", - "0 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n", - "1 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n", - "2 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n", - "3 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n", - "4 5 27.0 185.0 Netherlands NED 1994 Winter 1994 Winter Lillehammer \n", - "\n", - " Sport Event Medal \n", - "0 Speed Skating Speed Skating Women's 500 metres NaN \n", - "1 Speed Skating Speed Skating Women's 1,000 metres NaN \n", - "2 Speed Skating Speed Skating Women's 500 metres NaN \n", - "3 Speed Skating Speed Skating Women's 1,000 metres NaN \n", - "4 Speed Skating Speed Skating Women's 500 metres NaN " - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
IDAgeHeightTeamNOCGamesYearSeasonCitySportEventMedal
0521.0185.0NetherlandsNED1988 Winter1988WinterCalgarySpeed SkatingSpeed Skating Women's 500 metresNaN
1521.0185.0NetherlandsNED1988 Winter1988WinterCalgarySpeed SkatingSpeed Skating Women's 1,000 metresNaN
2525.0185.0NetherlandsNED1992 Winter1992WinterAlbertvilleSpeed SkatingSpeed Skating Women's 500 metresNaN
3525.0185.0NetherlandsNED1992 Winter1992WinterAlbertvilleSpeed SkatingSpeed Skating Women's 1,000 metresNaN
4527.0185.0NetherlandsNED1994 Winter1994WinterLillehammerSpeed SkatingSpeed Skating Women's 500 metresNaN
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 105 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### a. How many rows?" - ], - "metadata": { - "id": "NHDoLsh-naiT" - } - }, - { - "cell_type": "code", - "source": [ - "print(f\"{df.shape[0]} : rows\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "-RmJqFbindHG", - "outputId": "ab196244-fabc-47e4-b04c-90ab3393d5f3" - }, - "execution_count": 106, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "40616 : rows\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### a. How many columns?" - ], - "metadata": { - "id": "AdHEZddqoHZE" - } - }, - { - "cell_type": "code", - "source": [ - "print(f\"{df.shape[1]} : columns\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "01JwkvpcoI1f", - "outputId": "17f8bdde-4a76-449c-8be4-fe4d3d3510f4" - }, - "execution_count": 107, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "12 : columns\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### c. Which columns have missing values? (Why do think there are there so many null values in the Medals column?)" - ], - "metadata": { - "id": "621nGurCoOLS" - } - }, - { - "cell_type": "code", - "source": [ - "df.isna().sum()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "1rTNIYU8oRVl", - "outputId": "012ac4fa-2466-4ce6-dd0e-df3ccad82a5d" - }, - "execution_count": 108, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ID 0\n", - "Age 1473\n", - "Height 9001\n", - "Team 0\n", - "NOC 0\n", - "Games 0\n", - "Year 0\n", - "Season 0\n", - "City 0\n", - "Sport 0\n", - "Event 0\n", - "Medal 34699\n", - "dtype: int64" - ] - }, - "metadata": {}, - "execution_count": 108 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# Missing values in the medals because there are only 3 medals given out and there are more athletes to medals. Not everyone will get a medal" - ], - "metadata": { - "id": "88eF9sOZocYi" - }, - "execution_count": 109, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### d. How many entries correspond to the city of London?" - ], - "metadata": { - "id": "AtRfXHPaojJ6" - } - }, - { - "cell_type": "code", - "source": [ - "london = df['City'] == \"London\"\n", - "print(f\"{london.sum()} : london entries\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pmr0cqluolD7", - "outputId": "33a7c62d-7b5d-4288-daa3-2f3d846e4c66" - }, - "execution_count": 110, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "3370 : london entries\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - " ### e. What age is the youngest athlete in our sample data? Hint: use min()" - ], - "metadata": { - "id": "T_PIpjQlpWKg" - } - }, - { - "cell_type": "code", - "source": [ - "print(f\"{df['Age'].min()} youngest olympic athlete\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "rirUZ4c3pXRv", - "outputId": "9c8eb601-6771-451f-8ad4-762adc885a2c" - }, - "execution_count": 111, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "11.0 youngest olympic athlete\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## 2. Create filters to find out:" - ], - "metadata": { - "id": "Oet_zYgQobsm" - } - }, - { - "cell_type": "markdown", - "source": [ - "### a. How many athletes who participated in the Sport \"Tug-Of-War\" were from Team \"Sweden\" in our sample dataset?" - ], - "metadata": { - "id": "N7sAzCm5pwdR" - } - }, - { - "cell_type": "code", - "source": [ - "df.info()" - ], - "metadata": { - "id": "ulRJ93drp2yT" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "tug = df['Sport'] == \"Tug-Of-War\"\n", - "country = df['Team'] == \"Sweden\"\n", - "total = df[tug & country]\n", - "\n", - "print(f\"{total.shape[0]} : Swedish Tug-Of-War athletes\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "A-XBIcgLpz1P", - "outputId": "eef47704-0261-46bd-a955-bdfbef7686f6" - }, - "execution_count": 113, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "6 : Swedish Tug-Of-War athletes\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# tug.isna().sum()\n", - "# country.isna().sum()" - ], - "metadata": { - "id": "zSS-sahIqezG" - }, - "execution_count": 114, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "### b. How many Gold medals were awarded to athletes who were over 40 years old?" - ], - "metadata": { - "id": "gielhduHqnPe" - } - }, - { - "cell_type": "code", - "source": [ - "gold = df['Medal'] == \"Gold\"\n", - "forty = df['Age'] > 40\n", - "total = df[gold & forty]\n", - "\n", - "print(f\"{total.shape[0]} athletes over 40 and won gold\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "cIBJgXjzqyUV", - "outputId": "3977e274-3280-49cc-fa66-ec0205a7ff6b" - }, - "execution_count": 115, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "66 athletes over 40 and won gold\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## 3. Determine the average height if:" - ], - "metadata": { - "id": "KQvXpE0srClp" - } - }, - { - "cell_type": "markdown", - "source": [ - "### a. All rows are included." - ], - "metadata": { - "id": "LAD9A4CGrGJt" - } - }, - { - "cell_type": "code", - "source": [ - "df.isna().sum()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "65ETAWT9rHpB", - "outputId": "cad9f06b-6d97-4ef9-d44a-fc978addf3c9" - }, - "execution_count": 116, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "ID 0\n", - "Age 1473\n", - "Height 9001\n", - "Team 0\n", - "NOC 0\n", - "Games 0\n", - "Year 0\n", - "Season 0\n", - "City 0\n", - "Sport 0\n", - "Event 0\n", - "Medal 34699\n", - "dtype: int64" - ] - }, - "metadata": {}, - "execution_count": 116 - } - ] - }, - { - "cell_type": "code", - "source": [ - "df['Height'].fillna(value=df['Height'].mean(), inplace=True)" - ], - "metadata": { - "id": "PIu07JtJrlmu" - }, - "execution_count": 117, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "df.info().sum()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 508 - }, - "id": "ekeejLD9s6I5", - "outputId": "a388962d-b4ec-47fe-bbe6-482d25f0f2d6" - }, - "execution_count": 118, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "RangeIndex: 40616 entries, 0 to 40615\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 ID 40616 non-null int64 \n", - " 1 Age 39143 non-null float64\n", - " 2 Height 40616 non-null float64\n", - " 3 Team 40616 non-null object \n", - " 4 NOC 40616 non-null object \n", - " 5 Games 40616 non-null object \n", - " 6 Year 40616 non-null int64 \n", - " 7 Season 40616 non-null object \n", - " 8 City 40616 non-null object \n", - " 9 Sport 40616 non-null object \n", - " 10 Event 40616 non-null object \n", - " 11 Medal 5917 non-null object \n", - "dtypes: float64(2), int64(2), object(8)\n", - "memory usage: 3.7+ MB\n" - ] - }, - { - "output_type": "error", - "ename": "AttributeError", - "evalue": "ignored", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'sum'" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "print(f\"{round(df['Height'].mean(), 2)} : average height\")" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "GZvlSKHUs9pI", - "outputId": "98278521-673b-41ea-c6cb-14bdd0cd6323" - }, - "execution_count": 119, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "175.38 : average height\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### b. All rows are included but grouped by Event." - ], - "metadata": { - "id": "8jItbMYWtO4t" - } - }, - { - "cell_type": "code", - "source": [ - "df.groupby(['Event'])['Height'].mean()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "2xFAO2nAtSyp", - "outputId": "7c18c1ae-f5b2-412d-91fa-56d3019ac24e" - }, - "execution_count": 120, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "Event\n", - "Alpine Skiing Men's Combined 177.744508\n", - "Alpine Skiing Men's Downhill 177.422101\n", - "Alpine Skiing Men's Giant Slalom 176.334801\n", - "Alpine Skiing Men's Slalom 176.161591\n", - "Alpine Skiing Men's Super G 179.193301\n", - " ... \n", - "Wrestling Women's Flyweight, Freestyle 158.100000\n", - "Wrestling Women's Heavyweight, Freestyle 173.444444\n", - "Wrestling Women's Light-Heavyweight, Freestyle 170.000000\n", - "Wrestling Women's Lightweight, Freestyle 162.666667\n", - "Wrestling Women's Middleweight, Freestyle 164.533333\n", - "Name: Height, Length: 726, dtype: float64" - ] - }, - "metadata": {}, - "execution_count": 120 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "### c. Bonus: Fin average height of all the athletes, but don't want to include any athlete more than once. \n", - "\n", - "Hint: We learned how to drop duplicates if EVERY column was identical\n", - "\n", - "Hint: Now we want to drop a row any time just the ID is repeated. To do this, you can use a subset of your data as an argument in the drop_duplicates function. You can also tell Python which of the duplicates to keep by using the keep argument." - ], - "metadata": { - "id": "MwZIpOlmtrVk" - } - }, - { - "cell_type": "code", - "source": [ - "df.duplicated().sum()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "zDAEIiF-t9dd", - "outputId": "d90572b4-16ff-456f-e5b3-88d5ac5fb7ce" - }, - "execution_count": 121, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "207" - ] - }, - "metadata": {}, - "execution_count": 121 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# drops the identical rows\n", - "new_df = df.drop_duplicates()" - ], - "metadata": { - "id": "9RdX8TfduBRP" - }, - "execution_count": 122, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "df.duplicated().sum()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "pQHyua7zuEBl", - "outputId": "c80cbcc9-80aa-418e-8d8f-c0bb0a2aba64" - }, - "execution_count": 123, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "207" - ] - }, - "metadata": {}, - "execution_count": 123 - } - ] - }, - { - "cell_type": "code", - "source": [ - "# drops identical ids\n", - "new_df = df.drop_duplicates(subset=['ID'])" - ], - "metadata": { - "id": "SapWRK1cuKl1" - }, - "execution_count": 124, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "df.shape, new_df.shape" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "dsrJ5jZOurZI", - "outputId": "dccdb2dd-14f6-4e4c-a648-fa79e064278c" - }, - "execution_count": 126, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "((40616, 12), (20336, 12))" - ] - }, - "metadata": {}, - "execution_count": 126 - } - ] - } - ] -} \ No newline at end of file