From 7cedd51ffb75490765494d22f993646bd694740f Mon Sep 17 00:00:00 2001
From: Lisa Broadhead <37422388+lisabroadhead@users.noreply.github.com>
Date: Mon, 6 Jun 2022 15:38:09 -0500
Subject: [PATCH 1/2] Created using Colaboratory
---
olympian.ipynb | 902 +++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 902 insertions(+)
create mode 100644 olympian.ipynb
diff --git a/olympian.ipynb b/olympian.ipynb
new file mode 100644
index 0000000..fec0af3
--- /dev/null
+++ b/olympian.ipynb
@@ -0,0 +1,902 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+ "colab": {
+ "name": "olympian.ipynb",
+ "provenance": [],
+ "collapsed_sections": [],
+ "mount_file_id": "1BnM2nF0qZYP7dc9ciyIDoKcMjMLsh-8M",
+ "authorship_tag": "ABX9TyOTuJS8S5ykRqDGRYe66Bak",
+ "include_colab_link": true
+ },
+ "kernelspec": {
+ "name": "python3",
+ "display_name": "Python 3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "view-in-github",
+ "colab_type": "text"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "# Find average height per individual olympian per year exercise (Core)"
+ ],
+ "metadata": {
+ "id": "TgcY0EcYnNrE"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 1. Load this dataset and explore it! Answer the following questions:"
+ ],
+ "metadata": {
+ "id": "8Wmk4yyanVAG"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np"
+ ],
+ "metadata": {
+ "id": "Bgnsj0JOnW2s"
+ },
+ "execution_count": 102,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "file = \"/content/drive/MyDrive/Colab Notebooks/coding_dojo/files/athleteEventsNoPersonal.csv\"\n",
+ "df = pd.read_csv(file)"
+ ],
+ "metadata": {
+ "id": "uRrzNXwSnd8D"
+ },
+ "execution_count": 103,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.head()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 206
+ },
+ "id": "kOoXnWyTn9zt",
+ "outputId": "666c6c01-e501-4489-d56d-28faa3a11e95"
+ },
+ "execution_count": 105,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ " ID Age Height Team NOC Games Year Season City \\\n",
+ "0 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n",
+ "1 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n",
+ "2 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n",
+ "3 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n",
+ "4 5 27.0 185.0 Netherlands NED 1994 Winter 1994 Winter Lillehammer \n",
+ "\n",
+ " Sport Event Medal \n",
+ "0 Speed Skating Speed Skating Women's 500 metres NaN \n",
+ "1 Speed Skating Speed Skating Women's 1,000 metres NaN \n",
+ "2 Speed Skating Speed Skating Women's 500 metres NaN \n",
+ "3 Speed Skating Speed Skating Women's 1,000 metres NaN \n",
+ "4 Speed Skating Speed Skating Women's 500 metres NaN "
+ ],
+ "text/html": [
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ID | \n",
+ " Age | \n",
+ " Height | \n",
+ " Team | \n",
+ " NOC | \n",
+ " Games | \n",
+ " Year | \n",
+ " Season | \n",
+ " City | \n",
+ " Sport | \n",
+ " Event | \n",
+ " Medal | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " | 0 | \n",
+ " 5 | \n",
+ " 21.0 | \n",
+ " 185.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1988 Winter | \n",
+ " 1988 | \n",
+ " Winter | \n",
+ " Calgary | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 500 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 1 | \n",
+ " 5 | \n",
+ " 21.0 | \n",
+ " 185.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1988 Winter | \n",
+ " 1988 | \n",
+ " Winter | \n",
+ " Calgary | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 1,000 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 2 | \n",
+ " 5 | \n",
+ " 25.0 | \n",
+ " 185.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1992 Winter | \n",
+ " 1992 | \n",
+ " Winter | \n",
+ " Albertville | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 500 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 3 | \n",
+ " 5 | \n",
+ " 25.0 | \n",
+ " 185.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1992 Winter | \n",
+ " 1992 | \n",
+ " Winter | \n",
+ " Albertville | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 1,000 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " | 4 | \n",
+ " 5 | \n",
+ " 27.0 | \n",
+ " 185.0 | \n",
+ " Netherlands | \n",
+ " NED | \n",
+ " 1994 Winter | \n",
+ " 1994 | \n",
+ " Winter | \n",
+ " Lillehammer | \n",
+ " Speed Skating | \n",
+ " Speed Skating Women's 500 metres | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ "\n",
+ " \n",
+ "
\n",
+ "
\n",
+ " "
+ ]
+ },
+ "metadata": {},
+ "execution_count": 105
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### a. How many rows?"
+ ],
+ "metadata": {
+ "id": "NHDoLsh-naiT"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(f\"{df.shape[0]} : rows\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "-RmJqFbindHG",
+ "outputId": "ab196244-fabc-47e4-b04c-90ab3393d5f3"
+ },
+ "execution_count": 106,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "40616 : rows\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### a. How many columns?"
+ ],
+ "metadata": {
+ "id": "AdHEZddqoHZE"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(f\"{df.shape[1]} : columns\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "01JwkvpcoI1f",
+ "outputId": "17f8bdde-4a76-449c-8be4-fe4d3d3510f4"
+ },
+ "execution_count": 107,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "12 : columns\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### c. Which columns have missing values? (Why do think there are there so many null values in the Medals column?)"
+ ],
+ "metadata": {
+ "id": "621nGurCoOLS"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.isna().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "1rTNIYU8oRVl",
+ "outputId": "012ac4fa-2466-4ce6-dd0e-df3ccad82a5d"
+ },
+ "execution_count": 108,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "ID 0\n",
+ "Age 1473\n",
+ "Height 9001\n",
+ "Team 0\n",
+ "NOC 0\n",
+ "Games 0\n",
+ "Year 0\n",
+ "Season 0\n",
+ "City 0\n",
+ "Sport 0\n",
+ "Event 0\n",
+ "Medal 34699\n",
+ "dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 108
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# Missing values in the medals because there are only 3 medals given out and there are more athletes to medals. Not everyone will get a medal"
+ ],
+ "metadata": {
+ "id": "88eF9sOZocYi"
+ },
+ "execution_count": 109,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### d. How many entries correspond to the city of London?"
+ ],
+ "metadata": {
+ "id": "AtRfXHPaojJ6"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "london = df['City'] == \"London\"\n",
+ "print(f\"{london.sum()} : london entries\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pmr0cqluolD7",
+ "outputId": "33a7c62d-7b5d-4288-daa3-2f3d846e4c66"
+ },
+ "execution_count": 110,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "3370 : london entries\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ " ### e. What age is the youngest athlete in our sample data? Hint: use min()"
+ ],
+ "metadata": {
+ "id": "T_PIpjQlpWKg"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(f\"{df['Age'].min()} youngest olympic athlete\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "rirUZ4c3pXRv",
+ "outputId": "9c8eb601-6771-451f-8ad4-762adc885a2c"
+ },
+ "execution_count": 111,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "11.0 youngest olympic athlete\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 2. Create filters to find out:"
+ ],
+ "metadata": {
+ "id": "Oet_zYgQobsm"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### a. How many athletes who participated in the Sport \"Tug-Of-War\" were from Team \"Sweden\" in our sample dataset?"
+ ],
+ "metadata": {
+ "id": "N7sAzCm5pwdR"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.info()"
+ ],
+ "metadata": {
+ "id": "ulRJ93drp2yT"
+ },
+ "execution_count": null,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "tug = df['Sport'] == \"Tug-Of-War\"\n",
+ "country = df['Team'] == \"Sweden\"\n",
+ "total = df[tug & country]\n",
+ "\n",
+ "print(f\"{total.shape[0]} : Swedish Tug-Of-War athletes\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "A-XBIcgLpz1P",
+ "outputId": "eef47704-0261-46bd-a955-bdfbef7686f6"
+ },
+ "execution_count": 113,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "6 : Swedish Tug-Of-War athletes\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# tug.isna().sum()\n",
+ "# country.isna().sum()"
+ ],
+ "metadata": {
+ "id": "zSS-sahIqezG"
+ },
+ "execution_count": 114,
+ "outputs": []
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### b. How many Gold medals were awarded to athletes who were over 40 years old?"
+ ],
+ "metadata": {
+ "id": "gielhduHqnPe"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "gold = df['Medal'] == \"Gold\"\n",
+ "forty = df['Age'] > 40\n",
+ "total = df[gold & forty]\n",
+ "\n",
+ "print(f\"{total.shape[0]} athletes over 40 and won gold\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "cIBJgXjzqyUV",
+ "outputId": "3977e274-3280-49cc-fa66-ec0205a7ff6b"
+ },
+ "execution_count": 115,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "66 athletes over 40 and won gold\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "## 3. Determine the average height if:"
+ ],
+ "metadata": {
+ "id": "KQvXpE0srClp"
+ }
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### a. All rows are included."
+ ],
+ "metadata": {
+ "id": "LAD9A4CGrGJt"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.isna().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "65ETAWT9rHpB",
+ "outputId": "cad9f06b-6d97-4ef9-d44a-fc978addf3c9"
+ },
+ "execution_count": 116,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "ID 0\n",
+ "Age 1473\n",
+ "Height 9001\n",
+ "Team 0\n",
+ "NOC 0\n",
+ "Games 0\n",
+ "Year 0\n",
+ "Season 0\n",
+ "City 0\n",
+ "Sport 0\n",
+ "Event 0\n",
+ "Medal 34699\n",
+ "dtype: int64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 116
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df['Height'].fillna(value=df['Height'].mean(), inplace=True)"
+ ],
+ "metadata": {
+ "id": "PIu07JtJrlmu"
+ },
+ "execution_count": 117,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.info().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 508
+ },
+ "id": "ekeejLD9s6I5",
+ "outputId": "a388962d-b4ec-47fe-bbe6-482d25f0f2d6"
+ },
+ "execution_count": 118,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "\n",
+ "RangeIndex: 40616 entries, 0 to 40615\n",
+ "Data columns (total 12 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 ID 40616 non-null int64 \n",
+ " 1 Age 39143 non-null float64\n",
+ " 2 Height 40616 non-null float64\n",
+ " 3 Team 40616 non-null object \n",
+ " 4 NOC 40616 non-null object \n",
+ " 5 Games 40616 non-null object \n",
+ " 6 Year 40616 non-null int64 \n",
+ " 7 Season 40616 non-null object \n",
+ " 8 City 40616 non-null object \n",
+ " 9 Sport 40616 non-null object \n",
+ " 10 Event 40616 non-null object \n",
+ " 11 Medal 5917 non-null object \n",
+ "dtypes: float64(2), int64(2), object(8)\n",
+ "memory usage: 3.7+ MB\n"
+ ]
+ },
+ {
+ "output_type": "error",
+ "ename": "AttributeError",
+ "evalue": "ignored",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+ "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'sum'"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "print(f\"{round(df['Height'].mean(), 2)} : average height\")"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "GZvlSKHUs9pI",
+ "outputId": "98278521-673b-41ea-c6cb-14bdd0cd6323"
+ },
+ "execution_count": 119,
+ "outputs": [
+ {
+ "output_type": "stream",
+ "name": "stdout",
+ "text": [
+ "175.38 : average height\n"
+ ]
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### b. All rows are included but grouped by Event."
+ ],
+ "metadata": {
+ "id": "8jItbMYWtO4t"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.groupby(['Event'])['Height'].mean()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "2xFAO2nAtSyp",
+ "outputId": "7c18c1ae-f5b2-412d-91fa-56d3019ac24e"
+ },
+ "execution_count": 120,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "Event\n",
+ "Alpine Skiing Men's Combined 177.744508\n",
+ "Alpine Skiing Men's Downhill 177.422101\n",
+ "Alpine Skiing Men's Giant Slalom 176.334801\n",
+ "Alpine Skiing Men's Slalom 176.161591\n",
+ "Alpine Skiing Men's Super G 179.193301\n",
+ " ... \n",
+ "Wrestling Women's Flyweight, Freestyle 158.100000\n",
+ "Wrestling Women's Heavyweight, Freestyle 173.444444\n",
+ "Wrestling Women's Light-Heavyweight, Freestyle 170.000000\n",
+ "Wrestling Women's Lightweight, Freestyle 162.666667\n",
+ "Wrestling Women's Middleweight, Freestyle 164.533333\n",
+ "Name: Height, Length: 726, dtype: float64"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 120
+ }
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "source": [
+ "### c. Bonus: Fin average height of all the athletes, but don't want to include any athlete more than once. \n",
+ "\n",
+ "Hint: We learned how to drop duplicates if EVERY column was identical\n",
+ "\n",
+ "Hint: Now we want to drop a row any time just the ID is repeated. To do this, you can use a subset of your data as an argument in the drop_duplicates function. You can also tell Python which of the duplicates to keep by using the keep argument."
+ ],
+ "metadata": {
+ "id": "MwZIpOlmtrVk"
+ }
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.duplicated().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "zDAEIiF-t9dd",
+ "outputId": "d90572b4-16ff-456f-e5b3-88d5ac5fb7ce"
+ },
+ "execution_count": 121,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "207"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 121
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# drops the identical rows\n",
+ "new_df = df.drop_duplicates()"
+ ],
+ "metadata": {
+ "id": "9RdX8TfduBRP"
+ },
+ "execution_count": 122,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.duplicated().sum()"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "pQHyua7zuEBl",
+ "outputId": "c80cbcc9-80aa-418e-8d8f-c0bb0a2aba64"
+ },
+ "execution_count": 123,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "207"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 123
+ }
+ ]
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "# drops identical ids\n",
+ "new_df = df.drop_duplicates(subset=['ID'])"
+ ],
+ "metadata": {
+ "id": "SapWRK1cuKl1"
+ },
+ "execution_count": 124,
+ "outputs": []
+ },
+ {
+ "cell_type": "code",
+ "source": [
+ "df.shape, new_df.shape"
+ ],
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "dsrJ5jZOurZI",
+ "outputId": "dccdb2dd-14f6-4e4c-a648-fa79e064278c"
+ },
+ "execution_count": 126,
+ "outputs": [
+ {
+ "output_type": "execute_result",
+ "data": {
+ "text/plain": [
+ "((40616, 12), (20336, 12))"
+ ]
+ },
+ "metadata": {},
+ "execution_count": 126
+ }
+ ]
+ }
+ ]
+}
\ No newline at end of file
From 21f3078ae11a8a392295d0ad7add4446d9507d26 Mon Sep 17 00:00:00 2001
From: Lisa Broadhead <37422388+lisabroadhead@users.noreply.github.com>
Date: Mon, 6 Jun 2022 15:38:33 -0500
Subject: [PATCH 2/2] Delete olympian.ipynb
---
olympian.ipynb | 902 -------------------------------------------------
1 file changed, 902 deletions(-)
delete mode 100644 olympian.ipynb
diff --git a/olympian.ipynb b/olympian.ipynb
deleted file mode 100644
index fec0af3..0000000
--- a/olympian.ipynb
+++ /dev/null
@@ -1,902 +0,0 @@
-{
- "nbformat": 4,
- "nbformat_minor": 0,
- "metadata": {
- "colab": {
- "name": "olympian.ipynb",
- "provenance": [],
- "collapsed_sections": [],
- "mount_file_id": "1BnM2nF0qZYP7dc9ciyIDoKcMjMLsh-8M",
- "authorship_tag": "ABX9TyOTuJS8S5ykRqDGRYe66Bak",
- "include_colab_link": true
- },
- "kernelspec": {
- "name": "python3",
- "display_name": "Python 3"
- },
- "language_info": {
- "name": "python"
- }
- },
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "view-in-github",
- "colab_type": "text"
- },
- "source": [
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "# Find average height per individual olympian per year exercise (Core)"
- ],
- "metadata": {
- "id": "TgcY0EcYnNrE"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "## 1. Load this dataset and explore it! Answer the following questions:"
- ],
- "metadata": {
- "id": "8Wmk4yyanVAG"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "import pandas as pd\n",
- "import numpy as np"
- ],
- "metadata": {
- "id": "Bgnsj0JOnW2s"
- },
- "execution_count": 102,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "file = \"/content/drive/MyDrive/Colab Notebooks/coding_dojo/files/athleteEventsNoPersonal.csv\"\n",
- "df = pd.read_csv(file)"
- ],
- "metadata": {
- "id": "uRrzNXwSnd8D"
- },
- "execution_count": 103,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "df.head()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 206
- },
- "id": "kOoXnWyTn9zt",
- "outputId": "666c6c01-e501-4489-d56d-28faa3a11e95"
- },
- "execution_count": 105,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- " ID Age Height Team NOC Games Year Season City \\\n",
- "0 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n",
- "1 5 21.0 185.0 Netherlands NED 1988 Winter 1988 Winter Calgary \n",
- "2 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n",
- "3 5 25.0 185.0 Netherlands NED 1992 Winter 1992 Winter Albertville \n",
- "4 5 27.0 185.0 Netherlands NED 1994 Winter 1994 Winter Lillehammer \n",
- "\n",
- " Sport Event Medal \n",
- "0 Speed Skating Speed Skating Women's 500 metres NaN \n",
- "1 Speed Skating Speed Skating Women's 1,000 metres NaN \n",
- "2 Speed Skating Speed Skating Women's 500 metres NaN \n",
- "3 Speed Skating Speed Skating Women's 1,000 metres NaN \n",
- "4 Speed Skating Speed Skating Women's 500 metres NaN "
- ],
- "text/html": [
- "\n",
- " \n",
- "
\n",
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " ID | \n",
- " Age | \n",
- " Height | \n",
- " Team | \n",
- " NOC | \n",
- " Games | \n",
- " Year | \n",
- " Season | \n",
- " City | \n",
- " Sport | \n",
- " Event | \n",
- " Medal | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 0 | \n",
- " 5 | \n",
- " 21.0 | \n",
- " 185.0 | \n",
- " Netherlands | \n",
- " NED | \n",
- " 1988 Winter | \n",
- " 1988 | \n",
- " Winter | \n",
- " Calgary | \n",
- " Speed Skating | \n",
- " Speed Skating Women's 500 metres | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 1 | \n",
- " 5 | \n",
- " 21.0 | \n",
- " 185.0 | \n",
- " Netherlands | \n",
- " NED | \n",
- " 1988 Winter | \n",
- " 1988 | \n",
- " Winter | \n",
- " Calgary | \n",
- " Speed Skating | \n",
- " Speed Skating Women's 1,000 metres | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 2 | \n",
- " 5 | \n",
- " 25.0 | \n",
- " 185.0 | \n",
- " Netherlands | \n",
- " NED | \n",
- " 1992 Winter | \n",
- " 1992 | \n",
- " Winter | \n",
- " Albertville | \n",
- " Speed Skating | \n",
- " Speed Skating Women's 500 metres | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 3 | \n",
- " 5 | \n",
- " 25.0 | \n",
- " 185.0 | \n",
- " Netherlands | \n",
- " NED | \n",
- " 1992 Winter | \n",
- " 1992 | \n",
- " Winter | \n",
- " Albertville | \n",
- " Speed Skating | \n",
- " Speed Skating Women's 1,000 metres | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " | 4 | \n",
- " 5 | \n",
- " 27.0 | \n",
- " 185.0 | \n",
- " Netherlands | \n",
- " NED | \n",
- " 1994 Winter | \n",
- " 1994 | \n",
- " Winter | \n",
- " Lillehammer | \n",
- " Speed Skating | \n",
- " Speed Skating Women's 500 metres | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
\n",
- "
\n",
- " \n",
- " \n",
- "\n",
- " \n",
- "
\n",
- "
\n",
- " "
- ]
- },
- "metadata": {},
- "execution_count": 105
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### a. How many rows?"
- ],
- "metadata": {
- "id": "NHDoLsh-naiT"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"{df.shape[0]} : rows\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "-RmJqFbindHG",
- "outputId": "ab196244-fabc-47e4-b04c-90ab3393d5f3"
- },
- "execution_count": 106,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "40616 : rows\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### a. How many columns?"
- ],
- "metadata": {
- "id": "AdHEZddqoHZE"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"{df.shape[1]} : columns\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "01JwkvpcoI1f",
- "outputId": "17f8bdde-4a76-449c-8be4-fe4d3d3510f4"
- },
- "execution_count": 107,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "12 : columns\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### c. Which columns have missing values? (Why do think there are there so many null values in the Medals column?)"
- ],
- "metadata": {
- "id": "621nGurCoOLS"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.isna().sum()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "1rTNIYU8oRVl",
- "outputId": "012ac4fa-2466-4ce6-dd0e-df3ccad82a5d"
- },
- "execution_count": 108,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "ID 0\n",
- "Age 1473\n",
- "Height 9001\n",
- "Team 0\n",
- "NOC 0\n",
- "Games 0\n",
- "Year 0\n",
- "Season 0\n",
- "City 0\n",
- "Sport 0\n",
- "Event 0\n",
- "Medal 34699\n",
- "dtype: int64"
- ]
- },
- "metadata": {},
- "execution_count": 108
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# Missing values in the medals because there are only 3 medals given out and there are more athletes to medals. Not everyone will get a medal"
- ],
- "metadata": {
- "id": "88eF9sOZocYi"
- },
- "execution_count": 109,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "### d. How many entries correspond to the city of London?"
- ],
- "metadata": {
- "id": "AtRfXHPaojJ6"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "london = df['City'] == \"London\"\n",
- "print(f\"{london.sum()} : london entries\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "pmr0cqluolD7",
- "outputId": "33a7c62d-7b5d-4288-daa3-2f3d846e4c66"
- },
- "execution_count": 110,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "3370 : london entries\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- " ### e. What age is the youngest athlete in our sample data? Hint: use min()"
- ],
- "metadata": {
- "id": "T_PIpjQlpWKg"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"{df['Age'].min()} youngest olympic athlete\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "rirUZ4c3pXRv",
- "outputId": "9c8eb601-6771-451f-8ad4-762adc885a2c"
- },
- "execution_count": 111,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "11.0 youngest olympic athlete\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "## 2. Create filters to find out:"
- ],
- "metadata": {
- "id": "Oet_zYgQobsm"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "### a. How many athletes who participated in the Sport \"Tug-Of-War\" were from Team \"Sweden\" in our sample dataset?"
- ],
- "metadata": {
- "id": "N7sAzCm5pwdR"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.info()"
- ],
- "metadata": {
- "id": "ulRJ93drp2yT"
- },
- "execution_count": null,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "tug = df['Sport'] == \"Tug-Of-War\"\n",
- "country = df['Team'] == \"Sweden\"\n",
- "total = df[tug & country]\n",
- "\n",
- "print(f\"{total.shape[0]} : Swedish Tug-Of-War athletes\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "A-XBIcgLpz1P",
- "outputId": "eef47704-0261-46bd-a955-bdfbef7686f6"
- },
- "execution_count": 113,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "6 : Swedish Tug-Of-War athletes\n"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# tug.isna().sum()\n",
- "# country.isna().sum()"
- ],
- "metadata": {
- "id": "zSS-sahIqezG"
- },
- "execution_count": 114,
- "outputs": []
- },
- {
- "cell_type": "markdown",
- "source": [
- "### b. How many Gold medals were awarded to athletes who were over 40 years old?"
- ],
- "metadata": {
- "id": "gielhduHqnPe"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "gold = df['Medal'] == \"Gold\"\n",
- "forty = df['Age'] > 40\n",
- "total = df[gold & forty]\n",
- "\n",
- "print(f\"{total.shape[0]} athletes over 40 and won gold\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "cIBJgXjzqyUV",
- "outputId": "3977e274-3280-49cc-fa66-ec0205a7ff6b"
- },
- "execution_count": 115,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "66 athletes over 40 and won gold\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "## 3. Determine the average height if:"
- ],
- "metadata": {
- "id": "KQvXpE0srClp"
- }
- },
- {
- "cell_type": "markdown",
- "source": [
- "### a. All rows are included."
- ],
- "metadata": {
- "id": "LAD9A4CGrGJt"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.isna().sum()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "65ETAWT9rHpB",
- "outputId": "cad9f06b-6d97-4ef9-d44a-fc978addf3c9"
- },
- "execution_count": 116,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "ID 0\n",
- "Age 1473\n",
- "Height 9001\n",
- "Team 0\n",
- "NOC 0\n",
- "Games 0\n",
- "Year 0\n",
- "Season 0\n",
- "City 0\n",
- "Sport 0\n",
- "Event 0\n",
- "Medal 34699\n",
- "dtype: int64"
- ]
- },
- "metadata": {},
- "execution_count": 116
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "df['Height'].fillna(value=df['Height'].mean(), inplace=True)"
- ],
- "metadata": {
- "id": "PIu07JtJrlmu"
- },
- "execution_count": 117,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "df.info().sum()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/",
- "height": 508
- },
- "id": "ekeejLD9s6I5",
- "outputId": "a388962d-b4ec-47fe-bbe6-482d25f0f2d6"
- },
- "execution_count": 118,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "\n",
- "RangeIndex: 40616 entries, 0 to 40615\n",
- "Data columns (total 12 columns):\n",
- " # Column Non-Null Count Dtype \n",
- "--- ------ -------------- ----- \n",
- " 0 ID 40616 non-null int64 \n",
- " 1 Age 39143 non-null float64\n",
- " 2 Height 40616 non-null float64\n",
- " 3 Team 40616 non-null object \n",
- " 4 NOC 40616 non-null object \n",
- " 5 Games 40616 non-null object \n",
- " 6 Year 40616 non-null int64 \n",
- " 7 Season 40616 non-null object \n",
- " 8 City 40616 non-null object \n",
- " 9 Sport 40616 non-null object \n",
- " 10 Event 40616 non-null object \n",
- " 11 Medal 5917 non-null object \n",
- "dtypes: float64(2), int64(2), object(8)\n",
- "memory usage: 3.7+ MB\n"
- ]
- },
- {
- "output_type": "error",
- "ename": "AttributeError",
- "evalue": "ignored",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;31mAttributeError\u001b[0m: 'NoneType' object has no attribute 'sum'"
- ]
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "print(f\"{round(df['Height'].mean(), 2)} : average height\")"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "GZvlSKHUs9pI",
- "outputId": "98278521-673b-41ea-c6cb-14bdd0cd6323"
- },
- "execution_count": 119,
- "outputs": [
- {
- "output_type": "stream",
- "name": "stdout",
- "text": [
- "175.38 : average height\n"
- ]
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### b. All rows are included but grouped by Event."
- ],
- "metadata": {
- "id": "8jItbMYWtO4t"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.groupby(['Event'])['Height'].mean()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "2xFAO2nAtSyp",
- "outputId": "7c18c1ae-f5b2-412d-91fa-56d3019ac24e"
- },
- "execution_count": 120,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "Event\n",
- "Alpine Skiing Men's Combined 177.744508\n",
- "Alpine Skiing Men's Downhill 177.422101\n",
- "Alpine Skiing Men's Giant Slalom 176.334801\n",
- "Alpine Skiing Men's Slalom 176.161591\n",
- "Alpine Skiing Men's Super G 179.193301\n",
- " ... \n",
- "Wrestling Women's Flyweight, Freestyle 158.100000\n",
- "Wrestling Women's Heavyweight, Freestyle 173.444444\n",
- "Wrestling Women's Light-Heavyweight, Freestyle 170.000000\n",
- "Wrestling Women's Lightweight, Freestyle 162.666667\n",
- "Wrestling Women's Middleweight, Freestyle 164.533333\n",
- "Name: Height, Length: 726, dtype: float64"
- ]
- },
- "metadata": {},
- "execution_count": 120
- }
- ]
- },
- {
- "cell_type": "markdown",
- "source": [
- "### c. Bonus: Fin average height of all the athletes, but don't want to include any athlete more than once. \n",
- "\n",
- "Hint: We learned how to drop duplicates if EVERY column was identical\n",
- "\n",
- "Hint: Now we want to drop a row any time just the ID is repeated. To do this, you can use a subset of your data as an argument in the drop_duplicates function. You can also tell Python which of the duplicates to keep by using the keep argument."
- ],
- "metadata": {
- "id": "MwZIpOlmtrVk"
- }
- },
- {
- "cell_type": "code",
- "source": [
- "df.duplicated().sum()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "zDAEIiF-t9dd",
- "outputId": "d90572b4-16ff-456f-e5b3-88d5ac5fb7ce"
- },
- "execution_count": 121,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "207"
- ]
- },
- "metadata": {},
- "execution_count": 121
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# drops the identical rows\n",
- "new_df = df.drop_duplicates()"
- ],
- "metadata": {
- "id": "9RdX8TfduBRP"
- },
- "execution_count": 122,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "df.duplicated().sum()"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "pQHyua7zuEBl",
- "outputId": "c80cbcc9-80aa-418e-8d8f-c0bb0a2aba64"
- },
- "execution_count": 123,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "207"
- ]
- },
- "metadata": {},
- "execution_count": 123
- }
- ]
- },
- {
- "cell_type": "code",
- "source": [
- "# drops identical ids\n",
- "new_df = df.drop_duplicates(subset=['ID'])"
- ],
- "metadata": {
- "id": "SapWRK1cuKl1"
- },
- "execution_count": 124,
- "outputs": []
- },
- {
- "cell_type": "code",
- "source": [
- "df.shape, new_df.shape"
- ],
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "dsrJ5jZOurZI",
- "outputId": "dccdb2dd-14f6-4e4c-a648-fa79e064278c"
- },
- "execution_count": 126,
- "outputs": [
- {
- "output_type": "execute_result",
- "data": {
- "text/plain": [
- "((40616, 12), (20336, 12))"
- ]
- },
- "metadata": {},
- "execution_count": 126
- }
- ]
- }
- ]
-}
\ No newline at end of file