Skip to content

Commit c32e5f9

Browse files
authored
Update Explore BikeShare.py after first review
1 parent 7103427 commit c32e5f9

File tree

1 file changed

+149
-75
lines changed

1 file changed

+149
-75
lines changed

Project Explore BikeShare dataset/Explore BikeShare.py

Lines changed: 149 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,40 @@
11
#####
2-
##### Sondos Aabed Explores the Bokeshare Dataset
2+
##### Sondos Aabed Explores the Bikeshare Dataset
33
#####
44

55
### Importing the necessary libraries
66
import time
7+
import traceback # I used this to trace back the error catched
78
import pandas as pd
8-
import numpy as np
9+
# import numpy as np I didnit actually use np
910

1011
#### this is the csv files dictionary
1112
CITY_DATA = { 'chicago': 'chicago.csv',
1213
'new york city': 'new_york_city.csv',
1314
'washington': 'washington.csv' }
1415

16+
## in this method i take the user input and handle the entries to make sure they are valid
17+
def entery_validation(input_message, valid_inputs, invalid_messgae):
18+
"""
19+
Function that verifies the user input and if there was a problem it returns a prompt
20+
Args:
21+
(str) input_message - the message displayed to ask the user of input
22+
(list) valid_inputs - a list of enteries that are valid
23+
(str) invalid_messgae - a message to be displayed if the input is invalid
24+
Returns:
25+
(str) input - returns the input when it's valid
26+
"""
27+
## while
28+
while True:
29+
input_value = str(input("\n"+ input_message +"\n"))
30+
input_value = input_value.lower()
31+
if input_value not in valid_inputs:
32+
print(invalid_messgae)
33+
continue
34+
else:
35+
break
36+
return input_value
37+
1538
#### in this method get the filters inputted by the user
1639
def get_filters():
1740
"""
@@ -25,36 +48,29 @@ def get_filters():
2548
print('\nHello! Let\'s explore some US bikeshare data!')
2649
#####
2750
# In those cases an invalid input is handled by asking the user to try again until it's true input
51+
# because this is a redundunt code It was suggested to create anpther method that takes a message and returns the input if valid
2852
####
53+
54+
""" City input """
55+
city_input_message = "Which City would like to explore? All, Chicago, New york city, Or Washington?"
56+
city_invalid_message = "Try to enter another city that is either: Chicago, New york city, Or Washington "
57+
city_valid_enteries = ('all','new york city', 'chicago', 'washington')
2958
# get user input for city (chicago, new york city, washington).
30-
while True:
31-
city= input("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n")
32-
city=city.lower()
33-
if city not in ('all', 'new york city', 'chicago','washington'):
34-
print("Try to enter another city that is either: Chicago, New york city, Or Washington ")
35-
continue
36-
else:
37-
break
59+
city = entery_validation(city_input_message, city_valid_enteries,city_invalid_message)
3860

61+
""" Month input """
62+
month_input_message = "In which of the months you want to explore? is it (all, january, february, ... , june)"
63+
month_invalid_message = "Try to enter the month again, it wasn't a valid month!"
64+
month_valid_enteries = ('all','january','february','march','april','may','june','july','august','september','october','november','december')
3965
# get user input for month (all, january, february, ... , june)
40-
while True:
41-
month = input("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n")
42-
month = month.lower()
43-
if month not in ('all','january','february','march','april','may','june','july','august','september','october','november','december'):
44-
print("Try to enter the month again, it wasn't a valid month!")
45-
continue
46-
else:
47-
break
66+
month = entery_validation(month_input_message, month_valid_enteries, month_invalid_message)
4867

68+
""" Day input """
69+
day_input_messgae = "What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?"
70+
day_inavlid_message = "You entered a not valid day, try again"
71+
day_valid_enteries = ('sunday','monday','all','tuesday','wednesday','thursday','friday','saturday')
4972
# get user input for day of week (all, monday, tuesday, ... sunday)
50-
while True:
51-
day = input("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n")
52-
day = day.lower()
53-
if day not in ('sunday','monday','all','tuesday','wednesday','thursday','friday','saturday'):
54-
print("You entered a not valid day, try again")
55-
continue
56-
else:
57-
break
73+
day = entery_validation(day_input_messgae, day_valid_enteries, day_inavlid_message)
5874

5975
print('-'*40)
6076
return city, month, day
@@ -73,10 +89,10 @@ def load_data(city, month, day):
7389
"""
7490
# read the csv file using read_csv pandas based on the user input of cit
7591
# I have decided to add the option all because why not exploring all of them together giving a broader view
76-
if city not in ('all'):
92+
if city != 'all':
7793
df = pd.read_csv(CITY_DATA[city])
7894
else:
79-
# for all dataframes if the user choses all combine them
95+
# for all dataframes if the user choses all concate them
8096
dfs = []
8197
for city, path in CITY_DATA.items(all):
8298
dfC = pd.read_csv(path)
@@ -89,13 +105,29 @@ def load_data(city, month, day):
89105
## this metohd I created to clean the data
90106
## cleaning the data included handling missing data
91107
# also handle the high cardinality of dates
92-
def clean_data(df):
93-
df = handle_dates(df)
108+
def clean_data(df, city):
109+
"""
110+
Args:
111+
(pandas dataframe) df - takes a data frame with missing data probabloy and with not proper datatypes probably
112+
(city) df - because in the case of washington some coulmns doesn't exists
113+
Returns:
114+
(pandas dataframe) df - imputed with unknown and date handled
115+
"""
116+
df = handle_dates(df, city)
94117
df = handle_missing(df)
95118
return df
96119

97120
# this method I created to handle the missing data
98121
def handle_missing(df):
122+
# when I have created the method display data I have notived that there
123+
# is a missing coulmn name so I searched for it stands for on kaggle
124+
# and it makes since that this is the bike ID, I think in this case
125+
# the bike ID is irrelvant so I made the decision to drop it
126+
# althought a possible query comes to mind what if there is a frequent bike ID for example
127+
# in this project scope it is decided to drop it then
128+
# print(df.columns) it is at index 0
129+
df.drop(df.columns[0], axis = 1, inplace=True)
130+
99131
# I chose to fill them with Unknown
100132
print('We have {} missing enteries'.format(df.isnull().sum().sum()) )
101133
# fill Nan values using fillna method
@@ -104,13 +136,13 @@ def handle_missing(df):
104136
return df
105137

106138
## this method I created to handle teh dates
107-
def handle_dates(df):
139+
def handle_dates(df, city):
108140
"""
109141
Handle the dates as their datatypes using to_datetime pandas
110142
"""
143+
# convert to the proper data type
111144
df['Start Time'] = pd.to_datetime(df['Start Time'])
112145
df['End Time'] = pd.to_datetime(df['End Time'])
113-
df['Birth Year'] = pd.to_datetime(df['Birth Year'])
114146

115147
## this coulmn has high cardinality so I better create new coulmns that I can filter by
116148
# Like the day of the week and the month and the year and the time
@@ -124,18 +156,43 @@ def handle_dates(df):
124156
df['end_year'] = df['End Time'].dt.strftime('%Y')
125157
df['end_time'] = df['End Time'].dt.strftime('%X')
126158

127-
# we have also the coulmn of Birth year
128-
# df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129-
# this is not working for users stats
130-
# I have decided to handle this one as integer to get the min and max values
131-
df['Birth Year'] = pd.to_numeric(df['Birth Year'],errors='coerce' , downcast='integer')
159+
if city in ('new york city', 'chicago'):
160+
df['Birth Year'] = pd.to_datetime(df['Birth Year'])
161+
# we have also the coulmn of Birth year
162+
# df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
163+
# this is not working for users stats
164+
# I have decided to handle this one as integer to get the min and max values
165+
df['Birth Year'] = pd.to_numeric(df['Birth Year'],errors='coerce' , downcast='integer')
132166

133167
# dropped them after I handeld them
134168
df.drop('Start Time', axis=1, inplace=True)
135169
df.drop('End Time', axis=1, inplace=True)
136170

137171
return df
138172

173+
# In this function I ask the user if they want to see 5 of the rows
174+
# I use the head method build in by pandas to do that
175+
def display_data(df):
176+
view_data = input('\nWould you like to view 5 rows of individual trip data? Enter yes or no\n').lower()
177+
start_locaction = 0
178+
179+
# I actually will famalrize myself with df.iloc, I like the suggestion, the idea that I went for here that also came to my mind is
180+
# using the head function with its parameter
181+
182+
while view_data == 'yes':
183+
# while the usr wish to print print
184+
# print(df.head(start_locaction))
185+
186+
# So I started this solution but It doesn't actually perform this functionality
187+
# it prints from the first
188+
# So I will go for the suggested way hhhhhh
189+
190+
#using iloc
191+
print(df.iloc[start_locaction:start_locaction+5])
192+
# change the start location of the head print
193+
start_locaction=start_locaction +5
194+
view_data = input("Do you want to proceed showing the next 5 rows?\n").lower()
195+
139196
# this method get the time travel frequent times
140197
# to get that I used the mode built-in method
141198
def time_stats(df):
@@ -198,7 +255,7 @@ def trip_duration_stats(df):
198255

199256
# In this method I get some statics about the users
200257
# Using
201-
def user_stats(df):
258+
def user_stats(df, city):
202259
"""Displays statistics on bikeshare users."""
203260

204261
print('\nCalculating User Stats...\n')
@@ -208,51 +265,68 @@ def user_stats(df):
208265
print('In this city, we have diffrent types of users as follows: ')
209266
print(df['User Type'].value_counts())
210267

211-
# counts users based on gender
212-
print('The total count of each gender is as follow: ')
213-
print('Females:', df['Gender'].value_counts().get("Female", 0))
214-
print('Males:', df['Gender'].value_counts().get("Male", 0))
215-
print('Unknown:', df['Gender'].value_counts().get("Unknown", 0))
268+
# this condition because the washington csv doens't include gender and year birth coulmns
269+
if city in ('new york city', 'chicago'):
270+
# counts users based on gender
271+
print('The total count of each gender is as follow: ')
272+
print('Females:', df['Gender'].value_counts().get("Female", 0))
273+
print('Males:', df['Gender'].value_counts().get("Male", 0))
274+
print('Unknown:', df['Gender'].value_counts().get("Unknown", 0))
216275

217-
# So because I don't want to include the unknown value of these I will use a filter on the dataset
218-
# earliest year of birth
219-
print('The earliest year of birth is: ', df['Birth Year'].min())
276+
# So because I don't want to include the unknown value of these I will use a filter on the dataset
277+
# earliest year of birth
278+
print('The earliest year of birth is: ', df['Birth Year'].min())
220279

221-
# Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222-
# I am thinking to impute the missing birth year with the mode of it
223-
# but this will effect the time since I already imputed why impute twice
224-
# so what can I do ?
280+
# Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
281+
# I am thinking to impute the missing birth year with the mode of it
282+
# but this will effect the time since I already imputed why impute twice
283+
# so what can I do ?
225284

226-
# most recent of birth
227-
print('The most recent year of birth is: ', df['Birth Year'].max())
285+
# most recent of birth
286+
print('The most recent year of birth is: ', df['Birth Year'].max())
228287

229-
# most common year of birth
230-
print('The most common year of birth is: ', df['Birth Year'].mode()[0])
288+
# most common year of birth
289+
print('The most common year of birth is: ', df['Birth Year'].mode()[0])
231290

232291
print("\nThis took %s seconds." % (time.time() - start_time))
233292
print('-'*40)
234293

235294
def main():
236-
# start the program until the user hits no
237-
while True:
238-
# gets the filters
239-
city, month, day = get_filters()
240-
241-
# load the dataset
242-
df = load_data(city, month, day)
243-
244-
# clean the dataset
245-
df= clean_data(df)
246-
247-
# Display diffrent statics of the dataset
248-
time_stats(df)
249-
station_stats(df)
250-
trip_duration_stats(df)
251-
user_stats(df)
252-
253-
restart = input('\nWould you like to restart? Enter yes or no.\n')
254-
if restart.lower() != 'yes':
255-
break
295+
# start the program until the user hits no ot there exists an exception
296+
try:
297+
while True:
298+
# gets the filters
299+
city, month, day = get_filters()
300+
301+
# load the dataset
302+
df = load_data(city, month, day)
303+
304+
# clean the dataset
305+
# Here I pass the city because in case the city is washington
306+
# coulmns Gender and Birth Year coulmns doesn't exist
307+
df= clean_data(df, city)
308+
309+
# ask the user if they want to print the data
310+
display_data(df)
311+
312+
# Display diffrent statics of the dataset
313+
time_stats(df)
314+
station_stats(df)
315+
trip_duration_stats(df)
316+
# Here I pass the city because in case the city is washington
317+
# coulmns Gender and Birth Year coulmns doesn't exist
318+
user_stats(df, city)
319+
320+
# the user can restart and try diffrent cities if they
321+
# key hit no the program will hault
322+
restart = str(input('\nWould you like to restart? Enter yes or no.\n'))
323+
if restart.lower() != 'yes':
324+
break
325+
# Any exception that occures will be printed and traced
326+
except Exception as e:
327+
print("The program encountered an error: ",
328+
type(e).__name__, " : ", e)
329+
traceback.print_exc()
256330

257331
############################
258332

@@ -261,4 +335,4 @@ def main():
261335

262336
############################
263337
if __name__ == "__main__":
264-
main()
338+
main()

0 commit comments

Comments
 (0)