1
1
#####
2
- ##### Sondos Aabed Explores the Bokeshare Dataset
2
+ ##### Sondos Aabed Explores the Bikeshare Dataset
3
3
#####
4
4
5
5
### Importing the necessary libraries
6
6
import time
7
+ import traceback # I used this to trace back the error catched
7
8
import pandas as pd
8
- import numpy as np
9
+ # import numpy as np I didnit actually use np
9
10
10
11
#### this is the csv files dictionary
11
12
CITY_DATA = { 'chicago' : 'chicago.csv' ,
12
13
'new york city' : 'new_york_city.csv' ,
13
14
'washington' : 'washington.csv' }
14
15
16
+ ## in this method i take the user input and handle the entries to make sure they are valid
17
+ def entery_validation (input_message , valid_inputs , invalid_messgae ):
18
+ """
19
+ Function that verifies the user input and if there was a problem it returns a prompt
20
+ Args:
21
+ (str) input_message - the message displayed to ask the user of input
22
+ (list) valid_inputs - a list of enteries that are valid
23
+ (str) invalid_messgae - a message to be displayed if the input is invalid
24
+ Returns:
25
+ (str) input - returns the input when it's valid
26
+ """
27
+ ## while
28
+ while True :
29
+ input_value = str (input ("\n " + input_message + "\n " ))
30
+ input_value = input_value .lower ()
31
+ if input_value not in valid_inputs :
32
+ print (invalid_messgae )
33
+ continue
34
+ else :
35
+ break
36
+ return input_value
37
+
15
38
#### in this method get the filters inputted by the user
16
39
def get_filters ():
17
40
"""
@@ -25,36 +48,29 @@ def get_filters():
25
48
print ('\n Hello! Let\' s explore some US bikeshare data!' )
26
49
#####
27
50
# In those cases an invalid input is handled by asking the user to try again until it's true input
51
+ # because this is a redundunt code It was suggested to create anpther method that takes a message and returns the input if valid
28
52
####
53
+
54
+ """ City input """
55
+ city_input_message = "Which City would like to explore? All, Chicago, New york city, Or Washington?"
56
+ city_invalid_message = "Try to enter another city that is either: Chicago, New york city, Or Washington "
57
+ city_valid_enteries = ('all' ,'new york city' , 'chicago' , 'washington' )
29
58
# get user input for city (chicago, new york city, washington).
30
- while True :
31
- city = input ("\n Which City would like to explore? All, Chicago, New york city, Or Washington?\n " )
32
- city = city .lower ()
33
- if city not in ('all' , 'new york city' , 'chicago' ,'washington' ):
34
- print ("Try to enter another city that is either: Chicago, New york city, Or Washington " )
35
- continue
36
- else :
37
- break
59
+ city = entery_validation (city_input_message , city_valid_enteries ,city_invalid_message )
38
60
61
+ """ Month input """
62
+ month_input_message = "In which of the months you want to explore? is it (all, january, february, ... , june)"
63
+ month_invalid_message = "Try to enter the month again, it wasn't a valid month!"
64
+ month_valid_enteries = ('all' ,'january' ,'february' ,'march' ,'april' ,'may' ,'june' ,'july' ,'august' ,'september' ,'october' ,'november' ,'december' )
39
65
# get user input for month (all, january, february, ... , june)
40
- while True :
41
- month = input ("\n In which of the months you want to explore? is it (all, january, february, ... , june)\n " )
42
- month = month .lower ()
43
- if month not in ('all' ,'january' ,'february' ,'march' ,'april' ,'may' ,'june' ,'july' ,'august' ,'september' ,'october' ,'november' ,'december' ):
44
- print ("Try to enter the month again, it wasn't a valid month!" )
45
- continue
46
- else :
47
- break
66
+ month = entery_validation (month_input_message , month_valid_enteries , month_invalid_message )
48
67
68
+ """ Day input """
69
+ day_input_messgae = "What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?"
70
+ day_inavlid_message = "You entered a not valid day, try again"
71
+ day_valid_enteries = ('sunday' ,'monday' ,'all' ,'tuesday' ,'wednesday' ,'thursday' ,'friday' ,'saturday' )
49
72
# get user input for day of week (all, monday, tuesday, ... sunday)
50
- while True :
51
- day = input ("\n What about the day you are looking for? is it (all, monday, tuesday, ... sunday)?\n " )
52
- day = day .lower ()
53
- if day not in ('sunday' ,'monday' ,'all' ,'tuesday' ,'wednesday' ,'thursday' ,'friday' ,'saturday' ):
54
- print ("You entered a not valid day, try again" )
55
- continue
56
- else :
57
- break
73
+ day = entery_validation (day_input_messgae , day_valid_enteries , day_inavlid_message )
58
74
59
75
print ('-' * 40 )
60
76
return city , month , day
@@ -73,10 +89,10 @@ def load_data(city, month, day):
73
89
"""
74
90
# read the csv file using read_csv pandas based on the user input of cit
75
91
# I have decided to add the option all because why not exploring all of them together giving a broader view
76
- if city not in ( 'all' ) :
92
+ if city != 'all' :
77
93
df = pd .read_csv (CITY_DATA [city ])
78
94
else :
79
- # for all dataframes if the user choses all combine them
95
+ # for all dataframes if the user choses all concate them
80
96
dfs = []
81
97
for city , path in CITY_DATA .items (all ):
82
98
dfC = pd .read_csv (path )
@@ -89,13 +105,29 @@ def load_data(city, month, day):
89
105
## this metohd I created to clean the data
90
106
## cleaning the data included handling missing data
91
107
# also handle the high cardinality of dates
92
- def clean_data (df ):
93
- df = handle_dates (df )
108
+ def clean_data (df , city ):
109
+ """
110
+ Args:
111
+ (pandas dataframe) df - takes a data frame with missing data probabloy and with not proper datatypes probably
112
+ (city) df - because in the case of washington some coulmns doesn't exists
113
+ Returns:
114
+ (pandas dataframe) df - imputed with unknown and date handled
115
+ """
116
+ df = handle_dates (df , city )
94
117
df = handle_missing (df )
95
118
return df
96
119
97
120
# this method I created to handle the missing data
98
121
def handle_missing (df ):
122
+ # when I have created the method display data I have notived that there
123
+ # is a missing coulmn name so I searched for it stands for on kaggle
124
+ # and it makes since that this is the bike ID, I think in this case
125
+ # the bike ID is irrelvant so I made the decision to drop it
126
+ # althought a possible query comes to mind what if there is a frequent bike ID for example
127
+ # in this project scope it is decided to drop it then
128
+ # print(df.columns) it is at index 0
129
+ df .drop (df .columns [0 ], axis = 1 , inplace = True )
130
+
99
131
# I chose to fill them with Unknown
100
132
print ('We have {} missing enteries' .format (df .isnull ().sum ().sum ()) )
101
133
# fill Nan values using fillna method
@@ -104,13 +136,13 @@ def handle_missing(df):
104
136
return df
105
137
106
138
## this method I created to handle teh dates
107
- def handle_dates (df ):
139
+ def handle_dates (df , city ):
108
140
"""
109
141
Handle the dates as their datatypes using to_datetime pandas
110
142
"""
143
+ # convert to the proper data type
111
144
df ['Start Time' ] = pd .to_datetime (df ['Start Time' ])
112
145
df ['End Time' ] = pd .to_datetime (df ['End Time' ])
113
- df ['Birth Year' ] = pd .to_datetime (df ['Birth Year' ])
114
146
115
147
## this coulmn has high cardinality so I better create new coulmns that I can filter by
116
148
# Like the day of the week and the month and the year and the time
@@ -124,18 +156,43 @@ def handle_dates(df):
124
156
df ['end_year' ] = df ['End Time' ].dt .strftime ('%Y' )
125
157
df ['end_time' ] = df ['End Time' ].dt .strftime ('%X' )
126
158
127
- # we have also the coulmn of Birth year
128
- # df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
129
- # this is not working for users stats
130
- # I have decided to handle this one as integer to get the min and max values
131
- df ['Birth Year' ] = pd .to_numeric (df ['Birth Year' ],errors = 'coerce' , downcast = 'integer' )
159
+ if city in ('new york city' , 'chicago' ):
160
+ df ['Birth Year' ] = pd .to_datetime (df ['Birth Year' ])
161
+ # we have also the coulmn of Birth year
162
+ # df['Birth Year'] = pd.to_datetime(df['Birth Year'], format='%Y')
163
+ # this is not working for users stats
164
+ # I have decided to handle this one as integer to get the min and max values
165
+ df ['Birth Year' ] = pd .to_numeric (df ['Birth Year' ],errors = 'coerce' , downcast = 'integer' )
132
166
133
167
# dropped them after I handeld them
134
168
df .drop ('Start Time' , axis = 1 , inplace = True )
135
169
df .drop ('End Time' , axis = 1 , inplace = True )
136
170
137
171
return df
138
172
173
+ # In this function I ask the user if they want to see 5 of the rows
174
+ # I use the head method build in by pandas to do that
175
+ def display_data (df ):
176
+ view_data = input ('\n Would you like to view 5 rows of individual trip data? Enter yes or no\n ' ).lower ()
177
+ start_locaction = 0
178
+
179
+ # I actually will famalrize myself with df.iloc, I like the suggestion, the idea that I went for here that also came to my mind is
180
+ # using the head function with its parameter
181
+
182
+ while view_data == 'yes' :
183
+ # while the usr wish to print print
184
+ # print(df.head(start_locaction))
185
+
186
+ # So I started this solution but It doesn't actually perform this functionality
187
+ # it prints from the first
188
+ # So I will go for the suggested way hhhhhh
189
+
190
+ #using iloc
191
+ print (df .iloc [start_locaction :start_locaction + 5 ])
192
+ # change the start location of the head print
193
+ start_locaction = start_locaction + 5
194
+ view_data = input ("Do you want to proceed showing the next 5 rows?\n " ).lower ()
195
+
139
196
# this method get the time travel frequent times
140
197
# to get that I used the mode built-in method
141
198
def time_stats (df ):
@@ -198,7 +255,7 @@ def trip_duration_stats(df):
198
255
199
256
# In this method I get some statics about the users
200
257
# Using
201
- def user_stats (df ):
258
+ def user_stats (df , city ):
202
259
"""Displays statistics on bikeshare users."""
203
260
204
261
print ('\n Calculating User Stats...\n ' )
@@ -208,51 +265,68 @@ def user_stats(df):
208
265
print ('In this city, we have diffrent types of users as follows: ' )
209
266
print (df ['User Type' ].value_counts ())
210
267
211
- # counts users based on gender
212
- print ('The total count of each gender is as follow: ' )
213
- print ('Females:' , df ['Gender' ].value_counts ().get ("Female" , 0 ))
214
- print ('Males:' , df ['Gender' ].value_counts ().get ("Male" , 0 ))
215
- print ('Unknown:' , df ['Gender' ].value_counts ().get ("Unknown" , 0 ))
268
+ # this condition because the washington csv doens't include gender and year birth coulmns
269
+ if city in ('new york city' , 'chicago' ):
270
+ # counts users based on gender
271
+ print ('The total count of each gender is as follow: ' )
272
+ print ('Females:' , df ['Gender' ].value_counts ().get ("Female" , 0 ))
273
+ print ('Males:' , df ['Gender' ].value_counts ().get ("Male" , 0 ))
274
+ print ('Unknown:' , df ['Gender' ].value_counts ().get ("Unknown" , 0 ))
216
275
217
- # So because I don't want to include the unknown value of these I will use a filter on the dataset
218
- # earliest year of birth
219
- print ('The earliest year of birth is: ' , df ['Birth Year' ].min ())
276
+ # So because I don't want to include the unknown value of these I will use a filter on the dataset
277
+ # earliest year of birth
278
+ print ('The earliest year of birth is: ' , df ['Birth Year' ].min ())
220
279
221
- # Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
222
- # I am thinking to impute the missing birth year with the mode of it
223
- # but this will effect the time since I already imputed why impute twice
224
- # so what can I do ?
280
+ # Something doesn't add up here because it first displays to me the (unknown) so because I used it to fill the missing data
281
+ # I am thinking to impute the missing birth year with the mode of it
282
+ # but this will effect the time since I already imputed why impute twice
283
+ # so what can I do ?
225
284
226
- # most recent of birth
227
- print ('The most recent year of birth is: ' , df ['Birth Year' ].max ())
285
+ # most recent of birth
286
+ print ('The most recent year of birth is: ' , df ['Birth Year' ].max ())
228
287
229
- # most common year of birth
230
- print ('The most common year of birth is: ' , df ['Birth Year' ].mode ()[0 ])
288
+ # most common year of birth
289
+ print ('The most common year of birth is: ' , df ['Birth Year' ].mode ()[0 ])
231
290
232
291
print ("\n This took %s seconds." % (time .time () - start_time ))
233
292
print ('-' * 40 )
234
293
235
294
def main ():
236
- # start the program until the user hits no
237
- while True :
238
- # gets the filters
239
- city , month , day = get_filters ()
240
-
241
- # load the dataset
242
- df = load_data (city , month , day )
243
-
244
- # clean the dataset
245
- df = clean_data (df )
246
-
247
- # Display diffrent statics of the dataset
248
- time_stats (df )
249
- station_stats (df )
250
- trip_duration_stats (df )
251
- user_stats (df )
252
-
253
- restart = input ('\n Would you like to restart? Enter yes or no.\n ' )
254
- if restart .lower () != 'yes' :
255
- break
295
+ # start the program until the user hits no ot there exists an exception
296
+ try :
297
+ while True :
298
+ # gets the filters
299
+ city , month , day = get_filters ()
300
+
301
+ # load the dataset
302
+ df = load_data (city , month , day )
303
+
304
+ # clean the dataset
305
+ # Here I pass the city because in case the city is washington
306
+ # coulmns Gender and Birth Year coulmns doesn't exist
307
+ df = clean_data (df , city )
308
+
309
+ # ask the user if they want to print the data
310
+ display_data (df )
311
+
312
+ # Display diffrent statics of the dataset
313
+ time_stats (df )
314
+ station_stats (df )
315
+ trip_duration_stats (df )
316
+ # Here I pass the city because in case the city is washington
317
+ # coulmns Gender and Birth Year coulmns doesn't exist
318
+ user_stats (df , city )
319
+
320
+ # the user can restart and try diffrent cities if they
321
+ # key hit no the program will hault
322
+ restart = str (input ('\n Would you like to restart? Enter yes or no.\n ' ))
323
+ if restart .lower () != 'yes' :
324
+ break
325
+ # Any exception that occures will be printed and traced
326
+ except Exception as e :
327
+ print ("The program encountered an error: " ,
328
+ type (e ).__name__ , " : " , e )
329
+ traceback .print_exc ()
256
330
257
331
############################
258
332
@@ -261,4 +335,4 @@ def main():
261
335
262
336
############################
263
337
if __name__ == "__main__" :
264
- main ()
338
+ main ()
0 commit comments