|
4 | 4 | "cell_type": "markdown",
|
5 | 5 | "metadata": {
|
6 | 6 | "application/vnd.databricks.v1+cell": {
|
7 |
| - "cellMetadata": {}, |
| 7 | + "cellMetadata": { |
| 8 | + "byteLimit": 2048000, |
| 9 | + "rowLimit": 10000 |
| 10 | + }, |
8 | 11 | "inputWidgets": {},
|
9 | 12 | "nuid": "f6ac9266-658c-4507-8a37-ecc81334b06d",
|
10 | 13 | "showTitle": false,
|
|
21 | 24 | "execution_count": 0,
|
22 | 25 | "metadata": {
|
23 | 26 | "application/vnd.databricks.v1+cell": {
|
24 |
| - "cellMetadata": {}, |
| 27 | + "cellMetadata": { |
| 28 | + "byteLimit": 2048000, |
| 29 | + "rowLimit": 10000 |
| 30 | + }, |
25 | 31 | "inputWidgets": {},
|
26 | 32 | "nuid": "d6194993-9312-4b46-8365-d6a73d8cb0b0",
|
27 | 33 | "showTitle": false,
|
|
40 | 46 | "cell_type": "markdown",
|
41 | 47 | "metadata": {
|
42 | 48 | "application/vnd.databricks.v1+cell": {
|
43 |
| - "cellMetadata": {}, |
| 49 | + "cellMetadata": { |
| 50 | + "byteLimit": 2048000, |
| 51 | + "rowLimit": 10000 |
| 52 | + }, |
44 | 53 | "inputWidgets": {},
|
45 | 54 | "nuid": "23f6a680-8f01-4976-83fa-88103e49c6b8",
|
46 | 55 | "showTitle": false,
|
|
176 | 185 | "execution_count": 0,
|
177 | 186 | "metadata": {
|
178 | 187 | "application/vnd.databricks.v1+cell": {
|
179 |
| - "cellMetadata": {}, |
| 188 | + "cellMetadata": { |
| 189 | + "byteLimit": 2048000, |
| 190 | + "rowLimit": 10000 |
| 191 | + }, |
180 | 192 | "inputWidgets": {},
|
181 | 193 | "nuid": "a762f84d-f622-4403-b695-4066e5bdbaf0",
|
182 | 194 | "showTitle": false,
|
183 | 195 | "tableResultSettingsMap": {},
|
184 | 196 | "title": ""
|
185 | 197 | }
|
186 | 198 | },
|
187 |
| - "outputs": [], |
| 199 | + "outputs": [ |
| 200 | + { |
| 201 | + "output_type": "stream", |
| 202 | + "name": "stdout", |
| 203 | + "output_type": "stream", |
| 204 | + "text": [ |
| 205 | + "+-------+---------+-------------+-------------+\n|ride_id|driver_id|ride_distance|ride_duration|\n+-------+---------+-------------+-------------+\n| 10| 10| 63| 38|\n| 13| 10| 73| 96|\n| 7| 8| 100| 28|\n| 17| 7| 119| 68|\n| 20| 1| 121| 92|\n| 5| 7| 42| 101|\n| 2| 4| 6| 38|\n| 11| 8| 37| 43|\n| 15| 8| 108| 82|\n| 12| 8| 38| 34|\n| 14| 1| 90| 74|\n+-------+---------+-------------+-------------+\n\n+-------+-------+------------+\n|ride_id|user_id|requested_at|\n+-------+-------+------------+\n| 10| 63| 2020-03-04|\n| 13| 52| 2020-06-22|\n| 7| 69| 2020-07-16|\n| 17| 70| 2020-08-25|\n| 20| 81| 2020-11-02|\n| 5| 57| 2020-11-09|\n| 2| 42| 2020-12-09|\n| 11| 68| 2021-01-11|\n| 15| 32| 2021-01-17|\n| 12| 11| 2021-01-19|\n| 14| 18| 2021-01-27|\n+-------+-------+------------+\n\n" |
| 206 | + ] |
| 207 | + } |
| 208 | + ], |
188 | 209 | "source": [
|
189 | 210 | "accepted_rides_data_1651 = [\n",
|
190 | 211 | " (10, 10, 63, 38), (13, 10, 73, 96), (7, 8, 100, 28),\n",
|
|
208 | 229 | "rides_df_1651 = spark.createDataFrame(data_rides_1651, columns_rides_1651)\n",
|
209 | 230 | "rides_df_1651.show()"
|
210 | 231 | ]
|
| 232 | + }, |
| 233 | + { |
| 234 | + "cell_type": "code", |
| 235 | + "execution_count": 0, |
| 236 | + "metadata": { |
| 237 | + "application/vnd.databricks.v1+cell": { |
| 238 | + "cellMetadata": { |
| 239 | + "byteLimit": 2048000, |
| 240 | + "rowLimit": 10000 |
| 241 | + }, |
| 242 | + "inputWidgets": {}, |
| 243 | + "nuid": "ca7fc839-d4ba-4e80-8d94-08762bdb684b", |
| 244 | + "showTitle": false, |
| 245 | + "tableResultSettingsMap": {}, |
| 246 | + "title": "" |
| 247 | + } |
| 248 | + }, |
| 249 | + "outputs": [], |
| 250 | + "source": [ |
| 251 | + "rides_2020_df_1651 = rides_df_1651\\\n", |
| 252 | + " .withColumn( \"requested_at\",\n", |
| 253 | + " coalesce(\n", |
| 254 | + " to_date(\"requested_at\", \"yyyy-MM-dd\"),\n", |
| 255 | + " to_date(\"requested_at\", \"yyyy-M-d\")\n", |
| 256 | + " )\n", |
| 257 | + " )\\\n", |
| 258 | + " .filter(year(\"requested_at\") == 2020)\\\n", |
| 259 | + " .withColumn(\"month\", month(\"requested_at\"))\\\n", |
| 260 | + " .select(\"ride_id\", \"month\")" |
| 261 | + ] |
| 262 | + }, |
| 263 | + { |
| 264 | + "cell_type": "code", |
| 265 | + "execution_count": 0, |
| 266 | + "metadata": { |
| 267 | + "application/vnd.databricks.v1+cell": { |
| 268 | + "cellMetadata": { |
| 269 | + "byteLimit": 2048000, |
| 270 | + "rowLimit": 10000 |
| 271 | + }, |
| 272 | + "inputWidgets": {}, |
| 273 | + "nuid": "a214a11d-f742-49dd-8c82-ce7ef7e307d8", |
| 274 | + "showTitle": false, |
| 275 | + "tableResultSettingsMap": {}, |
| 276 | + "title": "" |
| 277 | + } |
| 278 | + }, |
| 279 | + "outputs": [], |
| 280 | + "source": [ |
| 281 | + "accepted_2020_df_1651 = accepted_rides_df_1651\\\n", |
| 282 | + " .join(rides_2020_df_1651, on=\"ride_id\", how=\"inner\")" |
| 283 | + ] |
| 284 | + }, |
| 285 | + { |
| 286 | + "cell_type": "code", |
| 287 | + "execution_count": 0, |
| 288 | + "metadata": { |
| 289 | + "application/vnd.databricks.v1+cell": { |
| 290 | + "cellMetadata": { |
| 291 | + "byteLimit": 2048000, |
| 292 | + "rowLimit": 10000 |
| 293 | + }, |
| 294 | + "inputWidgets": {}, |
| 295 | + "nuid": "263f63ba-91b7-4be2-b472-ede18b99f80f", |
| 296 | + "showTitle": false, |
| 297 | + "tableResultSettingsMap": {}, |
| 298 | + "title": "" |
| 299 | + } |
| 300 | + }, |
| 301 | + "outputs": [], |
| 302 | + "source": [ |
| 303 | + "monthly_totals_df_1651 = accepted_2020_df_1651\\\n", |
| 304 | + " .groupBy(\"month\")\\\n", |
| 305 | + " .agg(\n", |
| 306 | + " sum(\"ride_distance\").alias(\"total_distance\"),\n", |
| 307 | + " sum(\"ride_duration\").alias(\"total_duration\")\n", |
| 308 | + " )" |
| 309 | + ] |
| 310 | + }, |
| 311 | + { |
| 312 | + "cell_type": "code", |
| 313 | + "execution_count": 0, |
| 314 | + "metadata": { |
| 315 | + "application/vnd.databricks.v1+cell": { |
| 316 | + "cellMetadata": { |
| 317 | + "byteLimit": 2048000, |
| 318 | + "rowLimit": 10000 |
| 319 | + }, |
| 320 | + "inputWidgets": {}, |
| 321 | + "nuid": "af94514f-f262-42ca-a280-be2a8ad3fd02", |
| 322 | + "showTitle": false, |
| 323 | + "tableResultSettingsMap": {}, |
| 324 | + "title": "" |
| 325 | + } |
| 326 | + }, |
| 327 | + "outputs": [], |
| 328 | + "source": [ |
| 329 | + "months_df_1651 = spark.createDataFrame([(m,) for m in range(1, 13)], [\"month\"])\n", |
| 330 | + "\n", |
| 331 | + "monthly_full_df_1651 = months_df_1651\\\n", |
| 332 | + " .join(monthly_totals_df_1651, on=\"month\", how=\"left\")\\\n", |
| 333 | + " .na.fill({\"total_distance\": 0, \"total_duration\": 0}).orderBy(\"month\")\n" |
| 334 | + ] |
| 335 | + }, |
| 336 | + { |
| 337 | + "cell_type": "code", |
| 338 | + "execution_count": 0, |
| 339 | + "metadata": { |
| 340 | + "application/vnd.databricks.v1+cell": { |
| 341 | + "cellMetadata": { |
| 342 | + "byteLimit": 2048000, |
| 343 | + "rowLimit": 10000 |
| 344 | + }, |
| 345 | + "inputWidgets": {}, |
| 346 | + "nuid": "cb979d51-7086-4db3-96bf-6ff5a624a61b", |
| 347 | + "showTitle": false, |
| 348 | + "tableResultSettingsMap": {}, |
| 349 | + "title": "" |
| 350 | + } |
| 351 | + }, |
| 352 | + "outputs": [], |
| 353 | + "source": [ |
| 354 | + "windowSpec = Window.orderBy(\"month\").rowsBetween(0, 2)" |
| 355 | + ] |
| 356 | + }, |
| 357 | + { |
| 358 | + "cell_type": "code", |
| 359 | + "execution_count": 0, |
| 360 | + "metadata": { |
| 361 | + "application/vnd.databricks.v1+cell": { |
| 362 | + "cellMetadata": { |
| 363 | + "byteLimit": 2048000, |
| 364 | + "rowLimit": 10000 |
| 365 | + }, |
| 366 | + "inputWidgets": {}, |
| 367 | + "nuid": "66ec2974-5f8b-4fa3-9260-53e68c288bdc", |
| 368 | + "showTitle": false, |
| 369 | + "tableResultSettingsMap": {}, |
| 370 | + "title": "" |
| 371 | + } |
| 372 | + }, |
| 373 | + "outputs": [ |
| 374 | + { |
| 375 | + "output_type": "stream", |
| 376 | + "name": "stderr", |
| 377 | + "output_type": "stream", |
| 378 | + "text": [ |
| 379 | + "/databricks/python/lib/python3.11/site-packages/pyspark/sql/connect/expressions.py:1017: UserWarning: WARN WindowExpression: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.\n warnings.warn(\n" |
| 380 | + ] |
| 381 | + }, |
| 382 | + { |
| 383 | + "output_type": "stream", |
| 384 | + "name": "stdout", |
| 385 | + "output_type": "stream", |
| 386 | + "text": [ |
| 387 | + "+-----+---------------------+---------------------+\n|month|average_ride_distance|average_ride_duration|\n+-----+---------------------+---------------------+\n| 1| 21.0| 12.67|\n| 2| 21.0| 12.67|\n| 3| 21.0| 12.67|\n| 4| 24.33| 32.0|\n| 5| 57.67| 41.33|\n| 6| 97.33| 64.0|\n| 7| 73.0| 32.0|\n| 8| 39.67| 22.67|\n| 9| 54.33| 64.33|\n| 10| 56.33| 77.0|\n+-----+---------------------+---------------------+\n\n" |
| 388 | + ] |
| 389 | + } |
| 390 | + ], |
| 391 | + "source": [ |
| 392 | + "monthly_full_df_1651\\\n", |
| 393 | + " .withColumn(\"sum_dist_3m\", sum(\"total_distance\").over(windowSpec))\\\n", |
| 394 | + " .withColumn(\"sum_dur_3m\", sum(\"total_duration\").over(windowSpec))\\\n", |
| 395 | + " .filter(F.col(\"month\") <= 10)\\\n", |
| 396 | + " .select(\n", |
| 397 | + " col(\"month\"),\n", |
| 398 | + " round(col(\"sum_dist_3m\") / lit(3), 2).alias(\"average_ride_distance\"),\n", |
| 399 | + " round(col(\"sum_dur_3m\") / lit(3), 2).alias(\"average_ride_duration\")\n", |
| 400 | + " )\\\n", |
| 401 | + " .orderBy(\"month\").show()" |
| 402 | + ] |
211 | 403 | }
|
212 | 404 | ],
|
213 | 405 | "metadata": {
|
214 | 406 | "application/vnd.databricks.v1+notebook": {
|
215 |
| - "computePreferences": null, |
| 407 | + "computePreferences": { |
| 408 | + "hardware": { |
| 409 | + "accelerator": null, |
| 410 | + "gpuPoolId": null, |
| 411 | + "memory": null |
| 412 | + } |
| 413 | + }, |
216 | 414 | "dashboards": [],
|
217 | 415 | "environmentMetadata": {
|
218 | 416 | "base_environment": "",
|
219 |
| - "environment_version": "1" |
| 417 | + "environment_version": "2" |
220 | 418 | },
|
221 | 419 | "inputWidgetPreferences": null,
|
222 | 420 | "language": "python",
|
|
0 commit comments