Skip to content
This repository was archived by the owner on Feb 3, 2021. It is now read-only.

Commit 18b74e4

Browse files
authored
Feature: Spark retry docker pull (#672)
* retry docker pulls * change order of pool, job, storage creation to reduce conflicts * add error message on docker-compose curl failure
1 parent 9e32b4b commit 18b74e4

File tree

3 files changed

+19
-10
lines changed

3 files changed

+19
-10
lines changed

aztk/client/cluster/helpers/create.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,9 @@ def create_pool_and_job_and_table(
2222
:param VmImageModel: the type of image to provision for the cluster
2323
:param wait: wait until the cluster is ready
2424
"""
25-
# update storage with the necessary values
25+
# save cluster configuration in storage
2626
core_cluster_operations.get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
2727

28-
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
29-
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
30-
3128
# reuse pool_id as job_id
3229
pool_id = cluster_conf.cluster_id
3330
job_id = cluster_conf.cluster_id
@@ -71,4 +68,8 @@ def create_pool_and_job_and_table(
7168
# Add job to batch
7269
core_cluster_operations.batch_client.job.add(job)
7370

71+
# create storage task table
72+
if cluster_conf.scheduling_target != models.SchedulingTarget.Any:
73+
core_cluster_operations.create_task_table(cluster_conf.cluster_id)
74+
7475
return helpers.get_cluster(cluster_conf.cluster_id, core_cluster_operations.batch_client)

aztk/client/job/helpers/submit.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,6 @@ def submit_job(
2929
core_job_operations.get_cluster_data(job_configuration.id).save_cluster_config(
3030
job_configuration.to_cluster_config())
3131

32-
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
33-
core_job_operations.create_task_table(job_configuration.id)
34-
3532
# get a verified node agent sku
3633
sku_to_use, image_ref_to_use = helpers.select_latest_verified_vm_image_with_node_agent_sku(
3734
vm_image_model.publisher, vm_image_model.offer, vm_image_model.sku, core_job_operations.batch_client)
@@ -84,4 +81,7 @@ def submit_job(
8481

8582
core_job_operations.batch_client.job_schedule.add(setup)
8683

84+
if job_configuration.scheduling_target != models.SchedulingTarget.Any:
85+
core_job_operations.create_task_table(job_configuration.id)
86+
8787
return core_job_operations.batch_client.job_schedule.get(job_schedule_id=job_configuration.id)

aztk/node_scripts/setup_host.sh

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,11 @@ install_prerequisites () {
4242

4343
install_docker_compose () {
4444
echo "Installing Docker-Compose"
45-
for i in {1..5}; do
46-
sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose && break || sleep 2;
45+
url=https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m`
46+
for i in {1..5}; do
47+
sudo curl -L $url -o /usr/local/bin/docker-compose && break ||
48+
echo "ERROR: failed to download docker-compose ... retrying in $($i**2) seconds" &&
49+
sleep $i**2;
4750
done
4851
sudo chmod +x /usr/local/bin/docker-compose
4952
echo "Finished installing Docker-Compose"
@@ -59,7 +62,12 @@ pull_docker_container () {
5962
docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD
6063
fi
6164

62-
docker pull $docker_repo_name
65+
66+
for i in {1..5}; do
67+
docker pull $docker_repo_name && break ||
68+
echo "ERROR: docker pull $docker_repo_name failed ... retrying after $($i**2) seconds" &&
69+
sleep $i**2;
70+
done
6371
echo "Finished pulling $docker_repo_name"
6472
}
6573

0 commit comments

Comments
 (0)