diff --git a/.vscode/settings.json b/.vscode/settings.json index 43ff63b3..790874c0 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -14,5 +14,7 @@ "python.formatting.provider": "yapf", "python.formatting.yapfArgs": [ "--style=.style.yapf" - ] + ], + "python.venvPath": "${workspaceFolder}/ENV", + "python.pythonPath": "${workspaceFolder}\\ENV\\Scripts\\python.exe" } diff --git a/aztk/error.py b/aztk/error.py index 7fc2cfdc..6cb6a6c8 100644 --- a/aztk/error.py +++ b/aztk/error.py @@ -6,7 +6,7 @@ class AztkError(Exception): - def __init__(self, message: str = None): + def __init__(self, message: str=None): super().__init__(message) class ClusterNotReadyError(AztkError): diff --git a/aztk/internal/__init__.py b/aztk/internal/__init__.py index 74745aec..ff79d5ac 100644 --- a/aztk/internal/__init__.py +++ b/aztk/internal/__init__.py @@ -3,3 +3,4 @@ """ from .configuration_base import * +from .docker_cmd import * diff --git a/aztk/internal/cluster_data/node_data.py b/aztk/internal/cluster_data/node_data.py index 6975d07b..47641ab3 100644 --- a/aztk/internal/cluster_data/node_data.py +++ b/aztk/internal/cluster_data/node_data.py @@ -1,11 +1,10 @@ import fnmatch import io -import json import os -import yaml import zipfile from pathlib import Path from typing import List +import yaml from aztk.spark import models from aztk.utils import constants, file_utils, secure_utils from aztk.error import InvalidCustomScriptError @@ -147,7 +146,8 @@ def _add_plugins(self): execute='{0}/{1}'.format(plugin.name, plugin.execute), args=plugin.args, env=plugin.env, - runOn=plugin.run_on.value, + target=plugin.target.value, + target_role=plugin.target_role.value, )) self.zipf.writestr(os.path.join('plugins', 'plugins-manifest.yaml'), yaml.dump(data)) diff --git a/aztk/internal/docker_cmd.py b/aztk/internal/docker_cmd.py new file mode 100644 index 00000000..7dc75e1e --- /dev/null +++ b/aztk/internal/docker_cmd.py @@ -0,0 +1,38 @@ +import os +from aztk.utils.command_builder import CommandBuilder + +class DockerCmd: + """ + Class helping to write a docker command + """ + + def __init__(self, name: str, docker_repo: str, cmd: str, gpu_enabled=False): + if gpu_enabled: + self.cmd = CommandBuilder('nvidia-docker run') + else: + self.cmd = CommandBuilder('docker run') + self.cmd.add_option('--net', 'host') + self.cmd.add_option('--name', name) + self.cmd.add_argument('-d') + self.cmd.add_argument(docker_repo) + self.cmd.add_argument(cmd) + + + def add_env(self, env: str, value: str): + self.cmd.add_option('-e', '{0}={1}'.format(env, value)) + + def pass_env(self, env: str): + """ + Give the value of an environment variable in the main process to the docker image + """ + self.cmd.add_option('-e', '{0}'.format(env)) + + def share_folder(self, folder: str): + self.cmd.add_option('-v', '{0}:{0}'.format(folder)) + + def open_port(self, port: int): + self.cmd.add_option('-p', '{0}:{0}'.format(port)) # Spark Master UI + + + def to_str(self): + return self.cmd.to_str() diff --git a/aztk/models/plugins/internal/plugin_manager.py b/aztk/models/plugins/internal/plugin_manager.py index acd60c7c..66180e12 100644 --- a/aztk/models/plugins/internal/plugin_manager.py +++ b/aztk/models/plugins/internal/plugin_manager.py @@ -20,6 +20,7 @@ class PluginManager: jupyterlab=plugins.JupyterLabPlugin, rstudio_server=plugins.RStudioServerPlugin, hdfs=plugins.HDFSPlugin, + simple=plugins.SimplePlugin, spark_ui_proxy=plugins.SparkUIProxyPlugin, ) diff --git a/aztk/models/plugins/plugin_configuration.py b/aztk/models/plugins/plugin_configuration.py index c32dc3a8..b5f35049 100644 --- a/aztk/models/plugins/plugin_configuration.py +++ b/aztk/models/plugins/plugin_configuration.py @@ -1,8 +1,23 @@ -import inspect -from typing import List, Union from enum import Enum -from .plugin_file import PluginFile +from typing import List, Union from aztk.internal import ConfigurationBase +from aztk.error import InvalidPluginConfigurationError +from .plugin_file import PluginFile + +class PluginTarget(Enum): + """ + Where this plugin should run + """ + SparkContainer = "spark-container", + Host = "host", + + +class PluginTargetRole(Enum): + Master = "master" + Worker = "worker" + All = "all-nodes" + + class PluginPort: """ @@ -12,8 +27,7 @@ class PluginPort: :param name: [Optional] name to differentiate ports if you have multiple """ - def __init__(self, internal: int, public: Union[int, bool] = False, name=None): - + def __init__(self, internal: int, public: Union[int, bool]=False, name=None): self.internal = internal self.expose_publicly = bool(public) self.public_port = None @@ -26,11 +40,6 @@ def __init__(self, internal: int, public: Union[int, bool] = False, name=None): self.name = name -class PluginRunTarget(Enum): - Master = "master" - Worker = "worker" - All = "all-nodes" - class PluginConfiguration(ConfigurationBase): @@ -45,15 +54,17 @@ class PluginConfiguration(ConfigurationBase): def __init__(self, name: str, - ports: List[PluginPort] = None, - files: List[PluginFile] = None, - execute: str = None, + ports: List[PluginPort]=None, + files: List[PluginFile]=None, + execute: str=None, args=None, env=None, - run_on: PluginRunTarget = PluginRunTarget.Master): + target_role: PluginTargetRole=PluginTargetRole.Master, + target: PluginTarget=PluginTarget.SparkContainer): self.name = name # self.docker_image = docker_image - self.run_on = run_on + self.target = target + self.target_role = target_role self.ports = ports or [] self.files = files or [] self.args = args or [] @@ -64,11 +75,18 @@ def has_arg(self, name: str): for x in self.args: if x.name == name: return True - else: - return False + return False def validate(self): self._validate_required([ "name", "execute", ]) + + if not isinstance(self.target, PluginTarget): + raise InvalidPluginConfigurationError( + "Target must be of type Plugin target but was {0}".format(self.target)) + + if not isinstance(self.target_role, PluginTargetRole): + raise InvalidPluginConfigurationError( + "Target role must be of type Plugin target role but was {0}".format(self.target)) diff --git a/aztk/node_scripts/core/config.py b/aztk/node_scripts/core/config.py index ee8eb9bc..4252e377 100644 --- a/aztk/node_scripts/core/config.py +++ b/aztk/node_scripts/core/config.py @@ -16,14 +16,14 @@ '/providers/[^/]+' '/[^/]+Accounts/(?P[^/]+)$') -batch_account_name = os.environ["AZ_BATCH_ACCOUNT_NAME"] -batch_account_key = os.environ["BATCH_ACCOUNT_KEY"] -batch_service_url = os.environ["BATCH_SERVICE_URL"] -tenant_id = os.environ["SP_TENANT_ID"] -client_id = os.environ["SP_CLIENT_ID"] -credential = os.environ["SP_CREDENTIAL"] -batch_resource_id = os.environ["SP_BATCH_RESOURCE_ID"] -storage_resource_id = os.environ["SP_STORAGE_RESOURCE_ID"] +batch_account_name = os.environ.get("AZ_BATCH_ACCOUNT_NAME") +batch_account_key = os.environ.get("BATCH_ACCOUNT_KEY") +batch_service_url = os.environ.get("BATCH_SERVICE_URL") +tenant_id = os.environ.get("SP_TENANT_ID") +client_id = os.environ.get("SP_CLIENT_ID") +credential = os.environ.get("SP_CREDENTIAL") +batch_resource_id = os.environ.get("SP_BATCH_RESOURCE_ID") +storage_resource_id = os.environ.get("SP_STORAGE_RESOURCE_ID") pool_id = os.environ["AZ_BATCH_POOL_ID"] node_id = os.environ["AZ_BATCH_NODE_ID"] @@ -33,9 +33,9 @@ spark_worker_ui_port = os.environ["SPARK_WORKER_UI_PORT"] spark_job_ui_port = os.environ["SPARK_JOB_UI_PORT"] -storage_account_name = os.environ["STORAGE_ACCOUNT_NAME"] -storage_account_key = os.environ["STORAGE_ACCOUNT_KEY"] -storage_account_suffix = os.environ["STORAGE_ACCOUNT_SUFFIX"] +storage_account_name = os.environ.get("STORAGE_ACCOUNT_NAME") +storage_account_key = os.environ.get("STORAGE_ACCOUNT_KEY") +storage_account_suffix = os.environ.get("STORAGE_ACCOUNT_SUFFIX") def get_blob_client() -> blob.BlockBlobService: if not storage_resource_id: diff --git a/aztk/node_scripts/docker_main.sh b/aztk/node_scripts/docker_main.sh index 9af7528e..960e9913 100644 --- a/aztk/node_scripts/docker_main.sh +++ b/aztk/node_scripts/docker_main.sh @@ -3,12 +3,13 @@ # This file is the entry point of the docker container. set -e +echo "Initializing spark container" # -------------------- # Setup custom scripts # -------------------- -custom_script_dir=$DOCKER_WORKING_DIR/custom-scripts -aztk_dir=$DOCKER_WORKING_DIR/aztk +custom_script_dir=$AZTK_WORKING_DIR/custom-scripts +aztk_dir=$AZTK_WORKING_DIR/aztk # ----------------------- # Preload jupyter samples @@ -28,11 +29,11 @@ done echo "Starting setup using Docker" $(pyenv root)/versions/$AZTK_PYTHON_VERSION/bin/pip install -r $(dirname $0)/requirements.txt -export PYTHONPATH=$PYTHONPATH:$DOCKER_WORKING_DIR -echo 'export PYTHONPATH=$PYTHONPATH:$DOCKER_WORKING_DIR' >> ~/.bashrc +export PYTHONPATH=$PYTHONPATH:$AZTK_WORKING_DIR +echo 'export PYTHONPATH=$PYTHONPATH:$AZTK_WORKING_DIR' >> ~/.bashrc echo "Running main.py script" -$(pyenv root)/versions/$AZTK_PYTHON_VERSION/bin/python $(dirname $0)/main.py install +$(pyenv root)/versions/$AZTK_PYTHON_VERSION/bin/python $(dirname $0)/main.py setup-spark-container # sleep to keep container running while true; do sleep 1; done diff --git a/aztk/node_scripts/install/create_user.py b/aztk/node_scripts/install/create_user.py index 5c595d45..b87f641d 100644 --- a/aztk/node_scripts/install/create_user.py +++ b/aztk/node_scripts/install/create_user.py @@ -6,11 +6,11 @@ from datetime import datetime, timezone, timedelta import yaml ''' - Creates a user if the user configuration file at $DOCKER_WORKING_DIR/user.yaml exists + Creates a user if the user configuration file at $AZTK_WORKING_DIR/user.yaml exists ''' def create_user(batch_client): - path = os.path.join(os.environ['DOCKER_WORKING_DIR'], "user.yaml") + path = os.path.join(os.environ['AZTK_WORKING_DIR'], "user.yaml") if not os.path.isfile(path): print("No user to create.") @@ -43,7 +43,7 @@ def decrypt_password(user_conf): tag = user_conf['tag'] # Read private key - with open(os.path.join(os.environ['DOCKER_WORKING_DIR'], 'id_rsa'), encoding='UTF-8') as f: + with open(os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa'), encoding='UTF-8') as f: private_key = RSA.import_key(f.read()) # Decrypt the session key with the public RSA key cipher_rsa = PKCS1_OAEP.new(private_key) diff --git a/aztk/node_scripts/install/install.py b/aztk/node_scripts/install/install.py index 4337778d..c18cb94b 100644 --- a/aztk/node_scripts/install/install.py +++ b/aztk/node_scripts/install/install.py @@ -1,47 +1,74 @@ import os from core import config -from install import pick_master, spark, scripts, create_user, plugins +from install import pick_master, spark, scripts, create_user, plugins, spark_container import wait_until_master_selected +from aztk.models.plugins import PluginTarget +from aztk.internal import cluster_data +def read_cluster_config(): + data = cluster_data.ClusterData(config.blob_client, config.pool_id) + cluster_config = data.read_cluster_config() + print("Got cluster config", cluster_config) + return cluster_config -def setup_node(): +def setup_host(docker_repo: str): + """ + Code to be run on the node(NOT in a container) + """ client = config.batch_client create_user.create_user(batch_client=client) - - spark.setup_conf() - - if os.environ['AZ_BATCH_NODE_IS_DEDICATED'] == "true" or os.environ['MIXED_MODE'] == "False": + if os.environ['AZ_BATCH_NODE_IS_DEDICATED'] == "true" or os.environ['AZTK_MIXED_MODE'] == "False": is_master = pick_master.find_master(client) else: is_master = False wait_until_master_selected.main() + is_worker = not is_master or os.environ["AZTK_WORKER_ON_MASTER"] master_node_id = pick_master.get_master_node_id(config.batch_client.pool.get(config.pool_id)) master_node = config.batch_client.compute_node.get(config.pool_id, master_node_id) - os.environ["MASTER_IP"] = master_node.ip_address - if is_master: - setup_as_master() - plugins.setup_plugins(is_master=True, is_worker=True) - scripts.run_custom_scripts(is_master=True, is_worker=True) - else: - setup_as_worker() - plugins.setup_plugins(is_master=False, is_worker=True) - scripts.run_custom_scripts(is_master=False, is_worker=True) + os.environ["AZTK_IS_MASTER"] = "1" + if is_worker: + os.environ["AZTK_IS_WORKER"] = "1" - open("/tmp/setup_complete", 'a').close() + os.environ["AZTK_MASTER_IP"] = master_node.ip_address + + cluster_conf = read_cluster_config() + + spark_container.start_spark_container( + docker_repo=docker_repo, + gpu_enabled=os.environ.get("AZTK_GPU_ENABLED") == "true", + plugins=cluster_conf.plugins, + ) + plugins.setup_plugins(target=PluginTarget.Host, is_master=is_master, is_worker=is_worker) + + +def setup_spark_container(): + """ + Code run in the main spark container + """ + is_master = os.environ["AZTK_IS_MASTER"] + is_worker = os.environ["AZTK_IS_WORKER"] + print("Setting spark container. Master: ", is_master, ", Worker: ", is_worker) + print("Copying spark setup config") + spark.setup_conf() + print("Done copying spark setup config") + + master_node_id = pick_master.get_master_node_id(config.batch_client.pool.get(config.pool_id)) + master_node = config.batch_client.compute_node.get(config.pool_id, master_node_id) -def setup_as_master(): - print("Setting up as master.") spark.setup_connection() - spark.start_spark_master() - if os.environ["WORKER_ON_MASTER"] == "True": + + if is_master: + spark.start_spark_master() + + if is_worker: spark.start_spark_worker() -def setup_as_worker(): - print("Setting up as worker.") - spark.setup_connection() - spark.start_spark_worker() + plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker) + scripts.run_custom_scripts(is_master=is_master, is_worker=is_worker) + + open("/tmp/setup_complete", 'a').close() diff --git a/aztk/node_scripts/install/plugins.py b/aztk/node_scripts/install/plugins.py index 02dd057b..4a11fa28 100644 --- a/aztk/node_scripts/install/plugins.py +++ b/aztk/node_scripts/install/plugins.py @@ -3,25 +3,24 @@ import yaml import subprocess from pathlib import Path +from aztk.models.plugins import PluginTarget, PluginTargetRole -log_folder = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'logs','plugins') +log_folder = os.path.join(os.environ['AZTK_WORKING_DIR'], 'logs','plugins') def _read_manifest_file(path=None): - custom_scripts = None if not os.path.isfile(path): print("Plugins manifest file doesn't exist at {0}".format(path)) else: with open(path, 'r', encoding='UTF-8') as stream: try: - custom_scripts = yaml.load(stream) + return yaml.load(stream) except json.JSONDecodeError as err: print("Error in plugins manifest: {0}".format(err)) - return custom_scripts -def setup_plugins(is_master: bool = False, is_worker: bool = False): +def setup_plugins(target: PluginTarget, is_master: bool = False, is_worker: bool = False): plugins_dir = _plugins_dir() plugins_manifest = _read_manifest_file( @@ -31,39 +30,33 @@ def setup_plugins(is_master: bool = False, is_worker: bool = False): os.makedirs(log_folder) if plugins_manifest is not None: - _setup_plugins(plugins_manifest, is_master, is_worker) + _setup_plugins(plugins_manifest, target, is_master, is_worker) def _plugins_dir(): - return os.path.join(os.environ['DOCKER_WORKING_DIR'], 'plugins') + return os.path.join(os.environ['AZTK_WORKING_DIR'], 'plugins') -def _run_on_this_node(plugin_obj=None, is_master=False, is_worker=False): - if plugin_obj['runOn'] == 'master' and is_master is True: +def _run_on_this_node(plugin_obj, target: PluginTarget, is_master, is_worker): + if plugin_obj['target'] != target.value: + print("Ignoring ", plugin_obj["execute"], " as target is for ", plugin_obj['target'], " but is currently running in ", target.value) + return False + + if plugin_obj['target_role'] == PluginTargetRole.Master.value and is_master is True: return True - if plugin_obj['runOn'] == 'worker' and is_worker is True: + if plugin_obj['target_role'] == PluginTargetRole.Worker.value and is_worker is True: return True - if plugin_obj['runOn'] == 'all-nodes': + if plugin_obj['target_role'] == PluginTargetRole.All.value: return True return False -def _setup_plugins(plugins_manifest, is_master=False, is_worker=False): +def _setup_plugins(plugins_manifest, target: PluginTarget, is_master, is_worker): plugins_dir = _plugins_dir() - if is_master: - os.environ["IS_MASTER"] = "1" - else: - os.environ["IS_MASTER"] = "0" - - if is_worker: - os.environ["IS_WORKER"] = "1" - else: - os.environ["IS_WORKER"] = "0" - for plugin in plugins_manifest: - if _run_on_this_node(plugin, is_master, is_worker): + if _run_on_this_node(plugin, target, is_master, is_worker): path = os.path.join(plugins_dir, plugin['execute']) _run_script(plugin.get("name"), path, plugin.get('args'), plugin.get('env')) diff --git a/aztk/node_scripts/install/scripts.py b/aztk/node_scripts/install/scripts.py index c3534824..5c1426e0 100644 --- a/aztk/node_scripts/install/scripts.py +++ b/aztk/node_scripts/install/scripts.py @@ -14,7 +14,7 @@ def _read_yaml_file(path=None): custom_scripts = yaml.load(stream) except yaml.YAMLError as err: print("Error in cluster.yaml: {0}".format(err)) - + return custom_scripts @@ -25,7 +25,7 @@ def _run_on_this_node(script_obj=None, is_master=False, is_worker=False): return True if script_obj['runOn'] == 'all-nodes': return True - + return False @@ -64,17 +64,7 @@ def _run_scripts_dir(root: str = None): def run_custom_scripts(is_master: bool = False, is_worker: bool = False): - if is_master: - os.environ["IS_MASTER"] = "1" - else: - os.environ["IS_MASTER"] = "0" - - if is_worker: - os.environ["IS_WORKER"] = "1" - else: - os.environ["IS_WORKER"] = "0" - - custom_scripts_dir = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'custom-scripts') + custom_scripts_dir = os.path.join(os.environ['AZTK_WORKING_DIR'], 'custom-scripts') custom_scripts = _read_yaml_file(os.path.join(custom_scripts_dir, 'custom-scripts.yaml')) diff --git a/aztk/node_scripts/install/spark.py b/aztk/node_scripts/install/spark.py index feb375f4..1a354228 100644 --- a/aztk/node_scripts/install/spark.py +++ b/aztk/node_scripts/install/spark.py @@ -17,6 +17,18 @@ spark_home = "/home/spark-current" spark_conf_folder = os.path.join(spark_home, "conf") + +def setup_as_master(): + print("Setting up as master.") + setup_connection() + start_spark_master() + + +def setup_as_worker(): + print("Setting up as worker.") + setup_connection() + start_spark_worker() + def get_pool() -> batchmodels.CloudPool: return batch_client.pool.get(config.pool_id) @@ -120,8 +132,8 @@ def setup_conf(): def setup_ssh_keys(): - pub_key_path_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'id_rsa.pub') - priv_key_path_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'id_rsa') + pub_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa.pub') + priv_key_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'id_rsa') ssh_key_dest = '/root/.ssh' if not os.path.exists(ssh_key_dest): @@ -132,26 +144,26 @@ def setup_ssh_keys(): def copy_spark_env(): - spark_env_path_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'conf/spark-env.sh') + spark_env_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-env.sh') spark_env_path_dest = os.path.join(spark_home, 'conf/spark-env.sh') copyfile(spark_env_path_src, spark_env_path_dest) def copy_spark_defaults(): - spark_default_path_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'conf/spark-defaults.conf') + spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/spark-defaults.conf') spark_default_path_dest = os.path.join(spark_home, 'conf/spark-defaults.conf') copyfile(spark_default_path_src, spark_default_path_dest) def copy_core_site(): - spark_core_site_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'conf/core-site.xml') + spark_core_site_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'conf/core-site.xml') spark_core_site_dest = os.path.join(spark_home, 'conf/core-site.xml') copyfile(spark_core_site_src, spark_core_site_dest) def copy_jars(): # Copy jars to $SPARK_HOME/jars - spark_default_path_src = os.path.join(os.environ['DOCKER_WORKING_DIR'], 'jars') + spark_default_path_src = os.path.join(os.environ['AZTK_WORKING_DIR'], 'jars') spark_default_path_dest = os.path.join(spark_home, 'jars') try: diff --git a/aztk/node_scripts/install/spark_container.py b/aztk/node_scripts/install/spark_container.py new file mode 100644 index 00000000..434bb5f2 --- /dev/null +++ b/aztk/node_scripts/install/spark_container.py @@ -0,0 +1,71 @@ +import subprocess + +from aztk.internal import DockerCmd +from aztk.utils import constants + +def start_spark_container( + docker_repo: str=None, + gpu_enabled: bool=False, + file_mounts=None, + plugins=None): + + cmd = DockerCmd( + name=constants.DOCKER_SPARK_CONTAINER_NAME, + docker_repo=docker_repo, + cmd="/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh", + gpu_enabled=gpu_enabled) + + if file_mounts: + for mount in file_mounts: + cmd.share_folder(mount.mount_path) + cmd.share_folder('/mnt/batch/tasks') + + cmd.pass_env('AZTK_WORKING_DIR') + cmd.pass_env('AZ_BATCH_ACCOUNT_NAME') + cmd.pass_env('BATCH_ACCOUNT_KEY') + cmd.pass_env('BATCH_SERVICE_URL') + cmd.pass_env('STORAGE_ACCOUNT_NAME') + cmd.pass_env('STORAGE_ACCOUNT_KEY') + cmd.pass_env('STORAGE_ACCOUNT_SUFFIX') + + cmd.pass_env('SP_TENANT_ID') + cmd.pass_env('SP_CLIENT_ID') + cmd.pass_env('SP_CREDENTIAL') + cmd.pass_env('SP_BATCH_RESOURCE_ID') + cmd.pass_env('SP_STORAGE_RESOURCE_ID') + + cmd.pass_env('AZ_BATCH_POOL_ID') + cmd.pass_env('AZ_BATCH_NODE_ID') + cmd.pass_env('AZ_BATCH_NODE_IS_DEDICATED') + + cmd.pass_env('AZTK_WORKER_ON_MASTER') + cmd.pass_env('AZTK_MIXED_MODE') + cmd.pass_env('AZTK_IS_MASTER') + cmd.pass_env('AZTK_IS_WORKER') + cmd.pass_env('AZTK_MASTER_IP') + + cmd.pass_env('SPARK_WEB_UI_PORT') + cmd.pass_env('SPARK_WORKER_UI_PORT') + cmd.pass_env('SPARK_CONTAINER_NAME') + cmd.pass_env('SPARK_SUBMIT_LOGS_FILE') + cmd.pass_env('SPARK_JOB_UI_PORT') + + cmd.open_port(8080) # Spark Master UI + cmd.open_port(7077) # Spark Master + cmd.open_port(7337) # Spark Shuffle Service + cmd.open_port(4040) # Job UI + cmd.open_port(18080) # Spark History Server UI + cmd.open_port(3022) # Docker SSH + + if plugins: + for plugin in plugins: + for port in plugin.ports: + cmd.open_port(port.internal) + + print("="*60) + print(" Starting docker container") + print("-"*60) + print(cmd.to_str()) + print("="*60) + subprocess.call(['/bin/bash', '-c', 'echo Is master?: $AZTK_IS_MASTER _ $AZTK_IS_WORKER']) + subprocess.call(['/bin/bash', '-c', cmd.to_str()]) diff --git a/aztk/node_scripts/main.py b/aztk/node_scripts/main.py index afd5edaa..727b188e 100644 --- a/aztk/node_scripts/main.py +++ b/aztk/node_scripts/main.py @@ -1,4 +1,5 @@ import sys +import aztk.spark from install import install def run(): @@ -8,8 +9,10 @@ def run(): action = sys.argv[1] - if action == "install": - install.setup_node() + if action == "setup-node": + install.setup_host(sys.argv[2]) + elif action == "setup-spark-container": + install.setup_spark_container() else: print("Action not supported") diff --git a/aztk/node_scripts/requirements.txt b/aztk/node_scripts/requirements.txt index 1dc6cfc4..abbb81be 100644 --- a/aztk/node_scripts/requirements.txt +++ b/aztk/node_scripts/requirements.txt @@ -1,7 +1,7 @@ azure-batch==3.0.0 -azure-storage-blob==1.1.0 -azure-mgmt-storage==1.5.0 azure-mgmt-batch==5.0.0 +azure-mgmt-storage==1.5.0 +azure-storage-blob==1.1.0 pyyaml==3.12 pycryptodome==3.4.7 diff --git a/aztk/node_scripts/setup_node.sh b/aztk/node_scripts/setup_host.sh similarity index 67% rename from aztk/node_scripts/setup_node.sh rename to aztk/node_scripts/setup_host.sh index 7cb996ce..5e2e07f3 100644 --- a/aztk/node_scripts/setup_node.sh +++ b/aztk/node_scripts/setup_host.sh @@ -2,12 +2,13 @@ # Entry point for the start task. It will install all dependencies and start docker. # Usage: -# setup_node.sh [container_name] [gpu_enabled] [docker_repo] [docker_cmd] +# setup_host.sh [container_name] [docker_repo_name] + +export AZTK_WORKING_DIR=/mnt/batch/tasks/startup/wd +export PYTHONUNBUFFERED=TRUE container_name=$1 -gpu_enabled=$2 -repo_name=$3 -docker_run_cmd=$4 +docker_repo_name=$2 echo "Installing pre-reqs" apt-get -y install linux-image-extra-$(uname -r) linux-image-extra-virtual @@ -15,6 +16,7 @@ apt-get -y install apt-transport-https apt-get -y install curl apt-get -y install ca-certificates apt-get -y install software-properties-common +apt-get -y install python3-pip python-dev build-essential libssl-dev echo "Done installing pre-reqs" # Install docker @@ -30,7 +32,13 @@ if ! host $HOSTNAME ; then echo $(hostname -I | awk '{print $1}') $HOSTNAME >> /etc/hosts fi -if [ $gpu_enabled == "True" ]; then +# Install docker-compose +echo "Installing Docker-Componse" +sudo curl -L https://github.com/docker/compose/releases/download/1.19.0/docker-compose-`uname -s`-`uname -m` -o /usr/local/bin/docker-compose +sudo chmod +x /usr/local/bin/docker-compose +echo "Done installing Docker-Compose" + +if [ $AZTK_GPU_ENABLED == "True" ]; then echo "running nvidia install" sudo apt-get -y install nvidia-384 sudo apt-get -y install nvidia-modprobe @@ -47,12 +55,12 @@ else docker login $DOCKER_ENDPOINT --username $DOCKER_USERNAME --password $DOCKER_PASSWORD fi -echo "Pulling $repo_name" -(time docker pull $repo_name) 2>&1 +echo "Pulling $docker_repo_name" +(time docker pull $docker_repo_name) 2>&1 # Unzip resource files and set permissions apt-get -y install unzip -chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/docker_main.sh +chmod 777 $AZTK_WORKING_DIR/aztk/node_scripts/docker_main.sh # Check docker is running docker info > /dev/null 2>&1 @@ -67,16 +75,26 @@ if [ "$(docker ps -a -q -f name=$container_name)" ]; then docker restart $container_name else echo "Creating docker container." - # Start docker - eval $docker_run_cmd + + echo "Node python version:" + python3 --version + # Install python dependencies + pip3 install -r $(dirname $0)/requirements.txt + export PYTHONPATH=$PYTHONPATH:$AZTK_WORKING_DIR + + echo "Running setup python script" + python3 $(dirname $0)/main.py setup-node $docker_repo_name # wait until container is running until [ "`/usr/bin/docker inspect -f {{.State.Running}} $container_name`"=="true" ]; do sleep 0.1; done; + + # wait until container setup is complete - docker exec spark /bin/bash -c 'python $DOCKER_WORKING_DIR/aztk/node_scripts/wait_until_setup_complete.py' + echo "Waiting for spark docker container to setup." + docker exec spark /bin/bash -c 'python $AZTK_WORKING_DIR/aztk/node_scripts/wait_until_setup_complete.py' # Setup symbolic link for the docker logs docker_log=$(docker inspect --format='{{.LogPath}}' $container_name) diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 55c22cd6..d706f1ea 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -12,7 +12,6 @@ from aztk.spark.helpers import cluster_diagnostic_helper from aztk.spark.utils import util from aztk.internal.cluster_data import NodeData -import yaml class Client(BaseClient): @@ -202,7 +201,7 @@ def submit_job(self, job_configuration): autoscale_formula = "$TargetDedicatedNodes = {0}; " \ "$TargetLowPriorityNodes = {1}".format( - job_configuration.max_dedicated_nodes, + job_configuration.max_dedicated_nodes, job_configuration.max_low_pri_nodes) job = self.__submit_job( diff --git a/aztk/spark/helpers/create_cluster.py b/aztk/spark/helpers/create_cluster.py index fdc81106..2119efaa 100644 --- a/aztk/spark/helpers/create_cluster.py +++ b/aztk/spark/helpers/create_cluster.py @@ -11,72 +11,16 @@ scope=batch_models.AutoUserScope.pool, elevation_level=batch_models.ElevationLevel.admin)) -''' -Cluster create helper methods -''' -def __docker_run_cmd(docker_repo: str = None, - gpu_enabled: bool = False, - worker_on_master: bool = True, - file_mounts = None, - plugins = None, - mixed_mode = False) -> str: - """ - Build the docker run command by setting up the environment variables - """ - if gpu_enabled: - cmd = CommandBuilder('nvidia-docker run') - else: - cmd = CommandBuilder('docker run') - cmd.add_option('--net', 'host') - cmd.add_option('--name', constants.DOCKER_SPARK_CONTAINER_NAME) - cmd.add_option('-v', '/mnt/batch/tasks:/mnt/batch/tasks') - - if file_mounts: - for mount in file_mounts: - cmd.add_option('-v', '{0}:{0}'.format(mount.mount_path)) - - cmd.add_option('-e', 'DOCKER_WORKING_DIR=/mnt/batch/tasks/startup/wd') - cmd.add_option('-e', 'AZ_BATCH_ACCOUNT_NAME=$AZ_BATCH_ACCOUNT_NAME') - cmd.add_option('-e', 'BATCH_ACCOUNT_KEY=$BATCH_ACCOUNT_KEY') - cmd.add_option('-e', 'BATCH_SERVICE_URL=$BATCH_SERVICE_URL') - cmd.add_option('-e', 'STORAGE_ACCOUNT_NAME=$STORAGE_ACCOUNT_NAME') - cmd.add_option('-e', 'STORAGE_ACCOUNT_KEY=$STORAGE_ACCOUNT_KEY') - cmd.add_option('-e', 'STORAGE_ACCOUNT_SUFFIX=$STORAGE_ACCOUNT_SUFFIX') - cmd.add_option('-e', 'SP_TENANT_ID=$SP_TENANT_ID') - cmd.add_option('-e', 'SP_CLIENT_ID=$SP_CLIENT_ID') - cmd.add_option('-e', 'SP_CREDENTIAL=$SP_CREDENTIAL') - cmd.add_option('-e', 'SP_BATCH_RESOURCE_ID=$SP_BATCH_RESOURCE_ID') - cmd.add_option('-e', 'SP_STORAGE_RESOURCE_ID=$SP_STORAGE_RESOURCE_ID') - cmd.add_option('-e', 'AZ_BATCH_POOL_ID=$AZ_BATCH_POOL_ID') - cmd.add_option('-e', 'AZ_BATCH_NODE_ID=$AZ_BATCH_NODE_ID') - cmd.add_option( - '-e', 'AZ_BATCH_NODE_IS_DEDICATED=$AZ_BATCH_NODE_IS_DEDICATED') +def _get_aztk_environment(worker_on_master, mixed_mode): + envs = [] + envs.append(batch_models.EnvironmentSetting(name="AZTK_MIXED_MODE", value=mixed_mode)) if worker_on_master is not None: - cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(worker_on_master)) + envs.append(batch_models.EnvironmentSetting( + name="AZTK_WORKER_ON_MASTER", value=worker_on_master)) else: - # default to True if not specified - cmd.add_option('-e', 'WORKER_ON_MASTER={}'.format(True)) - cmd.add_option('-e', 'MIXED_MODE={}'.format(mixed_mode)) - cmd.add_option('-e', 'SPARK_WEB_UI_PORT=$SPARK_WEB_UI_PORT') - cmd.add_option('-e', 'SPARK_WORKER_UI_PORT=$SPARK_WORKER_UI_PORT') - cmd.add_option('-e', 'SPARK_CONTAINER_NAME=$SPARK_CONTAINER_NAME') - cmd.add_option('-e', 'SPARK_SUBMIT_LOGS_FILE=$SPARK_SUBMIT_LOGS_FILE') - cmd.add_option('-e', 'SPARK_JOB_UI_PORT=$SPARK_JOB_UI_PORT') - cmd.add_option('-p', '8080:8080') # Spark Master UI - cmd.add_option('-p', '7077:7077') # Spark Master - cmd.add_option('-p', '7337:7337') # Spark Shuffle Service - cmd.add_option('-p', '4040:4040') # Job UI - cmd.add_option('-p', '18080:18080') # Spark History Server UI - cmd.add_option('-p', '3022:3022') # Docker SSH - if plugins: - for plugin in plugins: - for port in plugin.ports: - cmd.add_option('-p', '{0}:{1}'.format(port.internal, port.internal)) # Jupyter UI - - cmd.add_option('-d', docker_repo) - cmd.add_argument('/bin/bash /mnt/batch/tasks/startup/wd/aztk/node_scripts/docker_main.sh') - - return cmd.to_str() + envs.append(batch_models.EnvironmentSetting( + name="AZTK_WORKER_ON_MASTER", value=False)) + return envs def __get_docker_credentials(spark_client): creds = [] @@ -162,12 +106,11 @@ def __cluster_install_cmd(zip_resource_file: batch_models.ResourceFile, 'apt-get -y install unzip', 'unzip $AZ_BATCH_TASK_WORKING_DIR/{0}'.format( zip_resource_file.file_path), - 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_node.sh', - '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_node.sh {0} {1} {2} "{3}"'.format( + 'chmod 777 $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh', + '/bin/bash $AZ_BATCH_TASK_WORKING_DIR/aztk/node_scripts/setup_host.sh {0} {1}'.format( constants.DOCKER_SPARK_CONTAINER_NAME, - gpu_enabled, docker_repo, - __docker_run_cmd(docker_repo, gpu_enabled, worker_on_master, file_mounts, plugins, mixed_mode)), + ), ] commands = shares + setup @@ -208,7 +151,9 @@ def generate_cluster_start_task( name="SPARK_CONTAINER_NAME", value=spark_container_name), batch_models.EnvironmentSetting( name="SPARK_SUBMIT_LOGS_FILE", value=spark_submit_logs_file), - ] + __get_docker_credentials(spark_client) + batch_models.EnvironmentSetting( + name="AZTK_GPU_ENABLED", value=gpu_enabled), + ] + __get_docker_credentials(spark_client) + _get_aztk_environment(worker_on_master, mixed_mode) # start task command command = __cluster_install_cmd(zip_resource_file, gpu_enabled, docker_repo, plugins, worker_on_master, file_shares, mixed_mode) diff --git a/aztk/spark/helpers/job_submission.py b/aztk/spark/helpers/job_submission.py index 25ff038f..96f15766 100644 --- a/aztk/spark/helpers/job_submission.py +++ b/aztk/spark/helpers/job_submission.py @@ -21,7 +21,7 @@ def __app_cmd(): docker_exec.add_option("-e", "AZ_BATCH_JOB_ID=$AZ_BATCH_JOB_ID") docker_exec.add_argument("spark /bin/bash >> output.log 2>&1 -c \""\ "source ~/.bashrc; "\ - "python \$DOCKER_WORKING_DIR/aztk/node_scripts/job_submission.py\"") + "python \$AZTK_WORKING_DIR/aztk/node_scripts/job_submission.py\"") return docker_exec.to_str() diff --git a/aztk/spark/helpers/submit.py b/aztk/spark/helpers/submit.py index 5e7ac365..37557743 100644 --- a/aztk/spark/helpers/submit.py +++ b/aztk/spark/helpers/submit.py @@ -85,7 +85,7 @@ def generate_task(spark_client, container_id, application): task_cmd.add_argument('-c "source ~/.bashrc; '\ 'cd $AZ_BATCH_TASK_WORKING_DIR; ' \ '\$(pyenv root)/versions/\$AZTK_PYTHON_VERSION/bin/python ' \ - '\$DOCKER_WORKING_DIR/aztk/node_scripts/submit.py"') + '\$AZTK_WORKING_DIR/aztk/node_scripts/submit.py"') # Create task task = batch_models.TaskAddParameter( diff --git a/aztk/spark/models/plugins/__init__.py b/aztk/spark/models/plugins/__init__.py index 7ea2cdff..73fe3f3b 100644 --- a/aztk/spark/models/plugins/__init__.py +++ b/aztk/spark/models/plugins/__init__.py @@ -2,4 +2,5 @@ from .jupyter import * from .jupyter_lab import * from .rstudio_server import * +from .simple import * from .spark_ui_proxy import * diff --git a/aztk/spark/models/plugins/hdfs/configuration.py b/aztk/spark/models/plugins/hdfs/configuration.py index ddc9d190..99a73d6f 100644 --- a/aztk/spark/models/plugins/hdfs/configuration.py +++ b/aztk/spark/models/plugins/hdfs/configuration.py @@ -1,5 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginRunTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile from aztk.utils import constants @@ -38,7 +38,7 @@ def __init__(self): public=True, ), ], - run_on=PluginRunTarget.All, + target_role=PluginTargetRole.All, execute="hdfs.sh", files=[ PluginFile("hdfs.sh", os.path.join(dir_path, "hdfs.sh")), diff --git a/aztk/spark/models/plugins/hdfs/hdfs.sh b/aztk/spark/models/plugins/hdfs/hdfs.sh index b2e94f70..ba8d6c5e 100644 --- a/aztk/spark/models/plugins/hdfs/hdfs.sh +++ b/aztk/spark/models/plugins/hdfs/hdfs.sh @@ -21,14 +21,14 @@ sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' /usr/sbin/sshd # install and configure hadoop -mkdir /home/hadoop-2.8.2 -curl http://apache.claz.org/hadoop/common/hadoop-2.8.2/hadoop-2.8.2.tar.gz | tar -xz -C /home +mkdir /home/hadoop-2.8.3 +curl http://apache.claz.org/hadoop/common/hadoop-2.8.3/hadoop-2.8.3.tar.gz | tar -xz -C /home -export HADOOP_HOME=/home/hadoop-2.8.2 -echo 'export HADOOP_HOME=/home/hadoop-2.8.2' >> ~/.bashrc +export HADOOP_HOME=/home/hadoop-2.8.3 +echo 'export HADOOP_HOME=/home/hadoop-2.8.3' >> ~/.bashrc -export HADOOP_CONF_DIR=/home/hadoop-2.8.2/etc/hadoop -echo 'export HADOOP_CONF_DIR=/home/hadoop-2.8.2/etc/hadoop' >> ~/.bashrc +export HADOOP_CONF_DIR=/home/hadoop-2.8.3/etc/hadoop +echo 'export HADOOP_CONF_DIR=/home/hadoop-2.8.3/etc/hadoop' >> ~/.bashrc export PATH=$PATH:$HADOOP_HOME/bin echo 'export PATH=$PATH:$HADOOP_HOME/bin' >> ~/.bashrc @@ -41,7 +41,7 @@ echo ' fs.defaultFS - hdfs://'$MASTER_IP':8020 + hdfs://'$AZTK_MASTER_IP':8020 ' > $HADOOP_HOME/etc/hadoop/core-site.xml @@ -59,12 +59,12 @@ echo ' ' > $HADOOP_HOME/etc/hadoop/hdfs-site.xml # run HDFS -if [ $IS_MASTER -eq "1" ]; then +if [ "$AZTK_IS_MASTER" -eq "1" ]; then echo 'starting namenode and datanode' hdfs namenode -format $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode else - echo 'starting datanode - namenode at ' $MASTER_IP ':8020' + echo 'starting datanode - namenode at ' $AZTK_MASTER_IP ':8020' $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode fi diff --git a/aztk/spark/models/plugins/jupyter/configuration.py b/aztk/spark/models/plugins/jupyter/configuration.py index 01f4289d..ef53f78e 100644 --- a/aztk/spark/models/plugins/jupyter/configuration.py +++ b/aztk/spark/models/plugins/jupyter/configuration.py @@ -1,5 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginRunTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile from aztk.utils import constants @@ -15,7 +15,7 @@ def __init__(self): public=True, ), ], - run_on=PluginRunTarget.All, + target_role=PluginTargetRole.All, execute="jupyter.sh", files=[ PluginFile("jupyter.sh", os.path.join(dir_path, "jupyter.sh")), diff --git a/aztk/spark/models/plugins/jupyter/jupyter.sh b/aztk/spark/models/plugins/jupyter/jupyter.sh index ab71b2fa..14e1e410 100644 --- a/aztk/spark/models/plugins/jupyter/jupyter.sh +++ b/aztk/spark/models/plugins/jupyter/jupyter.sh @@ -8,7 +8,9 @@ # - aztk/python:spark2.1.0-python3.6.2-base # - aztk/python:spark2.1.0-python3.6.2-gpu -if [ "$IS_MASTER" = "1" ]; then +echo "Is master: $AZTK_IS_MASTER" + +if [ "$AZTK_IS_MASTER" = "1" ]; then pip install jupyter --upgrade pip install notebook --upgrade @@ -45,7 +47,7 @@ if [ "$IS_MASTER" = "1" ]; then "env": { "SPARK_HOME": "$SPARK_HOME", "PYSPARK_PYTHON": "python", - "PYSPARK_SUBMIT_ARGS": "--master spark://$MASTER_IP:7077 pyspark-shell" + "PYSPARK_SUBMIT_ARGS": "--master spark://$AZTK_MASTER_IP:7077 pyspark-shell" } } EOF diff --git a/aztk/spark/models/plugins/jupyter_lab/configuration.py b/aztk/spark/models/plugins/jupyter_lab/configuration.py index 768384f3..cced476a 100644 --- a/aztk/spark/models/plugins/jupyter_lab/configuration.py +++ b/aztk/spark/models/plugins/jupyter_lab/configuration.py @@ -1,5 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginRunTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile from aztk.utils import constants @@ -15,7 +15,7 @@ def __init__(self): public=True, ), ], - run_on=PluginRunTarget.All, + run_on=PluginTargetRole.All, execute="jupyter_lab.sh", files=[ PluginFile("jupyter_lab.sh", os.path.join(dir_path, "jupyter_lab.sh")), diff --git a/aztk/spark/models/plugins/rstudio_server/configuration.py b/aztk/spark/models/plugins/rstudio_server/configuration.py index 02081e0c..fd6a422d 100644 --- a/aztk/spark/models/plugins/rstudio_server/configuration.py +++ b/aztk/spark/models/plugins/rstudio_server/configuration.py @@ -1,5 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginRunTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile from aztk.utils import constants dir_path = os.path.dirname(os.path.realpath(__file__)) @@ -15,7 +15,7 @@ def __init__(self, version="1.1.383"): public=True, ), ], - run_on=PluginRunTarget.Master, + target_role=PluginTargetRole.Master, execute="rstudio_server.sh", files=[ PluginFile("rstudio_server.sh", os.path.join(dir_path, "rstudio_server.sh")), diff --git a/aztk/spark/models/plugins/rstudio_server/rstudio_server.sh b/aztk/spark/models/plugins/rstudio_server/rstudio_server.sh index cd9c3538..c586318b 100644 --- a/aztk/spark/models/plugins/rstudio_server/rstudio_server.sh +++ b/aztk/spark/models/plugins/rstudio_server/rstudio_server.sh @@ -7,7 +7,7 @@ # - jiata/aztk-r:0.1.0-spark2.1.0-r3.4.1 # - jiata/aztk-r:0.1.0-spark1.6.3-r3.4.1 -if [ "$IS_MASTER" = "1" ]; then +if [ "$AZTK_IS_MASTER" = "1" ]; then ## Download and install Rstudio Server wget https://download2.rstudio.org/rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb diff --git a/aztk/spark/models/plugins/simple/__init__.py b/aztk/spark/models/plugins/simple/__init__.py new file mode 100644 index 00000000..2ec26f31 --- /dev/null +++ b/aztk/spark/models/plugins/simple/__init__.py @@ -0,0 +1 @@ +from .configuration import * diff --git a/aztk/spark/models/plugins/simple/configuration.py b/aztk/spark/models/plugins/simple/configuration.py new file mode 100644 index 00000000..f0ea6622 --- /dev/null +++ b/aztk/spark/models/plugins/simple/configuration.py @@ -0,0 +1,18 @@ +import os +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole, PluginTarget +from aztk.models.plugins.plugin_file import PluginFile +from aztk.utils import constants + +dir_path = os.path.dirname(os.path.realpath(__file__)) + +class SimplePlugin(PluginConfiguration): + def __init__(self): + super().__init__( + name="simple", + target_role=PluginTargetRole.All, + target=PluginTarget.Host, + execute="simple.sh", + files=[ + PluginFile("simple.sh", os.path.join(dir_path, "simple.sh")), + ], + ) diff --git a/aztk/spark/models/plugins/simple/simple.sh b/aztk/spark/models/plugins/simple/simple.sh new file mode 100644 index 00000000..a237e07c --- /dev/null +++ b/aztk/spark/models/plugins/simple/simple.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +echo "Is master: $AZTK_IS_MASTER" +echo "Is worker: $AZTK_IS_WORKER" +echo "Master node ip: $AZTK_MASTER_IP" diff --git a/aztk/spark/models/plugins/spark_ui_proxy/configuration.py b/aztk/spark/models/plugins/spark_ui_proxy/configuration.py index 92c726ab..ceb71e67 100644 --- a/aztk/spark/models/plugins/spark_ui_proxy/configuration.py +++ b/aztk/spark/models/plugins/spark_ui_proxy/configuration.py @@ -1,5 +1,5 @@ import os -from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginRunTarget +from aztk.models.plugins.plugin_configuration import PluginConfiguration, PluginPort, PluginTargetRole from aztk.models.plugins.plugin_file import PluginFile from aztk.utils import constants @@ -16,7 +16,7 @@ def __init__(self): public=True ) ], - run_on=PluginRunTarget.Master, + target_role=PluginTargetRole.Master, execute="spark_ui_proxy.sh", args=["localhost:8080", "9999"], files=[ diff --git a/aztk/utils/ssh.py b/aztk/utils/ssh.py index bd139683..733a7ea0 100644 --- a/aztk/utils/ssh.py +++ b/aztk/utils/ssh.py @@ -9,8 +9,6 @@ import sys from concurrent.futures import ThreadPoolExecutor -import paramiko - from . import helpers @@ -19,6 +17,7 @@ def connect(hostname, username=None, password=None, pkey=None): + import paramiko client = paramiko.SSHClient() diff --git a/custom-scripts/hdfs.sh b/custom-scripts/hdfs.sh index b2e94f70..b475a6db 100644 --- a/custom-scripts/hdfs.sh +++ b/custom-scripts/hdfs.sh @@ -41,7 +41,7 @@ echo ' fs.defaultFS - hdfs://'$MASTER_IP':8020 + hdfs://'$AZTK_MASTER_IP':8020 ' > $HADOOP_HOME/etc/hadoop/core-site.xml @@ -59,12 +59,12 @@ echo ' ' > $HADOOP_HOME/etc/hadoop/hdfs-site.xml # run HDFS -if [ $IS_MASTER -eq "1" ]; then +if [ "$AZTK_IS_MASTER" -eq "1" ]; then echo 'starting namenode and datanode' hdfs namenode -format $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start namenode $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode else - echo 'starting datanode - namenode at ' $MASTER_IP ':8020' + echo 'starting datanode - namenode at ' $AZTK_MASTER_IP ':8020' $HADOOP_HOME/sbin/hadoop-daemon.sh --config $HADOOP_CONF_DIR --script hdfs start datanode fi diff --git a/custom-scripts/jupyter.sh b/custom-scripts/jupyter.sh index ab71b2fa..595173e5 100644 --- a/custom-scripts/jupyter.sh +++ b/custom-scripts/jupyter.sh @@ -8,7 +8,7 @@ # - aztk/python:spark2.1.0-python3.6.2-base # - aztk/python:spark2.1.0-python3.6.2-gpu -if [ "$IS_MASTER" = "1" ]; then +if [ "$AZTK_IS_MASTER" = "1" ]; then pip install jupyter --upgrade pip install notebook --upgrade @@ -45,7 +45,7 @@ if [ "$IS_MASTER" = "1" ]; then "env": { "SPARK_HOME": "$SPARK_HOME", "PYSPARK_PYTHON": "python", - "PYSPARK_SUBMIT_ARGS": "--master spark://$MASTER_IP:7077 pyspark-shell" + "PYSPARK_SUBMIT_ARGS": "--master spark://$AZTK_MASTER_IP:7077 pyspark-shell" } } EOF diff --git a/custom-scripts/rstudio_server.sh b/custom-scripts/rstudio_server.sh index fa69b937..f50e438b 100644 --- a/custom-scripts/rstudio_server.sh +++ b/custom-scripts/rstudio_server.sh @@ -7,7 +7,7 @@ # - jiata/aztk-r:0.1.0-spark2.1.0-r3.4.1 # - jiata/aztk-r:0.1.0-spark1.6.3-r3.4.1 -if [ "$IS_MASTER" = "1" ]; then +if [ "$AZTK_IS_MASTER" = "1" ]; then ## Download and install Rstudio Server wget https://download2.rstudio.org/rstudio-server-$RSTUDIO_SERVER_VERSION-amd64.deb diff --git a/custom-scripts/simple.sh b/custom-scripts/simple.sh index a408ab36..871075f5 100644 --- a/custom-scripts/simple.sh +++ b/custom-scripts/simple.sh @@ -1,10 +1,12 @@ #!/bin/bash -if [ "$IS_MASTER" = "1" ]; then +# Demo plugin. Not actually meant to be used. + +if [ "$AZTK_IS_MASTER" = "1" ]; then echo "This is a custom script running on just the master!" fi -if [ "$IS_WORKER" = "1" ]; then +if [ "$AZTK_IS_WORKER" = "1" ]; then echo "This is a custom script running on just the workers!" fi diff --git a/docs/51-define-plugin.md b/docs/51-define-plugin.md new file mode 100644 index 00000000..1096209f --- /dev/null +++ b/docs/51-define-plugin.md @@ -0,0 +1,89 @@ + +# Define a custom plugin + +## Full example +```py + +from aztk.spark.models.plugins import PluginConfiguration, PluginFile,PluginPort, PluginTarget, PluginTargetRole + +cluster_config = ClusterConfiguration( + ...# Other config, + plugins=[ + PluginConfiguration( + name="my-custom-plugin", + files=[ + PluginFile("file.sh", "/my/local/path/to/file.sh"), + PluginFile("data/one.json", "/my/local/path/to/data/one.json"), + PluginFile("data/two.json", "/my/local/path/to/data/two.json"), + ], + execute="file.sh", # This must be one of the files defined in the file list and match the target path, + env=dict( + SOME_ENV_VAR="foo" + ), + args=["arg1"], # Those arguments are passed to your execute script + ports=[ + PluginPort(internal="1234"), # Internal only(For node communication for example) + PluginPort(internal="2345", public=True), # Open the port to the public(When ssh into). Used for UI for example + ], + + # Pick where you want the plugin to run + target=PluginTarget.Host, # The script will be run on the host. Default value is to run in the spark container + target_role=PluginTargetRole.All, # If the plugin should be run only on the master worker or all. You can use environment variables(See below to have different master/worker config) + ) + ] +) +``` + +## Parameters + +### `PluginConfiguration` +| Name | Required? | Type | Description | +|--------------|-----------|---------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `name` | required | string | Name of your plugin(This will be used for creating folder, it is recommended to have a simple letter, dash, underscore only name) | +| `files` | required | List[PluginFile|PluginTextFile] | List of files to upload | +| `execute` | required | str | Script to execute. This script must be defined in the files above and must match its remote path | +| `args` | optional | List[str] | List of arguments to be passed to your execute scripts | +| `env` | optional | dict | List of environment variables to access in the script(This can be used to pass arguments to your script instead of args) | +| `ports` | optional | List[PluginPort] | List of ports to open if the script is running in a container. A port can also be specific public and it will then be accessible when ssh into the master node. | +| `target` | optional | PluginTarget | Define where the execute script should be running. Potential values are `PluginTarget.SparkContainer(Default)` and `PluginTarget.Host` | +| `taget_role` | optional | PluginTargetRole | If the plugin should be run only on the master worker or all. You can use environment variables(See below to have different master/worker config) | | + +### `PluginFile` +| Name | Required? | Type | Description | +|--------------|-----------|------|------------------------------------------------------------------------------| +| `target` | required | str | Where the file should be dropped relative to the plugin working directory | +| `local_path` | required | str | Path to the local file you want to upload(Could form the plugins parameters) | + +### `TextPluginFile` +| Name | Required? | Type | Description | +|-----------|-----------|-------------------|------------------------------------------------------------------------------| +| `target` | required | str | Where the file should be dropped relative to the plugin working directory | +| `content` | required | str | io.StringIO | Path to the local file you want to upload(Could form the plugins parameters) | + +### `PluginPort` +| Name | Required? | Type | Description | +|------------|-----------|------|-------------------------------------------------------| +| `internal` | required | int | Internal port to open on the docker container | +| `public` | optional | bool | If the port should be open publicly(Default: `False`) | + +## Environment variables availables in the plugin + +AZTK provide a few environment variables that can be used in your plugin script + +* `AZTK_IS_MASTER`: Is the plugin running on the master node +* `AZTK_IS_WORKER`: Is a worker setup on the current node(This might also be a master if you have `worker_on_master` set to true) +* `AZTK_MASTER_IP`: Internal ip of the master + +## Debug your plugin +When your plugin is not working as expected there is a few things you do to invesigate issues + +Check the logs, you can either use the debug tool or [BatchLabs](https://github.com/Azure/BatchLabs) +Navigate to `startup/wd/logs/plugins` +![](misc/plugin-logs.png) + +* Now if you see a file named `.txt` under that folder it means that your plugin started correctly and you can check this file to see what you execute script logged. +* IF this file doesn't exists this means the script was not run on this node. There could be multiple reasons for this: + - If you want your plugin to run on the spark container check the `startup/wd/logs/docker.log` file for information about this + - If you want your plugin to run on the host check the `startup/stdout.txt` and `startup/stderr.txt` + + The log could mention you picked the wrong target or target role for that plugin which is why this plugin is not running on this node. diff --git a/docs/misc/plugin-logs.png b/docs/misc/plugin-logs.png new file mode 100644 index 00000000..c8391afa Binary files /dev/null and b/docs/misc/plugin-logs.png differ diff --git a/pylintrc b/pylintrc index 71710d66..c8660ad2 100644 --- a/pylintrc +++ b/pylintrc @@ -66,7 +66,7 @@ confidence= # no Warning level messages displayed, use"--disable=all --enable=classes # --disable=W" # disable=print-statement,parameter-unpacking,unpacking-in-except,old-raise-syntax,backtick,import-star-module-level,apply-builtin,basestring-builtin,buffer-builtin,cmp-builtin,coerce-builtin,execfile-builtin,file-builtin,long-builtin,raw_input-builtin,reduce-builtin,standarderror-builtin,unicode-builtin,xrange-builtin,coerce-method,delslice-method,getslice-method,setslice-method,no-absolute-import,old-division,dict-iter-method,dict-view-method,next-method-called,metaclass-assignment,indexing-exception,raising-string,reload-builtin,oct-method,hex-method,nonzero-method,cmp-method,input-builtin,round-builtin,intern-builtin,unichr-builtin,map-builtin-not-iterating,zip-builtin-not-iterating,range-builtin-not-iterating,filter-builtin-not-iterating,using-cmp-argument,long-suffix,old-ne-operator,old-octal-literal,suppressed-message,useless-suppression -disable = C0111,W0401,I0011,C0103,E1101,too-few-public-methods +disable = C0111,W0401,I0011,C0103,E1101,too-few-public-methods,too-many-instance-attributes,too-many-arguments [REPORTS] diff --git a/tests/models/plugins/test_plugin_configuration.py b/tests/models/plugins/test_plugin_configuration.py index d7ff7c63..9369f5b8 100644 --- a/tests/models/plugins/test_plugin_configuration.py +++ b/tests/models/plugins/test_plugin_configuration.py @@ -1,5 +1,7 @@ -from aztk.models.plugins import PluginConfiguration, PluginPort, PluginRunTarget +import pytest +from aztk.models.plugins import PluginConfiguration, PluginPort, PluginTarget, PluginTargetRole +from aztk.error import InvalidPluginConfigurationError def test_create_basic_plugin(): plugin = PluginConfiguration( @@ -8,7 +10,7 @@ def test_create_basic_plugin(): assert plugin.files == ["file.sh"] assert plugin.execute == "file.sh" assert plugin.args == [] - assert plugin.run_on == PluginRunTarget.Master + assert plugin.target_role == PluginTargetRole.Master def test_create_with_args(): @@ -45,3 +47,16 @@ def test_plugin_with_specified_public_port(): assert port.internal == 1234 assert port.expose_publicly == True assert port.public_port == 4321 + + +def throw_error_if_passing_invalid_target(): + with pytest.raises(InvalidPluginConfigurationError): + PluginConfiguration(name="abc", target="some") + with pytest.raises(InvalidPluginConfigurationError): + PluginConfiguration(name="abc", target=PluginTargetRole.All) + +def throw_error_if_passing_invalid_target_role(): + with pytest.raises(InvalidPluginConfigurationError): + PluginConfiguration(name="abc", target_role="some") + with pytest.raises(InvalidPluginConfigurationError): + PluginConfiguration(name="abc", target_role=PluginTarget.Host)