diff --git a/aztk/client.py b/aztk/client.py index 0276affc..584a66a5 100644 --- a/aztk/client.py +++ b/aztk/client.py @@ -229,43 +229,48 @@ def __delete_user_on_pool(self, username, pool_id, nodes): concurrent.futures.wait(futures) - def __cluster_run(self, cluster_id, container_name, command, internal): + def __cluster_run(self, cluster_id, command, internal, container_name=None): pool, nodes = self.__get_pool_details(cluster_id) nodes = [node for node in nodes] if internal: - cluster_nodes = [models.RemoteLogin(ip_address=node.ip_address, port="22") for node in nodes] + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [self.__get_remote_login_settings(pool.id, node.id) for node in nodes] + cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] try: ssh_key = self.__create_user_on_pool('aztk', pool.id, nodes) - asyncio.get_event_loop().run_until_complete(ssh_lib.clus_exec_command(command, - container_name, - 'aztk', - cluster_nodes, - ssh_key=ssh_key.exportKey().decode('utf-8'))) + output = asyncio.get_event_loop().run_until_complete(ssh_lib.clus_exec_command(command, + 'aztk', + cluster_nodes, + ssh_key=ssh_key.exportKey().decode('utf-8'), + container_name=container_name)) + return output except OSError as exc: raise exc finally: self.__delete_user_on_pool('aztk', pool.id, nodes) - def __cluster_copy(self, cluster_id, container_name, source_path, destination_path, internal): + def __cluster_copy(self, cluster_id, source_path, destination_path, container_name=None, internal=False, get=False): pool, nodes = self.__get_pool_details(cluster_id) nodes = [node for node in nodes] if internal: - cluster_nodes = [models.RemoteLogin(ip_address=node.ip_address, port="22") for node in nodes] + cluster_nodes = [(node, models.RemoteLogin(ip_address=node.ip_address, port="22")) for node in nodes] else: - cluster_nodes = [self.__get_remote_login_settings(pool.id, node.id) for node in nodes] + cluster_nodes = [(node, self.__get_remote_login_settings(pool.id, node.id)) for node in nodes] try: ssh_key = self.__create_user_on_pool('aztk', pool.id, nodes) - asyncio.get_event_loop().run_until_complete(ssh_lib.clus_copy(container_name=container_name, - username='aztk', - nodes=cluster_nodes, - source_path=source_path, - destination_path=destination_path, - ssh_key=ssh_key.exportKey().decode('utf-8'))) - self.__delete_user_on_pool('aztk', pool.id, nodes) + output = asyncio.get_event_loop().run_until_complete( + ssh_lib.clus_copy(container_name=container_name, + username='aztk', + nodes=cluster_nodes, + source_path=source_path, + destination_path=destination_path, + ssh_key=ssh_key.exportKey().decode('utf-8'), + get=get)) + return output except (OSError, batch_error.BatchErrorException) as exc: raise exc + finally: + self.__delete_user_on_pool('aztk', pool.id, nodes) def __submit_job(self, job_configuration, @@ -388,5 +393,8 @@ def cluster_run(self, cluster_id, command): def cluster_copy(self, cluster_id, source_path, destination_path): raise NotImplementedError() + def cluster_download(self, cluster_id, source_path, destination_path): + raise NotImplementedError() + def submit_job(self, job): raise NotImplementedError() diff --git a/aztk/spark/client.py b/aztk/spark/client.py index 9f5c1a88..55c22cd6 100644 --- a/aztk/spark/client.py +++ b/aztk/spark/client.py @@ -9,6 +9,7 @@ from aztk.spark.helpers import submit as cluster_submit_helper from aztk.spark.helpers import job_submission as job_submit_helper from aztk.spark.helpers import get_log as get_log_helper +from aztk.spark.helpers import cluster_diagnostic_helper from aztk.spark.utils import util from aztk.internal.cluster_data import NodeData import yaml @@ -146,15 +147,23 @@ def get_application_status(self, cluster_id: str, app_name: str): except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def cluster_run(self, cluster_id: str, command: str, internal: bool = False): + def cluster_run(self, cluster_id: str, command: str, host=False, internal: bool = False): try: - return self.__cluster_run(cluster_id, 'spark', command, internal) + return self.__cluster_run(cluster_id, command, internal, container_name='spark' if not host else None) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) - def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, internal: bool = False): + def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False): try: - return self.__cluster_copy(cluster_id, 'spark', source_path, destination_path, internal) + container_name = None if host else 'spark' + return self.__cluster_copy(cluster_id, source_path, destination_path, container_name=container_name, get=False, internal=internal) + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) + + def cluster_download(self, cluster_id: str, source_path: str, destination_path: str, host: bool = False, internal: bool = False): + try: + container_name = None if host else 'spark' + return self.__cluster_copy(cluster_id, source_path, destination_path, container_name=container_name, get=True, internal=internal) except batch_error.BatchErrorException as e: raise error.AztkError(helpers.format_batch_exception(e)) @@ -272,3 +281,10 @@ def wait_until_job_finished(self, job_id): def wait_until_all_jobs_finished(self, jobs): for job in jobs: self.wait_until_job_finished(job) + + def run_cluster_diagnostics(self, cluster_id, output_directory): + try: + output = cluster_diagnostic_helper.run(self, cluster_id, output_directory) + return output + except batch_error.BatchErrorException as e: + raise error.AztkError(helpers.format_batch_exception(e)) diff --git a/aztk/spark/helpers/cluster_diagnostic_helper.py b/aztk/spark/helpers/cluster_diagnostic_helper.py new file mode 100644 index 00000000..d6e4a525 --- /dev/null +++ b/aztk/spark/helpers/cluster_diagnostic_helper.py @@ -0,0 +1,26 @@ +import os +from aztk.utils import ssh +from aztk.utils.command_builder import CommandBuilder +from aztk import models as aztk_models +import azure.batch.models as batch_models + +def run(spark_client, cluster_id, output_directory): + # copy debug program to each node + spark_client.cluster_copy(cluster_id, os.path.abspath("./aztk/spark/utils/debug.py"), "/tmp/debug.py", host=True) + ssh_cmd = _build_diagnostic_ssh_command() + run_output = spark_client.cluster_run(cluster_id, ssh_cmd, host=True) + local_path = os.path.join(os.path.abspath(output_directory), "debug", "debug.zip") + remote_path = "/tmp/debug.zip" + output = spark_client.cluster_download(cluster_id, remote_path, local_path, host=True) + # write run output to debug/ directory + with open(os.path.join(os.path.dirname(local_path), "debug-output.txt"), 'w', encoding="UTF-8") as f: + [f.write(line + '\n') for node_id, result in run_output for line in result] + return output + + +def _build_diagnostic_ssh_command(): + return "sudo rm -rf /tmp/debug.zip; "\ + "sudo apt-get install -y python3-pip; "\ + "sudo -H pip3 install --upgrade pip; "\ + "sudo -H pip3 install docker; "\ + "sudo python3 /tmp/debug.py" diff --git a/aztk/spark/utils/debug.py b/aztk/spark/utils/debug.py new file mode 100644 index 00000000..21a5e9eb --- /dev/null +++ b/aztk/spark/utils/debug.py @@ -0,0 +1,160 @@ +""" + Diagnostic program that runs on each node in the cluster + This program must be run with sudo +""" +import io +import json +import os +import socket +import tarfile +from subprocess import STDOUT, CalledProcessError, check_output +from zipfile import ZIP_DEFLATED, ZipFile + +import docker # pylint: disable=import-error + + +def main(): + zipf = create_zip_archive() + + # general node diagnostics + zipf.writestr("hostname.txt", data=get_hostname()) + zipf.writestr("df.txt", data=get_disk_free()) + + # docker container diagnostics + docker_client = docker.from_env() + for filename, data in get_docker_diagnostics(docker_client): + zipf.writestr(filename, data=data) + + zipf.close() + + +def create_zip_archive(): + zip_file_path = "/tmp/debug.zip" + return ZipFile(zip_file_path, "w", ZIP_DEFLATED) + + +def get_hostname(): + return socket.gethostname() + + +def cmd_check_output(cmd): + try: + output = check_output(cmd, shell=True, stderr=STDOUT) + except CalledProcessError as e: + return "CMD: {0}\n"\ + "returncode: {1}"\ + "output: {2}".format(e.cmd, e.returncode, e.output) + else: + return output + + +def get_disk_free(): + return cmd_check_output("df -h") + + +def get_docker_diagnostics(docker_client): + ''' + returns list of tuples (filename, data) to be written in the zip + ''' + output = [] + output.append(get_docker_images(docker_client)) + logs = get_docker_containers(docker_client) + for item in logs: + output.append(item) + + return output + + +def get_docker_images(docker_client): + output = "" + try: + images = docker_client.images.list() + for image in images: + output += json.dumps(image.attrs, sort_keys=True, indent=4) + return ("docker-images.txt", output) + except docker.errors.APIError as e: + return ("docker-images.err", e.__str__()) + + +def get_docker_containers(docker_client): + container_attrs = "" + logs = [] + try: + containers = docker_client.containers.list() + for container in containers: + container_attrs += json.dumps(container.attrs, sort_keys=True, indent=4) + # get docker container logs + logs.append((container.name + "/docker.log", container.logs())) + logs.append(get_docker_process_status(container)) + if container.name == "spark": #TODO: find a more robust way to get specific info off specific containers + logs.extend(get_container_aztk_script(container)) + logs.extend(get_spark_logs(container)) + logs.extend(get_spark_app_logs(container)) + + logs.append(("docker-containers.txt", container_attrs)) + return logs + except docker.errors.APIError as e: + return [("docker-containers.err", e.__str__())] + + +def get_docker_process_status(container): + try: + exit_code, output = container.exec_run("ps -auxw", tty=True, privileged=True) + out_file_name = container.name + "/ps_aux.txt" + if exit_code == 0: + return (out_file_name, output) + else: + return (out_file_name, "exit_code: {0}\n{1}".format(exit_code, output)) + except docker.errors.APIError as e: + return (container.name + "ps_aux.err", e.__str__()) + + +def get_container_aztk_script(container): + aztk_path = "/mnt/batch/tasks/startup/wd" + try: + stream, _ = container.get_archive(aztk_path) # second item is stat info + return extract_tar_in_memory(container, stream) + except docker.errors.APIError as e: + return (container.name + "/" + "aztk-scripts.err", e.__str__()) + + +def get_spark_logs(container): + spark_logs_path = "/home/spark-current/logs" + try: + stream, _ = container.get_archive(spark_logs_path) # second item is stat info + return extract_tar_in_memory(container, stream) + except docker.errors.APIError as e: + return [(container.name + "/" + "spark-logs.err", e.__str__())] + + +def get_spark_app_logs(container): + spark_app_logs_path = "/home/spark-current/work" + try: + stream, _ = container.get_archive(spark_app_logs_path) + return extract_tar_in_memory(container, stream) + except docker.errors.APIError as e: + return [(container.name + "/" + "spark-work-logs.err", e.__str__())] + + +def filter_members(members): + skip_files = ["id_rsa", "id_rsa.pub", "docker.log"] + skip_extensions = [".pyc", ".zip"] + for tarinfo in members: + if (os.path.splitext(tarinfo.name)[1] not in skip_extensions and + os.path.basename(tarinfo.name) not in skip_files): + yield tarinfo + + +def extract_tar_in_memory(container, data): + data = io.BytesIO(b''.join([item for item in data])) + tarf = tarfile.open(fileobj=data) + logs = [] + for member in filter_members(tarf): + file_bytes = tarf.extractfile(member) + if file_bytes is not None: + logs.append((container.name + "/" + member.name, b''.join(file_bytes.readlines()))) + return logs + + +if __name__ == "__main__": + main() diff --git a/aztk/utils/ssh.py b/aztk/utils/ssh.py index 9cde8381..bd139683 100644 --- a/aztk/utils/ssh.py +++ b/aztk/utils/ssh.py @@ -40,59 +40,86 @@ def connect(hostname, return client -def node_exec_command(command, container_name, username, hostname, port, ssh_key=None, password=None): +def node_exec_command(node_id, command, username, hostname, port, ssh_key=None, password=None, container_name=None): client = connect(hostname=hostname, port=port, username=username, password=password, pkey=ssh_key) - docker_exec = 'sudo docker exec 2>&1 -t {0} /bin/bash -c \'set -e; set -o pipefail; {1}; wait\''.format(container_name, command) - stdin, stdout, stderr = client.exec_command(docker_exec, get_pty=True) - [print(line.decode('utf-8')) for line in stdout.read().splitlines()] + if container_name: + cmd = 'sudo docker exec 2>&1 -t {0} /bin/bash -c \'set -e; set -o pipefail; {1}; wait\''.format(container_name, command) + else: + cmd = '/bin/bash 2>&1 -c \'set -e; set -o pipefail; {0}; wait\''.format(command) + stdin, stdout, stderr = client.exec_command(cmd, get_pty=True) + output = [line.decode('utf-8') for line in stdout.read().splitlines()] client.close() + return (node_id, output) -async def clus_exec_command(command, container_name, username, nodes, ports=None, ssh_key=None, password=None): - await asyncio.wait( - [asyncio.get_event_loop().run_in_executor(ThreadPoolExecutor(), - node_exec_command, - command, - container_name, - username, - node.ip_address, - node.port, - ssh_key, - password) for node in nodes] +async def clus_exec_command(command, username, nodes, ports=None, ssh_key=None, password=None, container_name=None): + return await asyncio.gather( + *[asyncio.get_event_loop().run_in_executor(ThreadPoolExecutor(), + node_exec_command, + node.id, + command, + username, + node_rls.ip_address, + node_rls.port, + ssh_key, + password, + container_name) for node, node_rls in nodes] ) -def node_copy(container_name, source_path, destination_path, username, hostname, port, ssh_key=None, password=None): +def copy_from_node(node_id, source_path, destination_path, username, hostname, port, ssh_key=None, password=None, container_name=None): client = connect(hostname=hostname, port=port, username=username, password=password, pkey=ssh_key) sftp_client = client.open_sftp() - try: - # put the file in /tmp on the host - tmp_file = '/tmp/' + os.path.basename(source_path) - sftp_client.put(source_path, tmp_file) - # move to correct destination on container - docker_command = 'sudo docker cp {0} {1}:{2}'.format(tmp_file, container_name, destination_path) - _, stdout, _ = client.exec_command(docker_command, get_pty=True) - [print(line.decode('utf-8')) for line in stdout.read().splitlines()] - # clean up - sftp_client.remove(tmp_file) + destination_path = os.path.join(os.path.dirname(destination_path), node_id, os.path.basename(destination_path)) + os.makedirs(os.path.dirname(destination_path), exist_ok=True) + with open(destination_path, 'wb') as f: #SpooledTemporaryFile instead?? + sftp_client.getfo(source_path, f) + return (node_id, True, None) + except OSError as e: + return (node_id, False, e) + finally: + sftp_client.close() + client.close() + + +def node_copy(node_id, source_path, destination_path, username, hostname, port, ssh_key=None, password=None, container_name=None): + client = connect(hostname=hostname, port=port, username=username, password=password, pkey=ssh_key) + sftp_client = client.open_sftp() + try: + if container_name: + # put the file in /tmp on the host + tmp_file = '/tmp/' + os.path.basename(source_path) + sftp_client.put(source_path, tmp_file) + # move to correct destination on container + docker_command = 'sudo docker cp {0} {1}:{2}'.format(tmp_file, container_name, destination_path) + _, stdout, _ = client.exec_command(docker_command, get_pty=True) + output = [line.decode('utf-8') for line in stdout.read().splitlines()] + # clean up + sftp_client.remove(tmp_file) + return (node_id, True, None) + else: + output = sftp_client.put(source_path, destination_path).__str__() + return (node_id, True, None) except (IOError, PermissionError) as e: - print(e) - - client.close() - + return (node_id, False, e) + finally: + sftp_client.close() + client.close() #TODO: progress bar -async def clus_copy(container_name, username, nodes, source_path, destination_path, ssh_key=None, password=None): - await asyncio.gather( + +async def clus_copy(username, nodes, source_path, destination_path, ssh_key=None, password=None, container_name=None, get=False): + return await asyncio.gather( *[asyncio.get_event_loop().run_in_executor(ThreadPoolExecutor(), - node_copy, - container_name, + copy_from_node if get else node_copy, + node.id, source_path, destination_path, username, - node.ip_address, - node.port, + node_rls.ip_address, + node_rls.port, ssh_key, - password) for node in nodes - ]) + password, + container_name) for node, node_rls in nodes] + ) diff --git a/aztk_cli/spark/endpoints/cluster/cluster.py b/aztk_cli/spark/endpoints/cluster/cluster.py index 4b42930d..b90b094b 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster.py +++ b/aztk_cli/spark/endpoints/cluster/cluster.py @@ -10,6 +10,7 @@ from . import cluster_submit from . import cluster_run from . import cluster_copy +from . import cluster_debug class ClusterAction: @@ -23,6 +24,7 @@ class ClusterAction: submit = "submit" run = "run" copy = "copy" + debug = "debug" def setup_parser(parser: argparse.ArgumentParser): @@ -50,6 +52,8 @@ def setup_parser(parser: argparse.ArgumentParser): ClusterAction.run, help="Run a command on all nodes in your spark cluster") copy_parser = subparsers.add_parser( ClusterAction.copy, help="Copy files to all nodes in your spark cluster") + debug_parser = subparsers.add_parser( + ClusterAction.debug, help="Debugging tool that aggregates logs and output from the cluster.") cluster_create.setup_parser(create_parser) cluster_add_user.setup_parser(add_user_parser) @@ -61,6 +65,7 @@ def setup_parser(parser: argparse.ArgumentParser): cluster_app_logs.setup_parser(app_logs_parser) cluster_run.setup_parser(run_parser) cluster_copy.setup_parser(copy_parser) + cluster_debug.setup_parser(debug_parser) def execute(args: typing.NamedTuple): @@ -76,6 +81,7 @@ def execute(args: typing.NamedTuple): actions[ClusterAction.app_logs] = cluster_app_logs.execute actions[ClusterAction.run] = cluster_run.execute actions[ClusterAction.copy] = cluster_copy.execute + actions[ClusterAction.debug] = cluster_debug.execute func = actions[args.cluster_action] func(args) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_copy.py b/aztk_cli/spark/endpoints/cluster/cluster_copy.py index 3210cbdf..8a861d82 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_copy.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_copy.py @@ -1,8 +1,9 @@ import argparse +import sys import typing import aztk.spark -from aztk_cli import config +from aztk_cli import config, utils def setup_parser(parser: argparse.ArgumentParser): @@ -22,10 +23,22 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - - spark_client.cluster_copy( - cluster_id=args.cluster_id, - source_path=args.source_path, - destination_path=args.dest_path, - internal=args.internal - ) + with utils.Spinner(): + copy_output = spark_client.cluster_copy( + cluster_id=args.cluster_id, + source_path=args.source_path, + destination_path=args.dest_path, + internal=args.internal + ) + [print_copy_result(node_id, result, err) for node_id, result, err in copy_output] + sys.exit(0 if all([result for _, result, _ in copy_output]) else 1) + + +def print_copy_result(node_id, success, err): + print("-" * (len(node_id) + 6)) + print("| ", node_id, " |") + print("-" * (len(node_id) + 6)) + if success: + print("Copy successful") + else: + print(err) diff --git a/aztk_cli/spark/endpoints/cluster/cluster_debug.py b/aztk_cli/spark/endpoints/cluster/cluster_debug.py new file mode 100644 index 00000000..7fe3d5d2 --- /dev/null +++ b/aztk_cli/spark/endpoints/cluster/cluster_debug.py @@ -0,0 +1,26 @@ +import argparse +import os +import typing +import time + +import aztk.spark +from aztk_cli import config, utils + + +def setup_parser(parser: argparse.ArgumentParser): + parser.add_argument('--id', dest='cluster_id', required=True, + help='The unique id of your spark cluster') + + parser.add_argument('--output', '-o', required=False, + help='the directory for the output folder') + + +def execute(args: typing.NamedTuple): + spark_client = aztk.spark.Client(config.load_aztk_secrets()) + timestr = time.strftime("%Y%m%d-%H%M%S") + + if not args.output: + args.output = os.path.join(os.getcwd(), "debug-{0}-{1}".format(args.cluster_id, timestr)) + with utils.Spinner(): + spark_client.run_cluster_diagnostics(cluster_id=args.cluster_id, output_directory=args.output) + # TODO: analyze results, display some info about status diff --git a/aztk_cli/spark/endpoints/cluster/cluster_run.py b/aztk_cli/spark/endpoints/cluster/cluster_run.py index 877eddbd..5567b0fc 100644 --- a/aztk_cli/spark/endpoints/cluster/cluster_run.py +++ b/aztk_cli/spark/endpoints/cluster/cluster_run.py @@ -19,5 +19,14 @@ def setup_parser(parser: argparse.ArgumentParser): def execute(args: typing.NamedTuple): spark_client = aztk.spark.Client(config.load_aztk_secrets()) - result = spark_client.cluster_run(args.cluster_id, args.command, args.internal) - #TODO: pretty print result + with utils.Spinner(): + results = spark_client.cluster_run(args.cluster_id, args.command, args.internal) + [print_execute_result(node_id, result) for node_id, result in results] + + +def print_execute_result(node_id, result): + print("-" * (len(node_id) + 6)) + print("| ", node_id, " |") + print("-" * (len(node_id) + 6)) + for line in result: + print(line) diff --git a/docs/10-clusters.md b/docs/10-clusters.md index 3d653b4c..df0a0e98 100644 --- a/docs/10-clusters.md +++ b/docs/10-clusters.md @@ -18,7 +18,7 @@ For example, to create a cluster of 4 *Standard_A2* nodes called 'spark' you can aztk spark cluster create --id spark --vm-size standard_a2 --size 4 ``` -You can find more information on VM sizes [here.](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes) Please note that you must use the official SKU name when setting your VM size - they usually come in the form: "standard_d2_v2". +You can find more information on VM sizes [here.](https://docs.microsoft.com/en-us/azure/virtual-machines/linux/sizes) Please note that you must use the official SKU name when setting your VM size - they usually come in the form: "standard_d2_v2". _Note: The cluster id (`--id`) can only contain alphanumeric characters including hyphens and underscores, and cannot contain more than 64 characters. Each cluster **must** have a unique cluster id._ @@ -139,6 +139,27 @@ Now that you're in, you can change directory to your familiar `$SPARK_HOME` cd $SPARK_HOME ``` +### Debugging your Spark Cluster + +If your cluster is in an unknown or unusbale state, you can debug by running: + +```sh +aztk spark cluster debug --id --output +``` + +The debug utility will pull logs from all nodes in the cluster. The utility will check for: +- free diskspace +- docker image status +- docker container status +- docker container logs +- docker container process status +- aztk code & version +- spark component logs (master, worker, shuffle service, history server, etc) from $SPARK_HOME/logs +- spark application logs from $SPARK_HOME/work + +__Please be careful sharing the output of the `debug` command as secrets and application code are present in the output.__ + + ### Interact with your Spark cluster By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI to *localhost:8080*, Spark Jobs UI to *localhost:4040*, and Spark History Server to your *locahost:18080*. This can be [configured in *.aztk/ssh.yaml*](../docs/13-configuration.md##sshyaml).