Azure
diff --git a/‎.vscode/settings.json‎
Lines changed: 16 additions & 13 deletions b/‎.vscode/settings.json‎
Lines changed: 16 additions & 13 deletions
diff --git a/‎aztk/client.py‎
Lines changed: 10 additions & 2 deletions b/‎aztk/client.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎aztk/internal/cluster_data/__init__.py‎
Lines changed: 3 additions & 0 deletions b/‎aztk/internal/cluster_data/__init__.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎aztk/internal/cluster_data/blob_data.py‎
Lines changed: 26 additions & 0 deletions b/‎aztk/internal/cluster_data/blob_data.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎aztk/internal/cluster_data/cluster_data.py‎
Lines changed: 57 additions & 0 deletions b/‎aztk/internal/cluster_data/cluster_data.py‎
Lines changed: 57 additions & 0 deletions
diff --git a/‎aztk/internal/cluster_data/node_data.py‎
Lines changed: 165 additions & 0 deletions b/‎aztk/internal/cluster_data/node_data.py‎
Lines changed: 165 additions & 0 deletions
diff --git a/‎aztk/spark/client.py‎
Lines changed: 9 additions & 12 deletions b/‎aztk/spark/client.py‎
Lines changed: 9 additions & 12 deletions
diff --git a/‎aztk/spark/models/models.py‎
Lines changed: 8 additions & 0 deletions b/‎aztk/spark/models/models.py‎
Lines changed: 8 additions & 0 deletions
@@ -1,15 +1,18 @@
 {
-    // "python.linting.pylintEnabled": false,
-    "search.exclude": {
-        "build/**": true,
-        "bin/**": true
-    },
-    "files.exclude": {
-        "**/__pycache__": true,
-        "*.egg-info": true,
-    },
-    "python.autoComplete.extraPaths": [
-        "${workspaceRoot}/node_scripts"
-    ],
-    "python.formatting.provider": "yapf"
+  // "python.linting.pylintEnabled": false,
+  "search.exclude": {
+    "build/**": true,
+    "bin/**": true
+  },
+  "files.exclude": {
+    "**/__pycache__": true,
+    "*.egg-info": true,
+  },
+  "python.autoComplete.extraPaths": [
+    "${workspaceRoot}/node_scripts"
+  ],
+  "python.formatting.provider": "yapf",
+  "python.formatting.yapfArgs": [
+    "--style=.style.yapf"
+  ]
 }
@@ -16,7 +16,7 @@
 import azure.batch.models as batch_models
 from azure.batch.models import batch_error
 from Crypto.PublicKey import RSA
-
+from aztk.internal import cluster_data
 
 class Client:
     def __init__(self, secrets_config: models.SecretsConfiguration):
@@ -26,6 +26,14 @@ def __init__(self, secrets_config: models.SecretsConfiguration):
         self.batch_client = azure_api.make_batch_client(secrets_config)
         self.blob_client = azure_api.make_blob_client(secrets_config)
 
+    def get_cluster_config(self, cluster_id: str) -> models.ClusterConfiguration:
+        return self._get_cluster_data(cluster_id).read_cluster_config()
+
+    def _get_cluster_data(self, cluster_id: str) -> cluster_data.ClusterData:
+        """
+        Returns ClusterData object to manage data related to the given cluster id
+        """
+        return cluster_data.ClusterData(self.blob_client, cluster_id)
 
     '''
     General Batch Operations
@@ -66,7 +74,7 @@ def __create_pool_and_job(self, cluster_conf: models.ClusterConfiguration, softw
             :param VmImageModel: the type of image to provision for the cluster
             :param wait: wait until the cluster is ready
         """
-        helpers.save_cluster_config(cluster_conf, self.blob_client)
+        self._get_cluster_data(cluster_conf.cluster_id).save_cluster_config(cluster_conf)
         # reuse pool_id as job_id
         pool_id = cluster_conf.cluster_id
         job_id = cluster_conf.cluster_id
 
@@ -0,0 +1,3 @@
+from .blob_data import *
+from .node_data import *
+from .cluster_data import *
@@ -0,0 +1,26 @@
+import azure.batch.models as batch_models
+import datetime
+from azure.storage.blob import BlockBlobService, BlobPermissions
+
+class BlobData:
+    """
+    Object mapping to a blob entry. Can generate resource files for batch
+    """
+    def __init__(self, blob_client: BlockBlobService, container: str, blob: str):
+        self.container = container
+        self.blob = blob
+        self.dest = blob
+        self.blob_client = blob_client
+
+
+    def to_resource_file(self, dest: str = None) -> batch_models.ResourceFile:
+        sas_token = self.blob_client.generate_blob_shared_access_signature(
+            self.container,
+            self.blob,
+            permission=BlobPermissions.READ,
+            expiry=datetime.datetime.utcnow() + datetime.timedelta(days=365))
+
+        sas_url = self.blob_client.make_blob_url(
+            self.container, self.blob, sas_token=sas_token)
+
+        return batch_models.ResourceFile(file_path=dest or self.dest, blob_source=sas_url)
@@ -0,0 +1,57 @@
+import yaml
+import logging
+import azure.common
+from azure.storage.blob import BlockBlobService
+from .node_data import NodeData
+from .blob_data import BlobData
+
+
+class ClusterData:
+    """
+    Class handling the management of data for a cluster
+    """
+    # ALl data related to cluster(config, metadata, etc.) should be under this folder
+    CLUSTER_DIR = "cluster"
+    APPLICATIONS_DIR = "applications"
+    CLUSTER_CONFIG_FILE = "config.yaml"
+
+    def __init__(self, blob_client: BlockBlobService, cluster_id: str):
+        self.blob_client = blob_client
+        self.cluster_id = cluster_id
+        self._ensure_container()
+
+    def save_cluster_config(self, cluster_config):
+        blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
+        content = yaml.dump(cluster_config)
+        container_name = cluster_config.cluster_id
+        self.blob_client.create_blob_from_text(container_name, blob_path, content)
+
+    def read_cluster_config(self):
+        blob_path = self.CLUSTER_DIR + "/" + self.CLUSTER_CONFIG_FILE
+        try:
+            result = self.blob_client.get_blob_to_text(self.cluster_id, blob_path)
+            return yaml.load(result.content)
+        except azure.common.AzureMissingResourceHttpError:
+            logging.warn("Cluster %s doesn't have cluster configuration in storage", self.cluster_id)
+        except yaml.YAMLError:
+            logging.warn("Cluster %s contains invalid cluster configuration in blob", self.cluster_id)
+
+    def upload_file(self, blob_path: str, local_path: str) -> BlobData:
+        self.blob_client.create_blob_from_path(self.cluster_id, blob_path, local_path)
+        return BlobData(self.blob_client, self.cluster_id, blob_path)
+
+    def upload_cluster_file(self, blob_path: str, local_path: str) -> BlobData:
+        blob_data = self.upload_file(self.CLUSTER_DIR + "/" + blob_path, local_path)
+        blob_data.dest = blob_path
+        return blob_data
+
+    def upload_application_file(self, blob_path: str, local_path: str) -> BlobData:
+        blob_data = self.upload_file(self.APPLICATIONS_DIR + "/" + blob_path, local_path)
+        blob_data.dest = blob_path
+        return blob_data
+
+    def upload_node_data(self, node_data: NodeData) -> BlobData:
+        return self.upload_cluster_file("node-scripts.zip", node_data.zip_path)
+
+    def _ensure_container(self):
+        self.blob_client.create_container(self.cluster_id, fail_on_exist=False)
@@ -0,0 +1,165 @@
+import fnmatch
+import io
+import json
+import os
+import yaml
+import zipfile
+from pathlib import Path
+from typing import List
+from aztk.spark import models
+from aztk.utils import constants, file_utils, secure_utils
+from aztk.error import InvalidCustomScriptError
+
+ROOT_PATH = constants.ROOT_PATH
+
+# Constants for node data
+NODE_SCRIPT_FOLDER = "node_scripts"
+CUSTOM_SCRIPT_FOLDER = "custom-scripts"
+CUSTOM_SCRIPT_METADATA_FILE = "custom-scripts.yaml"
+PLUGIN_FOLDER = "plugins"
+
+
+class NodeData:
+    """
+    Class made to bundle data to be uploaded to the node as a zip
+    """
+
+    def __init__(self, cluster_config: models.ClusterConfiguration):
+        self.zip_path = os.path.join(ROOT_PATH, "tmp/node-scripts.zip")
+        self.cluster_config = cluster_config
+        file_utils.ensure_dir(self.zip_path)
+        self.zipf = zipfile.ZipFile(self.zip_path, "w", zipfile.ZIP_DEFLATED)
+
+    def add_core(self):
+        self._add_node_scripts()
+        self._add_custom_scripts()
+        self._add_plugins()
+        self._add_spark_configuration()
+        self._add_user_conf()
+        self.add_file(os.path.join(constants.ROOT_PATH, 'aztk', 'utils', 'command_builder.py'), '', binary=False)
+        return self
+
+    def done(self):
+        self.zipf.close()
+        return self
+
+    def add_file(self, file: str, zip_dir: str, binary: bool = True):
+        if not file:
+            return
+        if isinstance(file, (str, bytes)):
+            full_file_path = Path(file)
+            with io.open(file, 'r') as f:
+                if binary:
+                    self.zipf.write(file, os.path.join(zip_dir, full_file_path.name))
+                else:
+                    self.zipf.writestr(os.path.join(zip_dir, full_file_path.name), f.read().replace('\r\n', '\n'))
+        elif isinstance(file, models.File):
+            self.zipf.writestr(os.path.join(zip_dir, file.name), file.payload.getvalue())
+
+    def add_files(self, file_paths: List[str], zip_dir, binary: bool = True):
+        """
+        Add a list of local files to the node data
+        """
+        for file in file_paths:
+            self.add_file(file, zip_dir, binary)
+
+    def add_dir(self, path: str, exclude: List[str] = []):
+        """
+            Zip all the files in the given directory into the zip file handler
+        """
+        for base, _, files in os.walk(path):
+            relative_folder = os.path.relpath(base, path)
+            for file in files:
+                if self._includeFile(file, exclude):
+                    with io.open(os.path.join(base, file), 'r') as f:
+                        self.zipf.writestr(os.path.join(relative_folder, file), f.read().replace('\r\n', '\n'))
+
+    def _add_custom_scripts(self):
+        data = []
+        if not self.cluster_config.custom_scripts:
+            return
+
+        for index, custom_script in enumerate(self.cluster_config.custom_scripts):
+            if isinstance(custom_script.script, (str, bytes)):
+                new_file_name = str(index) + '_' + os.path.basename(custom_script.script)
+                data.append(dict(script=new_file_name, runOn=str(custom_script.run_on)))
+                try:
+                    with io.open(custom_script.script, 'r') as f:
+                        self.zipf.writestr(
+                            os.path.join(CUSTOM_SCRIPT_FOLDER, new_file_name),
+                            f.read().replace('\r\n', '\n'))
+                except FileNotFoundError:
+                    raise InvalidCustomScriptError("Custom script '{0}' doesn't exists.".format(custom_script.script))
+            elif isinstance(custom_script.script, models.File):
+                new_file_name = str(index) + '_' + custom_script.script.name
+                self.zipf.writestr(os.path.join('custom-scripts', new_file_name), custom_script.script.payload.getvalue())
+
+        self.zipf.writestr(
+            os.path.join(CUSTOM_SCRIPT_FOLDER, CUSTOM_SCRIPT_METADATA_FILE), yaml.dump(data, default_flow_style=False))
+
+    def _add_spark_configuration(self):
+        spark_configuration = self.cluster_config.spark_configuration
+        if not spark_configuration:
+            return
+        self.add_files(
+            [
+                spark_configuration.spark_defaults_conf, spark_configuration.spark_env_sh,
+                spark_configuration.core_site_xml
+            ],
+            'conf',
+            binary=False)
+
+        # add ssh keys for passwordless ssh
+        self.zipf.writestr( 'id_rsa.pub', spark_configuration.ssh_key_pair['pub_key'])
+        self.zipf.writestr( 'id_rsa', spark_configuration.ssh_key_pair['priv_key'])
+
+        if spark_configuration.jars:
+            for jar in spark_configuration.jars:
+                self.add_file(jar, 'jars', binary=True)
+
+    def _add_user_conf(self):
+        user_conf = self.cluster_config.user_configuration
+        if not user_conf:
+            return
+        encrypted_aes_session_key, cipher_aes_nonce, tag, ciphertext = secure_utils.encrypt_password(
+            self.cluster_config.spark_configuration.ssh_key_pair['pub_key'], user_conf.password)
+        user_conf = yaml.dump({
+            'username': user_conf.username,
+            'password': ciphertext,
+            'ssh-key': user_conf.ssh_key,
+            'aes_session_key': encrypted_aes_session_key,
+            'cipher_aes_nonce': cipher_aes_nonce,
+            'tag': tag,
+            'cluster_id': self.cluster_config.cluster_id
+        })
+        self.zipf.writestr('user.yaml', user_conf)
+
+    def _add_plugins(self):
+        if not self.cluster_config.plugins:
+            return
+
+        data = []
+        for plugin in self.cluster_config.plugins:
+            for file in plugin.files:
+                zipf = self.zipf.writestr('plugins/{0}/{1}'.format(plugin.name, file.target), file.content())
+            if plugin.execute:
+                data.append(dict(
+                    name=plugin.name,
+                    execute='{0}/{1}'.format(plugin.name, plugin.execute),
+                    args=plugin.args,
+                    env=plugin.env,
+                    runOn=plugin.run_on.value,
+                ))
+
+        self.zipf.writestr(os.path.join('plugins', 'plugins-manifest.yaml'), yaml.dump(data))
+        return zipf
+
+    def _add_node_scripts(self):
+        self.add_dir(os.path.join(ROOT_PATH, NODE_SCRIPT_FOLDER), exclude=['*.pyc'])
+
+    def _includeFile(self, filename: str, exclude: List[str] = []) -> bool:
+        for pattern in exclude:
+            if fnmatch.fnmatch(filename, pattern):
+                return False
+
+        return True
@@ -9,7 +9,8 @@
 from aztk.spark.helpers import submit as cluster_submit_helper
 from aztk.spark.helpers import job_submission as job_submit_helper
 from aztk.spark.helpers import get_log as get_log_helper
-from aztk.spark.utils import upload_node_scripts, util
+from aztk.spark.utils import util
+from aztk.internal.cluster_data import NodeData
 import yaml
 
 
@@ -22,14 +23,11 @@ def __init__(self, secrets_config):
     '''
     def create_cluster(self, cluster_conf: models.ClusterConfiguration, wait: bool = False):
         cluster_conf.validate()
-
+        cluster_data = self._get_cluster_data(cluster_conf.cluster_id)
         try:
-            zip_resource_files = upload_node_scripts.zip_scripts(self.blob_client,
-                                                                 cluster_conf.cluster_id,
-                                                                 cluster_conf.custom_scripts,
-                                                                 cluster_conf.spark_configuration,
-                                                                 cluster_conf.user_configuration,
-                                                                 cluster_conf.plugins)
+            zip_resource_files = None
+            node_data = NodeData(cluster_conf).add_core().done()
+            zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()
 
             start_task = create_cluster_helper.generate_cluster_start_task(self,
                                                                            zip_resource_files,
@@ -165,10 +163,9 @@ def cluster_copy(self, cluster_id: str, source_path: str, destination_path: str)
     '''
     def submit_job(self, job_configuration):
         try:
-            zip_resource_files = upload_node_scripts.zip_scripts(self.blob_client,
-                                                                 job_configuration.id,
-                                                                 job_configuration.custom_scripts,
-                                                                 job_configuration.spark_configuration)
+            cluster_data = self._get_cluster_data(job_configuration.id)
+            node_data =  NodeData(job_configuration.as_cluster_config()).add_core().done()
+            zip_resource_files = cluster_data.upload_node_data(node_data).to_resource_file()
 
             start_task = create_cluster_helper.generate_cluster_start_task(self,
                                                                            zip_resource_files,
 
@@ -222,6 +222,14 @@ def __init__(
         self.subnet_id = subnet_id
         self.worker_on_master = worker_on_master
 
+    def as_cluster_config(self):
+        return ClusterConfiguration(
+            cluster_id =  self.id,
+            custom_scripts = self.custom_scripts,
+            vm_size=self.vm_size,
+            spark_configuration=self.spark_configuration,
+        )
+
 
 class JobState():
     complete = 'completed'
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+from .blob_data import *`
	`2`	`+from .node_data import *`
	`3`	`+from .cluster_data import *`