Skip to content
This repository was archived by the owner on Feb 3, 2021. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
# custom
my-custom-scripts/
.aztk

# Environments
Expand Down
4 changes: 1 addition & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,7 @@ By default, we port forward the Spark Web UI to *localhost:8080*, Spark Jobs UI

You can configure these settings in the *.aztk/ssh.yaml* file.

NOTE: When working interactively, you may want to use tools like Jupyter or RStudio-Server depending on whether or not you are a python or R user. To do so, you need to setup your cluster with the appropriate docker image and custom scripts:
- [how to setup Jupyter with Pyspark](https://github.com/Azure/aztk/wiki/PySpark-on-Azure-with-AZTK)
- [how to setup RStudio-Server with Sparklyr](https://github.com/Azure/aztk/wiki/SparklyR-on-Azure-with-AZTK)
NOTE: When working interactively, you may want to use tools like Jupyter or RStudio-Server. To do so, you need to setup your cluster with the appropriate docker image and plugin. See [Plugins](./docs/15-plugins.md) for more information.

### 5. Manage and Monitor your Spark Cluster

Expand Down
31 changes: 3 additions & 28 deletions aztk/internal/cluster_data/node_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,17 @@
import zipfile
from pathlib import Path
from typing import List

import yaml

from aztk import models
from aztk.utils import constants, file_utils, secure_utils
from aztk.error import InvalidCustomScriptError
from aztk.utils import constants, file_utils, secure_utils

ROOT_PATH = constants.ROOT_PATH

# Constants for node data
NODE_SCRIPT_FOLDER = "aztk"
CUSTOM_SCRIPT_FOLDER = "custom-scripts"
CUSTOM_SCRIPT_METADATA_FILE = "custom-scripts.yaml"
PLUGIN_FOLDER = "plugins"


Expand All @@ -30,7 +30,6 @@ def __init__(self, cluster_config: models.ClusterConfiguration):

def add_core(self):
self._add_node_scripts()
self._add_custom_scripts()
self._add_plugins()
self._add_spark_configuration()
self._add_user_conf()
Expand Down Expand Up @@ -72,30 +71,6 @@ def add_dir(self, path: str, dest: str = None, exclude: List[str] = None):
if self._includeFile(file, exclude):
self.add_file(os.path.join(base, file), os.path.join(dest, relative_folder), binary=False)

def _add_custom_scripts(self):
data = []
if not self.cluster_config.custom_scripts:
return

for index, custom_script in enumerate(self.cluster_config.custom_scripts):
if isinstance(custom_script.script, (str, bytes)):
new_file_name = str(index) + '_' + os.path.basename(custom_script.script)
data.append(dict(script=new_file_name, runOn=str(custom_script.run_on)))
try:
with io.open(custom_script.script, 'r', encoding='UTF-8') as f:
self.zipf.writestr(
os.path.join(CUSTOM_SCRIPT_FOLDER, new_file_name),
f.read().replace('\r\n', '\n'))
except FileNotFoundError:
raise InvalidCustomScriptError("Custom script '{0}' doesn't exists.".format(custom_script.script))
elif isinstance(custom_script.script, models.File):
new_file_name = str(index) + '_' + custom_script.script.name
self.zipf.writestr(
os.path.join('custom-scripts', new_file_name), custom_script.script.payload.getvalue())

self.zipf.writestr(
os.path.join(CUSTOM_SCRIPT_FOLDER, CUSTOM_SCRIPT_METADATA_FILE), yaml.dump(data, default_flow_style=False))

def _add_spark_configuration(self):
spark_configuration = self.cluster_config.spark_configuration
if not spark_configuration:
Expand Down
1 change: 0 additions & 1 deletion aztk/models/cluster_configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ class ClusterConfiguration(Model):

subnet_id = fields.String(default=None)
plugins = fields.List(PluginConfiguration)
custom_scripts = fields.List(CustomScript)
file_shares = fields.List(FileShare)
user_configuration = fields.Model(UserConfiguration, default=None)
scheduling_target = fields.Enum(SchedulingTarget, default=None)
Expand Down
5 changes: 0 additions & 5 deletions aztk/node_scripts/docker_main.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,10 @@ set -e
source ~/.bashrc
echo "Initializing spark container"

# --------------------
# Setup custom scripts
# --------------------
custom_script_dir=$AZTK_WORKING_DIR/custom-scripts
aztk_dir=$AZTK_WORKING_DIR/aztk

# -----------------------
# Preload jupyter samples
# TODO: remove when we support uploading random (non-executable) files as part custom-scripts
# -----------------------
mkdir -p /mnt/samples

Expand Down
1 change: 0 additions & 1 deletion aztk/node_scripts/install/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,5 @@ def setup_spark_container():
spark.start_spark_worker()

plugins.setup_plugins(target=PluginTarget.SparkContainer, is_master=is_master, is_worker=is_worker)
scripts.run_custom_scripts(is_master=is_master, is_worker=is_worker)

open("/tmp/setup_complete", 'a').close()
73 changes: 0 additions & 73 deletions aztk/node_scripts/install/scripts.py

This file was deleted.

9 changes: 4 additions & 5 deletions aztk/spark/models/models.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from typing import List
from Cryptodome.PublicKey import RSA

import azure.batch.models as batch_models
from Cryptodome.PublicKey import RSA

import aztk.models
from aztk import error
from aztk.utils import constants, helpers
from aztk.core.models import Model, fields
from aztk.utils import constants, helpers


class SparkToolkit(aztk.models.Toolkit):
Expand Down Expand Up @@ -192,7 +194,6 @@ def __init__(self,
id=None,
applications=None,
vm_size=None,
custom_scripts=None,
spark_configuration=None,
toolkit=None,
max_dedicated_nodes=0,
Expand All @@ -203,7 +204,6 @@ def __init__(self,

self.id = id
self.applications = applications
self.custom_scripts = custom_scripts
self.spark_configuration = spark_configuration
self.vm_size = vm_size
self.gpu_enabled = None
Expand All @@ -219,7 +219,6 @@ def __init__(self,
def to_cluster_config(self):
return ClusterConfiguration(
cluster_id=self.id,
custom_scripts=self.custom_scripts,
toolkit=self.toolkit,
vm_size=self.vm_size,
size=self.max_dedicated_nodes,
Expand Down
2 changes: 0 additions & 2 deletions aztk/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,6 @@
DEFAULT_SPARK_JARS_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'jars')
DEFAULT_SPARK_JOB_CONFIG = os.path.join(os.getcwd(), '.aztk', 'job.yaml')
GLOBAL_SPARK_JOB_CONFIG = os.path.join(HOME_DIRECTORY_PATH, '.aztk', 'job.yaml')

CUSTOM_SCRIPTS_DEST = os.path.join(ROOT_PATH, 'node_scripts', 'custom-scripts')
"""
Source and destination paths for spark init
"""
Expand Down
2 changes: 0 additions & 2 deletions aztk_cli/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@ class JobConfig():
def __init__(self):
self.id = None
self.applications = []
self.custom_scripts = None
self.spark_configuration = None
self.vm_size = None
self.toolkit = None
Expand All @@ -187,7 +186,6 @@ def _merge_dict(self, config):
self.max_dedicated_nodes = cluster_configuration.get('size')
if cluster_configuration.get('size_low_priority') is not None:
self.max_low_pri_nodes = cluster_configuration.get('size_low_priority')
self.custom_scripts = cluster_configuration.get('custom_scripts')
self.subnet_id = cluster_configuration.get('subnet_id')
self.worker_on_master = cluster_configuration.get("worker_on_master")
scheduling_target = cluster_configuration.get("scheduling_target")
Expand Down
7 changes: 0 additions & 7 deletions aztk_cli/config/cluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,6 @@ size: 2
# username: <username for the linux user to be created> (optional)
username: spark

# **DEPRECATED** Use plugins instead
# custom_scripts:
# - script: </path/to/script.sh or /path/to/script/directory/>
# runOn: <master/worker/all-nodes>
# - script: <./relative/path/to/other/script.sh or ./relative/path/to/other/script/directory/>
# runOn: <master/worker/all-nodes>

# To add your cluster to a virtual network provide the full arm resource id below
# subnet_id: /subscriptions/********-****-****-****-************/resourceGroups/********/providers/Microsoft.Network/virtualNetworks/*******/subnets/******

Expand Down
1 change: 0 additions & 1 deletion aztk_cli/spark/endpoints/job/submit.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@ def execute(args: typing.NamedTuple):
job_configuration = aztk.spark.models.JobConfiguration(
id=job_conf.id,
applications=job_conf.applications,
custom_scripts=job_conf.custom_scripts,
spark_configuration=spark_configuration,
vm_size=job_conf.vm_size,
toolkit=job_conf.toolkit,
Expand Down
1 change: 0 additions & 1 deletion aztk_cli/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -399,7 +399,6 @@ def print_cluster_conf(cluster_conf: ClusterConfiguration, wait: bool):
log.info("> dedicated: %s", cluster_conf.size)
log.info("> low priority: %s", cluster_conf.size_low_priority)
log.info("cluster vm size: %s", cluster_conf.vm_size)
log.info("custom scripts: %s", len(cluster_conf.custom_scripts) if cluster_conf.custom_scripts else 0)
log.info("subnet ID: %s", cluster_conf.subnet_id)
log.info("file shares: %s",
len(cluster_conf.file_shares) if cluster_conf.file_shares is not None else 0)
Expand Down
2 changes: 1 addition & 1 deletion docs/10-clusters.md
Original file line number Diff line number Diff line change
Expand Up @@ -197,5 +197,5 @@ By default, the `aztk spark cluster ssh` command port forwards the Spark Web UI

## Next Steps
- [Run a Spark job](20-spark-submit.html)
- [Configure the Spark cluster using custom commands](11-custom-scripts.html)
- [Configure the Spark cluster using custom plugins](51-define-plugin.html)
- [Bring your own Docker image or choose between a variety of our supported base images to manage your Spark and Python versions](12-docker-image.html)
4 changes: 1 addition & 3 deletions docs/15-plugins.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Plugins

Plugins are a successor to [custom scripts](11-custom-scripts.html) and are the recommended way of running custom code on the cluster.

Plugins can either be one of the Aztk [supported plugins](#supported-plugins) or the path to a [local file](#custom-script-plugin).
Plugins can either be one of the `aztk` [supported plugins](#supported-plugins) or the path to a [local file](#custom-script-plugin).

## Supported Plugins
AZTK ships with a library of default plugins that enable auxiliary services to use with your Spark cluster.
Expand Down
7 changes: 0 additions & 7 deletions docs/70-jobs.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,6 @@ Jobs also require a definition of the cluster on which the Applications will run
software: spark
version: 2.2
subnet_id: <resource ID of a subnet to use (optional)>
custom_scripts:
- List
- of
- paths
- to
- custom
- scripts
```
_Please Note: For more information about Azure VM sizes, see [Azure Batch Pricing](https://azure.microsoft.com/en-us/pricing/details/batch/). And for more information about Docker repositories see [Docker](./12-docker-image.html)._

Expand Down
9 changes: 0 additions & 9 deletions tests/integration_tests/spark/sdk/cluster/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,6 @@ def test_create_cluster():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down Expand Up @@ -105,7 +104,6 @@ def test_list_clusters():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand All @@ -130,7 +128,6 @@ def test_get_remote_login_settings():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down Expand Up @@ -158,7 +155,6 @@ def test_submit():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down Expand Up @@ -199,7 +195,6 @@ def test_get_application_log():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down Expand Up @@ -260,7 +255,6 @@ def test_get_application_status_complete():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down Expand Up @@ -304,7 +298,6 @@ def test_delete_cluster():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand All @@ -330,7 +323,6 @@ def test_spark_processes_up():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand All @@ -357,7 +349,6 @@ def test_debug_tool():
size_low_priority=0,
vm_size="standard_f2",
subnet_id=None,
custom_scripts=None,
file_shares=None,
toolkit=aztk.spark.models.SparkToolkit(version="2.3.0"),
spark_configuration=None)
Expand Down
Loading