From ba72ffacb8617f975a2f6d3d93244231855741cd Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Tue, 16 Nov 2021 18:36:15 +0100 Subject: [PATCH 1/9] Add oneDNN support for array_split operator --- src/operator/nn/dnnl/dnnl_ops-inl.h | 6 + src/operator/nn/dnnl/dnnl_split-inl.h | 69 ++++++++++ src/operator/nn/dnnl/dnnl_split.cc | 176 ++++++++++++++++++++++++++ src/operator/tensor/matrix_op-inl.h | 16 +++ src/operator/tensor/matrix_op.cc | 35 +++++ 5 files changed, 302 insertions(+) create mode 100644 src/operator/nn/dnnl/dnnl_split-inl.h create mode 100644 src/operator/nn/dnnl/dnnl_split.cc diff --git a/src/operator/nn/dnnl/dnnl_ops-inl.h b/src/operator/nn/dnnl/dnnl_ops-inl.h index 8db1e8adc1a5..2eea57424129 100644 --- a/src/operator/nn/dnnl/dnnl_ops-inl.h +++ b/src/operator/nn/dnnl/dnnl_ops-inl.h @@ -132,6 +132,12 @@ void DNNLSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& out_data); +void DNNLSplitForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); + /* For sum */ void DNNLSumForward(const nnvm::NodeAttrs& attrs, const OpContext& ctx, diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h new file mode 100644 index 000000000000..e0ad804c2a70 --- /dev/null +++ b/src/operator/nn/dnnl/dnnl_split-inl.h @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dnnl_split-inl.h + */ + +#ifndef MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_ +#define MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_ + +#if MXNET_USE_ONEDNN == 1 +#include + +#include "./dnnl_base-inl.h" +#include "./dnnl_ops-inl.h" + +namespace mxnet { +namespace op { + +using split_fwd_t = dnnl::reorder; + +class DNNLSplitFwd { + public: + struct Tensors { + Tensors(const NDArray& input, const std::vector& outputs); + + const NDArray& input; + const std::vector& outputs; + }; + + static DNNLSplitFwd GetCached(const SplitParam& param, + const Tensors& tensors, + const bool is_train); + + // static split_fwd_pd_t GetSplitFwdPd(const dnnl::memory::desc& input_md, + // const dnnl::memory::desc& output_md); + + DNNLSplitFwd(const SplitParam& param, const Tensors& tensors, const bool is_train); + + void Execute(const Tensors& tensors) const; + + private: + // std::shared_ptr split_pd; + std::shared_ptr split_fwd; +}; + + +bool SupportDNNLSplit(const SplitParam& param, const NDArray& input); + +} // namespace op +} // namespace mxnet +#endif +#endif // MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_ \ No newline at end of file diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc new file mode 100644 index 000000000000..7937e2d5a5ef --- /dev/null +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file dnnl_split.cc + * \brief + */ + +#if MXNET_USE_ONEDNN == 1 + +#include "../../tensor/matrix_op-inl.h" +#include "./dnnl_split-inl.h" + +namespace mxnet { +namespace op { + +bool SupportDNNLSplit(const SplitParam& param, const NDArray& input) { + // int in_ndim = input.shape().ndim(); + // int out_size = output.shape().Size(); + // int in_size = input.shape().Size(); + // bool param_supported = true; + return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);// && + // (output.dtype() == mshadow::kFloat32 || output.dtype() == mshadow::kBfloat16); + // return true; +} + +void DNNLSplitForward(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (req[0] == kNullOp) + return; + CHECK_NE(req[0], kAddTo); + const SplitParam& param = dmlc::get(attrs.parsed); + const bool is_train = ctx.is_train; + const auto tensors = DNNLSplitFwd::Tensors(inputs[0], outputs); + const auto fwd = DNNLSplitFwd::GetCached(param, tensors, is_train); + fwd.Execute(tensors); +} + +DNNLSplitFwd::Tensors::Tensors(const NDArray& input, const std::vector& outputs) + : input(input), outputs(outputs) {} + +typedef ParamOpSign DNNLSplitSignature; + +DNNLSplitFwd DNNLSplitFwd::GetCached(const SplitParam& param, + const Tensors& tensors, + const bool is_train) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local std::unordered_map fwds; +#else + static MX_THREAD_LOCAL std::unordered_map fwds; +#endif + + DNNLSplitSignature key(param); + key.AddSign(is_train); + key.AddSign(tensors.input); + key.AddSign(tensors.outputs); + key.AddSign(param.indices); + key.AddSign(param.squeeze_axis); + key.AddSign(param.sections); + DNNLSplitFwd fwd(param, tensors, is_train); + auto it = fwds.find(key); + if (it == fwds.end()) { + DNNLSplitFwd fwd(param, tensors, is_train); + it = AddToCache(&fwds, key, fwd); + } + return it->second; + return fwd; +} + +DNNLSplitFwd::DNNLSplitFwd(const SplitParam& param, const Tensors& tensors, const bool is_train) { + auto input_tensor = tensors.input.Reorder2Default(); + // create X mem descriptors + auto cpu_engine = CpuEngine::Get()->get_engine(); + const auto& ishape = tensors.input.shape(); + int real_axis = param.axis; + if (real_axis < 0) { + real_axis += ishape.ndim(); + } + + const mxnet::TShape split_pts = + (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices; + // LOG(INFO) << split_pts; + std::vector strides(ishape.ndim()); + strides[ishape.ndim() - 1] = 1; + for (int i = ishape.ndim() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ishape[i + 1]; + } + dnnl::memory::dims dnnl_strides(strides.begin(), strides.end()); + + for (int i = 0; i < split_pts.ndim(); ++i) { + auto section_shape = ishape; + int end_dim; + if (i + 1 >= split_pts.ndim()) { + end_dim = ishape[real_axis]; + } else { + end_dim = split_pts[i + 1]; + } + section_shape[real_axis] = end_dim - split_pts[i]; + if(section_shape[real_axis] == 0) + continue; + // LOG(INFO) << section_shape; + dnnl::memory::dims dnnl_dims(section_shape.begin(), section_shape.end()); + auto in_mem_desc = + dnnl::memory::desc(dnnl_dims, get_dnnl_type(tensors.input.dtype()), dnnl_strides); + int offset = split_pts[i] * strides[real_axis] * GetTypeSize(tensors.input.dtype()); + auto in_mem = + dnnl::memory(in_mem_desc, cpu_engine, reinterpret_cast(input_tensor.data().dptr_) + offset); + + auto out_mem = tensors.outputs[i].GetDNNLData(); + const auto reorder_pd = + dnnl::reorder::primitive_desc(cpu_engine, in_mem_desc, cpu_engine, out_mem->get_desc()); + dnnl_args_map_t reorder_args; + reorder_args[DNNL_ARG_SRC] = in_mem; + reorder_args[DNNL_ARG_DST] = *out_mem; + DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd), reorder_args); + DNNLStream::Get()->Submit(); + } + // split indicies e.g. [0,1,3] - from 0 to 1 and from 1 to 3 --- 2 sections + + // auto input_md = input_mem->get_desc(); + // const auto in_shape = tensors.data.shape(); + // const size_t in_ndim = in_shape.ndim(); + // const size_t out_ndim = tensors.out.shape().ndim(); + // const auto out_dtype = get_dnnl_type(tensors.out.dtype()); + // dnnl::memory::desc out_md; + + // split_pd = std::make_shared(GetSplitFwdPd(input_md, out_md, + // reduction_alg)); split_fwd = std::make_shared(*split_pd); +} + +// split_fwd_pd_t DNNLSplitFwd::GetSplitFwdPd(const dnnl::memory::desc& input_md, +// const dnnl::memory::desc& output_md) { +// auto cpu_engine = CpuEngine::Get()->get_engine(); +// auto desc = dnnl::reorder(input_md, output_md); +// return split_fwd_pd_t(desc, cpu_engine); +// } + +void DNNLSplitFwd::Execute(const Tensors& tensors) const { + // auto stream = DNNLStream::Get(); + // auto engine = CpuEngine::Get()->get_engine(); + // auto input_mem = tensors.data.GetDNNLData(); + // if (tensors.out.shape().Size() == 1) { + // // scalar result + // auto out_mem = dnnl::memory(split_pd->dst_desc(), engine, + // tensors.out.data().dptr()); stream->RegisterPrimArgs(*split_fwd, {{DNNL_ARG_SRC, + // *input_mem}, {DNNL_ARG_DST, out_mem}}); + // } else { + // auto out_mem = tensors.out.GetDNNLData(split_pd->dst_desc()); + // stream->RegisterPrimArgs(*split_fwd, {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, + // *out_mem}}); + // } + // stream->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif \ No newline at end of file diff --git a/src/operator/tensor/matrix_op-inl.h b/src/operator/tensor/matrix_op-inl.h index 04e49d646e45..00bfe9bd51c7 100644 --- a/src/operator/tensor/matrix_op-inl.h +++ b/src/operator/tensor/matrix_op-inl.h @@ -3062,6 +3062,11 @@ struct SplitParam : public dmlc::Parameter { (*dict)["squeeze_axis"] = squeeze_axis_s.str(); (*dict)["sections"] = sections_s.str(); } + + bool operator==(const SplitParam& other) const { + return this->indices == other.indices && this->axis == other.axis && + this->squeeze_axis == other.squeeze_axis && this->sections == other.sections; + } }; // struct SplitParam inline mxnet::TShape GetSplitIndices(const mxnet::TShape& ishape, int axis, int sections) { @@ -3451,6 +3456,17 @@ struct hash { } }; +template <> +struct hash { + size_t operator()(const mxnet::op::SplitParam& val) { + size_t ret = 0; + ret = dmlc::HashCombine(ret, val.indices); + ret = dmlc::HashCombine(ret, val.axis); + ret = dmlc::HashCombine(ret, val.squeeze_axis); + ret = dmlc::HashCombine(ret, val.sections); + return ret; + } +}; } // namespace std #endif // MXNET_OPERATOR_TENSOR_MATRIX_OP_INL_H_ diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index b65c7cb03564..0f039eec2020 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -30,6 +30,7 @@ #include "../nn/dnnl/dnnl_reshape-inl.h" #include "../nn/dnnl/dnnl_slice-inl.h" #include "../nn/dnnl/dnnl_transpose-inl.h" +#include "../nn/dnnl/dnnl_split-inl.h" #endif namespace mxnet { @@ -1177,6 +1178,35 @@ Example:: .add_argument("data", "NDArray-or-Symbol", "Input ndarray") .add_arguments(DepthToSpaceParam::__FIELDS__()); +#if MXNET_USE_ONEDNN == 1 +static void SplitForwardEx(const nnvm::NodeAttrs& attrs, + const OpContext& op_ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK(!inputs.empty()); + if (req[0] == kNullOp) { + return; + } + const SplitParam& param = dmlc::get(attrs.parsed); + if (SupportDNNLSplit(param, inputs[0])) { + DNNL_OPCHECK_INIT(/*is backward*/ false, outputs.size(), inputs, outputs); + DNNLRun(DNNLSplitForward, attrs, op_ctx, inputs, req, outputs); + DNNL_OPCHECK_RUN(SplitOpForward, attrs, op_ctx, inputs, req, outputs); + } else { + FallBackCompute(SplitOpForward, attrs, op_ctx, inputs, req, outputs); + } +} + +inline static bool SplitInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector* in_attrs, + std::vector* out_attrs) { + return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs); + } +#endif // MXNET_USE_ONEDNN == 1 + NNVM_REGISTER_OP(_split_v2) .add_alias("_npi_split") .add_alias("_npi_array_split") @@ -1246,6 +1276,11 @@ Example:: [](const NodeAttrs& n) { return std::vector{ResourceRequest::kTempSpace}; }) +#if MXNET_USE_ONEDNN == 1 + .set_attr("FComputeEx", SplitForwardEx) + .set_attr("TIsDNNL", true) + .set_attr("FInferStorageType", SplitInferStorageType) +#endif .set_attr("THasDeterministicOutput", true) .set_attr("FGradient", ElemwiseGradUseNone{"_split_v2_backward"}) .add_argument("data", "NDArray-or-Symbol", "The input") From 0a682f47a2a1a6cd4ac84e3f5ceb3b2577f23310 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Tue, 16 Nov 2021 18:47:26 +0100 Subject: [PATCH 2/9] benchmark.py --- split.py | 41 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) create mode 100644 split.py diff --git a/split.py b/split.py new file mode 100644 index 000000000000..c773637a2d9b --- /dev/null +++ b/split.py @@ -0,0 +1,41 @@ +# import mxnet as mx +# import numpy as np +# a = mx.np.arange(4*4*4).reshape((4,4,4)) + +# out = mx.np.array_split(a, axis=2, indices_or_sections=3) + +# # print(a) +# # print("\n\n\n") +# # print(out) + + +# b = np.arange(4*4*4).reshape((4,4,4)) + +# out2 = np.array_split(b, axis=2, indices_or_sections=3) + +# for o1, o2 in zip(out, out2): +# print(o2 - o1) + + + +import mxnet +import mxnet.gluon.nn as nn +import mxnet.numpy as np +import time + + +dims = [128, 512, 1024, 4096] +print("shape;axis;time") +for ndim in range (2): + for dim1 in dims: + for dim2 in dims: + shape = (dim1, dim2) if ndim == 0 else (32, dim1, dim2) + a = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) + for axis in range(2 + ndim): + for section in range(1, 4): + tic = time.time() + for i in range(100): + out = np.array_split(a, axis=axis, indices_or_sections=section) + [o.wait_to_read() for o in out] + toc = time.time() + print(f"{shape};{axis};{section};{toc-tic}") \ No newline at end of file From 878cd5e6263975220471e98ce1d307843e4e8ec6 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Thu, 18 Nov 2021 17:52:08 +0100 Subject: [PATCH 3/9] refactor --- src/operator/nn/dnnl/dnnl_split-inl.h | 21 +-- src/operator/nn/dnnl/dnnl_split.cc | 176 ++++++++++++-------------- src/operator/tensor/matrix_op.cc | 3 +- 3 files changed, 90 insertions(+), 110 deletions(-) diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h index e0ad804c2a70..1e989a6f6d77 100644 --- a/src/operator/nn/dnnl/dnnl_split-inl.h +++ b/src/operator/nn/dnnl/dnnl_split-inl.h @@ -34,6 +34,7 @@ namespace mxnet { namespace op { using split_fwd_t = dnnl::reorder; +using split_fwd_pd_t = dnnl::reorder::primitive_desc; class DNNLSplitFwd { public: @@ -46,22 +47,22 @@ class DNNLSplitFwd { static DNNLSplitFwd GetCached(const SplitParam& param, const Tensors& tensors, - const bool is_train); + const TShape& split_pts, + const int split_axis); - // static split_fwd_pd_t GetSplitFwdPd(const dnnl::memory::desc& input_md, - // const dnnl::memory::desc& output_md); + DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, const int split_axis); - DNNLSplitFwd(const SplitParam& param, const Tensors& tensors, const bool is_train); - - void Execute(const Tensors& tensors) const; + void Execute(const Tensors& tensors, + const TShape& split_pts, + const int split_axis, + const std::vector& req) const; private: - // std::shared_ptr split_pd; - std::shared_ptr split_fwd; + std::vector split_fwds; + std::vector split_pds; }; - -bool SupportDNNLSplit(const SplitParam& param, const NDArray& input); +bool SupportDNNLSplit(const NDArray& input); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index 7937e2d5a5ef..bb3667e269f3 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -30,14 +30,10 @@ namespace mxnet { namespace op { -bool SupportDNNLSplit(const SplitParam& param, const NDArray& input) { - // int in_ndim = input.shape().ndim(); - // int out_size = output.shape().Size(); - // int in_size = input.shape().Size(); - // bool param_supported = true; - return (input.dtype() == mshadow::kFloat32 || input.dtype() == mshadow::kBfloat16);// && - // (output.dtype() == mshadow::kFloat32 || output.dtype() == mshadow::kBfloat16); - // return true; +bool SupportDNNLSplit(const NDArray& input) { + static const std::set supported_dtypes = { + mshadow::kFloat32, mshadow::kBfloat16, mshadow::kInt32, mshadow::kInt8, mshadow::kUint8}; + return supported_dtypes.count(input.dtype()); } void DNNLSplitForward(const nnvm::NodeAttrs& attrs, @@ -45,14 +41,17 @@ void DNNLSplitForward(const nnvm::NodeAttrs& attrs, const std::vector& inputs, const std::vector& req, const std::vector& outputs) { - if (req[0] == kNullOp) - return; - CHECK_NE(req[0], kAddTo); const SplitParam& param = dmlc::get(attrs.parsed); - const bool is_train = ctx.is_train; const auto tensors = DNNLSplitFwd::Tensors(inputs[0], outputs); - const auto fwd = DNNLSplitFwd::GetCached(param, tensors, is_train); - fwd.Execute(tensors); + + const auto& ishape = tensors.input.shape(); + const int split_axis = param.axis >= 0 ? param.axis : param.axis + ishape.ndim(); + const mxnet::TShape split_pts = + (param.sections > 0) ? GetSplitIndices(tensors.input.shape(), split_axis, param.sections) : + param.indices; + + const auto fwd = DNNLSplitFwd::GetCached(param, tensors, split_pts, split_axis); + fwd.Execute(tensors, split_pts, split_axis, req); } DNNLSplitFwd::Tensors::Tensors(const NDArray& input, const std::vector& outputs) @@ -62,7 +61,8 @@ typedef ParamOpSign DNNLSplitSignature; DNNLSplitFwd DNNLSplitFwd::GetCached(const SplitParam& param, const Tensors& tensors, - const bool is_train) { + const TShape& split_pts, + const int split_axis) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map fwds; #else @@ -70,105 +70,85 @@ DNNLSplitFwd DNNLSplitFwd::GetCached(const SplitParam& param, #endif DNNLSplitSignature key(param); - key.AddSign(is_train); key.AddSign(tensors.input); key.AddSign(tensors.outputs); - key.AddSign(param.indices); - key.AddSign(param.squeeze_axis); - key.AddSign(param.sections); - DNNLSplitFwd fwd(param, tensors, is_train); + key.AddSign(split_pts); + key.AddSign(split_axis); auto it = fwds.find(key); if (it == fwds.end()) { - DNNLSplitFwd fwd(param, tensors, is_train); + DNNLSplitFwd fwd(tensors, split_pts, split_axis); it = AddToCache(&fwds, key, fwd); } return it->second; - return fwd; } -DNNLSplitFwd::DNNLSplitFwd(const SplitParam& param, const Tensors& tensors, const bool is_train) { - auto input_tensor = tensors.input.Reorder2Default(); - // create X mem descriptors - auto cpu_engine = CpuEngine::Get()->get_engine(); - const auto& ishape = tensors.input.shape(); - int real_axis = param.axis; - if (real_axis < 0) { - real_axis += ishape.ndim(); - } +DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, const int split_axis) { + const auto cpu_engine = CpuEngine::Get()->get_engine(); + const auto input = tensors.input.Reorder2Default(); + const auto& ishape = input.shape(); + const auto& dtype = get_dnnl_type(input.dtype()); + const auto format_tag = static_cast(GetDefaultFormat(ishape.ndim())); + + std::vector strides(ishape.ndim(), 1); + // last dim stride = 1, start loop from the penultimate + for (int i = ishape.ndim() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ishape[i + 1]; + } - const mxnet::TShape split_pts = - (param.sections > 0) ? GetSplitIndices(ishape, real_axis, param.sections) : param.indices; - // LOG(INFO) << split_pts; - std::vector strides(ishape.ndim()); - strides[ishape.ndim() - 1] = 1; - for (int i = ishape.ndim() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ishape[i + 1]; + for (int i = 0; i < tensors.outputs.size(); ++i) { + const auto& out = tensors.outputs[i]; + if (out.shape().Size() == 0) { + continue; } + dnnl::memory::dims dnnl_dims(ishape.begin(), ishape.end()); dnnl::memory::dims dnnl_strides(strides.begin(), strides.end()); + // ending split point is always last dimension + int end_split_pt = (i + 1 >= split_pts.ndim()) ? ishape[split_axis] : split_pts[i + 1]; + dnnl_dims[split_axis] = end_split_pt - split_pts[i]; - for (int i = 0; i < split_pts.ndim(); ++i) { - auto section_shape = ishape; - int end_dim; - if (i + 1 >= split_pts.ndim()) { - end_dim = ishape[real_axis]; - } else { - end_dim = split_pts[i + 1]; - } - section_shape[real_axis] = end_dim - split_pts[i]; - if(section_shape[real_axis] == 0) - continue; - // LOG(INFO) << section_shape; - dnnl::memory::dims dnnl_dims(section_shape.begin(), section_shape.end()); - auto in_mem_desc = - dnnl::memory::desc(dnnl_dims, get_dnnl_type(tensors.input.dtype()), dnnl_strides); - int offset = split_pts[i] * strides[real_axis] * GetTypeSize(tensors.input.dtype()); - auto in_mem = - dnnl::memory(in_mem_desc, cpu_engine, reinterpret_cast(input_tensor.data().dptr_) + offset); - - auto out_mem = tensors.outputs[i].GetDNNLData(); - const auto reorder_pd = - dnnl::reorder::primitive_desc(cpu_engine, in_mem_desc, cpu_engine, out_mem->get_desc()); - dnnl_args_map_t reorder_args; - reorder_args[DNNL_ARG_SRC] = in_mem; - reorder_args[DNNL_ARG_DST] = *out_mem; - DNNLStream::Get()->RegisterPrimArgs(dnnl::reorder(reorder_pd), reorder_args); - DNNLStream::Get()->Submit(); - } - // split indicies e.g. [0,1,3] - from 0 to 1 and from 1 to 3 --- 2 sections - - // auto input_md = input_mem->get_desc(); - // const auto in_shape = tensors.data.shape(); - // const size_t in_ndim = in_shape.ndim(); - // const size_t out_ndim = tensors.out.shape().ndim(); - // const auto out_dtype = get_dnnl_type(tensors.out.dtype()); - // dnnl::memory::desc out_md; + auto in_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, dnnl_strides); + auto out_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, format_tag); - // split_pd = std::make_shared(GetSplitFwdPd(input_md, out_md, - // reduction_alg)); split_fwd = std::make_shared(*split_pd); + const auto split_pd = split_fwd_pd_t(cpu_engine, in_mem_desc, cpu_engine, out_mem_desc); + split_pds.emplace_back(split_pd); + split_fwds.emplace_back(split_fwd_t(split_pd)); + } } -// split_fwd_pd_t DNNLSplitFwd::GetSplitFwdPd(const dnnl::memory::desc& input_md, -// const dnnl::memory::desc& output_md) { -// auto cpu_engine = CpuEngine::Get()->get_engine(); -// auto desc = dnnl::reorder(input_md, output_md); -// return split_fwd_pd_t(desc, cpu_engine); -// } - -void DNNLSplitFwd::Execute(const Tensors& tensors) const { - // auto stream = DNNLStream::Get(); - // auto engine = CpuEngine::Get()->get_engine(); - // auto input_mem = tensors.data.GetDNNLData(); - // if (tensors.out.shape().Size() == 1) { - // // scalar result - // auto out_mem = dnnl::memory(split_pd->dst_desc(), engine, - // tensors.out.data().dptr()); stream->RegisterPrimArgs(*split_fwd, {{DNNL_ARG_SRC, - // *input_mem}, {DNNL_ARG_DST, out_mem}}); - // } else { - // auto out_mem = tensors.out.GetDNNLData(split_pd->dst_desc()); - // stream->RegisterPrimArgs(*split_fwd, {{DNNL_ARG_SRC, *input_mem}, {DNNL_ARG_DST, - // *out_mem}}); - // } - // stream->Submit(); +void DNNLSplitFwd::Execute(const Tensors& tensors, + const TShape& split_pts, + const int split_axis, + const std::vector& req) const { + const auto& cpu_engine = CpuEngine::Get()->get_engine(); + + const auto& input_tensor = tensors.input.Reorder2Default(); + const auto& ishape = input_tensor.shape(); + + std::vector strides(ishape.ndim(), 1); + for (int i = ishape.ndim() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ishape[i + 1]; + } + int out_idx = 0, primitive_idx = 0; + + for (const auto& out : tensors.outputs) { + if (out.shape().Size() == 0) { + out_idx++; + continue; + } + int offset = split_pts[out_idx] * strides[split_axis] * GetTypeSize(input_tensor.dtype()); + auto in_mem = dnnl::memory(split_pds[primitive_idx].src_desc(), + cpu_engine, + reinterpret_cast(input_tensor.data().dptr_) + offset); + + auto out_mem = CreateDNNLMem(out, split_pds[primitive_idx].dst_desc(), req[out_idx]); + DNNLStream::Get()->RegisterPrimArgs(split_fwds[primitive_idx], + {{DNNL_ARG_SRC, in_mem}, {DNNL_ARG_DST, *out_mem.second}}); + + CommitOutput(out, out_mem); + ++out_idx; + ++primitive_idx; + } + DNNLStream::Get()->Submit(); } } // namespace op diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 0f039eec2020..8fbfd6bcc26a 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -1188,8 +1188,7 @@ static void SplitForwardEx(const nnvm::NodeAttrs& attrs, if (req[0] == kNullOp) { return; } - const SplitParam& param = dmlc::get(attrs.parsed); - if (SupportDNNLSplit(param, inputs[0])) { + if (SupportDNNLSplit(inputs[0])) { DNNL_OPCHECK_INIT(/*is backward*/ false, outputs.size(), inputs, outputs); DNNLRun(DNNLSplitForward, attrs, op_ctx, inputs, req, outputs); DNNL_OPCHECK_RUN(SplitOpForward, attrs, op_ctx, inputs, req, outputs); From f47aa1893c7a4ac0e01144dd6071672662b6a905 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Fri, 19 Nov 2021 09:03:33 +0100 Subject: [PATCH 4/9] update --- split.py | 41 ------------------------------ src/operator/nn/dnnl/dnnl_split.cc | 3 +-- src/operator/tensor/matrix_op.cc | 5 ++-- 3 files changed, 4 insertions(+), 45 deletions(-) delete mode 100644 split.py diff --git a/split.py b/split.py deleted file mode 100644 index c773637a2d9b..000000000000 --- a/split.py +++ /dev/null @@ -1,41 +0,0 @@ -# import mxnet as mx -# import numpy as np -# a = mx.np.arange(4*4*4).reshape((4,4,4)) - -# out = mx.np.array_split(a, axis=2, indices_or_sections=3) - -# # print(a) -# # print("\n\n\n") -# # print(out) - - -# b = np.arange(4*4*4).reshape((4,4,4)) - -# out2 = np.array_split(b, axis=2, indices_or_sections=3) - -# for o1, o2 in zip(out, out2): -# print(o2 - o1) - - - -import mxnet -import mxnet.gluon.nn as nn -import mxnet.numpy as np -import time - - -dims = [128, 512, 1024, 4096] -print("shape;axis;time") -for ndim in range (2): - for dim1 in dims: - for dim2 in dims: - shape = (dim1, dim2) if ndim == 0 else (32, dim1, dim2) - a = np.random.uniform(-1.0, 1.0, shape).astype(np.float32) - for axis in range(2 + ndim): - for section in range(1, 4): - tic = time.time() - for i in range(100): - out = np.array_split(a, axis=axis, indices_or_sections=section) - [o.wait_to_read() for o in out] - toc = time.time() - print(f"{shape};{axis};{section};{toc-tic}") \ No newline at end of file diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index bb3667e269f3..07564c7a3716 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -19,7 +19,6 @@ /*! * \file dnnl_split.cc - * \brief */ #if MXNET_USE_ONEDNN == 1 @@ -153,4 +152,4 @@ void DNNLSplitFwd::Execute(const Tensors& tensors, } // namespace op } // namespace mxnet -#endif \ No newline at end of file +#endif diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 8fbfd6bcc26a..4dbc223816f5 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -1202,8 +1202,9 @@ inline static bool SplitInferStorageType(const nnvm::NodeAttrs& attrs, DispatchMode* dispatch_mode, std::vector* in_attrs, std::vector* out_attrs) { - return DNNLStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs, out_attrs); - } + return DNNLStorageType( + attrs, dev_mask, /*support onednn*/ true, dispatch_mode, in_attrs, out_attrs); +} #endif // MXNET_USE_ONEDNN == 1 NNVM_REGISTER_OP(_split_v2) From e54312697e313e6bb223bc94c7d37bfe30ad1acf Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Thu, 25 Nov 2021 14:47:41 +0100 Subject: [PATCH 5/9] review fixes --- src/operator/nn/dnnl/dnnl_split.cc | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index 07564c7a3716..e6864afa1411 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -88,7 +88,8 @@ DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, cons const auto& dtype = get_dnnl_type(input.dtype()); const auto format_tag = static_cast(GetDefaultFormat(ishape.ndim())); - std::vector strides(ishape.ndim(), 1); + + dnnl::memory::dims strides(ishape.ndim(), 1); // last dim stride = 1, start loop from the penultimate for (int i = ishape.ndim() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ishape[i + 1]; @@ -100,12 +101,11 @@ DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, cons continue; } dnnl::memory::dims dnnl_dims(ishape.begin(), ishape.end()); - dnnl::memory::dims dnnl_strides(strides.begin(), strides.end()); // ending split point is always last dimension int end_split_pt = (i + 1 >= split_pts.ndim()) ? ishape[split_axis] : split_pts[i + 1]; dnnl_dims[split_axis] = end_split_pt - split_pts[i]; - auto in_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, dnnl_strides); + auto in_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, strides); auto out_mem_desc = dnnl::memory::desc(dnnl_dims, dtype, format_tag); const auto split_pd = split_fwd_pd_t(cpu_engine, in_mem_desc, cpu_engine, out_mem_desc); @@ -127,17 +127,20 @@ void DNNLSplitFwd::Execute(const Tensors& tensors, for (int i = ishape.ndim() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ishape[i + 1]; } + int out_idx = 0, primitive_idx = 0; + int axis_offset = strides[split_axis] * GetTypeSize(input_tensor.dtype()); + std::byte* input_ptr = reinterpret_cast(input_tensor.data().dptr_); for (const auto& out : tensors.outputs) { if (out.shape().Size() == 0) { out_idx++; continue; } - int offset = split_pts[out_idx] * strides[split_axis] * GetTypeSize(input_tensor.dtype()); + int offset = split_pts[out_idx] * axis_offset; auto in_mem = dnnl::memory(split_pds[primitive_idx].src_desc(), cpu_engine, - reinterpret_cast(input_tensor.data().dptr_) + offset); + input_ptr + offset); auto out_mem = CreateDNNLMem(out, split_pds[primitive_idx].dst_desc(), req[out_idx]); DNNLStream::Get()->RegisterPrimArgs(split_fwds[primitive_idx], From 1dfbb3f3277df33d6ec05da439da52e7e06df3b3 Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Thu, 25 Nov 2021 16:30:24 +0100 Subject: [PATCH 6/9] fix sanity --- src/operator/nn/dnnl/dnnl_split-inl.h | 2 +- src/operator/nn/dnnl/dnnl_split.cc | 7 ++----- src/operator/tensor/matrix_op.cc | 2 +- 3 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h index 1e989a6f6d77..3827380cda77 100644 --- a/src/operator/nn/dnnl/dnnl_split-inl.h +++ b/src/operator/nn/dnnl/dnnl_split-inl.h @@ -67,4 +67,4 @@ bool SupportDNNLSplit(const NDArray& input); } // namespace op } // namespace mxnet #endif -#endif // MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_ \ No newline at end of file +#endif // MXNET_OPERATOR_NN_DNNL_DNNL_SPLIT_INL_H_ diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index e6864afa1411..678eab8a8007 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -88,7 +88,6 @@ DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, cons const auto& dtype = get_dnnl_type(input.dtype()); const auto format_tag = static_cast(GetDefaultFormat(ishape.ndim())); - dnnl::memory::dims strides(ishape.ndim(), 1); // last dim stride = 1, start loop from the penultimate for (int i = ishape.ndim() - 2; i >= 0; --i) { @@ -129,7 +128,7 @@ void DNNLSplitFwd::Execute(const Tensors& tensors, } int out_idx = 0, primitive_idx = 0; - int axis_offset = strides[split_axis] * GetTypeSize(input_tensor.dtype()); + int axis_offset = strides[split_axis] * GetTypeSize(input_tensor.dtype()); std::byte* input_ptr = reinterpret_cast(input_tensor.data().dptr_); for (const auto& out : tensors.outputs) { @@ -138,9 +137,7 @@ void DNNLSplitFwd::Execute(const Tensors& tensors, continue; } int offset = split_pts[out_idx] * axis_offset; - auto in_mem = dnnl::memory(split_pds[primitive_idx].src_desc(), - cpu_engine, - input_ptr + offset); + auto in_mem = dnnl::memory(split_pds[primitive_idx].src_desc(), cpu_engine, input_ptr + offset); auto out_mem = CreateDNNLMem(out, split_pds[primitive_idx].dst_desc(), req[out_idx]); DNNLStream::Get()->RegisterPrimArgs(split_fwds[primitive_idx], diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 4dbc223816f5..21bf43ea5ff4 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -1193,7 +1193,7 @@ static void SplitForwardEx(const nnvm::NodeAttrs& attrs, DNNLRun(DNNLSplitForward, attrs, op_ctx, inputs, req, outputs); DNNL_OPCHECK_RUN(SplitOpForward, attrs, op_ctx, inputs, req, outputs); } else { - FallBackCompute(SplitOpForward, attrs, op_ctx, inputs, req, outputs); + FallBackCompute(SplitOpForward, attrs, op_ctx, inputs, req, outputs); } } From c5d816f87bde018a38da1230b1e6b3bd4387930b Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 29 Nov 2021 12:12:30 +0100 Subject: [PATCH 7/9] fix --- src/operator/tensor/matrix_op.cc | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 21bf43ea5ff4..bc97aa4a538c 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -1185,9 +1185,6 @@ static void SplitForwardEx(const nnvm::NodeAttrs& attrs, const std::vector& req, const std::vector& outputs) { CHECK(!inputs.empty()); - if (req[0] == kNullOp) { - return; - } if (SupportDNNLSplit(inputs[0])) { DNNL_OPCHECK_INIT(/*is backward*/ false, outputs.size(), inputs, outputs); DNNLRun(DNNLSplitForward, attrs, op_ctx, inputs, req, outputs); From 7dbe9adfb77990a901435514ec7c0a93f49d620a Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Tue, 7 Dec 2021 17:32:54 +0100 Subject: [PATCH 8/9] review --- src/operator/nn/dnnl/dnnl_base-inl.h | 1 + src/operator/nn/dnnl/dnnl_split-inl.h | 3 +-- src/operator/nn/dnnl/dnnl_split.cc | 9 +-------- 3 files changed, 3 insertions(+), 10 deletions(-) diff --git a/src/operator/nn/dnnl/dnnl_base-inl.h b/src/operator/nn/dnnl/dnnl_base-inl.h index 20b8319ac110..5344989c1e51 100644 --- a/src/operator/nn/dnnl/dnnl_base-inl.h +++ b/src/operator/nn/dnnl/dnnl_base-inl.h @@ -197,6 +197,7 @@ bool SupportDNNLTranspose(const NDArray& data); bool SupportDNNLBatchDot(const std::vector& inputs, const NDArray& output); bool SupportDNNLLayerNorm(const LayerNormParam& param, const std::vector& inputs); bool SupportDNNLReshape(const NDArray& input, const NDArray& output); +bool SupportDNNLSplit(const NDArray& input); bool SupportDNNLStack(const std::vector& inputs); } // namespace op diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h index 3827380cda77..820b7373b990 100644 --- a/src/operator/nn/dnnl/dnnl_split-inl.h +++ b/src/operator/nn/dnnl/dnnl_split-inl.h @@ -60,10 +60,9 @@ class DNNLSplitFwd { private: std::vector split_fwds; std::vector split_pds; + dnnl::memory::dims strides; }; -bool SupportDNNLSplit(const NDArray& input); - } // namespace op } // namespace mxnet #endif diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index 678eab8a8007..1dae8d12d57f 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -88,7 +88,7 @@ DNNLSplitFwd::DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, cons const auto& dtype = get_dnnl_type(input.dtype()); const auto format_tag = static_cast(GetDefaultFormat(ishape.ndim())); - dnnl::memory::dims strides(ishape.ndim(), 1); + strides = dnnl::memory::dims(ishape.ndim(), 1); // last dim stride = 1, start loop from the penultimate for (int i = ishape.ndim() - 2; i >= 0; --i) { strides[i] = strides[i + 1] * ishape[i + 1]; @@ -120,13 +120,6 @@ void DNNLSplitFwd::Execute(const Tensors& tensors, const auto& cpu_engine = CpuEngine::Get()->get_engine(); const auto& input_tensor = tensors.input.Reorder2Default(); - const auto& ishape = input_tensor.shape(); - - std::vector strides(ishape.ndim(), 1); - for (int i = ishape.ndim() - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * ishape[i + 1]; - } - int out_idx = 0, primitive_idx = 0; int axis_offset = strides[split_axis] * GetTypeSize(input_tensor.dtype()); std::byte* input_ptr = reinterpret_cast(input_tensor.data().dptr_); From eb1f605c45c77927157b93f874141b4db64c337a Mon Sep 17 00:00:00 2001 From: Bartlomiej Gawrych Date: Mon, 10 Jan 2022 15:00:00 +0100 Subject: [PATCH 9/9] Apply review comments --- src/operator/nn/dnnl/dnnl_split-inl.h | 8 ++++---- src/operator/nn/dnnl/dnnl_split.cc | 10 +++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/operator/nn/dnnl/dnnl_split-inl.h b/src/operator/nn/dnnl/dnnl_split-inl.h index 820b7373b990..a8cdc4cd93e9 100644 --- a/src/operator/nn/dnnl/dnnl_split-inl.h +++ b/src/operator/nn/dnnl/dnnl_split-inl.h @@ -45,10 +45,10 @@ class DNNLSplitFwd { const std::vector& outputs; }; - static DNNLSplitFwd GetCached(const SplitParam& param, - const Tensors& tensors, - const TShape& split_pts, - const int split_axis); + static DNNLSplitFwd& GetCached(const SplitParam& param, + const Tensors& tensors, + const TShape& split_pts, + const int split_axis); DNNLSplitFwd(const Tensors& tensors, const TShape& split_pts, const int split_axis); diff --git a/src/operator/nn/dnnl/dnnl_split.cc b/src/operator/nn/dnnl/dnnl_split.cc index 1dae8d12d57f..e13b45a259ce 100644 --- a/src/operator/nn/dnnl/dnnl_split.cc +++ b/src/operator/nn/dnnl/dnnl_split.cc @@ -49,7 +49,7 @@ void DNNLSplitForward(const nnvm::NodeAttrs& attrs, (param.sections > 0) ? GetSplitIndices(tensors.input.shape(), split_axis, param.sections) : param.indices; - const auto fwd = DNNLSplitFwd::GetCached(param, tensors, split_pts, split_axis); + const auto& fwd = DNNLSplitFwd::GetCached(param, tensors, split_pts, split_axis); fwd.Execute(tensors, split_pts, split_axis, req); } @@ -58,10 +58,10 @@ DNNLSplitFwd::Tensors::Tensors(const NDArray& input, const std::vector& typedef ParamOpSign DNNLSplitSignature; -DNNLSplitFwd DNNLSplitFwd::GetCached(const SplitParam& param, - const Tensors& tensors, - const TShape& split_pts, - const int split_axis) { +DNNLSplitFwd& DNNLSplitFwd::GetCached(const SplitParam& param, + const Tensors& tensors, + const TShape& split_pts, + const int split_axis) { #if DMLC_CXX11_THREAD_LOCAL static thread_local std::unordered_map fwds; #else