From aa48cdf68f2887a0aac95ed2070d90470962dc4e Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Wed, 13 Apr 2016 22:15:15 -0700 Subject: [PATCH 1/6] ARROW-100: [C++] Computing RowBatch size Implement RowBatchWriter::DataHeaderSize and arrow::ipc::GetRowBatchSize. To achieve this, the Flatbuffer metadata is written to a temporary buffer and its size is determined. This commit also adds MockMemorySource, a new MemorySource that tracks the amount of memory written. Author: Philipp Moritz --- cpp/src/arrow/ipc/adapter.cc | 26 ++++++++++++++++++++++---- cpp/src/arrow/ipc/adapter.h | 2 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 8 ++++++++ cpp/src/arrow/ipc/memory.cc | 24 ++++++++++++++++++++++++ cpp/src/arrow/ipc/memory.h | 20 ++++++++++++++++++++ 5 files changed, 75 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 2f72c3aa846..27b2dc66ae7 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -149,10 +149,19 @@ class RowBatchWriter { } // This must be called after invoking AssemblePayload - int64_t DataHeaderSize() { - // TODO(wesm): In case it is needed, compute the upper bound for the size - // of the buffer containing the flatbuffer data header. - return 0; + Status DataHeaderSize(int64_t* size) { + // emulates the behavior of Write without actually writing + int64_t offset = 0; + for (size_t i = 0; i < buffers_.size(); ++i) { + const Buffer* buffer = buffers_[i].get(); + offset += buffer->size(); + buffer_meta_.push_back(flatbuf::Buffer(0, 0, 0)); + } + std::shared_ptr data_header; + RETURN_NOT_OK(WriteDataHeader( + batch_->num_rows(), offset, field_nodes_, buffer_meta_, &data_header)); + *size = data_header->size(); + return Status::OK(); } // Total footprint of buffers. This must be called after invoking @@ -179,6 +188,15 @@ Status WriteRowBatch( RETURN_NOT_OK(serializer.AssemblePayload()); return serializer.Write(dst, position, header_offset); } + +Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { + RowBatchWriter serializer(batch); + RETURN_NOT_OK(serializer.AssemblePayload()); + RETURN_NOT_OK(serializer.DataHeaderSize(size)); + *size += serializer.TotalBytes(); + return Status::OK(); +} + // ---------------------------------------------------------------------- // Row batch read path diff --git a/cpp/src/arrow/ipc/adapter.h b/cpp/src/arrow/ipc/adapter.h index d453fa05f49..254293b87db 100644 --- a/cpp/src/arrow/ipc/adapter.h +++ b/cpp/src/arrow/ipc/adapter.h @@ -60,7 +60,7 @@ Status WriteRowBatch( // Compute the precise number of bytes needed in a contiguous memory segment to // write the row batch. This involves generating the complete serialized // Flatbuffers metadata. -int64_t GetRowBatchSize(const RowBatch* batch); +Status GetRowBatchSize(const RowBatch* batch, int64_t* size); // ---------------------------------------------------------------------- // "Read" path; does not copy data if the MemorySource does not diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index fbdda77e491..9450279ab48 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -91,6 +91,10 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { int64_t header_location; ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); + MockMemorySource mock_source(1 << 16); + int64_t mock_header_location; + ASSERT_OK(WriteRowBatch(&mock_source, &batch, 0, &mock_header_location)); + std::shared_ptr result; ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); @@ -98,6 +102,10 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { ASSERT_OK(result->GetRowBatch(schema, &batch_result)); EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); + int64_t size; + ASSERT_OK(GetRowBatchSize(batch_result.get(), &size)); + EXPECT_EQ(mock_source.Position(), size); + for (int i = 0; i < batch.num_columns(); ++i) { EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) << i << batch.column_name(i); diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 2b077e97929..2e411b71f4c 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -144,5 +144,29 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t return Status::OK(); } +MockMemorySource::MockMemorySource(int64_t size) : size_(size) {} + +Status MockMemorySource::Close() { + return Status::OK(); +} + +Status MockMemorySource::ReadAt( + int64_t position, int64_t nbytes, std::shared_ptr* out) { + return Status::OK(); +} + +Status MockMemorySource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { + pos_ = position + nbytes; + return Status::OK(); +} + +int64_t MockMemorySource::Size() const { + return size_; +} + +int64_t MockMemorySource::Position() const { + return pos_; +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index e529603dc6e..0a04bea95e3 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -121,6 +121,26 @@ class MemoryMappedSource : public MemorySource { std::unique_ptr impl_; }; +// A MemorySource that tracks the size of allocations from a memory source +class MockMemorySource : public MemorySource { + public: + explicit MockMemorySource(int64_t size); + + Status Close() override; + + Status ReadAt(int64_t position, int64_t nbytes, std::shared_ptr* out) override; + + Status Write(int64_t position, const uint8_t* data, int64_t nbytes) override; + + int64_t Size() const override; + + int64_t Position() const; + + private: + int64_t size_; + int64_t pos_; +}; + } // namespace ipc } // namespace arrow From 9b69f12d29f09aec581929b7b09cf2bbeb0cfd7d Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Mon, 18 Apr 2016 01:07:15 -0700 Subject: [PATCH 2/6] factor out GetRowBatchSize test, use MockMemorySource to implement GetRowBatchSize, unify DataHeaderSize and TotalBytes into GetTotalSize --- cpp/src/arrow/ipc/adapter.cc | 29 ++++++------------------- cpp/src/arrow/ipc/ipc-adapter-test.cc | 31 ++++++++++++++++++++------- cpp/src/arrow/ipc/memory.cc | 4 ++-- 3 files changed, 31 insertions(+), 33 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 27b2dc66ae7..fcfc98d56a1 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -149,31 +149,15 @@ class RowBatchWriter { } // This must be called after invoking AssemblePayload - Status DataHeaderSize(int64_t* size) { + Status GetTotalSize(int64_t* size) { // emulates the behavior of Write without actually writing - int64_t offset = 0; - for (size_t i = 0; i < buffers_.size(); ++i) { - const Buffer* buffer = buffers_[i].get(); - offset += buffer->size(); - buffer_meta_.push_back(flatbuf::Buffer(0, 0, 0)); - } - std::shared_ptr data_header; - RETURN_NOT_OK(WriteDataHeader( - batch_->num_rows(), offset, field_nodes_, buffer_meta_, &data_header)); - *size = data_header->size(); + int64_t data_header_offset; + MockMemorySource source(0); + RETURN_NOT_OK(Write(&source, 0, &data_header_offset)); + *size = source.Position(); return Status::OK(); } - // Total footprint of buffers. This must be called after invoking - // AssemblePayload - int64_t TotalBytes() { - int64_t total = 0; - for (const std::shared_ptr& buffer : buffers_) { - total += buffer->size(); - } - return total; - } - private: const RowBatch* batch_; @@ -192,8 +176,7 @@ Status WriteRowBatch( Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { RowBatchWriter serializer(batch); RETURN_NOT_OK(serializer.AssemblePayload()); - RETURN_NOT_OK(serializer.DataHeaderSize(size)); - *size += serializer.TotalBytes(); + RETURN_NOT_OK(serializer.GetTotalSize(size)); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 9450279ab48..277fdc61c29 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -58,6 +58,29 @@ class TestWriteRowBatch : public ::testing::Test, public MemoryMapFixture { const auto INT32 = std::make_shared(); +TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { + const int length = 1000; + + auto f0 = std::make_shared("f0", INT32); + std::shared_ptr schema(new Schema({f0})); + + auto data = std::make_shared(pool_); + ASSERT_OK(data->Resize(length * sizeof(int32_t))); + test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), + reinterpret_cast(data->mutable_data())); + auto a0 = std::make_shared(length, data); + + RowBatch batch(schema, length, {a0}); + + MockMemorySource mock_source(1 << 16); + int64_t mock_header_location; + ASSERT_OK(WriteRowBatch(&mock_source, &batch, 0, &mock_header_location)); + + int64_t size; + ASSERT_OK(GetRowBatchSize(&batch, &size)); + EXPECT_EQ(mock_source.Position(), size); +} + TEST_F(TestWriteRowBatch, IntegerRoundTrip) { const int length = 1000; @@ -91,10 +114,6 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { int64_t header_location; ASSERT_OK(WriteRowBatch(mmap_.get(), &batch, 0, &header_location)); - MockMemorySource mock_source(1 << 16); - int64_t mock_header_location; - ASSERT_OK(WriteRowBatch(&mock_source, &batch, 0, &mock_header_location)); - std::shared_ptr result; ASSERT_OK(RowBatchReader::Open(mmap_.get(), header_location, &result)); @@ -102,10 +121,6 @@ TEST_F(TestWriteRowBatch, IntegerRoundTrip) { ASSERT_OK(result->GetRowBatch(schema, &batch_result)); EXPECT_EQ(batch.num_rows(), batch_result->num_rows()); - int64_t size; - ASSERT_OK(GetRowBatchSize(batch_result.get(), &size)); - EXPECT_EQ(mock_source.Position(), size); - for (int i = 0; i < batch.num_columns(); ++i) { EXPECT_TRUE(batch.column(i)->Equals(batch_result->column(i))) << i << batch.column_name(i); diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 2e411b71f4c..55cb0e49b4a 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -144,7 +144,7 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t return Status::OK(); } -MockMemorySource::MockMemorySource(int64_t size) : size_(size) {} +MockMemorySource::MockMemorySource(int64_t size) : size_(size), pos_(0) {} Status MockMemorySource::Close() { return Status::OK(); @@ -156,7 +156,7 @@ Status MockMemorySource::ReadAt( } Status MockMemorySource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { - pos_ = position + nbytes; + pos_ = std::max(pos_, position + nbytes); return Status::OK(); } From 6b798f8975b92ffb7853a014bc87e6708ee3cf4a Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 19 Apr 2016 15:02:11 -0700 Subject: [PATCH 3/6] fix maximum recursion depth --- cpp/src/arrow/ipc/adapter.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 068fa8acf02..e692dbc96d5 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -206,7 +206,7 @@ Status WriteRowBatch(MemorySource* dst, const RowBatch* batch, int64_t position, } Status GetRowBatchSize(const RowBatch* batch, int64_t* size) { - RowBatchWriter serializer(batch); + RowBatchWriter serializer(batch, kMaxIpcRecursionDepth); RETURN_NOT_OK(serializer.AssemblePayload()); RETURN_NOT_OK(serializer.GetTotalSize(size)); return Status::OK(); From 348445856cf4e1860206735ed1f690762e260635 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Tue, 19 Apr 2016 21:54:27 -0700 Subject: [PATCH 4/6] add tests for more datatypes --- cpp/src/arrow/ipc/ipc-adapter-test.cc | 51 +++++++++++++++------------ 1 file changed, 28 insertions(+), 23 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index 9eb2e8e0ee9..eaae89ae329 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -88,29 +88,6 @@ TEST_P(TestWriteRowBatch, RoundTrip) { } } -TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { - const int length = 1000; - - auto f0 = std::make_shared("f0", INT32); - std::shared_ptr schema(new Schema({f0})); - - auto data = std::make_shared(pool_); - ASSERT_OK(data->Resize(length * sizeof(int32_t))); - test::rand_uniform_int(length, 0, 0, std::numeric_limits::max(), - reinterpret_cast(data->mutable_data())); - auto a0 = std::make_shared(length, data); - - RowBatch batch(schema, length, {a0}); - - MockMemorySource mock_source(1 << 16); - int64_t mock_header_location; - ASSERT_OK(WriteRowBatch(&mock_source, &batch, 0, &mock_header_location)); - - int64_t size; - ASSERT_OK(GetRowBatchSize(&batch, &size)); - EXPECT_EQ(mock_source.Position(), size); -} - Status MakeIntRowBatch(std::shared_ptr* out) { const int length = 1000; @@ -218,6 +195,34 @@ INSTANTIATE_TEST_CASE_P(RoundTripTests, TestWriteRowBatch, ::testing::Values(&MakeIntRowBatch, &MakeListRowBatch, &MakeNonNullRowBatch, &MakeZeroLengthRowBatch, &MakeDeeplyNestedList)); +void TestGetRowBatchSize(std::shared_ptr batch) { + MockMemorySource mock_source(1 << 16); + int64_t mock_header_location; + int64_t size; + ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location)); + ASSERT_OK(GetRowBatchSize(batch.get(), &size)); + ASSERT_EQ(mock_source.Position(), size); +} + +TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { + std::shared_ptr batch; + + ASSERT_OK(MakeIntRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeListRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeZeroLengthRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeNonNullRowBatch(&batch)); + TestGetRowBatchSize(batch); + + ASSERT_OK(MakeDeeplyNestedList(&batch)); + TestGetRowBatchSize(batch); +} + class RecursionLimits : public ::testing::Test, public MemoryMapFixture { public: void SetUp() { pool_ = default_memory_pool(); } From 253c9f067f9ca9ee07e055b722e5ddfa1aab3d72 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Fri, 22 Apr 2016 17:37:38 -0700 Subject: [PATCH 5/6] rename MockMemorySource methods to reflect better what they are doing --- cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/ipc/ipc-adapter-test.cc | 2 +- cpp/src/arrow/ipc/memory.cc | 8 ++++---- cpp/src/arrow/ipc/memory.h | 5 +++-- 4 files changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index e692dbc96d5..34700080746 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -184,7 +184,7 @@ class RowBatchWriter { int64_t data_header_offset; MockMemorySource source(0); RETURN_NOT_OK(Write(&source, 0, &data_header_offset)); - *size = source.Position(); + *size = source.GetExtentBytesWritten(); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/ipc-adapter-test.cc b/cpp/src/arrow/ipc/ipc-adapter-test.cc index eaae89ae329..3b147343f77 100644 --- a/cpp/src/arrow/ipc/ipc-adapter-test.cc +++ b/cpp/src/arrow/ipc/ipc-adapter-test.cc @@ -201,7 +201,7 @@ void TestGetRowBatchSize(std::shared_ptr batch) { int64_t size; ASSERT_OK(WriteRowBatch(&mock_source, batch.get(), 0, &mock_header_location)); ASSERT_OK(GetRowBatchSize(batch.get(), &size)); - ASSERT_EQ(mock_source.Position(), size); + ASSERT_EQ(mock_source.GetExtentBytesWritten(), size); } TEST_F(TestWriteRowBatch, IntegerGetRowBatchSize) { diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index 74d7648c1c5..a519e98675d 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -145,7 +145,7 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t return Status::OK(); } -MockMemorySource::MockMemorySource(int64_t size) : size_(size), pos_(0) {} +MockMemorySource::MockMemorySource(int64_t size) : size_(size), extent_bytes_written_(0) {} Status MockMemorySource::Close() { return Status::OK(); @@ -157,7 +157,7 @@ Status MockMemorySource::ReadAt( } Status MockMemorySource::Write(int64_t position, const uint8_t* data, int64_t nbytes) { - pos_ = std::max(pos_, position + nbytes); + extent_bytes_written_ = std::max(extent_bytes_written_, position + nbytes); return Status::OK(); } @@ -165,8 +165,8 @@ int64_t MockMemorySource::Size() const { return size_; } -int64_t MockMemorySource::Position() const { - return pos_; +int64_t MockMemorySource::GetExtentBytesWritten() const { + return extent_bytes_written_; } } // namespace ipc diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index 0a04bea95e3..627919fb91d 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -134,11 +134,12 @@ class MockMemorySource : public MemorySource { int64_t Size() const override; - int64_t Position() const; + // @return: the smallest number of bytes containing the modified region of the MockMemorySource + int64_t GetExtentBytesWritten() const; private: int64_t size_; - int64_t pos_; + int64_t extent_bytes_written_; }; } // namespace ipc From e95fc5cb554a14706e18bf36d2097d0624019e22 Mon Sep 17 00:00:00 2001 From: Philipp Moritz Date: Sat, 23 Apr 2016 00:55:49 +0000 Subject: [PATCH 6/6] fix formating --- cpp/src/arrow/ipc/memory.cc | 3 ++- cpp/src/arrow/ipc/memory.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/src/arrow/ipc/memory.cc b/cpp/src/arrow/ipc/memory.cc index a519e98675d..caff2c610b9 100644 --- a/cpp/src/arrow/ipc/memory.cc +++ b/cpp/src/arrow/ipc/memory.cc @@ -145,7 +145,8 @@ Status MemoryMappedSource::Write(int64_t position, const uint8_t* data, int64_t return Status::OK(); } -MockMemorySource::MockMemorySource(int64_t size) : size_(size), extent_bytes_written_(0) {} +MockMemorySource::MockMemorySource(int64_t size) + : size_(size), extent_bytes_written_(0) {} Status MockMemorySource::Close() { return Status::OK(); diff --git a/cpp/src/arrow/ipc/memory.h b/cpp/src/arrow/ipc/memory.h index 627919fb91d..c6fd7a71899 100644 --- a/cpp/src/arrow/ipc/memory.h +++ b/cpp/src/arrow/ipc/memory.h @@ -134,7 +134,8 @@ class MockMemorySource : public MemorySource { int64_t Size() const override; - // @return: the smallest number of bytes containing the modified region of the MockMemorySource + // @return: the smallest number of bytes containing the modified region of the + // MockMemorySource int64_t GetExtentBytesWritten() const; private: