From 0463995eae95c6542c232b2c76bd28c8a4ae8709 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 29 May 2016 10:50:35 +0200 Subject: [PATCH 01/28] ARROW-203: Python: Basic filename based Parquet read/write --- cpp/src/arrow/column.h | 2 + cpp/src/arrow/parquet/parquet-io-test.cc | 144 +++++++++++++++++++++-- cpp/src/arrow/parquet/reader.cc | 26 ++++ cpp/src/arrow/parquet/reader.h | 3 + cpp/src/arrow/parquet/writer.cc | 86 ++++++++++++++ cpp/src/arrow/parquet/writer.h | 9 ++ python/CMakeLists.txt | 5 + python/cmake_modules/FindArrow.cmake | 14 ++- python/pyarrow/array.pyx | 8 ++ python/pyarrow/error.pxd | 2 + python/pyarrow/error.pyx | 7 ++ python/pyarrow/includes/common.pxd | 5 + python/pyarrow/includes/libarrow.pxd | 3 + python/pyarrow/includes/parquet.pxd | 46 ++++++++ python/pyarrow/parquet.pyx | 41 ++++++- python/pyarrow/tests/test_parquet.py | 50 ++++++++ python/setup.py | 2 +- python/src/pyarrow/parquet.cc | 30 +++++ python/src/pyarrow/parquet.h | 30 +++++ 19 files changed, 502 insertions(+), 11 deletions(-) create mode 100644 python/pyarrow/tests/test_parquet.py create mode 100644 python/src/pyarrow/parquet.cc create mode 100644 python/src/pyarrow/parquet.h diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h index 22becc34547..e409566e1f1 100644 --- a/cpp/src/arrow/column.h +++ b/cpp/src/arrow/column.h @@ -67,6 +67,8 @@ class Column { int64_t null_count() const { return data_->null_count(); } + const std::shared_ptr& field() const { return field_; } + // @returns: the column's name in the passed metadata const std::string& name() const { return field_->name; } diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 845574d2c53..218e8916a6b 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -69,11 +69,26 @@ std::shared_ptr NullableArray( return std::static_pointer_cast(builder.Finish()); } +std::shared_ptr MakeColumn(const std::string& name, + const std::shared_ptr& array, bool nullable) { + auto field = std::make_shared(name, array->type(), nullable); + return std::make_shared(field, array); +} + +std::shared_ptr MakeSimpleTable( + const std::shared_ptr& values, bool nullable) { + std::shared_ptr column = MakeColumn("col", values, nullable); + std::vector> columns({column}); + std::vector> fields({column->field()}); + auto schema = std::make_shared(fields); + return std::make_shared
("table", schema, columns); +} + class TestParquetIO : public ::testing::Test { public: virtual void SetUp() {} - std::shared_ptr Schema( + std::shared_ptr MakeSchema( ParquetType::type parquet_type, Repetition::type repetition) { auto pnode = PrimitiveNode::Make("column1", repetition, parquet_type); NodePtr node_ = @@ -102,9 +117,17 @@ class TestParquetIO : public ::testing::Test { ASSERT_NE(nullptr, out->get()); } + void ReadTableFromFile( + std::unique_ptr file_reader, std::shared_ptr
* out) { + arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader)); + ASSERT_NO_THROW(ASSERT_OK(reader.ReadFlatTable(out))); + ASSERT_NE(nullptr, out->get()); + } + std::unique_ptr Int64File( std::vector& values, int num_chunks) { - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + std::shared_ptr schema = + MakeSchema(ParquetType::INT64, Repetition::REQUIRED); std::unique_ptr file_writer = MakeWriter(schema); size_t chunk_size = values.size() / num_chunks; for (int i = 0; i < num_chunks; i++) { @@ -120,7 +143,6 @@ class TestParquetIO : public ::testing::Test { return ReaderFromSink(); } - private: std::shared_ptr sink_; }; @@ -137,6 +159,23 @@ TEST_F(TestParquetIO, SingleColumnInt64Read) { } } +TEST_F(TestParquetIO, SingleColumnInt64TableRead) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 1); + + std::shared_ptr
out; + ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + Int64Array* out_array = static_cast(chunked_array->chunk(0).get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) { std::vector values(100, 128); std::unique_ptr file_reader = Int64File(values, 4); @@ -150,10 +189,28 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) { } } +TEST_F(TestParquetIO, SingleColumnInt64ChunkedTableRead) { + std::vector values(100, 128); + std::unique_ptr file_reader = Int64File(values, 4); + + std::shared_ptr
out; + ReadTableFromFile(std::move(file_reader), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + Int64Array* out_array = static_cast(chunked_array->chunk(0).get()); + for (size_t i = 0; i < values.size(); i++) { + EXPECT_EQ(values[i], out_array->raw_data()[i]); + } +} + TEST_F(TestParquetIO, SingleColumnInt64Write) { std::shared_ptr values = NonNullArray(100, 128); - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + std::shared_ptr schema = + MakeSchema(ParquetType::INT64, Repetition::REQUIRED); FileWriter writer(default_memory_pool(), MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); @@ -164,11 +221,29 @@ TEST_F(TestParquetIO, SingleColumnInt64Write) { ASSERT_TRUE(values->Equals(out)); } +TEST_F(TestParquetIO, SingleColumnTableInt64Write) { + std::shared_ptr values = NonNullArray(100, 128); + std::shared_ptr
table = MakeSimpleTable(values, false); + sink_ = std::make_shared(); + ASSERT_NO_THROW(ASSERT_OK( + WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length()))); + + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) { // This also tests max_definition_level = 1 std::shared_ptr values = NullableArray(100, 128, 10); - std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); + std::shared_ptr schema = + MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL); FileWriter writer(default_memory_pool(), MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); @@ -179,11 +254,30 @@ TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) { ASSERT_TRUE(values->Equals(out)); } +TEST_F(TestParquetIO, SingleColumnTableDoubleReadWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(100, 128, 10); + std::shared_ptr
table = MakeSimpleTable(values, true); + sink_ = std::make_shared(); + ASSERT_NO_THROW(ASSERT_OK( + WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length()))); + + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(100, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { std::shared_ptr values = NonNullArray(100, 128); std::shared_ptr values_chunk = NonNullArray(25, 128); - std::shared_ptr schema = Schema(ParquetType::INT64, Repetition::REQUIRED); + std::shared_ptr schema = + MakeSchema(ParquetType::INT64, Repetition::REQUIRED); FileWriter writer(default_memory_pool(), MakeWriter(schema)); for (int i = 0; i < 4; i++) { ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); @@ -196,13 +290,31 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { ASSERT_TRUE(values->Equals(out)); } +TEST_F(TestParquetIO, SingleColumnTableInt64ChunkedWrite) { + std::shared_ptr values = NonNullArray(1000, 128); + std::shared_ptr
table = MakeSimpleTable(values, false); + sink_ = std::make_shared(); + ASSERT_NO_THROW( + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512))); + + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(1000, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { std::shared_ptr values = NullableArray(100, 128, 10); std::shared_ptr values_chunk_nulls = NullableArray(25, 128, 10); std::shared_ptr values_chunk = NullableArray(25, 128, 0); - std::shared_ptr schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL); + std::shared_ptr schema = + MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL); FileWriter writer(default_memory_pool(), MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get()))); @@ -217,6 +329,24 @@ TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { ASSERT_TRUE(values->Equals(out)); } +TEST_F(TestParquetIO, SingleColumnTableDoubleChunkedWrite) { + // This also tests max_definition_level = 1 + std::shared_ptr values = NullableArray(1000, 128, 100); + std::shared_ptr
table = MakeSimpleTable(values, true); + sink_ = std::make_shared(); + ASSERT_NO_THROW( + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512))); + + std::shared_ptr
out; + ReadTableFromFile(ReaderFromSink(), &out); + ASSERT_EQ(1, out->num_columns()); + ASSERT_EQ(1000, out->num_rows()); + + std::shared_ptr chunked_array = out->column(0)->data(); + ASSERT_EQ(1, chunked_array->num_chunks()); + ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 346de253606..1b97aead159 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -18,10 +18,14 @@ #include "arrow/parquet/reader.h" #include +#include +#include +#include "arrow/column.h" #include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/types/primitive.h" #include "arrow/util/status.h" @@ -40,6 +44,7 @@ class FileReader::Impl { bool CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr); Status GetFlatColumn(int i, std::unique_ptr* out); Status ReadFlatColumn(int i, std::shared_ptr* out); + Status ReadFlatTable(std::shared_ptr
* out); private: MemoryPool* pool_; @@ -103,6 +108,23 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { return flat_column_reader->NextBatch(reader_->num_rows(), out); } +Status FileReader::Impl::ReadFlatTable(std::shared_ptr
* table) { + const std::string name = reader_->descr()->schema()->name(); + std::shared_ptr schema; + RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema)); + + std::vector> columns; + for (int i = 0; i < reader_->num_columns(); i++) { + std::shared_ptr array; + RETURN_NOT_OK(ReadFlatColumn(i, &array)); + auto column = std::make_shared(schema->field(i), array); + columns.push_back(column); + } + + *table = std::make_shared
(name, schema, columns); + return Status::OK(); +} + FileReader::FileReader( MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader) : impl_(new FileReader::Impl(pool, std::move(reader))) {} @@ -117,6 +139,10 @@ Status FileReader::ReadFlatColumn(int i, std::shared_ptr* out) { return impl_->ReadFlatColumn(i, out); } +Status FileReader::ReadFlatTable(std::shared_ptr
* out) { + return impl_->ReadFlatTable(out); +} + FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr, ::parquet::ParquetFileReader* reader, int column_index) : pool_(pool), diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h index 41ca7eb35b9..db7a15753d8 100644 --- a/cpp/src/arrow/parquet/reader.h +++ b/cpp/src/arrow/parquet/reader.h @@ -29,6 +29,7 @@ class Array; class MemoryPool; class RowBatch; class Status; +class Table; namespace parquet { @@ -90,6 +91,8 @@ class FileReader { Status GetFlatColumn(int i, std::unique_ptr* out); // Read column as a whole into an Array. Status ReadFlatColumn(int i, std::shared_ptr* out); + // Read a table of flat columns into a Table. + Status ReadFlatTable(std::shared_ptr
* out); virtual ~FileReader(); diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 3ad2c5b0735..e94ff1973bb 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -17,11 +17,21 @@ #include "arrow/parquet/writer.h" +#include +#include + #include "arrow/array.h" +#include "arrow/column.h" +#include "arrow/table.h" +#include "arrow/types/construct.h" #include "arrow/types/primitive.h" +#include "arrow/parquet/schema.h" #include "arrow/parquet/utils.h" #include "arrow/util/status.h" +using parquet::ParquetFileWriter; +using parquet::schema::GroupNode; + namespace arrow { namespace parquet { @@ -143,6 +153,82 @@ Status FileWriter::Close() { FileWriter::~FileWriter() {} +// Create a slice of a PrimitiveArray. +// +// This method is specially crafted for WriteFlatTable and assumes the following: +// * chunk_size is a multiple of 512 +Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, PrimitiveArray* array, + std::shared_ptr* out) { + // The last chunk may be smaller than the chunk_size + int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size); + int64_t buffer_offset = chunk * chunk_size * array->type()->value_size(); + int64_t value_size = size * array->type()->value_size(); + auto chunk_buffer = std::make_shared(array->data(), buffer_offset, value_size); + std::shared_ptr null_bitmap; + int32_t null_count = 0; + if (array->null_count() > 0) { + int64_t null_offset = (chunk * chunk_size) / 8; + int64_t null_size = util::ceil_byte(size) / 8; + null_bitmap = std::make_shared(array->null_bitmap(), null_offset, null_size); + for (int64_t k = 0; k < size; k++) { + if (!util::get_bit(null_bitmap->data(), k)) { null_count++; } + } + } + std::shared_ptr out_array; + RETURN_NOT_OK(MakePrimitiveArray( + array->type(), size, chunk_buffer, null_count, null_bitmap, &out_array)); + *out = std::static_pointer_cast(out_array); + return Status::OK(); +} + +Status WriteFlatTable(const Table* table, MemoryPool* pool, + std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) { + // Ensure alignment of sliced PrimitiveArray, esp. the null bitmap + // TODO: Support other chunksizes than multiples of 512 + if (((chunk_size & 511) != 0) && (chunk_size != table->num_rows())) { + return Status::NotImplemented( + "Only chunk sizes that are a multiple of 512 are supported"); + } + + std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; + RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema)); + auto schema_node = std::static_pointer_cast(parquet_schema->schema()); + std::unique_ptr parquet_writer = + ParquetFileWriter::Open(sink, schema_node); + FileWriter writer(pool, std::move(parquet_writer)); + + // TODO: Support writing chunked arrays. + for (int i = 0; i < table->num_columns(); i++) { + if (table->column(i)->data()->num_chunks() != 1) { + return Status::NotImplemented("No support for writing chunked arrays yet."); + } + } + + // Cast to PrimitiveArray instances as we work with them. + std::vector> arrays(table->num_columns()); + for (int i = 0; i < table->num_columns(); i++) { + // num_chunks == 1 as per above loop + std::shared_ptr array = table->column(i)->data()->chunk(0); + auto primitive_array = std::dynamic_pointer_cast(array); + if (!primitive_array) { + return Status::NotImplemented("Table must consist of PrimitiveArray instances"); + } + arrays[i] = primitive_array; + } + + for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { + int64_t size = std::min(chunk_size, table->num_rows() - chunk * chunk_size); + RETURN_NOT_OK(writer.NewRowGroup(size)); + for (int i = 0; i < table->num_columns(); i++) { + std::shared_ptr array; + RETURN_NOT_OK(TemporaryArraySlice(chunk, chunk_size, arrays[i].get(), &array)); + RETURN_NOT_OK(writer.WriteFlatColumnChunk(array.get())); + } + } + + return writer.Close(); +} + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 38f7d0b3a89..64625021c11 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -29,6 +29,7 @@ class MemoryPool; class PrimitiveArray; class RowBatch; class Status; +class Table; namespace parquet { @@ -52,6 +53,14 @@ class FileWriter { std::unique_ptr impl_; }; +/** + * Write a flat Table to Parquet. + * + * The table shall only consist of nullable, non-repeated columns of primitive type. + */ +Status WriteFlatTable(const Table* table, MemoryPool* pool, + std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size); + } // namespace parquet } // namespace arrow diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 2173232d4ef..30c37054aaa 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -344,6 +344,8 @@ find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) +ADD_THIRDPARTY_LIB(arrow_parquet + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -414,6 +416,7 @@ set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc src/pyarrow/helpers.cc + src/pyarrow/parquet.cc src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc @@ -422,6 +425,7 @@ set(PYARROW_SRCS set(LINK_LIBS arrow + arrow_parquet ) SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) @@ -442,6 +446,7 @@ set(CYTHON_EXTENSIONS array config error + parquet scalar schema table diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 3d9983849eb..f0b258ed027 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -42,19 +42,27 @@ find_library(ARROW_LIB_PATH NAMES arrow ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) +find_library(ARROW_PARQUET_LIB_PATH NAMES arrow_parquet + PATHS + ${ARROW_SEARCH_LIB_PATH} + NO_DEFAULT_PATH) + +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) + set(ARROW_PARQUET_LIB_NAME libarrow_parquet) set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH}) set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a) set(ARROW_SHARED_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) + set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) else () set(ARROW_FOUND FALSE) endif () if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) - message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}") + message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}, ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -74,4 +82,6 @@ mark_as_advanced( ARROW_LIBS ARROW_STATIC_LIB ARROW_SHARED_LIB + ARROW_PARQUET_STATIC_LIB + ARROW_PARQUET_SHARED_LIB ) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index a80b3ce8398..6d63c321931 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -68,6 +68,14 @@ cdef class Array: values = array_format(self, window=10) return '{0}\n{1}'.format(type_format, values) + def __richcmp__(Array self, Array other, int op): + if op == cpython.Py_EQ: + return self.ap.Equals(other.sp_array) + elif op == cpython.Py_NE: + return not self.ap.Equals(other.sp_array) + else: + raise TypeError('Invalid comparison') + def __len__(self): if self.sp_array.get(): return self.sp_array.get().length() diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd index d226abeda04..97ba0ef2e9f 100644 --- a/python/pyarrow/error.pxd +++ b/python/pyarrow/error.pxd @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. +from pyarrow.includes.libarrow cimport CStatus from pyarrow.includes.pyarrow cimport * +cdef check_cstatus(const CStatus& status) cdef check_status(const Status& status) diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index 3f8d7dd6460..51e06101bf6 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -21,6 +21,13 @@ from pyarrow.compat import frombytes class ArrowException(Exception): pass +cdef check_cstatus(const CStatus& status): + if status.ok(): + return + + cdef c_string c_message = status.ToString() + raise ArrowException(frombytes(c_message)) + cdef check_status(const Status& status): if status.ok(): return diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index e86d5d77e8b..5c3df6401bc 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -34,6 +34,11 @@ cdef extern from "": cdef extern from "" namespace "std" nogil: + cdef cppclass unique_ptr[T]: + unique_ptr() + unique_ptr(T*) + T* get() + cdef cppclass shared_ptr[T]: shared_ptr() shared_ptr(T*) diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index b2ef45a347b..90414e3d542 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -72,6 +72,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass MemoryPool" arrow::MemoryPool": int64_t bytes_allocated() + cdef MemoryPool* default_memory_pool() + cdef cppclass CListType" arrow::ListType"(CDataType): CListType(const shared_ptr[CDataType]& value_type) @@ -103,6 +105,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: int32_t null_count() Type type_enum() + c_bool Equals(const shared_ptr[CArray]& arr) c_bool IsNull(int i) cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray): diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd index ffdc5d48706..0918344070e 100644 --- a/python/pyarrow/includes/parquet.pxd +++ b/python/pyarrow/includes/parquet.pxd @@ -18,6 +18,26 @@ # distutils: language = c++ from pyarrow.includes.common cimport * +from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool + + +cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil: + cdef cppclass Node: + pass + + cdef cppclass GroupNode(Node): + pass + + cdef cppclass PrimitiveNode(Node): + pass + +cdef extern from "parquet/api/schema.h" namespace "parquet" nogil: + cdef cppclass SchemaDescriptor: + shared_ptr[Node] schema() + GroupNode* group() + + cdef cppclass ColumnDescriptor: + pass cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: cdef cppclass ColumnReader: @@ -48,4 +68,30 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: pass cdef cppclass ParquetFileReader: + # TODO: Some default arguments are missing + @staticmethod + unique_ptr[ParquetFileReader] OpenFile(const c_string& path) + +cdef extern from "parquet/api/writer.h" namespace "parquet" nogil: + cdef cppclass OutputStream: pass + + cdef cppclass LocalFileOutputStream(OutputStream): + LocalFileOutputStream(const c_string& path) + void Close() + + +cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil: + cdef cppclass FileReader: + FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader) + CStatus ReadFlatTable(shared_ptr[CTable]* out); + + +cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil: + CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, shared_ptr[CSchema]* out) + CStatus ToParquetSchema(const CSchema* arrow_schema, shared_ptr[SchemaDescriptor]* out) + + +cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil: + cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, shared_ptr[OutputStream] sink, int64_t chunk_size) + diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 622e7d07724..076a353bb87 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -19,5 +19,44 @@ # distutils: language = c++ # cython: embedsignature = True -from pyarrow.compat import frombytes, tobytes +from pyarrow.includes.libarrow cimport * +cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.parquet cimport * + +from pyarrow.error cimport check_cstatus +from pyarrow.table cimport Table + +def read_table(filename, columns=None): + """ + Read a Table from Parquet format + Returns + ------- + table: pyarrow.Table + """ + cdef unique_ptr[FileReader] reader + cdef Table table = Table() + cdef shared_ptr[CTable] ctable + + # Must be in one expression to avoid calling std::move which is not possible + # in Cython (due to missing rvalue support) + reader = unique_ptr[FileReader](new FileReader(default_memory_pool(), + ParquetFileReader.OpenFile(filename))) + check_cstatus(reader.get().ReadFlatTable(&ctable)) + table.init(ctable) + return table + +def write_table(table, filename, chunk_size=None): + """ + Write a Table to Parquet format + """ + cdef Table table_ = table + cdef CTable* ctable_ = table_.table + cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename)) + cdef int64_t chunk_size_ = 0 + if chunk_size is None: + chunk_size_ = ctable_.num_rows() + else: + chunk_size_ = chunk_size + + check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_)) + diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py new file mode 100644 index 00000000000..a84fc2785cd --- /dev/null +++ b/python/pyarrow/tests/test_parquet.py @@ -0,0 +1,50 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from pyarrow.compat import unittest +import pyarrow as arrow +import pyarrow.parquet + +A = arrow + +from shutil import rmtree +from tempfile import mkdtemp + +import os.path + + +class TestParquetIO(unittest.TestCase): + + def setUp(self): + self.temp_directory = mkdtemp() + + def tearDown(self): + rmtree(self.temp_directory) + + + def test_single_int64_column(self): + filename = os.path.join(self.temp_directory, 'single_int64_column.parquet') + data = [A.from_pylist(range(5))] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + A.parquet.write_table(table, filename) + table_read = pyarrow.parquet.read_table(filename) + for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + assert col_written.name == col_read.name + assert col_read.data.num_chunks == 1 + data_written = col_written.data.chunk(0) + data_read = col_read.data.chunk(0) + assert data_written == data_read diff --git a/python/setup.py b/python/setup.py index 5f228ed0af2..1c55198ec8a 100644 --- a/python/setup.py +++ b/python/setup.py @@ -214,7 +214,7 @@ def get_ext_built(self, name): return name + suffix def get_cmake_cython_names(self): - return ['array', 'config', 'error', 'scalar', 'schema', 'table'] + return ['array', 'config', 'error', 'parquet', 'scalar', 'schema', 'table'] def get_names(self): return self._found_names diff --git a/python/src/pyarrow/parquet.cc b/python/src/pyarrow/parquet.cc new file mode 100644 index 00000000000..2e524983d6f --- /dev/null +++ b/python/src/pyarrow/parquet.cc @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "pyarrow/parquet.h" + +#include +#include +#include + +using namespace arrow; + +namespace pyarrow { + + +} // namespace pyarrow + diff --git a/python/src/pyarrow/parquet.h b/python/src/pyarrow/parquet.h new file mode 100644 index 00000000000..8d1d79a4980 --- /dev/null +++ b/python/src/pyarrow/parquet.h @@ -0,0 +1,30 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_PARQUET_H +#define PYARROW_PARQUET_H + +#include +#include + +namespace pyarrow { + + +} // namespace pyarrow + +#endif // PYARROW_PARQUET_H + From 7192cfbd316835833f3489c52cba7132ff52a1db Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 30 May 2016 09:09:54 +0200 Subject: [PATCH 02/28] Add const to slicing parameters --- ci/travis_before_script_cpp.sh | 2 +- cpp/src/arrow/parquet/writer.cc | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 193c76feba1..bf9ec58dbd9 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -19,7 +19,7 @@ echo $GTEST_HOME : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install} -CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" +CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index e94ff1973bb..742b8ba6852 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -157,12 +157,12 @@ FileWriter::~FileWriter() {} // // This method is specially crafted for WriteFlatTable and assumes the following: // * chunk_size is a multiple of 512 -Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, PrimitiveArray* array, +Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, const PrimitiveArray* array, std::shared_ptr* out) { // The last chunk may be smaller than the chunk_size - int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size); - int64_t buffer_offset = chunk * chunk_size * array->type()->value_size(); - int64_t value_size = size * array->type()->value_size(); + const int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size); + const int64_t buffer_offset = chunk * chunk_size * array->type()->value_size(); + const int64_t value_size = size * array->type()->value_size(); auto chunk_buffer = std::make_shared(array->data(), buffer_offset, value_size); std::shared_ptr null_bitmap; int32_t null_count = 0; From 081db5f31c80668ce44e615d09fe8b479208a25b Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 30 May 2016 09:10:06 +0200 Subject: [PATCH 03/28] Limit and document chunk_size --- python/pyarrow/parquet.pyx | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 076a353bb87..78837a45988 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -48,13 +48,20 @@ def read_table(filename, columns=None): def write_table(table, filename, chunk_size=None): """ Write a Table to Parquet format + + Parameters + ---------- + table : pyarrow.Table + filename : string + chunk_size : int + The maximum number of rows in each Parquet RowGroup """ cdef Table table_ = table cdef CTable* ctable_ = table_.table cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename)) cdef int64_t chunk_size_ = 0 if chunk_size is None: - chunk_size_ = ctable_.num_rows() + chunk_size_ = max(ctable_.num_rows(), int(2**16)) else: chunk_size_ = chunk_size From 0fbed3f2c583eab902b8fc6c32d2a4918a308b67 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Mon, 30 May 2016 09:11:40 +0200 Subject: [PATCH 04/28] Remove obsolete parquet files --- python/CMakeLists.txt | 1 - python/src/pyarrow/parquet.cc | 30 ------------------------------ python/src/pyarrow/parquet.h | 30 ------------------------------ 3 files changed, 61 deletions(-) delete mode 100644 python/src/pyarrow/parquet.cc delete mode 100644 python/src/pyarrow/parquet.h diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 30c37054aaa..7e013c07b31 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -416,7 +416,6 @@ set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc src/pyarrow/helpers.cc - src/pyarrow/parquet.cc src/pyarrow/status.cc src/pyarrow/adapters/builtin.cc diff --git a/python/src/pyarrow/parquet.cc b/python/src/pyarrow/parquet.cc deleted file mode 100644 index 2e524983d6f..00000000000 --- a/python/src/pyarrow/parquet.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "pyarrow/parquet.h" - -#include -#include -#include - -using namespace arrow; - -namespace pyarrow { - - -} // namespace pyarrow - diff --git a/python/src/pyarrow/parquet.h b/python/src/pyarrow/parquet.h deleted file mode 100644 index 8d1d79a4980..00000000000 --- a/python/src/pyarrow/parquet.h +++ /dev/null @@ -1,30 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef PYARROW_PARQUET_H -#define PYARROW_PARQUET_H - -#include -#include - -namespace pyarrow { - - -} // namespace pyarrow - -#endif // PYARROW_PARQUET_H - From be6415c393a5f43d21bbce2d71fd908f81b9e526 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 31 May 2016 09:22:42 +0200 Subject: [PATCH 05/28] Incorportate review comments --- cpp/src/arrow/parquet/reader.cc | 7 +++---- python/pyarrow/tests/test_parquet.py | 1 - 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc index 1b97aead159..3b4882d4439 100644 --- a/cpp/src/arrow/parquet/reader.cc +++ b/cpp/src/arrow/parquet/reader.cc @@ -109,16 +109,15 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr* out) { } Status FileReader::Impl::ReadFlatTable(std::shared_ptr
* table) { - const std::string name = reader_->descr()->schema()->name(); + const std::string& name = reader_->descr()->schema()->name(); std::shared_ptr schema; RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema)); - std::vector> columns; + std::vector> columns(reader_->num_columns()); for (int i = 0; i < reader_->num_columns(); i++) { std::shared_ptr array; RETURN_NOT_OK(ReadFlatColumn(i, &array)); - auto column = std::make_shared(schema->field(i), array); - columns.push_back(column); + columns[i] = std::make_shared(schema->field(i), array); } *table = std::make_shared
(name, schema, columns); diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index a84fc2785cd..99b2c7e09ec 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -35,7 +35,6 @@ def setUp(self): def tearDown(self): rmtree(self.temp_directory) - def test_single_int64_column(self): filename = os.path.join(self.temp_directory, 'single_int64_column.parquet') data = [A.from_pylist(range(5))] From 9b06e417f7341261d537ba193e643aee0ea93302 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 3 Jun 2016 18:02:27 +0200 Subject: [PATCH 06/28] Make tests templated --- cpp/src/arrow/parquet/parquet-io-test.cc | 232 ++++++++++++----------- 1 file changed, 117 insertions(+), 115 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 218e8916a6b..474d167628b 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -18,6 +18,7 @@ #include "gtest/gtest.h" #include "arrow/test-util.h" +#include "arrow/parquet/test-util.h" #include "arrow/parquet/reader.h" #include "arrow/parquet/writer.h" #include "arrow/types/primitive.h" @@ -44,48 +45,36 @@ namespace arrow { namespace parquet { -template -std::shared_ptr NonNullArray( - size_t size, typename ArrowType::c_type value) { - std::vector values(size, value); - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size()); - return std::static_pointer_cast(builder.Finish()); -} +const int SMALL_SIZE = 100; +const int LARGE_SIZE = 10000; -// This helper function only supports (size/2) nulls yet. -template -std::shared_ptr NullableArray( - size_t size, typename ArrowType::c_type value, size_t num_nulls) { - std::vector values(size, value); - std::vector valid_bytes(size, 1); +template +struct test_traits {}; - for (size_t i = 0; i < num_nulls; i++) { - valid_bytes[i * 2] = 0; - } +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT32; +}; - NumericBuilder builder(default_memory_pool(), std::make_shared()); - builder.Append(values.data(), values.size(), valid_bytes.data()); - return std::static_pointer_cast(builder.Finish()); -} +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::INT64; +}; -std::shared_ptr MakeColumn(const std::string& name, - const std::shared_ptr& array, bool nullable) { - auto field = std::make_shared(name, array->type(), nullable); - return std::make_shared(field, array); -} +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT; +}; -std::shared_ptr
MakeSimpleTable( - const std::shared_ptr& values, bool nullable) { - std::shared_ptr column = MakeColumn("col", values, nullable); - std::vector> columns({column}); - std::vector> fields({column->field()}); - auto schema = std::make_shared(fields); - return std::make_shared
("table", schema, columns); -} +template <> +struct test_traits { + static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; +}; +template class TestParquetIO : public ::testing::Test { public: + typedef typename TestType::c_type T; virtual void SetUp() {} std::shared_ptr MakeSchema( @@ -113,7 +102,7 @@ class TestParquetIO : public ::testing::Test { std::unique_ptr column_reader; ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader))); ASSERT_NE(nullptr, column_reader.get()); - ASSERT_OK(column_reader->NextBatch(100, out)); + ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out)); ASSERT_NE(nullptr, out->get()); } @@ -143,93 +132,104 @@ class TestParquetIO : public ::testing::Test { return ReaderFromSink(); } + std::unique_ptr TestFile(std::vector& values, int num_chunks) { + std::shared_ptr schema = + MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + std::unique_ptr file_writer = MakeWriter(schema); + size_t chunk_size = values.size() / num_chunks; + for (int i = 0; i < num_chunks; i++) { + auto row_group_writer = file_writer->AppendRowGroup(chunk_size); + auto column_writer = static_cast<::parquet::TypedColumnWriter< + ::parquet::DataType::parquet_enum>>*>( + row_group_writer->NextColumn()); + T* data = values.data() + i * chunk_size; + column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); + column_writer->Close(); + row_group_writer->Close(); + } + file_writer->Close(); + return ReaderFromSink(); + } + std::shared_ptr sink_; }; -TEST_F(TestParquetIO, SingleColumnInt64Read) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 1); +typedef ::testing::Types TestTypes; + +TYPED_TEST_CASE(TestParquetIO, TestTypes); + +TYPED_TEST(TestParquetIO, SingleColumnRequiredRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 1); std::shared_ptr out; - ReadSingleColumnFile(std::move(file_reader), &out); + this->ReadSingleColumnFile(std::move(file_reader), &out); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), out.get()); } -TEST_F(TestParquetIO, SingleColumnInt64TableRead) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 1); +TYPED_TEST(TestParquetIO, SingleColumnRequiredTableRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 1); std::shared_ptr
out; - ReadTableFromFile(std::move(file_reader), &out); + this->ReadTableFromFile(std::move(file_reader), &out); ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(100, out->num_rows()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); std::shared_ptr chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); - Int64Array* out_array = static_cast(chunked_array->chunk(0).get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), chunked_array->chunk(0).get()); } -TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 4); +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 4); std::shared_ptr out; - ReadSingleColumnFile(std::move(file_reader), &out); + this->ReadSingleColumnFile(std::move(file_reader), &out); - Int64Array* out_array = static_cast(out.get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), out.get()); } -TEST_F(TestParquetIO, SingleColumnInt64ChunkedTableRead) { - std::vector values(100, 128); - std::unique_ptr file_reader = Int64File(values, 4); +TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedTableRead) { + std::vector values(SMALL_SIZE, 128); + std::unique_ptr file_reader = this->TestFile(values, 4); std::shared_ptr
out; - ReadTableFromFile(std::move(file_reader), &out); + this->ReadTableFromFile(std::move(file_reader), &out); ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(100, out->num_rows()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); std::shared_ptr chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); - Int64Array* out_array = static_cast(chunked_array->chunk(0).get()); - for (size_t i = 0; i < values.size(); i++) { - EXPECT_EQ(values[i], out_array->raw_data()[i]); - } + ExpectArray(values.data(), chunked_array->chunk(0).get()); } -TEST_F(TestParquetIO, SingleColumnInt64Write) { - std::shared_ptr values = NonNullArray(100, 128); +TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); std::shared_ptr schema = - MakeSchema(ParquetType::INT64, Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnTableInt64Write) { - std::shared_ptr values = NonNullArray(100, 128); +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); std::shared_ptr
table = MakeSimpleTable(values, false); - sink_ = std::make_shared(); + this->sink_ = std::make_shared(); ASSERT_NO_THROW(ASSERT_OK( - WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length()))); + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); + this->ReadTableFromFile(this->ReaderFromSink(), &out); ASSERT_EQ(1, out->num_columns()); ASSERT_EQ(100, out->num_rows()); @@ -238,47 +238,48 @@ TEST_F(TestParquetIO, SingleColumnTableInt64Write) { ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); } -TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) { +TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(100, 128, 10); + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); std::shared_ptr schema = - MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get()))); ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnTableDoubleReadWrite) { +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(100, 128, 10); + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); std::shared_ptr
table = MakeSimpleTable(values, true); - sink_ = std::make_shared(); + this->sink_ = std::make_shared(); ASSERT_NO_THROW(ASSERT_OK( - WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length()))); + WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length()))); std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); + this->ReadTableFromFile(this->ReaderFromSink(), &out); ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(100, out->num_rows()); + ASSERT_EQ(SMALL_SIZE, out->num_rows()); std::shared_ptr chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); } -TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { - std::shared_ptr values = NonNullArray(100, 128); - std::shared_ptr values_chunk = NonNullArray(25, 128); +TYPED_TEST(TestParquetIO, SingleColumnIntRequiredChunkedWrite) { + std::shared_ptr values = NonNullArray(SMALL_SIZE, 128); + std::shared_ptr values_chunk = + NonNullArray(SMALL_SIZE / 4, 128); std::shared_ptr schema = - MakeSchema(ParquetType::INT64, Repetition::REQUIRED); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + this->MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); for (int i = 0; i < 4; i++) { ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get()))); @@ -286,36 +287,37 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) { ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnTableInt64ChunkedWrite) { - std::shared_ptr values = NonNullArray(1000, 128); +TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) { + std::shared_ptr values = NonNullArray(LARGE_SIZE, 128); std::shared_ptr
table = MakeSimpleTable(values, false); - sink_ = std::make_shared(); + this->sink_ = std::make_shared(); ASSERT_NO_THROW( - ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512))); + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); + this->ReadTableFromFile(this->ReaderFromSink(), &out); ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(1000, out->num_rows()); + ASSERT_EQ(LARGE_SIZE, out->num_rows()); std::shared_ptr chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); } -TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { - std::shared_ptr values = NullableArray(100, 128, 10); +TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) { + std::shared_ptr values = NullableArray(SMALL_SIZE, 128, 10); std::shared_ptr values_chunk_nulls = - NullableArray(25, 128, 10); - std::shared_ptr values_chunk = NullableArray(25, 128, 0); + NullableArray(SMALL_SIZE / 4, 128, 10); + std::shared_ptr values_chunk = + NullableArray(SMALL_SIZE / 4, 128, 0); std::shared_ptr schema = - MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL); - FileWriter writer(default_memory_pool(), MakeWriter(schema)); + this->MakeSchema(test_traits::parquet_enum, Repetition::OPTIONAL); + FileWriter writer(default_memory_pool(), this->MakeWriter(schema)); ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length()))); ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get()))); for (int i = 0; i < 3; i++) { @@ -325,22 +327,22 @@ TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) { ASSERT_NO_THROW(ASSERT_OK(writer.Close())); std::shared_ptr out; - ReadSingleColumnFile(ReaderFromSink(), &out); + this->ReadSingleColumnFile(this->ReaderFromSink(), &out); ASSERT_TRUE(values->Equals(out)); } -TEST_F(TestParquetIO, SingleColumnTableDoubleChunkedWrite) { +TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) { // This also tests max_definition_level = 1 - std::shared_ptr values = NullableArray(1000, 128, 100); + std::shared_ptr values = NullableArray(LARGE_SIZE, 128, 100); std::shared_ptr
table = MakeSimpleTable(values, true); - sink_ = std::make_shared(); + this->sink_ = std::make_shared(); ASSERT_NO_THROW( - ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512))); + ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512))); std::shared_ptr
out; - ReadTableFromFile(ReaderFromSink(), &out); + this->ReadTableFromFile(this->ReaderFromSink(), &out); ASSERT_EQ(1, out->num_columns()); - ASSERT_EQ(1000, out->num_rows()); + ASSERT_EQ(LARGE_SIZE, out->num_rows()); std::shared_ptr chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); From 5d4929ad0a29e22aa7ded6e74a30f1c3fa840481 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 3 Jun 2016 22:29:14 +0200 Subject: [PATCH 07/28] Add test-util.h --- cpp/src/arrow/parquet/test-util.h | 77 +++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 cpp/src/arrow/parquet/test-util.h diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h new file mode 100644 index 00000000000..1496082d5c6 --- /dev/null +++ b/cpp/src/arrow/parquet/test-util.h @@ -0,0 +1,77 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "arrow/types/primitive.h" + +namespace arrow { + +namespace parquet { + +template +std::shared_ptr NonNullArray( + size_t size, typename ArrowType::c_type value) { + std::vector values(size, value); + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size()); + return std::static_pointer_cast(builder.Finish()); +} + +// This helper function only supports (size/2) nulls yet. +template +std::shared_ptr NullableArray( + size_t size, typename ArrowType::c_type value, size_t num_nulls) { + std::vector values(size, value); + std::vector valid_bytes(size, 1); + + for (size_t i = 0; i < num_nulls; i++) { + valid_bytes[i * 2] = 0; + } + + NumericBuilder builder(default_memory_pool(), std::make_shared()); + builder.Append(values.data(), values.size(), valid_bytes.data()); + return std::static_pointer_cast(builder.Finish()); +} + +std::shared_ptr MakeColumn(const std::string& name, + const std::shared_ptr& array, bool nullable) { + auto field = std::make_shared(name, array->type(), nullable); + return std::make_shared(field, array); +} + +std::shared_ptr
MakeSimpleTable( + const std::shared_ptr& values, bool nullable) { + std::shared_ptr column = MakeColumn("col", values, nullable); + std::vector> columns({column}); + std::vector> fields({column->field()}); + auto schema = std::make_shared(fields); + return std::make_shared
("table", schema, columns); +} + +template +void ExpectArray(T* expected, Array* result) { + PrimitiveArray* p_array = static_cast(result); + for (size_t i = 0; i < result->length(); i++) { + EXPECT_EQ(expected[i], reinterpret_cast(p_array->data()->data())[i]); + } +} + +} // namespace parquet + +} // namespace arrow From b505feba765d31eeb77cd0f008408883728c1f9e Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 3 Jun 2016 22:55:33 +0200 Subject: [PATCH 08/28] Install parquet-cpp via conda --- ci/travis_before_script_cpp.sh | 4 ++++ ci/travis_conda_build.sh | 22 +--------------------- ci/travis_install_conda.sh | 26 ++++++++++++++++++++++++++ 3 files changed, 31 insertions(+), 21 deletions(-) create mode 100644 ci/travis_install_conda.sh diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index bf9ec58dbd9..6159f67e361 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -2,6 +2,10 @@ set -e +source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh +conda install -y --channel apache/channel/dev parquet-cpp +export PARQUET_HOME=$MINICONDA + : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build} mkdir $CPP_BUILD_DIR diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh index afa531dbd6b..c43a85170b0 100755 --- a/ci/travis_conda_build.sh +++ b/ci/travis_conda_build.sh @@ -2,27 +2,7 @@ set -e -if [ $TRAVIS_OS_NAME == "linux" ]; then - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" -else - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" -fi - -wget -O miniconda.sh $MINICONDA_URL -MINICONDA=$TRAVIS_BUILD_DIR/miniconda -bash miniconda.sh -b -p $MINICONDA -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda info -a - -conda config --set show_channel_urls yes -conda config --add channels conda-forge -conda config --add channels apache - -conda install --yes conda-build jinja2 anaconda-client - -# faster builds, please -conda install -y nomkl +source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh # Build libarrow diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh new file mode 100644 index 00000000000..bef667dff7c --- /dev/null +++ b/ci/travis_install_conda.sh @@ -0,0 +1,26 @@ +#!/usr/bin/env bash + +set -e + +if [ $TRAVIS_OS_NAME == "linux" ]; then + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" +else + MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" +fi + +wget -O miniconda.sh $MINICONDA_URL +export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +bash miniconda.sh -b -p $MINICONDA +export PATH="$MINICONDA/bin:$PATH" +conda update -y -q conda +conda info -a + +conda config --set show_channel_urls yes +conda config --add channels conda-forge +conda config --add channels apache + +conda install --yes conda-build jinja2 anaconda-client + +# faster builds, please +conda install -y nomkl + From 81f501eefcc77e3cd791a1347eaa66e398a6c213 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 3 Jun 2016 23:52:50 +0200 Subject: [PATCH 09/28] No need to install conda in travis_script_python anymore --- ci/travis_script_python.sh | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index d45b895d8cf..8658456dd22 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -11,21 +11,6 @@ popd pushd $PYTHON_DIR -# Bootstrap a Conda Python environment - -if [ $TRAVIS_OS_NAME == "linux" ]; then - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh" -else - MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh" -fi - -curl $MINICONDA_URL > miniconda.sh -MINICONDA=$TRAVIS_BUILD_DIR/miniconda -bash miniconda.sh -b -p $MINICONDA -export PATH="$MINICONDA/bin:$PATH" -conda update -y -q conda -conda info -a - python_version_tests() { PYTHON_VERSION=$1 CONDA_ENV_NAME="pyarrow-test-${PYTHON_VERSION}" From 6a41d23fe3c2ceb7f32e4bacbe3ee7e6b43b9432 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 08:39:10 +0200 Subject: [PATCH 10/28] Re-use conda installation from C++ --- ci/travis_script_python.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 8658456dd22..6ac92ec1295 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -4,6 +4,10 @@ set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python +# Re-use conda installation from C++ +export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +export PATH="$MINICONDA/bin:$PATH" + # Share environment with C++ pushd $CPP_BUILD_DIR source setup_build_env.sh From cd3b9a9d4244515d824d6dfb3aeb2d43671943df Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 08:54:50 +0200 Subject: [PATCH 11/28] Also search for Parquet in PyArrow --- python/CMakeLists.txt | 4 ++ python/cmake_modules/FindParquet.cmake | 80 ++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) create mode 100644 python/cmake_modules/FindParquet.cmake diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index 7e013c07b31..f1becfcf449 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -339,6 +339,10 @@ if (PYARROW_BUILD_TESTS) STATIC_LIB ${GTEST_STATIC_LIB}) endif() +## Parquet +find_package(Parquet REQUIRED) +include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) + ## Arrow find_package(Arrow REQUIRED) include_directories(SYSTEM ${ARROW_INCLUDE_DIR}) diff --git a/python/cmake_modules/FindParquet.cmake b/python/cmake_modules/FindParquet.cmake new file mode 100644 index 00000000000..e3350d6e13d --- /dev/null +++ b/python/cmake_modules/FindParquet.cmake @@ -0,0 +1,80 @@ +# Copyright 2012 Cloudera Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so) +# This module defines +# PARQUET_INCLUDE_DIR, directory containing headers +# PARQUET_LIBS, directory containing parquet libraries +# PARQUET_STATIC_LIB, path to libparquet.a +# PARQUET_SHARED_LIB, path to libparquet's shared library +# PARQUET_FOUND, whether parquet has been found + +if( NOT "$ENV{PARQUET_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{PARQUET_HOME}" _native_path ) + list( APPEND _parquet_roots ${_native_path} ) +elseif ( Parquet_HOME ) + list( APPEND _parquet_roots ${Parquet_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _parquet_roots ) + find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( PARQUET_LIBRARIES NAMES parquet + PATHS ${_parquet_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h ) + find_library( PARQUET_LIBRARIES NAMES parquet ) +endif () + + +if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES) + set(PARQUET_FOUND TRUE) + get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} PATH ) + set(PARQUET_LIB_NAME libparquet) + set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) + set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) +else () + set(PARQUET_FOUND FALSE) +endif () + +if (PARQUET_FOUND) + if (NOT Parquet_FIND_QUIETLY) + message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}") + endif () +else () + if (NOT Parquet_FIND_QUIETLY) + set(PARQUET_ERR_MSG "Could not find the Parquet library. Looked in ") + if ( _parquet_roots ) + set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.") + else () + set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.") + endif () + if (Parquet_FIND_REQUIRED) + message(FATAL_ERROR "${PARQUET_ERR_MSG}") + else (Parquet_FIND_REQUIRED) + message(STATUS "${PARQUET_ERR_MSG}") + endif (Parquet_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + PARQUET_INCLUDE_DIR + PARQUET_LIBS + PARQUET_LIBRARIES + PARQUET_STATIC_LIB + PARQUET_SHARED_LIB +) From 9520c39f48e54a1e31f91a0ddf32debb3ea4aa12 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:03:09 +0200 Subject: [PATCH 12/28] Use PARQUET from miniconda path --- ci/travis_script_python.sh | 1 + python/pyarrow/tests/test_parquet.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6ac92ec1295..ea4f07ca938 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$TRAVIS_BUILD_DIR/miniconda export PATH="$MINICONDA/bin:$PATH" +export PARQUET_HOME=$MINICONDA # Share environment with C++ pushd $CPP_BUILD_DIR diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 99b2c7e09ec..68fdf194bf3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -27,6 +27,19 @@ import os.path +def test_single_int64_column(tmpdir): + filename = tmpdir.join('single_int64_column.parquet') + data = [A.from_pylist(range(5))] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + A.parquet.write_table(table, filename.strpath) + table_read = pyarrow.parquet.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + assert col_written.name == col_read.name + assert col_read.data.num_chunks == 1 + data_written = col_written.data.chunk(0) + data_read = col_read.data.chunk(0) + assert data_written == data_read + class TestParquetIO(unittest.TestCase): def setUp(self): From 2006e7021be022dc6ec149114810fc05cb55da19 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:03:24 +0200 Subject: [PATCH 13/28] Rewrite test py.test style --- python/pyarrow/tests/test_parquet.py | 45 ++++++++-------------------- 1 file changed, 13 insertions(+), 32 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 68fdf194bf3..ae9a75c0262 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -27,36 +27,17 @@ import os.path -def test_single_int64_column(tmpdir): - filename = tmpdir.join('single_int64_column.parquet') - data = [A.from_pylist(range(5))] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') - A.parquet.write_table(table, filename.strpath) - table_read = pyarrow.parquet.read_table(filename.strpath) - for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): - assert col_written.name == col_read.name - assert col_read.data.num_chunks == 1 - data_written = col_written.data.chunk(0) - data_read = col_read.data.chunk(0) - assert data_written == data_read +def test_single_pylist_column(tmpdir): + for dtype in [int, float]: + filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) + data = [A.from_pylist(map(dtype, range(5)))] + table = A.Table.from_arrays(('a', 'b'), data, 'table_name') + A.parquet.write_table(table, filename.strpath) + table_read = pyarrow.parquet.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + assert col_written.name == col_read.name + assert col_read.data.num_chunks == 1 + data_written = col_written.data.chunk(0) + data_read = col_read.data.chunk(0) + assert data_written == data_read -class TestParquetIO(unittest.TestCase): - - def setUp(self): - self.temp_directory = mkdtemp() - - def tearDown(self): - rmtree(self.temp_directory) - - def test_single_int64_column(self): - filename = os.path.join(self.temp_directory, 'single_int64_column.parquet') - data = [A.from_pylist(range(5))] - table = A.Table.from_arrays(('a', 'b'), data, 'table_name') - A.parquet.write_table(table, filename) - table_read = pyarrow.parquet.read_table(filename) - for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): - assert col_written.name == col_read.name - assert col_read.data.num_chunks == 1 - data_written = col_written.data.chunk(0) - data_read = col_read.data.chunk(0) - assert data_written == data_read From 2dffc1412b86bd473502628d9f6f98964efca72e Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:06:45 +0200 Subject: [PATCH 14/28] Fix min mistake, use equals instead of == --- python/pyarrow/array.pyx | 9 ++------- python/pyarrow/parquet.pyx | 2 +- python/pyarrow/tests/test_parquet.py | 2 +- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 6d63c321931..619e5ef7e39 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -68,13 +68,8 @@ cdef class Array: values = array_format(self, window=10) return '{0}\n{1}'.format(type_format, values) - def __richcmp__(Array self, Array other, int op): - if op == cpython.Py_EQ: - return self.ap.Equals(other.sp_array) - elif op == cpython.Py_NE: - return not self.ap.Equals(other.sp_array) - else: - raise TypeError('Invalid comparison') + def equals(Array self, Array other): + return self.ap.Equals(other.sp_array) def __len__(self): if self.sp_array.get(): diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 78837a45988..1dcfd60bee6 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -61,7 +61,7 @@ def write_table(table, filename, chunk_size=None): cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename)) cdef int64_t chunk_size_ = 0 if chunk_size is None: - chunk_size_ = max(ctable_.num_rows(), int(2**16)) + chunk_size_ = min(ctable_.num_rows(), int(2**16)) else: chunk_size_ = chunk_size diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index ae9a75c0262..236f06ca69b 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -39,5 +39,5 @@ def test_single_pylist_column(tmpdir): assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) - assert data_written == data_read + assert data_written.equals(data_read) From 443de8ba42b1097f217175dd168986c021b18414 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:14:53 +0200 Subject: [PATCH 15/28] Add miniconda to the LD_LIBRARY_PATH --- ci/travis_script_python.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index ea4f07ca938..6d35785356a 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$TRAVIS_BUILD_DIR/miniconda export PATH="$MINICONDA/bin:$PATH" +export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH" export PARQUET_HOME=$MINICONDA # Share environment with C++ From 5706db2f6386f125ce90ecc5db9bd1b0680f940f Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:26:25 +0200 Subject: [PATCH 16/28] Use length and offset instead of slicing --- cpp/src/arrow/parquet/writer.cc | 89 +++++++++------------------- cpp/src/arrow/parquet/writer.h | 3 +- python/pyarrow/tests/test_parquet.py | 2 +- 3 files changed, 32 insertions(+), 62 deletions(-) diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 742b8ba6852..0114f4c321e 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -42,8 +42,9 @@ class FileWriter::Impl { Status NewRowGroup(int64_t chunk_size); template - Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data); - Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data, + int64_t offset, int64_t length); + Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length); Status Close(); virtual ~Impl() {} @@ -70,31 +71,31 @@ Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) { } template -Status FileWriter::Impl::TypedWriteBatch( - ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data) { +Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer, + const PrimitiveArray* data, int64_t offset, int64_t length) { + // TODO: DCHECK((offset + length) <= data->length()); auto data_ptr = - reinterpret_cast(data->data()->data()); + reinterpret_cast(data->data()->data()) + + offset; auto writer = reinterpret_cast<::parquet::TypedColumnWriter*>(column_writer); if (writer->descr()->max_definition_level() == 0) { // no nulls, just dump the data - PARQUET_CATCH_NOT_OK(writer->WriteBatch(data->length(), nullptr, nullptr, data_ptr)); + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_ptr)); } else if (writer->descr()->max_definition_level() == 1) { - RETURN_NOT_OK(def_levels_buffer_.Resize(data->length() * sizeof(int16_t))); + RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t))); int16_t* def_levels_ptr = reinterpret_cast(def_levels_buffer_.mutable_data()); if (data->null_count() == 0) { - std::fill(def_levels_ptr, def_levels_ptr + data->length(), 1); - PARQUET_CATCH_NOT_OK( - writer->WriteBatch(data->length(), def_levels_ptr, nullptr, data_ptr)); + std::fill(def_levels_ptr, def_levels_ptr + length, 1); + PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, data_ptr)); } else { - RETURN_NOT_OK(data_buffer_.Resize( - (data->length() - data->null_count()) * sizeof(typename ParquetType::c_type))); + RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(typename ParquetType::c_type))); auto buffer_ptr = reinterpret_cast(data_buffer_.mutable_data()); int buffer_idx = 0; - for (size_t i = 0; i < data->length(); i++) { - if (data->IsNull(i)) { + for (size_t i = 0; i < length; i++) { + if (data->IsNull(offset + i)) { def_levels_ptr[i] = 0; } else { def_levels_ptr[i] = 1; @@ -102,7 +103,7 @@ Status FileWriter::Impl::TypedWriteBatch( } } PARQUET_CATCH_NOT_OK( - writer->WriteBatch(data->length(), def_levels_ptr, nullptr, buffer_ptr)); + writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr)); } } else { return Status::NotImplemented("no support for max definition level > 1 yet"); @@ -117,12 +118,13 @@ Status FileWriter::Impl::Close() { return Status::OK(); } -#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ - case Type::ENUM: \ - return TypedWriteBatch(writer, data); \ +#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \ + case Type::ENUM: \ + return TypedWriteBatch(writer, data, offset, length); \ break; -Status FileWriter::Impl::WriteFlatColumnChunk(const PrimitiveArray* data) { +Status FileWriter::Impl::WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset, int64_t length) { ::parquet::ColumnWriter* writer; PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn()); switch (data->type_enum()) { @@ -143,8 +145,11 @@ Status FileWriter::NewRowGroup(int64_t chunk_size) { return impl_->NewRowGroup(chunk_size); } -Status FileWriter::WriteFlatColumnChunk(const PrimitiveArray* data) { - return impl_->WriteFlatColumnChunk(data); +Status FileWriter::WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset, int64_t length) { + int64_t real_length = length; + if (length == -1) { real_length = data->length(); } + return impl_->WriteFlatColumnChunk(data, offset, real_length); } Status FileWriter::Close() { @@ -153,43 +158,8 @@ Status FileWriter::Close() { FileWriter::~FileWriter() {} -// Create a slice of a PrimitiveArray. -// -// This method is specially crafted for WriteFlatTable and assumes the following: -// * chunk_size is a multiple of 512 -Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, const PrimitiveArray* array, - std::shared_ptr* out) { - // The last chunk may be smaller than the chunk_size - const int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size); - const int64_t buffer_offset = chunk * chunk_size * array->type()->value_size(); - const int64_t value_size = size * array->type()->value_size(); - auto chunk_buffer = std::make_shared(array->data(), buffer_offset, value_size); - std::shared_ptr null_bitmap; - int32_t null_count = 0; - if (array->null_count() > 0) { - int64_t null_offset = (chunk * chunk_size) / 8; - int64_t null_size = util::ceil_byte(size) / 8; - null_bitmap = std::make_shared(array->null_bitmap(), null_offset, null_size); - for (int64_t k = 0; k < size; k++) { - if (!util::get_bit(null_bitmap->data(), k)) { null_count++; } - } - } - std::shared_ptr out_array; - RETURN_NOT_OK(MakePrimitiveArray( - array->type(), size, chunk_buffer, null_count, null_bitmap, &out_array)); - *out = std::static_pointer_cast(out_array); - return Status::OK(); -} - Status WriteFlatTable(const Table* table, MemoryPool* pool, std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) { - // Ensure alignment of sliced PrimitiveArray, esp. the null bitmap - // TODO: Support other chunksizes than multiples of 512 - if (((chunk_size & 511) != 0) && (chunk_size != table->num_rows())) { - return Status::NotImplemented( - "Only chunk sizes that are a multiple of 512 are supported"); - } - std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema; RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema)); auto schema_node = std::static_pointer_cast(parquet_schema->schema()); @@ -217,12 +187,11 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, } for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { - int64_t size = std::min(chunk_size, table->num_rows() - chunk * chunk_size); + int64_t offset = chunk * chunk_size; + int64_t size = std::min(chunk_size, table->num_rows() - offset); RETURN_NOT_OK(writer.NewRowGroup(size)); for (int i = 0; i < table->num_columns(); i++) { - std::shared_ptr array; - RETURN_NOT_OK(TemporaryArraySlice(chunk, chunk_size, arrays[i].get(), &array)); - RETURN_NOT_OK(writer.WriteFlatColumnChunk(array.get())); + RETURN_NOT_OK(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size)); } } diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h index 64625021c11..83e799f7ed1 100644 --- a/cpp/src/arrow/parquet/writer.h +++ b/cpp/src/arrow/parquet/writer.h @@ -43,7 +43,8 @@ class FileWriter { FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer); Status NewRowGroup(int64_t chunk_size); - Status WriteFlatColumnChunk(const PrimitiveArray* data); + Status WriteFlatColumnChunk( + const PrimitiveArray* data, int64_t offset = 0, int64_t length = -1); Status Close(); virtual ~FileWriter(); diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 236f06ca69b..12cd248314e 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -30,7 +30,7 @@ def test_single_pylist_column(tmpdir): for dtype in [int, float]: filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) - data = [A.from_pylist(map(dtype, range(5)))] + data = [A.from_pylist(list(map(dtype, range(5))))] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') A.parquet.write_table(table, filename.strpath) table_read = pyarrow.parquet.read_table(filename.strpath) From 066c08afc183838866299bb1040c7c738fe6530f Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:35:11 +0200 Subject: [PATCH 17/28] Add missing functions to smart pointers --- python/pyarrow/includes/common.pxd | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 5c3df6401bc..421c8920234 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -38,6 +38,11 @@ cdef extern from "" namespace "std" nogil: unique_ptr() unique_ptr(T*) T* get() + T* release() + void reset() + void reset(nullptr_t) + void reset(T*) + void swap(unique_ptr&) cdef cppclass shared_ptr[T]: shared_ptr() @@ -45,3 +50,4 @@ cdef extern from "" namespace "std" nogil: T* get() void reset() void reset(T* p) + void swap(shared_ptr&) From 4a80116ef2fd263471cd459da7bd7217679f7eae Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 09:40:59 +0200 Subject: [PATCH 18/28] Handle Python3 strings correctly --- python/pyarrow/parquet.pyx | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 1dcfd60bee6..39f5b229884 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -23,6 +23,7 @@ from pyarrow.includes.libarrow cimport * cimport pyarrow.includes.pyarrow as pyarrow from pyarrow.includes.parquet cimport * +from pyarrow.compat import tobytes from pyarrow.error cimport check_cstatus from pyarrow.table cimport Table @@ -40,7 +41,7 @@ def read_table(filename, columns=None): # Must be in one expression to avoid calling std::move which is not possible # in Cython (due to missing rvalue support) reader = unique_ptr[FileReader](new FileReader(default_memory_pool(), - ParquetFileReader.OpenFile(filename))) + ParquetFileReader.OpenFile(tobytes(filename)))) check_cstatus(reader.get().ReadFlatTable(&ctable)) table.init(ctable) return table @@ -58,7 +59,8 @@ def write_table(table, filename, chunk_size=None): """ cdef Table table_ = table cdef CTable* ctable_ = table_.table - cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename)) + cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream]( + new LocalFileOutputStream(tobytes(filename))) cdef int64_t chunk_size_ = 0 if chunk_size is None: chunk_size_ = min(ctable_.num_rows(), int(2**16)) From 00c14611041d112da6680b74297a6e8f437506c0 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sat, 4 Jun 2016 12:12:10 +0200 Subject: [PATCH 19/28] Also ensure correct OSX compiler flags in PyArrow --- python/conda.recipe/build.sh | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh index a9d9aedead3..a164c1af518 100644 --- a/python/conda.recipe/build.sh +++ b/python/conda.recipe/build.sh @@ -6,6 +6,19 @@ export ARROW_HOME=$PREFIX cd $RECIPE_DIR +if [ "$(uname)" == "Darwin" ]; then + # C++11 finagling for Mac OSX + export CC=clang + export CXX=clang++ + export MACOSX_VERSION_MIN="10.7" + CXXFLAGS="${CXXFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + CXXFLAGS="${CXXFLAGS} -stdlib=libc++ -std=c++11" + export LDFLAGS="${LDFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}" + export LDFLAGS="${LDFLAGS} -stdlib=libc++ -std=c++11" + export LINKFLAGS="${LDFLAGS}" + export MACOSX_DEPLOYMENT_TARGET=10.7 +fi + echo Setting the compiler... if [ `uname` == Linux ]; then EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++ From f583b614322374324a58654931d455f60fe8719a Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 5 Jun 2016 13:30:45 +0200 Subject: [PATCH 20/28] Fix rpath for libarrow_parquet --- cpp/src/arrow/parquet/CMakeLists.txt | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt index c00cc9f0f25..f00bb53c084 100644 --- a/cpp/src/arrow/parquet/CMakeLists.txt +++ b/cpp/src/arrow/parquet/CMakeLists.txt @@ -35,6 +35,13 @@ add_library(arrow_parquet SHARED target_link_libraries(arrow_parquet ${PARQUET_LIBS}) SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX) +if (APPLE) + set_target_properties(arrow_parquet + PROPERTIES + BUILD_WITH_INSTALL_RPATH ON + INSTALL_NAME_DIR "@rpath") +endif() + ADD_ARROW_TEST(parquet-schema-test) ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet) From 77bd21ab5a78f95c32a10c9ee54caeec8ca7a73a Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Sun, 5 Jun 2016 20:14:56 +0200 Subject: [PATCH 21/28] Add pandas roundtrip to tests --- python/pyarrow/error.pyx | 1 + python/pyarrow/tests/test_parquet.py | 20 +++++++++++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx index 51e06101bf6..5a6a038a92e 100644 --- a/python/pyarrow/error.pyx +++ b/python/pyarrow/error.pyx @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. +from pyarrow.includes.libarrow cimport CStatus from pyarrow.includes.common cimport c_string from pyarrow.compat import frombytes diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 12cd248314e..3469f5291b2 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -24,10 +24,13 @@ from shutil import rmtree from tempfile import mkdtemp +import numpy as np import os.path +import pandas as pd +import pandas.util.testing as pdt -def test_single_pylist_column(tmpdir): +def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) data = [A.from_pylist(list(map(dtype, range(5))))] @@ -41,3 +44,18 @@ def test_single_pylist_column(tmpdir): data_read = col_read.data.chunk(0) assert data_written.equals(data_read) +def test_pandas_rountrip(tmpdir): + size = 10000 + df = pd.DataFrame({ + 'int32': np.arange(size, dtype=np.int32), + 'int64': np.arange(size, dtype=np.int64), + 'float32': np.arange(size, dtype=np.float32), + 'float64': np.arange(size, dtype=np.float64) + }) + filename = tmpdir.join('pandas_rountrip.parquet') + arrow_table = A.from_pandas_dataframe(df) + A.parquet.write_table(arrow_table, filename.strpath) + table_read = pyarrow.parquet.read_table(filename.strpath) + df_read = table_read.to_pandas() + pdt.assert_frame_equal(df, df_read) + From 0514d01a39de0e0917fd80d46dd64092c8740169 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 7 Jun 2016 08:19:14 +0200 Subject: [PATCH 22/28] Handle exceptions on RowGroupWriter::Close better --- cpp/src/arrow/parquet/utils.h | 5 +++++ cpp/src/arrow/parquet/writer.cc | 5 +++-- cpp/src/arrow/util/status.h | 9 +++++++++ python/pyarrow/tests/test_parquet.py | 4 +--- 4 files changed, 18 insertions(+), 5 deletions(-) diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h index b32792fdf70..9f83fd0d0fc 100644 --- a/cpp/src/arrow/parquet/utils.h +++ b/cpp/src/arrow/parquet/utils.h @@ -31,6 +31,11 @@ namespace parquet { (s); \ } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); } +#define PARQUET_IGNORE_NOT_OK(s) \ + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) { } + } // namespace parquet } // namespace arrow diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 0114f4c321e..70de6000d8b 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -181,6 +181,7 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, std::shared_ptr array = table->column(i)->data()->chunk(0); auto primitive_array = std::dynamic_pointer_cast(array); if (!primitive_array) { + PARQUET_IGNORE_NOT_OK(writer.Close()); return Status::NotImplemented("Table must consist of PrimitiveArray instances"); } arrays[i] = primitive_array; @@ -189,9 +190,9 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) { int64_t offset = chunk * chunk_size; int64_t size = std::min(chunk_size, table->num_rows() - offset); - RETURN_NOT_OK(writer.NewRowGroup(size)); + RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close())); for (int i = 0; i < table->num_columns(); i++) { - RETURN_NOT_OK(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size)); + RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), PARQUET_IGNORE_NOT_OK(writer.Close())); } } diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h index 6ddc177a9a5..d1a74250008 100644 --- a/cpp/src/arrow/util/status.h +++ b/cpp/src/arrow/util/status.h @@ -63,6 +63,15 @@ namespace arrow { if (!_s.ok()) { return _s; } \ } while (0); +#define RETURN_NOT_OK_ELSE(s, else_) \ + do { \ + Status _s = (s); \ + if (!_s.ok()) { \ + else_; \ + return _s; \ + } \ + } while (0); + enum class StatusCode : char { OK = 0, OutOfMemory = 1, diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index 3469f5291b2..d92cf4ca656 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -21,12 +21,10 @@ A = arrow -from shutil import rmtree -from tempfile import mkdtemp - import numpy as np import os.path import pandas as pd + import pandas.util.testing as pdt From 8f6010aa584b20d28514289312d2def97994099e Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Tue, 7 Jun 2016 09:00:37 +0200 Subject: [PATCH 23/28] Linter fixes --- cpp/src/arrow/parquet/utils.h | 6 +++--- cpp/src/arrow/parquet/writer.cc | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h index 9f83fd0d0fc..409bcd9065c 100644 --- a/cpp/src/arrow/parquet/utils.h +++ b/cpp/src/arrow/parquet/utils.h @@ -32,9 +32,9 @@ namespace parquet { } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); } #define PARQUET_IGNORE_NOT_OK(s) \ - try { \ - (s); \ - } catch (const ::parquet::ParquetException& e) { } + try { \ + (s); \ + } catch (const ::parquet::ParquetException& e) {} } // namespace parquet diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc index 70de6000d8b..1223901d550 100644 --- a/cpp/src/arrow/parquet/writer.cc +++ b/cpp/src/arrow/parquet/writer.cc @@ -192,7 +192,8 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool, int64_t size = std::min(chunk_size, table->num_rows() - offset); RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close())); for (int i = 0; i < table->num_columns(); i++) { - RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), PARQUET_IGNORE_NOT_OK(writer.Close())); + RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), + PARQUET_IGNORE_NOT_OK(writer.Close())); } } From 000e1e34d6ad3b6c1a1bc430974f2eac05f96173 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 11:56:53 +0200 Subject: [PATCH 24/28] Use unique_ptr and shared_ptr from Cython --- python/pyarrow/includes/common.pxd | 20 +------------------- python/pyarrow/parquet.pyx | 4 ++-- python/pyarrow/schema.pyx | 9 ++++++--- python/setup.py | 2 +- 4 files changed, 10 insertions(+), 25 deletions(-) diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd index 421c8920234..1f6ecee5105 100644 --- a/python/pyarrow/includes/common.pxd +++ b/python/pyarrow/includes/common.pxd @@ -19,6 +19,7 @@ from libc.stdint cimport * from libcpp cimport bool as c_bool +from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string as c_string from libcpp.vector cimport vector @@ -32,22 +33,3 @@ cdef extern from "": cdef extern from "": void Py_XDECREF(PyObject* o) -cdef extern from "" namespace "std" nogil: - - cdef cppclass unique_ptr[T]: - unique_ptr() - unique_ptr(T*) - T* get() - T* release() - void reset() - void reset(nullptr_t) - void reset(T*) - void swap(unique_ptr&) - - cdef cppclass shared_ptr[T]: - shared_ptr() - shared_ptr(T*) - T* get() - void reset() - void reset(T* p) - void swap(shared_ptr&) diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx index 39f5b229884..3d5355ebe43 100644 --- a/python/pyarrow/parquet.pyx +++ b/python/pyarrow/parquet.pyx @@ -59,13 +59,13 @@ def write_table(table, filename, chunk_size=None): """ cdef Table table_ = table cdef CTable* ctable_ = table_.table - cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream]( - new LocalFileOutputStream(tobytes(filename))) + cdef shared_ptr[OutputStream] sink cdef int64_t chunk_size_ = 0 if chunk_size is None: chunk_size_ = min(ctable_.num_rows(), int(2**16)) else: chunk_size_ = chunk_size + sink.reset(new LocalFileOutputStream(tobytes(filename))) check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_)) diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index 22ddf0cf17e..084c304aed2 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -201,7 +201,9 @@ def string(): def list_(DataType value_type): cdef DataType out = DataType() - out.init(shared_ptr[CDataType](new CListType(value_type.sp_type))) + cdef shared_ptr[CDataType] list_type + list_type.reset(new CListType(value_type.sp_type)) + out.init(list_type) return out def struct(fields): @@ -212,12 +214,13 @@ def struct(fields): DataType out = DataType() Field field vector[shared_ptr[CField]] c_fields + cdef shared_ptr[CDataType] struct_type for field in fields: c_fields.push_back(field.sp_field) - out.init(shared_ptr[CDataType]( - new CStructType(c_fields))) + struct_type.reset(new CStructType(c_fields)) + out.init(struct_type) return out def schema(fields): diff --git a/python/setup.py b/python/setup.py index 1c55198ec8a..7edeb914331 100644 --- a/python/setup.py +++ b/python/setup.py @@ -242,7 +242,7 @@ def get_outputs(self): 'clean': clean, 'build_ext': build_ext }, - install_requires=['cython >= 0.21', 'numpy >= 1.9'], + install_requires=['cython >= 0.23', 'numpy >= 1.9'], description=DESC, license='Apache License, Version 2.0', maintainer="Apache Arrow Developers", From 8d90d3f0d57b82b26c9af5ba6a806e5b4ca52a3d Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 12:02:15 +0200 Subject: [PATCH 25/28] Do not set LD_LIBRARY_PATH in python build --- ci/travis_script_python.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6d35785356a..ea4f07ca938 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -7,7 +7,6 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$TRAVIS_BUILD_DIR/miniconda export PATH="$MINICONDA/bin:$PATH" -export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH" export PARQUET_HOME=$MINICONDA # Share environment with C++ From ec077689058f04343fdab148f6e30496e8fcc5f5 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 12:36:00 +0200 Subject: [PATCH 26/28] Set LD_LIBRARY_PATH in python build --- ci/travis_script_python.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index ea4f07ca938..6d35785356a 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ export MINICONDA=$TRAVIS_BUILD_DIR/miniconda export PATH="$MINICONDA/bin:$PATH" +export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH" export PARQUET_HOME=$MINICONDA # Share environment with C++ From 38d786cbddaadb5c3238a594a75cea214b0fb108 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 12:48:44 +0200 Subject: [PATCH 27/28] Make code more readable by using using --- cpp/src/arrow/parquet/parquet-io-test.cc | 28 ++++++------------------ 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc index 474d167628b..db779d8309c 100644 --- a/cpp/src/arrow/parquet/parquet-io-test.cc +++ b/cpp/src/arrow/parquet/parquet-io-test.cc @@ -71,6 +71,12 @@ struct test_traits { static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE; }; +template +using ParquetDataType = ::parquet::DataType::parquet_enum>; + +template +using ParquetWriter = ::parquet::TypedColumnWriter>; + template class TestParquetIO : public ::testing::Test { public: @@ -113,25 +119,6 @@ class TestParquetIO : public ::testing::Test { ASSERT_NE(nullptr, out->get()); } - std::unique_ptr Int64File( - std::vector& values, int num_chunks) { - std::shared_ptr schema = - MakeSchema(ParquetType::INT64, Repetition::REQUIRED); - std::unique_ptr file_writer = MakeWriter(schema); - size_t chunk_size = values.size() / num_chunks; - for (int i = 0; i < num_chunks; i++) { - auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = - static_cast<::parquet::Int64Writer*>(row_group_writer->NextColumn()); - int64_t* data = values.data() + i * chunk_size; - column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); - column_writer->Close(); - row_group_writer->Close(); - } - file_writer->Close(); - return ReaderFromSink(); - } - std::unique_ptr TestFile(std::vector& values, int num_chunks) { std::shared_ptr schema = MakeSchema(test_traits::parquet_enum, Repetition::REQUIRED); @@ -139,8 +126,7 @@ class TestParquetIO : public ::testing::Test { size_t chunk_size = values.size() / num_chunks; for (int i = 0; i < num_chunks; i++) { auto row_group_writer = file_writer->AppendRowGroup(chunk_size); - auto column_writer = static_cast<::parquet::TypedColumnWriter< - ::parquet::DataType::parquet_enum>>*>( + auto column_writer = static_cast*>( row_group_writer->NextColumn()); T* data = values.data() + i * chunk_size; column_writer->WriteBatch(chunk_size, nullptr, nullptr, data); From 405f85d88fb0f6943df7149d2e76ae95e78a5658 Mon Sep 17 00:00:00 2001 From: "Uwe L. Korn" Date: Fri, 10 Jun 2016 15:56:38 +0200 Subject: [PATCH 28/28] Remove FindParquet duplication --- python/cmake_modules/FindParquet.cmake | 80 -------------------------- 1 file changed, 80 deletions(-) delete mode 100644 python/cmake_modules/FindParquet.cmake diff --git a/python/cmake_modules/FindParquet.cmake b/python/cmake_modules/FindParquet.cmake deleted file mode 100644 index e3350d6e13d..00000000000 --- a/python/cmake_modules/FindParquet.cmake +++ /dev/null @@ -1,80 +0,0 @@ -# Copyright 2012 Cloudera Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so) -# This module defines -# PARQUET_INCLUDE_DIR, directory containing headers -# PARQUET_LIBS, directory containing parquet libraries -# PARQUET_STATIC_LIB, path to libparquet.a -# PARQUET_SHARED_LIB, path to libparquet's shared library -# PARQUET_FOUND, whether parquet has been found - -if( NOT "$ENV{PARQUET_HOME}" STREQUAL "") - file( TO_CMAKE_PATH "$ENV{PARQUET_HOME}" _native_path ) - list( APPEND _parquet_roots ${_native_path} ) -elseif ( Parquet_HOME ) - list( APPEND _parquet_roots ${Parquet_HOME} ) -endif() - -# Try the parameterized roots, if they exist -if ( _parquet_roots ) - find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h - PATHS ${_parquet_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "include" ) - find_library( PARQUET_LIBRARIES NAMES parquet - PATHS ${_parquet_roots} NO_DEFAULT_PATH - PATH_SUFFIXES "lib" ) -else () - find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h ) - find_library( PARQUET_LIBRARIES NAMES parquet ) -endif () - - -if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES) - set(PARQUET_FOUND TRUE) - get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} PATH ) - set(PARQUET_LIB_NAME libparquet) - set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a) - set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) -else () - set(PARQUET_FOUND FALSE) -endif () - -if (PARQUET_FOUND) - if (NOT Parquet_FIND_QUIETLY) - message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}") - endif () -else () - if (NOT Parquet_FIND_QUIETLY) - set(PARQUET_ERR_MSG "Could not find the Parquet library. Looked in ") - if ( _parquet_roots ) - set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.") - else () - set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.") - endif () - if (Parquet_FIND_REQUIRED) - message(FATAL_ERROR "${PARQUET_ERR_MSG}") - else (Parquet_FIND_REQUIRED) - message(STATUS "${PARQUET_ERR_MSG}") - endif (Parquet_FIND_REQUIRED) - endif () -endif () - -mark_as_advanced( - PARQUET_INCLUDE_DIR - PARQUET_LIBS - PARQUET_LIBRARIES - PARQUET_STATIC_LIB - PARQUET_SHARED_LIB -)