From 0463995eae95c6542c232b2c76bd28c8a4ae8709 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sun, 29 May 2016 10:50:35 +0200
Subject: [PATCH 01/28] ARROW-203: Python: Basic filename based Parquet
 read/write

---
 cpp/src/arrow/column.h                   |   2 +
 cpp/src/arrow/parquet/parquet-io-test.cc | 144 +++++++++++++++++++++--
 cpp/src/arrow/parquet/reader.cc          |  26 ++++
 cpp/src/arrow/parquet/reader.h           |   3 +
 cpp/src/arrow/parquet/writer.cc          |  86 ++++++++++++++
 cpp/src/arrow/parquet/writer.h           |   9 ++
 python/CMakeLists.txt                    |   5 +
 python/cmake_modules/FindArrow.cmake     |  14 ++-
 python/pyarrow/array.pyx                 |   8 ++
 python/pyarrow/error.pxd                 |   2 +
 python/pyarrow/error.pyx                 |   7 ++
 python/pyarrow/includes/common.pxd       |   5 +
 python/pyarrow/includes/libarrow.pxd     |   3 +
 python/pyarrow/includes/parquet.pxd      |  46 ++++++++
 python/pyarrow/parquet.pyx               |  41 ++++++-
 python/pyarrow/tests/test_parquet.py     |  50 ++++++++
 python/setup.py                          |   2 +-
 python/src/pyarrow/parquet.cc            |  30 +++++
 python/src/pyarrow/parquet.h             |  30 +++++
 19 files changed, 502 insertions(+), 11 deletions(-)
 create mode 100644 python/pyarrow/tests/test_parquet.py
 create mode 100644 python/src/pyarrow/parquet.cc
 create mode 100644 python/src/pyarrow/parquet.h
diff --git a/cpp/src/arrow/column.h b/cpp/src/arrow/column.h
index 22becc34547..e409566e1f1 100644
--- a/cpp/src/arrow/column.h
+++ b/cpp/src/arrow/column.h
@@ -67,6 +67,8 @@ class Column {
 
   int64_t null_count() const { return data_->null_count(); }
 
+  const std::shared_ptr<Field>& field() const { return field_; }
+
   // @returns: the column's name in the passed metadata
   const std::string& name() const { return field_->name; }
 
diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc
index 845574d2c53..218e8916a6b 100644
--- a/cpp/src/arrow/parquet/parquet-io-test.cc
+++ b/cpp/src/arrow/parquet/parquet-io-test.cc
@@ -69,11 +69,26 @@ std::shared_ptr<PrimitiveArray> NullableArray(
   return std::static_pointer_cast<PrimitiveArray>(builder.Finish());
 }
 
+std::shared_ptr<Column> MakeColumn(const std::string& name,
+    const std::shared_ptr<PrimitiveArray>& array, bool nullable) {
+  auto field = std::make_shared<Field>(name, array->type(), nullable);
+  return std::make_shared<Column>(field, array);
+}
+
+std::shared_ptr<Table> MakeSimpleTable(
+    const std::shared_ptr<PrimitiveArray>& values, bool nullable) {
+  std::shared_ptr<Column> column = MakeColumn("col", values, nullable);
+  std::vector<std::shared_ptr<Column>> columns({column});
+  std::vector<std::shared_ptr<Field>> fields({column->field()});
+  auto schema = std::make_shared<Schema>(fields);
+  return std::make_shared<Table>("table", schema, columns);
+}
+
 class TestParquetIO : public ::testing::Test {
  public:
   virtual void SetUp() {}
 
-  std::shared_ptr<GroupNode> Schema(
+  std::shared_ptr<GroupNode> MakeSchema(
       ParquetType::type parquet_type, Repetition::type repetition) {
     auto pnode = PrimitiveNode::Make("column1", repetition, parquet_type);
     NodePtr node_ =
@@ -102,9 +117,17 @@ class TestParquetIO : public ::testing::Test {
     ASSERT_NE(nullptr, out->get());
   }
 
+  void ReadTableFromFile(
+      std::unique_ptr<ParquetFileReader> file_reader, std::shared_ptr<Table>* out) {
+    arrow::parquet::FileReader reader(default_memory_pool(), std::move(file_reader));
+    ASSERT_NO_THROW(ASSERT_OK(reader.ReadFlatTable(out)));
+    ASSERT_NE(nullptr, out->get());
+  }
+
   std::unique_ptr<ParquetFileReader> Int64File(
       std::vector<int64_t>& values, int num_chunks) {
-    std::shared_ptr<GroupNode> schema = Schema(ParquetType::INT64, Repetition::REQUIRED);
+    std::shared_ptr<GroupNode> schema =
+        MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
     std::unique_ptr<ParquetFileWriter> file_writer = MakeWriter(schema);
     size_t chunk_size = values.size() / num_chunks;
     for (int i = 0; i < num_chunks; i++) {
@@ -120,7 +143,6 @@ class TestParquetIO : public ::testing::Test {
     return ReaderFromSink();
   }
 
- private:
   std::shared_ptr<InMemoryOutputStream> sink_;
 };
 
@@ -137,6 +159,23 @@ TEST_F(TestParquetIO, SingleColumnInt64Read) {
   }
 }
 
+TEST_F(TestParquetIO, SingleColumnInt64TableRead) {
+  std::vector<int64_t> values(100, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 1);
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(std::move(file_reader), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  Int64Array* out_array = static_cast<Int64Array*>(chunked_array->chunk(0).get());
+  for (size_t i = 0; i < values.size(); i++) {
+    EXPECT_EQ(values[i], out_array->raw_data()[i]);
+  }
+}
+
 TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) {
   std::vector<int64_t> values(100, 128);
   std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 4);
@@ -150,10 +189,28 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) {
   }
 }
 
+TEST_F(TestParquetIO, SingleColumnInt64ChunkedTableRead) {
+  std::vector<int64_t> values(100, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 4);
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(std::move(file_reader), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  Int64Array* out_array = static_cast<Int64Array*>(chunked_array->chunk(0).get());
+  for (size_t i = 0; i < values.size(); i++) {
+    EXPECT_EQ(values[i], out_array->raw_data()[i]);
+  }
+}
+
 TEST_F(TestParquetIO, SingleColumnInt64Write) {
   std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
 
-  std::shared_ptr<GroupNode> schema = Schema(ParquetType::INT64, Repetition::REQUIRED);
+  std::shared_ptr<GroupNode> schema =
+      MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
   FileWriter writer(default_memory_pool(), MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get())));
@@ -164,11 +221,29 @@ TEST_F(TestParquetIO, SingleColumnInt64Write) {
   ASSERT_TRUE(values->Equals(out));
 }
 
+TEST_F(TestParquetIO, SingleColumnTableInt64Write) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
+  std::shared_ptr<Table> table = MakeSimpleTable(values, false);
+  sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_NO_THROW(ASSERT_OK(
+      WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length())));
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(ReaderFromSink(), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+}
+
 TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) {
   // This also tests max_definition_level = 1
   std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
 
-  std::shared_ptr<GroupNode> schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL);
+  std::shared_ptr<GroupNode> schema =
+      MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL);
   FileWriter writer(default_memory_pool(), MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get())));
@@ -179,11 +254,30 @@ TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) {
   ASSERT_TRUE(values->Equals(out));
 }
 
+TEST_F(TestParquetIO, SingleColumnTableDoubleReadWrite) {
+  // This also tests max_definition_level = 1
+  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
+  std::shared_ptr<Table> table = MakeSimpleTable(values, true);
+  sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_NO_THROW(ASSERT_OK(
+      WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length())));
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(ReaderFromSink(), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(100, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+}
+
 TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) {
   std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
   std::shared_ptr<PrimitiveArray> values_chunk = NonNullArray<Int64Type>(25, 128);
 
-  std::shared_ptr<GroupNode> schema = Schema(ParquetType::INT64, Repetition::REQUIRED);
+  std::shared_ptr<GroupNode> schema =
+      MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
   FileWriter writer(default_memory_pool(), MakeWriter(schema));
   for (int i = 0; i < 4; i++) {
     ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length())));
@@ -196,13 +290,31 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) {
   ASSERT_TRUE(values->Equals(out));
 }
 
+TEST_F(TestParquetIO, SingleColumnTableInt64ChunkedWrite) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(1000, 128);
+  std::shared_ptr<Table> table = MakeSimpleTable(values, false);
+  sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_NO_THROW(
+      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512)));
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(ReaderFromSink(), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(1000, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+}
+
 TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) {
   std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
   std::shared_ptr<PrimitiveArray> values_chunk_nulls =
       NullableArray<DoubleType>(25, 128, 10);
   std::shared_ptr<PrimitiveArray> values_chunk = NullableArray<DoubleType>(25, 128, 0);
 
-  std::shared_ptr<GroupNode> schema = Schema(ParquetType::DOUBLE, Repetition::OPTIONAL);
+  std::shared_ptr<GroupNode> schema =
+      MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL);
   FileWriter writer(default_memory_pool(), MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get())));
@@ -217,6 +329,24 @@ TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) {
   ASSERT_TRUE(values->Equals(out));
 }
 
+TEST_F(TestParquetIO, SingleColumnTableDoubleChunkedWrite) {
+  // This also tests max_definition_level = 1
+  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(1000, 128, 100);
+  std::shared_ptr<Table> table = MakeSimpleTable(values, true);
+  sink_ = std::make_shared<InMemoryOutputStream>();
+  ASSERT_NO_THROW(
+      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512)));
+
+  std::shared_ptr<Table> out;
+  ReadTableFromFile(ReaderFromSink(), &out);
+  ASSERT_EQ(1, out->num_columns());
+  ASSERT_EQ(1000, out->num_rows());
+
+  std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
+  ASSERT_EQ(1, chunked_array->num_chunks());
+  ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
+}
+
 }  // namespace parquet
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc
index 346de253606..1b97aead159 100644
--- a/cpp/src/arrow/parquet/reader.cc
+++ b/cpp/src/arrow/parquet/reader.cc
@@ -18,10 +18,14 @@
 #include "arrow/parquet/reader.h"
 
 #include <queue>
+#include <string>
+#include <vector>
 
+#include "arrow/column.h"
 #include "arrow/parquet/schema.h"
 #include "arrow/parquet/utils.h"
 #include "arrow/schema.h"
+#include "arrow/table.h"
 #include "arrow/types/primitive.h"
 #include "arrow/util/status.h"
 
@@ -40,6 +44,7 @@ class FileReader::Impl {
   bool CheckForFlatColumn(const ::parquet::ColumnDescriptor* descr);
   Status GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out);
   Status ReadFlatColumn(int i, std::shared_ptr<Array>* out);
+  Status ReadFlatTable(std::shared_ptr<Table>* out);
 
  private:
   MemoryPool* pool_;
@@ -103,6 +108,23 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr<Array>* out) {
   return flat_column_reader->NextBatch(reader_->num_rows(), out);
 }
 
+Status FileReader::Impl::ReadFlatTable(std::shared_ptr<Table>* table) {
+  const std::string name = reader_->descr()->schema()->name();
+  std::shared_ptr<Schema> schema;
+  RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema));
+
+  std::vector<std::shared_ptr<Column>> columns;
+  for (int i = 0; i < reader_->num_columns(); i++) {
+    std::shared_ptr<Array> array;
+    RETURN_NOT_OK(ReadFlatColumn(i, &array));
+    auto column = std::make_shared<Column>(schema->field(i), array);
+    columns.push_back(column);
+  }
+
+  *table = std::make_shared<Table>(name, schema, columns);
+  return Status::OK();
+}
+
 FileReader::FileReader(
     MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileReader> reader)
     : impl_(new FileReader::Impl(pool, std::move(reader))) {}
@@ -117,6 +139,10 @@ Status FileReader::ReadFlatColumn(int i, std::shared_ptr<Array>* out) {
   return impl_->ReadFlatColumn(i, out);
 }
 
+Status FileReader::ReadFlatTable(std::shared_ptr<Table>* out) {
+  return impl_->ReadFlatTable(out);
+}
+
 FlatColumnReader::Impl::Impl(MemoryPool* pool, const ::parquet::ColumnDescriptor* descr,
     ::parquet::ParquetFileReader* reader, int column_index)
     : pool_(pool),
diff --git a/cpp/src/arrow/parquet/reader.h b/cpp/src/arrow/parquet/reader.h
index 41ca7eb35b9..db7a15753d8 100644
--- a/cpp/src/arrow/parquet/reader.h
+++ b/cpp/src/arrow/parquet/reader.h
@@ -29,6 +29,7 @@ class Array;
 class MemoryPool;
 class RowBatch;
 class Status;
+class Table;
 
 namespace parquet {
 
@@ -90,6 +91,8 @@ class FileReader {
   Status GetFlatColumn(int i, std::unique_ptr<FlatColumnReader>* out);
   // Read column as a whole into an Array.
   Status ReadFlatColumn(int i, std::shared_ptr<Array>* out);
+  // Read a table of flat columns into a Table.
+  Status ReadFlatTable(std::shared_ptr<Table>* out);
 
   virtual ~FileReader();
 
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index 3ad2c5b0735..e94ff1973bb 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -17,11 +17,21 @@
 
 #include "arrow/parquet/writer.h"
 
+#include <algorithm>
+#include <vector>
+
 #include "arrow/array.h"
+#include "arrow/column.h"
+#include "arrow/table.h"
+#include "arrow/types/construct.h"
 #include "arrow/types/primitive.h"
+#include "arrow/parquet/schema.h"
 #include "arrow/parquet/utils.h"
 #include "arrow/util/status.h"
 
+using parquet::ParquetFileWriter;
+using parquet::schema::GroupNode;
+
 namespace arrow {
 
 namespace parquet {
@@ -143,6 +153,82 @@ Status FileWriter::Close() {
 
 FileWriter::~FileWriter() {}
 
+// Create a slice of a PrimitiveArray.
+//
+// This method is specially crafted for WriteFlatTable and assumes the following:
+//  * chunk_size is a multiple of 512
+Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, PrimitiveArray* array,
+    std::shared_ptr<PrimitiveArray>* out) {
+  // The last chunk may be smaller than the chunk_size
+  int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size);
+  int64_t buffer_offset = chunk * chunk_size * array->type()->value_size();
+  int64_t value_size = size * array->type()->value_size();
+  auto chunk_buffer = std::make_shared<Buffer>(array->data(), buffer_offset, value_size);
+  std::shared_ptr<Buffer> null_bitmap;
+  int32_t null_count = 0;
+  if (array->null_count() > 0) {
+    int64_t null_offset = (chunk * chunk_size) / 8;
+    int64_t null_size = util::ceil_byte(size) / 8;
+    null_bitmap = std::make_shared<Buffer>(array->null_bitmap(), null_offset, null_size);
+    for (int64_t k = 0; k < size; k++) {
+      if (!util::get_bit(null_bitmap->data(), k)) { null_count++; }
+    }
+  }
+  std::shared_ptr<Array> out_array;
+  RETURN_NOT_OK(MakePrimitiveArray(
+      array->type(), size, chunk_buffer, null_count, null_bitmap, &out_array));
+  *out = std::static_pointer_cast<PrimitiveArray>(out_array);
+  return Status::OK();
+}
+
+Status WriteFlatTable(const Table* table, MemoryPool* pool,
+    std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) {
+  // Ensure alignment of sliced PrimitiveArray, esp. the null bitmap
+  // TODO: Support other chunksizes than multiples of 512
+  if (((chunk_size & 511) != 0) && (chunk_size != table->num_rows())) {
+    return Status::NotImplemented(
+        "Only chunk sizes that are a multiple of 512 are supported");
+  }
+
+  std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema;
+  RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema));
+  auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema());
+  std::unique_ptr<ParquetFileWriter> parquet_writer =
+      ParquetFileWriter::Open(sink, schema_node);
+  FileWriter writer(pool, std::move(parquet_writer));
+
+  // TODO: Support writing chunked arrays.
+  for (int i = 0; i < table->num_columns(); i++) {
+    if (table->column(i)->data()->num_chunks() != 1) {
+      return Status::NotImplemented("No support for writing chunked arrays yet.");
+    }
+  }
+
+  // Cast to PrimitiveArray instances as we work with them.
+  std::vector<std::shared_ptr<PrimitiveArray>> arrays(table->num_columns());
+  for (int i = 0; i < table->num_columns(); i++) {
+    // num_chunks == 1 as per above loop
+    std::shared_ptr<Array> array = table->column(i)->data()->chunk(0);
+    auto primitive_array = std::dynamic_pointer_cast<PrimitiveArray>(array);
+    if (!primitive_array) {
+      return Status::NotImplemented("Table must consist of PrimitiveArray instances");
+    }
+    arrays[i] = primitive_array;
+  }
+
+  for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) {
+    int64_t size = std::min(chunk_size, table->num_rows() - chunk * chunk_size);
+    RETURN_NOT_OK(writer.NewRowGroup(size));
+    for (int i = 0; i < table->num_columns(); i++) {
+      std::shared_ptr<PrimitiveArray> array;
+      RETURN_NOT_OK(TemporaryArraySlice(chunk, chunk_size, arrays[i].get(), &array));
+      RETURN_NOT_OK(writer.WriteFlatColumnChunk(array.get()));
+    }
+  }
+
+  return writer.Close();
+}
+
 }  // namespace parquet
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h
index 38f7d0b3a89..64625021c11 100644
--- a/cpp/src/arrow/parquet/writer.h
+++ b/cpp/src/arrow/parquet/writer.h
@@ -29,6 +29,7 @@ class MemoryPool;
 class PrimitiveArray;
 class RowBatch;
 class Status;
+class Table;
 
 namespace parquet {
 
@@ -52,6 +53,14 @@ class FileWriter {
   std::unique_ptr<Impl> impl_;
 };
 
+/**
+ * Write a flat Table to Parquet.
+ *
+ * The table shall only consist of nullable, non-repeated columns of primitive type.
+ */
+Status WriteFlatTable(const Table* table, MemoryPool* pool,
+    std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size);
+
 }  // namespace parquet
 
 }  // namespace arrow
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 2173232d4ef..30c37054aaa 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -344,6 +344,8 @@ find_package(Arrow REQUIRED)
 include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
 ADD_THIRDPARTY_LIB(arrow
   SHARED_LIB ${ARROW_SHARED_LIB})
+ADD_THIRDPARTY_LIB(arrow_parquet
+    SHARED_LIB ${ARROW_PARQUET_SHARED_LIB})
 
 ############################################################
 # Linker setup
@@ -414,6 +416,7 @@ set(PYARROW_SRCS
   src/pyarrow/common.cc
   src/pyarrow/config.cc
   src/pyarrow/helpers.cc
+  src/pyarrow/parquet.cc
   src/pyarrow/status.cc
 
   src/pyarrow/adapters/builtin.cc
@@ -422,6 +425,7 @@ set(PYARROW_SRCS
 
 set(LINK_LIBS
   arrow
+  arrow_parquet
 )
 
 SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
@@ -442,6 +446,7 @@ set(CYTHON_EXTENSIONS
   array
   config
   error
+  parquet
   scalar
   schema
   table
diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake
index 3d9983849eb..f0b258ed027 100644
--- a/python/cmake_modules/FindArrow.cmake
+++ b/python/cmake_modules/FindArrow.cmake
@@ -42,19 +42,27 @@ find_library(ARROW_LIB_PATH NAMES arrow
   ${ARROW_SEARCH_LIB_PATH}
   NO_DEFAULT_PATH)
 
-if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH)
+find_library(ARROW_PARQUET_LIB_PATH NAMES arrow_parquet
+  PATHS
+  ${ARROW_SEARCH_LIB_PATH}
+  NO_DEFAULT_PATH)
+
+if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH)
   set(ARROW_FOUND TRUE)
   set(ARROW_LIB_NAME libarrow)
+  set(ARROW_PARQUET_LIB_NAME libarrow_parquet)
   set(ARROW_LIBS ${ARROW_SEARCH_LIB_PATH})
   set(ARROW_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_LIB_NAME}.a)
   set(ARROW_SHARED_LIB ${ARROW_LIBS}/${ARROW_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+  set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a)
+  set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
 else ()
   set(ARROW_FOUND FALSE)
 endif ()
 
 if (ARROW_FOUND)
   if (NOT Arrow_FIND_QUIETLY)
-    message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}")
+    message(STATUS "Found the Arrow library: ${ARROW_LIB_PATH}, ${ARROW_PARQUET_LIB_PATH}")
   endif ()
 else ()
   if (NOT Arrow_FIND_QUIETLY)
@@ -74,4 +82,6 @@ mark_as_advanced(
   ARROW_LIBS
   ARROW_STATIC_LIB
   ARROW_SHARED_LIB
+  ARROW_PARQUET_STATIC_LIB
+  ARROW_PARQUET_SHARED_LIB
 )
diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index a80b3ce8398..6d63c321931 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -68,6 +68,14 @@ cdef class Array:
         values = array_format(self, window=10)
         return '{0}\n{1}'.format(type_format, values)
 
+    def __richcmp__(Array self, Array other, int op):
+        if op == cpython.Py_EQ:
+            return self.ap.Equals(other.sp_array)
+        elif op == cpython.Py_NE:
+            return not self.ap.Equals(other.sp_array)
+        else:
+            raise TypeError('Invalid comparison')
+
     def __len__(self):
         if self.sp_array.get():
             return self.sp_array.get().length()
diff --git a/python/pyarrow/error.pxd b/python/pyarrow/error.pxd
index d226abeda04..97ba0ef2e9f 100644
--- a/python/pyarrow/error.pxd
+++ b/python/pyarrow/error.pxd
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from pyarrow.includes.libarrow cimport CStatus
 from pyarrow.includes.pyarrow cimport *
 
+cdef check_cstatus(const CStatus& status)
 cdef check_status(const Status& status)
diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx
index 3f8d7dd6460..51e06101bf6 100644
--- a/python/pyarrow/error.pyx
+++ b/python/pyarrow/error.pyx
@@ -21,6 +21,13 @@ from pyarrow.compat import frombytes
 class ArrowException(Exception):
     pass
 
+cdef check_cstatus(const CStatus& status):
+    if status.ok():
+        return
+
+    cdef c_string c_message = status.ToString()
+    raise ArrowException(frombytes(c_message))
+
 cdef check_status(const Status& status):
     if status.ok():
         return
diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index e86d5d77e8b..5c3df6401bc 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -34,6 +34,11 @@ cdef extern from "<Python.h>":
 
 cdef extern from "<memory>" namespace "std" nogil:
 
+    cdef cppclass unique_ptr[T]:
+        unique_ptr()
+        unique_ptr(T*)
+        T* get()
+
     cdef cppclass shared_ptr[T]:
         shared_ptr()
         shared_ptr(T*)
diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd
index b2ef45a347b..90414e3d542 100644
--- a/python/pyarrow/includes/libarrow.pxd
+++ b/python/pyarrow/includes/libarrow.pxd
@@ -72,6 +72,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
     cdef cppclass MemoryPool" arrow::MemoryPool":
         int64_t bytes_allocated()
 
+    cdef MemoryPool* default_memory_pool()
+
     cdef cppclass CListType" arrow::ListType"(CDataType):
         CListType(const shared_ptr[CDataType]& value_type)
 
@@ -103,6 +105,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
         int32_t null_count()
         Type type_enum()
 
+        c_bool Equals(const shared_ptr[CArray]& arr)
         c_bool IsNull(int i)
 
     cdef cppclass CBooleanArray" arrow::BooleanArray"(CArray):
diff --git a/python/pyarrow/includes/parquet.pxd b/python/pyarrow/includes/parquet.pxd
index ffdc5d48706..0918344070e 100644
--- a/python/pyarrow/includes/parquet.pxd
+++ b/python/pyarrow/includes/parquet.pxd
@@ -18,6 +18,26 @@
 # distutils: language = c++
 
 from pyarrow.includes.common cimport *
+from pyarrow.includes.libarrow cimport CSchema, CStatus, CTable, MemoryPool
+
+
+cdef extern from "parquet/api/schema.h" namespace "parquet::schema" nogil:
+  cdef cppclass Node:
+    pass
+
+  cdef cppclass GroupNode(Node):
+    pass
+
+  cdef cppclass PrimitiveNode(Node):
+    pass
+
+cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
+  cdef cppclass SchemaDescriptor:
+    shared_ptr[Node] schema()
+    GroupNode* group()
+
+  cdef cppclass ColumnDescriptor:
+    pass
 
 cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
     cdef cppclass ColumnReader:
@@ -48,4 +68,30 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
         pass
 
     cdef cppclass ParquetFileReader:
+        # TODO: Some default arguments are missing
+        @staticmethod
+        unique_ptr[ParquetFileReader] OpenFile(const c_string& path)
+
+cdef extern from "parquet/api/writer.h" namespace "parquet" nogil:
+    cdef cppclass OutputStream:
         pass
+
+    cdef cppclass LocalFileOutputStream(OutputStream):
+        LocalFileOutputStream(const c_string& path)
+        void Close()
+
+
+cdef extern from "arrow/parquet/reader.h" namespace "arrow::parquet" nogil:
+    cdef cppclass FileReader:
+        FileReader(MemoryPool* pool, unique_ptr[ParquetFileReader] reader)
+        CStatus ReadFlatTable(shared_ptr[CTable]* out);
+
+
+cdef extern from "arrow/parquet/schema.h" namespace "arrow::parquet" nogil:
+    CStatus FromParquetSchema(const SchemaDescriptor* parquet_schema, shared_ptr[CSchema]* out)
+    CStatus ToParquetSchema(const CSchema* arrow_schema, shared_ptr[SchemaDescriptor]* out)
+
+
+cdef extern from "arrow/parquet/writer.h" namespace "arrow::parquet" nogil:
+    cdef CStatus WriteFlatTable(const CTable* table, MemoryPool* pool, shared_ptr[OutputStream] sink, int64_t chunk_size)
+
diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 622e7d07724..076a353bb87 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -19,5 +19,44 @@
 # distutils: language = c++
 # cython: embedsignature = True
 
-from pyarrow.compat import frombytes, tobytes
+from pyarrow.includes.libarrow cimport *
+cimport pyarrow.includes.pyarrow as pyarrow
 from pyarrow.includes.parquet cimport *
+
+from pyarrow.error cimport check_cstatus
+from pyarrow.table cimport Table
+
+def read_table(filename, columns=None):
+    """
+    Read a Table from Parquet format
+    Returns
+    -------
+    table: pyarrow.Table
+    """
+    cdef unique_ptr[FileReader] reader
+    cdef Table table = Table()
+    cdef shared_ptr[CTable] ctable
+
+    # Must be in one expression to avoid calling std::move which is not possible
+    # in Cython (due to missing rvalue support)
+    reader = unique_ptr[FileReader](new FileReader(default_memory_pool(),
+        ParquetFileReader.OpenFile(filename)))
+    check_cstatus(reader.get().ReadFlatTable(&ctable))
+    table.init(ctable)
+    return table
+
+def write_table(table, filename, chunk_size=None):
+    """
+    Write a Table to Parquet format
+    """
+    cdef Table table_ = table
+    cdef CTable* ctable_ = table_.table
+    cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename))
+    cdef int64_t chunk_size_ = 0
+    if chunk_size is None:
+        chunk_size_ = ctable_.num_rows()
+    else:
+        chunk_size_ = chunk_size
+
+    check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_))
+
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
new file mode 100644
index 00000000000..a84fc2785cd
--- /dev/null
+++ b/python/pyarrow/tests/test_parquet.py
@@ -0,0 +1,50 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from pyarrow.compat import unittest
+import pyarrow as arrow
+import pyarrow.parquet
+
+A = arrow
+
+from shutil import rmtree
+from tempfile import mkdtemp
+
+import os.path
+
+
+class TestParquetIO(unittest.TestCase):
+
+  def setUp(self):
+    self.temp_directory = mkdtemp()
+
+  def tearDown(self):
+    rmtree(self.temp_directory)
+
+
+  def test_single_int64_column(self):
+    filename = os.path.join(self.temp_directory, 'single_int64_column.parquet')
+    data = [A.from_pylist(range(5))]
+    table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
+    A.parquet.write_table(table, filename)
+    table_read = pyarrow.parquet.read_table(filename)
+    for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
+        assert col_written.name == col_read.name
+        assert col_read.data.num_chunks == 1
+        data_written = col_written.data.chunk(0)
+        data_read = col_read.data.chunk(0)
+        assert data_written == data_read
diff --git a/python/setup.py b/python/setup.py
index 5f228ed0af2..1c55198ec8a 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -214,7 +214,7 @@ def get_ext_built(self, name):
             return name + suffix
 
     def get_cmake_cython_names(self):
-        return ['array', 'config', 'error', 'scalar', 'schema', 'table']
+        return ['array', 'config', 'error', 'parquet', 'scalar', 'schema', 'table']
 
     def get_names(self):
         return self._found_names
diff --git a/python/src/pyarrow/parquet.cc b/python/src/pyarrow/parquet.cc
new file mode 100644
index 00000000000..2e524983d6f
--- /dev/null
+++ b/python/src/pyarrow/parquet.cc
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "pyarrow/parquet.h"
+
+#include <arrow/parquet/reader.h>
+#include <arrow/parquet/schema.h>
+#include <arrow/parquet/writer.h>
+
+using namespace arrow;
+
+namespace pyarrow {
+
+
+} // namespace pyarrow
+
diff --git a/python/src/pyarrow/parquet.h b/python/src/pyarrow/parquet.h
new file mode 100644
index 00000000000..8d1d79a4980
--- /dev/null
+++ b/python/src/pyarrow/parquet.h
@@ -0,0 +1,30 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef PYARROW_PARQUET_H
+#define PYARROW_PARQUET_H
+
+#include <arrow/parquet/reader.h>
+#include <arrow/parquet/writer.h>
+
+namespace pyarrow {
+
+
+} // namespace pyarrow
+
+#endif // PYARROW_PARQUET_H
+

From 7192cfbd316835833f3489c52cba7132ff52a1db Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Mon, 30 May 2016 09:09:54 +0200
Subject: [PATCH 02/28] Add const to slicing parameters

---
 ci/travis_before_script_cpp.sh  | 2 +-
 cpp/src/arrow/parquet/writer.cc | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh
index 193c76feba1..bf9ec58dbd9 100755
--- a/ci/travis_before_script_cpp.sh
+++ b/ci/travis_before_script_cpp.sh
@@ -19,7 +19,7 @@ echo $GTEST_HOME
 
 : ${ARROW_CPP_INSTALL=$TRAVIS_BUILD_DIR/cpp-install}
 
-CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"
+CMAKE_COMMON_FLAGS="-DARROW_BUILD_BENCHMARKS=ON -DARROW_PARQUET=ON -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL"
 
 if [ $TRAVIS_OS_NAME == "linux" ]; then
   cmake -DARROW_TEST_MEMCHECK=on $CMAKE_COMMON_FLAGS -DCMAKE_CXX_FLAGS="-Werror" $CPP_DIR
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index e94ff1973bb..742b8ba6852 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -157,12 +157,12 @@ FileWriter::~FileWriter() {}
 //
 // This method is specially crafted for WriteFlatTable and assumes the following:
 //  * chunk_size is a multiple of 512
-Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, PrimitiveArray* array,
+Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, const PrimitiveArray* array,
     std::shared_ptr<PrimitiveArray>* out) {
   // The last chunk may be smaller than the chunk_size
-  int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size);
-  int64_t buffer_offset = chunk * chunk_size * array->type()->value_size();
-  int64_t value_size = size * array->type()->value_size();
+  const int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size);
+  const int64_t buffer_offset = chunk * chunk_size * array->type()->value_size();
+  const int64_t value_size = size * array->type()->value_size();
   auto chunk_buffer = std::make_shared<Buffer>(array->data(), buffer_offset, value_size);
   std::shared_ptr<Buffer> null_bitmap;
   int32_t null_count = 0;

From 081db5f31c80668ce44e615d09fe8b479208a25b Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Mon, 30 May 2016 09:10:06 +0200
Subject: [PATCH 03/28] Limit and document chunk_size

---
 python/pyarrow/parquet.pyx | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 076a353bb87..78837a45988 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -48,13 +48,20 @@ def read_table(filename, columns=None):
 def write_table(table, filename, chunk_size=None):
     """
     Write a Table to Parquet format
+
+    Parameters
+    ----------
+    table : pyarrow.Table
+    filename : string
+    chunk_size : int
+        The maximum number of rows in each Parquet RowGroup
     """
     cdef Table table_ = table
     cdef CTable* ctable_ = table_.table
     cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename))
     cdef int64_t chunk_size_ = 0
     if chunk_size is None:
-        chunk_size_ = ctable_.num_rows()
+        chunk_size_ = max(ctable_.num_rows(), int(2**16))
     else:
         chunk_size_ = chunk_size
 

From 0fbed3f2c583eab902b8fc6c32d2a4918a308b67 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Mon, 30 May 2016 09:11:40 +0200
Subject: [PATCH 04/28] Remove obsolete parquet files

---
 python/CMakeLists.txt         |  1 -
 python/src/pyarrow/parquet.cc | 30 ------------------------------
 python/src/pyarrow/parquet.h  | 30 ------------------------------
 3 files changed, 61 deletions(-)
 delete mode 100644 python/src/pyarrow/parquet.cc
 delete mode 100644 python/src/pyarrow/parquet.h

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 30c37054aaa..7e013c07b31 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -416,7 +416,6 @@ set(PYARROW_SRCS
   src/pyarrow/common.cc
   src/pyarrow/config.cc
   src/pyarrow/helpers.cc
-  src/pyarrow/parquet.cc
   src/pyarrow/status.cc
 
   src/pyarrow/adapters/builtin.cc
diff --git a/python/src/pyarrow/parquet.cc b/python/src/pyarrow/parquet.cc
deleted file mode 100644
index 2e524983d6f..00000000000
--- a/python/src/pyarrow/parquet.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#include "pyarrow/parquet.h"
-
-#include <arrow/parquet/reader.h>
-#include <arrow/parquet/schema.h>
-#include <arrow/parquet/writer.h>
-
-using namespace arrow;
-
-namespace pyarrow {
-
-
-} // namespace pyarrow
-
diff --git a/python/src/pyarrow/parquet.h b/python/src/pyarrow/parquet.h
deleted file mode 100644
index 8d1d79a4980..00000000000
--- a/python/src/pyarrow/parquet.h
+++ /dev/null
@@ -1,30 +0,0 @@
-// Licensed to the Apache Software Foundation (ASF) under one
-// or more contributor license agreements.  See the NOTICE file
-// distributed with this work for additional information
-// regarding copyright ownership.  The ASF licenses this file
-// to you under the Apache License, Version 2.0 (the
-// "License"); you may not use this file except in compliance
-// with the License.  You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing,
-// software distributed under the License is distributed on an
-// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-// KIND, either express or implied.  See the License for the
-// specific language governing permissions and limitations
-// under the License.
-
-#ifndef PYARROW_PARQUET_H
-#define PYARROW_PARQUET_H
-
-#include <arrow/parquet/reader.h>
-#include <arrow/parquet/writer.h>
-
-namespace pyarrow {
-
-
-} // namespace pyarrow
-
-#endif // PYARROW_PARQUET_H
-

From be6415c393a5f43d21bbce2d71fd908f81b9e526 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Tue, 31 May 2016 09:22:42 +0200
Subject: [PATCH 05/28] Incorportate review comments

---
 cpp/src/arrow/parquet/reader.cc      | 7 +++----
 python/pyarrow/tests/test_parquet.py | 1 -
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/parquet/reader.cc b/cpp/src/arrow/parquet/reader.cc
index 1b97aead159..3b4882d4439 100644
--- a/cpp/src/arrow/parquet/reader.cc
+++ b/cpp/src/arrow/parquet/reader.cc
@@ -109,16 +109,15 @@ Status FileReader::Impl::ReadFlatColumn(int i, std::shared_ptr<Array>* out) {
 }
 
 Status FileReader::Impl::ReadFlatTable(std::shared_ptr<Table>* table) {
-  const std::string name = reader_->descr()->schema()->name();
+  const std::string& name = reader_->descr()->schema()->name();
   std::shared_ptr<Schema> schema;
   RETURN_NOT_OK(FromParquetSchema(reader_->descr(), &schema));
 
-  std::vector<std::shared_ptr<Column>> columns;
+  std::vector<std::shared_ptr<Column>> columns(reader_->num_columns());
   for (int i = 0; i < reader_->num_columns(); i++) {
     std::shared_ptr<Array> array;
     RETURN_NOT_OK(ReadFlatColumn(i, &array));
-    auto column = std::make_shared<Column>(schema->field(i), array);
-    columns.push_back(column);
+    columns[i] = std::make_shared<Column>(schema->field(i), array);
   }
 
   *table = std::make_shared<Table>(name, schema, columns);
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index a84fc2785cd..99b2c7e09ec 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -35,7 +35,6 @@ def setUp(self):
   def tearDown(self):
     rmtree(self.temp_directory)
 
-
   def test_single_int64_column(self):
     filename = os.path.join(self.temp_directory, 'single_int64_column.parquet')
     data = [A.from_pylist(range(5))]

From 9b06e417f7341261d537ba193e643aee0ea93302 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 3 Jun 2016 18:02:27 +0200
Subject: [PATCH 06/28] Make tests templated

---
 cpp/src/arrow/parquet/parquet-io-test.cc | 232 ++++++++++++-----------
 1 file changed, 117 insertions(+), 115 deletions(-)

diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc
index 218e8916a6b..474d167628b 100644
--- a/cpp/src/arrow/parquet/parquet-io-test.cc
+++ b/cpp/src/arrow/parquet/parquet-io-test.cc
@@ -18,6 +18,7 @@
 #include "gtest/gtest.h"
 
 #include "arrow/test-util.h"
+#include "arrow/parquet/test-util.h"
 #include "arrow/parquet/reader.h"
 #include "arrow/parquet/writer.h"
 #include "arrow/types/primitive.h"
@@ -44,48 +45,36 @@ namespace arrow {
 
 namespace parquet {
 
-template <typename ArrowType>
-std::shared_ptr<PrimitiveArray> NonNullArray(
-    size_t size, typename ArrowType::c_type value) {
-  std::vector<typename ArrowType::c_type> values(size, value);
-  NumericBuilder<ArrowType> builder(default_memory_pool(), std::make_shared<ArrowType>());
-  builder.Append(values.data(), values.size());
-  return std::static_pointer_cast<PrimitiveArray>(builder.Finish());
-}
+const int SMALL_SIZE = 100;
+const int LARGE_SIZE = 10000;
 
-// This helper function only supports (size/2) nulls yet.
-template <typename ArrowType>
-std::shared_ptr<PrimitiveArray> NullableArray(
-    size_t size, typename ArrowType::c_type value, size_t num_nulls) {
-  std::vector<typename ArrowType::c_type> values(size, value);
-  std::vector<uint8_t> valid_bytes(size, 1);
+template <typename TestType>
+struct test_traits {};
 
-  for (size_t i = 0; i < num_nulls; i++) {
-    valid_bytes[i * 2] = 0;
-  }
+template <>
+struct test_traits<Int32Type> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::INT32;
+};
 
-  NumericBuilder<ArrowType> builder(default_memory_pool(), std::make_shared<ArrowType>());
-  builder.Append(values.data(), values.size(), valid_bytes.data());
-  return std::static_pointer_cast<PrimitiveArray>(builder.Finish());
-}
+template <>
+struct test_traits<Int64Type> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::INT64;
+};
 
-std::shared_ptr<Column> MakeColumn(const std::string& name,
-    const std::shared_ptr<PrimitiveArray>& array, bool nullable) {
-  auto field = std::make_shared<Field>(name, array->type(), nullable);
-  return std::make_shared<Column>(field, array);
-}
+template <>
+struct test_traits<FloatType> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::FLOAT;
+};
 
-std::shared_ptr<Table> MakeSimpleTable(
-    const std::shared_ptr<PrimitiveArray>& values, bool nullable) {
-  std::shared_ptr<Column> column = MakeColumn("col", values, nullable);
-  std::vector<std::shared_ptr<Column>> columns({column});
-  std::vector<std::shared_ptr<Field>> fields({column->field()});
-  auto schema = std::make_shared<Schema>(fields);
-  return std::make_shared<Table>("table", schema, columns);
-}
+template <>
+struct test_traits<DoubleType> {
+  static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE;
+};
 
+template <typename TestType>
 class TestParquetIO : public ::testing::Test {
  public:
+  typedef typename TestType::c_type T;
   virtual void SetUp() {}
 
   std::shared_ptr<GroupNode> MakeSchema(
@@ -113,7 +102,7 @@ class TestParquetIO : public ::testing::Test {
     std::unique_ptr<arrow::parquet::FlatColumnReader> column_reader;
     ASSERT_NO_THROW(ASSERT_OK(reader.GetFlatColumn(0, &column_reader)));
     ASSERT_NE(nullptr, column_reader.get());
-    ASSERT_OK(column_reader->NextBatch(100, out));
+    ASSERT_OK(column_reader->NextBatch(SMALL_SIZE, out));
     ASSERT_NE(nullptr, out->get());
   }
 
@@ -143,93 +132,104 @@ class TestParquetIO : public ::testing::Test {
     return ReaderFromSink();
   }
 
+  std::unique_ptr<ParquetFileReader> TestFile(std::vector<T>& values, int num_chunks) {
+    std::shared_ptr<GroupNode> schema =
+        MakeSchema(test_traits<TestType>::parquet_enum, Repetition::REQUIRED);
+    std::unique_ptr<ParquetFileWriter> file_writer = MakeWriter(schema);
+    size_t chunk_size = values.size() / num_chunks;
+    for (int i = 0; i < num_chunks; i++) {
+      auto row_group_writer = file_writer->AppendRowGroup(chunk_size);
+      auto column_writer = static_cast<::parquet::TypedColumnWriter<
+          ::parquet::DataType<test_traits<TestType>::parquet_enum>>*>(
+          row_group_writer->NextColumn());
+      T* data = values.data() + i * chunk_size;
+      column_writer->WriteBatch(chunk_size, nullptr, nullptr, data);
+      column_writer->Close();
+      row_group_writer->Close();
+    }
+    file_writer->Close();
+    return ReaderFromSink();
+  }
+
   std::shared_ptr<InMemoryOutputStream> sink_;
 };
 
-TEST_F(TestParquetIO, SingleColumnInt64Read) {
-  std::vector<int64_t> values(100, 128);
-  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 1);
+typedef ::testing::Types<Int32Type, Int64Type, FloatType, DoubleType> TestTypes;
+
+TYPED_TEST_CASE(TestParquetIO, TestTypes);
+
+TYPED_TEST(TestParquetIO, SingleColumnRequiredRead) {
+  std::vector<typename TypeParam::c_type> values(SMALL_SIZE, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = this->TestFile(values, 1);
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(std::move(file_reader), &out);
+  this->ReadSingleColumnFile(std::move(file_reader), &out);
 
-  Int64Array* out_array = static_cast<Int64Array*>(out.get());
-  for (size_t i = 0; i < values.size(); i++) {
-    EXPECT_EQ(values[i], out_array->raw_data()[i]);
-  }
+  ExpectArray<typename TypeParam::c_type>(values.data(), out.get());
 }
 
-TEST_F(TestParquetIO, SingleColumnInt64TableRead) {
-  std::vector<int64_t> values(100, 128);
-  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 1);
+TYPED_TEST(TestParquetIO, SingleColumnRequiredTableRead) {
+  std::vector<typename TypeParam::c_type> values(SMALL_SIZE, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = this->TestFile(values, 1);
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(std::move(file_reader), &out);
+  this->ReadTableFromFile(std::move(file_reader), &out);
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+  ASSERT_EQ(SMALL_SIZE, out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
   ASSERT_EQ(1, chunked_array->num_chunks());
-  Int64Array* out_array = static_cast<Int64Array*>(chunked_array->chunk(0).get());
-  for (size_t i = 0; i < values.size(); i++) {
-    EXPECT_EQ(values[i], out_array->raw_data()[i]);
-  }
+  ExpectArray<typename TypeParam::c_type>(values.data(), chunked_array->chunk(0).get());
 }
 
-TEST_F(TestParquetIO, SingleColumnInt64ChunkedRead) {
-  std::vector<int64_t> values(100, 128);
-  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 4);
+TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedRead) {
+  std::vector<typename TypeParam::c_type> values(SMALL_SIZE, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = this->TestFile(values, 4);
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(std::move(file_reader), &out);
+  this->ReadSingleColumnFile(std::move(file_reader), &out);
 
-  Int64Array* out_array = static_cast<Int64Array*>(out.get());
-  for (size_t i = 0; i < values.size(); i++) {
-    EXPECT_EQ(values[i], out_array->raw_data()[i]);
-  }
+  ExpectArray<typename TypeParam::c_type>(values.data(), out.get());
 }
 
-TEST_F(TestParquetIO, SingleColumnInt64ChunkedTableRead) {
-  std::vector<int64_t> values(100, 128);
-  std::unique_ptr<ParquetFileReader> file_reader = Int64File(values, 4);
+TYPED_TEST(TestParquetIO, SingleColumnRequiredChunkedTableRead) {
+  std::vector<typename TypeParam::c_type> values(SMALL_SIZE, 128);
+  std::unique_ptr<ParquetFileReader> file_reader = this->TestFile(values, 4);
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(std::move(file_reader), &out);
+  this->ReadTableFromFile(std::move(file_reader), &out);
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+  ASSERT_EQ(SMALL_SIZE, out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
   ASSERT_EQ(1, chunked_array->num_chunks());
-  Int64Array* out_array = static_cast<Int64Array*>(chunked_array->chunk(0).get());
-  for (size_t i = 0; i < values.size(); i++) {
-    EXPECT_EQ(values[i], out_array->raw_data()[i]);
-  }
+  ExpectArray<typename TypeParam::c_type>(values.data(), chunked_array->chunk(0).get());
 }
 
-TEST_F(TestParquetIO, SingleColumnInt64Write) {
-  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
+TYPED_TEST(TestParquetIO, SingleColumnRequiredWrite) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<TypeParam>(SMALL_SIZE, 128);
 
   std::shared_ptr<GroupNode> schema =
-      MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
-  FileWriter writer(default_memory_pool(), MakeWriter(schema));
+      this->MakeSchema(test_traits<TypeParam>::parquet_enum, Repetition::REQUIRED);
+  FileWriter writer(default_memory_pool(), this->MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get())));
   ASSERT_NO_THROW(ASSERT_OK(writer.Close()));
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(ReaderFromSink(), &out);
+  this->ReadSingleColumnFile(this->ReaderFromSink(), &out);
   ASSERT_TRUE(values->Equals(out));
 }
 
-TEST_F(TestParquetIO, SingleColumnTableInt64Write) {
-  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
+TYPED_TEST(TestParquetIO, SingleColumnTableRequiredWrite) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<TypeParam>(SMALL_SIZE, 128);
   std::shared_ptr<Table> table = MakeSimpleTable(values, false);
-  sink_ = std::make_shared<InMemoryOutputStream>();
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
   ASSERT_NO_THROW(ASSERT_OK(
-      WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length())));
+      WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length())));
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(ReaderFromSink(), &out);
+  this->ReadTableFromFile(this->ReaderFromSink(), &out);
   ASSERT_EQ(1, out->num_columns());
   ASSERT_EQ(100, out->num_rows());
 
@@ -238,47 +238,48 @@ TEST_F(TestParquetIO, SingleColumnTableInt64Write) {
   ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
 }
 
-TEST_F(TestParquetIO, SingleColumnDoubleReadWrite) {
+TYPED_TEST(TestParquetIO, SingleColumnOptionalReadWrite) {
   // This also tests max_definition_level = 1
-  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
+  std::shared_ptr<PrimitiveArray> values = NullableArray<TypeParam>(SMALL_SIZE, 128, 10);
 
   std::shared_ptr<GroupNode> schema =
-      MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL);
-  FileWriter writer(default_memory_pool(), MakeWriter(schema));
+      this->MakeSchema(test_traits<TypeParam>::parquet_enum, Repetition::OPTIONAL);
+  FileWriter writer(default_memory_pool(), this->MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values.get())));
   ASSERT_NO_THROW(ASSERT_OK(writer.Close()));
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(ReaderFromSink(), &out);
+  this->ReadSingleColumnFile(this->ReaderFromSink(), &out);
   ASSERT_TRUE(values->Equals(out));
 }
 
-TEST_F(TestParquetIO, SingleColumnTableDoubleReadWrite) {
+TYPED_TEST(TestParquetIO, SingleColumnTableOptionalReadWrite) {
   // This also tests max_definition_level = 1
-  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
+  std::shared_ptr<PrimitiveArray> values = NullableArray<TypeParam>(SMALL_SIZE, 128, 10);
   std::shared_ptr<Table> table = MakeSimpleTable(values, true);
-  sink_ = std::make_shared<InMemoryOutputStream>();
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
   ASSERT_NO_THROW(ASSERT_OK(
-      WriteFlatTable(table.get(), default_memory_pool(), sink_, values->length())));
+      WriteFlatTable(table.get(), default_memory_pool(), this->sink_, values->length())));
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(ReaderFromSink(), &out);
+  this->ReadTableFromFile(this->ReaderFromSink(), &out);
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(100, out->num_rows());
+  ASSERT_EQ(SMALL_SIZE, out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
   ASSERT_EQ(1, chunked_array->num_chunks());
   ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
 }
 
-TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) {
-  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(100, 128);
-  std::shared_ptr<PrimitiveArray> values_chunk = NonNullArray<Int64Type>(25, 128);
+TYPED_TEST(TestParquetIO, SingleColumnIntRequiredChunkedWrite) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<TypeParam>(SMALL_SIZE, 128);
+  std::shared_ptr<PrimitiveArray> values_chunk =
+      NonNullArray<TypeParam>(SMALL_SIZE / 4, 128);
 
   std::shared_ptr<GroupNode> schema =
-      MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
-  FileWriter writer(default_memory_pool(), MakeWriter(schema));
+      this->MakeSchema(test_traits<TypeParam>::parquet_enum, Repetition::REQUIRED);
+  FileWriter writer(default_memory_pool(), this->MakeWriter(schema));
   for (int i = 0; i < 4; i++) {
     ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk->length())));
     ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk.get())));
@@ -286,36 +287,37 @@ TEST_F(TestParquetIO, SingleColumnInt64ChunkedWrite) {
   ASSERT_NO_THROW(ASSERT_OK(writer.Close()));
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(ReaderFromSink(), &out);
+  this->ReadSingleColumnFile(this->ReaderFromSink(), &out);
   ASSERT_TRUE(values->Equals(out));
 }
 
-TEST_F(TestParquetIO, SingleColumnTableInt64ChunkedWrite) {
-  std::shared_ptr<PrimitiveArray> values = NonNullArray<Int64Type>(1000, 128);
+TYPED_TEST(TestParquetIO, SingleColumnTableRequiredChunkedWrite) {
+  std::shared_ptr<PrimitiveArray> values = NonNullArray<TypeParam>(LARGE_SIZE, 128);
   std::shared_ptr<Table> table = MakeSimpleTable(values, false);
-  sink_ = std::make_shared<InMemoryOutputStream>();
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
   ASSERT_NO_THROW(
-      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512)));
+      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512)));
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(ReaderFromSink(), &out);
+  this->ReadTableFromFile(this->ReaderFromSink(), &out);
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(1000, out->num_rows());
+  ASSERT_EQ(LARGE_SIZE, out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
   ASSERT_EQ(1, chunked_array->num_chunks());
   ASSERT_TRUE(values->Equals(chunked_array->chunk(0)));
 }
 
-TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) {
-  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(100, 128, 10);
+TYPED_TEST(TestParquetIO, SingleColumnOptionalChunkedWrite) {
+  std::shared_ptr<PrimitiveArray> values = NullableArray<TypeParam>(SMALL_SIZE, 128, 10);
   std::shared_ptr<PrimitiveArray> values_chunk_nulls =
-      NullableArray<DoubleType>(25, 128, 10);
-  std::shared_ptr<PrimitiveArray> values_chunk = NullableArray<DoubleType>(25, 128, 0);
+      NullableArray<TypeParam>(SMALL_SIZE / 4, 128, 10);
+  std::shared_ptr<PrimitiveArray> values_chunk =
+      NullableArray<TypeParam>(SMALL_SIZE / 4, 128, 0);
 
   std::shared_ptr<GroupNode> schema =
-      MakeSchema(ParquetType::DOUBLE, Repetition::OPTIONAL);
-  FileWriter writer(default_memory_pool(), MakeWriter(schema));
+      this->MakeSchema(test_traits<TypeParam>::parquet_enum, Repetition::OPTIONAL);
+  FileWriter writer(default_memory_pool(), this->MakeWriter(schema));
   ASSERT_NO_THROW(ASSERT_OK(writer.NewRowGroup(values_chunk_nulls->length())));
   ASSERT_NO_THROW(ASSERT_OK(writer.WriteFlatColumnChunk(values_chunk_nulls.get())));
   for (int i = 0; i < 3; i++) {
@@ -325,22 +327,22 @@ TEST_F(TestParquetIO, SingleColumnDoubleChunkedWrite) {
   ASSERT_NO_THROW(ASSERT_OK(writer.Close()));
 
   std::shared_ptr<Array> out;
-  ReadSingleColumnFile(ReaderFromSink(), &out);
+  this->ReadSingleColumnFile(this->ReaderFromSink(), &out);
   ASSERT_TRUE(values->Equals(out));
 }
 
-TEST_F(TestParquetIO, SingleColumnTableDoubleChunkedWrite) {
+TYPED_TEST(TestParquetIO, SingleColumnTableOptionalChunkedWrite) {
   // This also tests max_definition_level = 1
-  std::shared_ptr<PrimitiveArray> values = NullableArray<DoubleType>(1000, 128, 100);
+  std::shared_ptr<PrimitiveArray> values = NullableArray<TypeParam>(LARGE_SIZE, 128, 100);
   std::shared_ptr<Table> table = MakeSimpleTable(values, true);
-  sink_ = std::make_shared<InMemoryOutputStream>();
+  this->sink_ = std::make_shared<InMemoryOutputStream>();
   ASSERT_NO_THROW(
-      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), sink_, 512)));
+      ASSERT_OK(WriteFlatTable(table.get(), default_memory_pool(), this->sink_, 512)));
 
   std::shared_ptr<Table> out;
-  ReadTableFromFile(ReaderFromSink(), &out);
+  this->ReadTableFromFile(this->ReaderFromSink(), &out);
   ASSERT_EQ(1, out->num_columns());
-  ASSERT_EQ(1000, out->num_rows());
+  ASSERT_EQ(LARGE_SIZE, out->num_rows());
 
   std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data();
   ASSERT_EQ(1, chunked_array->num_chunks());

From 5d4929ad0a29e22aa7ded6e74a30f1c3fa840481 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 3 Jun 2016 22:29:14 +0200
Subject: [PATCH 07/28] Add test-util.h

---
 cpp/src/arrow/parquet/test-util.h | 77 +++++++++++++++++++++++++++++++
 1 file changed, 77 insertions(+)
 create mode 100644 cpp/src/arrow/parquet/test-util.h

diff --git a/cpp/src/arrow/parquet/test-util.h b/cpp/src/arrow/parquet/test-util.h
new file mode 100644
index 00000000000..1496082d5c6
--- /dev/null
+++ b/cpp/src/arrow/parquet/test-util.h
@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <string>
+#include <vector>
+
+#include "arrow/types/primitive.h"
+
+namespace arrow {
+
+namespace parquet {
+
+template <typename ArrowType>
+std::shared_ptr<PrimitiveArray> NonNullArray(
+    size_t size, typename ArrowType::c_type value) {
+  std::vector<typename ArrowType::c_type> values(size, value);
+  NumericBuilder<ArrowType> builder(default_memory_pool(), std::make_shared<ArrowType>());
+  builder.Append(values.data(), values.size());
+  return std::static_pointer_cast<PrimitiveArray>(builder.Finish());
+}
+
+// This helper function only supports (size/2) nulls yet.
+template <typename ArrowType>
+std::shared_ptr<PrimitiveArray> NullableArray(
+    size_t size, typename ArrowType::c_type value, size_t num_nulls) {
+  std::vector<typename ArrowType::c_type> values(size, value);
+  std::vector<uint8_t> valid_bytes(size, 1);
+
+  for (size_t i = 0; i < num_nulls; i++) {
+    valid_bytes[i * 2] = 0;
+  }
+
+  NumericBuilder<ArrowType> builder(default_memory_pool(), std::make_shared<ArrowType>());
+  builder.Append(values.data(), values.size(), valid_bytes.data());
+  return std::static_pointer_cast<PrimitiveArray>(builder.Finish());
+}
+
+std::shared_ptr<Column> MakeColumn(const std::string& name,
+    const std::shared_ptr<PrimitiveArray>& array, bool nullable) {
+  auto field = std::make_shared<Field>(name, array->type(), nullable);
+  return std::make_shared<Column>(field, array);
+}
+
+std::shared_ptr<Table> MakeSimpleTable(
+    const std::shared_ptr<PrimitiveArray>& values, bool nullable) {
+  std::shared_ptr<Column> column = MakeColumn("col", values, nullable);
+  std::vector<std::shared_ptr<Column>> columns({column});
+  std::vector<std::shared_ptr<Field>> fields({column->field()});
+  auto schema = std::make_shared<Schema>(fields);
+  return std::make_shared<Table>("table", schema, columns);
+}
+
+template <typename T>
+void ExpectArray(T* expected, Array* result) {
+  PrimitiveArray* p_array = static_cast<PrimitiveArray*>(result);
+  for (size_t i = 0; i < result->length(); i++) {
+    EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->data()->data())[i]);
+  }
+}
+
+}  // namespace parquet
+
+}  // namespace arrow

From b505feba765d31eeb77cd0f008408883728c1f9e Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 3 Jun 2016 22:55:33 +0200
Subject: [PATCH 08/28] Install parquet-cpp via conda

---
 ci/travis_before_script_cpp.sh |  4 ++++
 ci/travis_conda_build.sh       | 22 +---------------------
 ci/travis_install_conda.sh     | 26 ++++++++++++++++++++++++++
 3 files changed, 31 insertions(+), 21 deletions(-)
 create mode 100644 ci/travis_install_conda.sh

diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh
index bf9ec58dbd9..6159f67e361 100755
--- a/ci/travis_before_script_cpp.sh
+++ b/ci/travis_before_script_cpp.sh
@@ -2,6 +2,10 @@
 
 set -e
 
+source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh
+conda install -y --channel apache/channel/dev parquet-cpp
+export PARQUET_HOME=$MINICONDA
+
 : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/cpp-build}
 
 mkdir $CPP_BUILD_DIR
diff --git a/ci/travis_conda_build.sh b/ci/travis_conda_build.sh
index afa531dbd6b..c43a85170b0 100755
--- a/ci/travis_conda_build.sh
+++ b/ci/travis_conda_build.sh
@@ -2,27 +2,7 @@
 
 set -e
 
-if [ $TRAVIS_OS_NAME == "linux" ]; then
-  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"
-else
-  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh"
-fi
-
-wget -O miniconda.sh $MINICONDA_URL
-MINICONDA=$TRAVIS_BUILD_DIR/miniconda
-bash miniconda.sh -b -p $MINICONDA
-export PATH="$MINICONDA/bin:$PATH"
-conda update -y -q conda
-conda info -a
-
-conda config --set show_channel_urls yes
-conda config --add channels conda-forge
-conda config --add channels apache
-
-conda install --yes conda-build jinja2 anaconda-client
-
-# faster builds, please
-conda install -y nomkl
+source $TRAVIS_BUILD_DIR/ci/travis_install_conda.sh
 
 # Build libarrow
 
diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh
new file mode 100644
index 00000000000..bef667dff7c
--- /dev/null
+++ b/ci/travis_install_conda.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -e
+
+if [ $TRAVIS_OS_NAME == "linux" ]; then
+  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"
+else
+  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh"
+fi
+
+wget -O miniconda.sh $MINICONDA_URL
+export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
+bash miniconda.sh -b -p $MINICONDA
+export PATH="$MINICONDA/bin:$PATH"
+conda update -y -q conda
+conda info -a
+
+conda config --set show_channel_urls yes
+conda config --add channels conda-forge
+conda config --add channels apache
+
+conda install --yes conda-build jinja2 anaconda-client
+
+# faster builds, please
+conda install -y nomkl
+

From 81f501eefcc77e3cd791a1347eaa66e398a6c213 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 3 Jun 2016 23:52:50 +0200
Subject: [PATCH 09/28] No need to install conda in travis_script_python
 anymore

---
 ci/travis_script_python.sh | 15 ---------------
 1 file changed, 15 deletions(-)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index d45b895d8cf..8658456dd22 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -11,21 +11,6 @@ popd
 
 pushd $PYTHON_DIR
 
-# Bootstrap a Conda Python environment
-
-if [ $TRAVIS_OS_NAME == "linux" ]; then
-  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh"
-else
-  MINICONDA_URL="https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh"
-fi
-
-curl $MINICONDA_URL > miniconda.sh
-MINICONDA=$TRAVIS_BUILD_DIR/miniconda
-bash miniconda.sh -b -p $MINICONDA
-export PATH="$MINICONDA/bin:$PATH"
-conda update -y -q conda
-conda info -a
-
 python_version_tests() {
   PYTHON_VERSION=$1
   CONDA_ENV_NAME="pyarrow-test-${PYTHON_VERSION}"

From 6a41d23fe3c2ceb7f32e4bacbe3ee7e6b43b9432 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 08:39:10 +0200
Subject: [PATCH 10/28] Re-use conda installation from C++

---
 ci/travis_script_python.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 8658456dd22..6ac92ec1295 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -4,6 +4,10 @@ set -e
 
 PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 
+# Re-use conda installation from C++
+export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
+export PATH="$MINICONDA/bin:$PATH"
+
 # Share environment with C++
 pushd $CPP_BUILD_DIR
 source setup_build_env.sh

From cd3b9a9d4244515d824d6dfb3aeb2d43671943df Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 08:54:50 +0200
Subject: [PATCH 11/28] Also search for Parquet in PyArrow

---
 python/CMakeLists.txt                  |  4 ++
 python/cmake_modules/FindParquet.cmake | 80 ++++++++++++++++++++++++++
 2 files changed, 84 insertions(+)
 create mode 100644 python/cmake_modules/FindParquet.cmake

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 7e013c07b31..f1becfcf449 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -339,6 +339,10 @@ if (PYARROW_BUILD_TESTS)
     STATIC_LIB ${GTEST_STATIC_LIB})
 endif()
 
+## Parquet
+find_package(Parquet REQUIRED)
+include_directories(SYSTEM ${PARQUET_INCLUDE_DIR})
+
 ## Arrow
 find_package(Arrow REQUIRED)
 include_directories(SYSTEM ${ARROW_INCLUDE_DIR})
diff --git a/python/cmake_modules/FindParquet.cmake b/python/cmake_modules/FindParquet.cmake
new file mode 100644
index 00000000000..e3350d6e13d
--- /dev/null
+++ b/python/cmake_modules/FindParquet.cmake
@@ -0,0 +1,80 @@
+# Copyright 2012 Cloudera Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
+# This module defines
+#  PARQUET_INCLUDE_DIR, directory containing headers
+#  PARQUET_LIBS, directory containing parquet libraries
+#  PARQUET_STATIC_LIB, path to libparquet.a
+#  PARQUET_SHARED_LIB, path to libparquet's shared library
+#  PARQUET_FOUND, whether parquet has been found
+
+if( NOT "$ENV{PARQUET_HOME}" STREQUAL "")
+    file( TO_CMAKE_PATH "$ENV{PARQUET_HOME}" _native_path )
+    list( APPEND _parquet_roots ${_native_path} )
+elseif ( Parquet_HOME )
+    list( APPEND _parquet_roots ${Parquet_HOME} )
+endif()
+
+# Try the parameterized roots, if they exist
+if ( _parquet_roots )
+    find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h
+        PATHS ${_parquet_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "include" )
+    find_library( PARQUET_LIBRARIES NAMES parquet
+        PATHS ${_parquet_roots} NO_DEFAULT_PATH
+        PATH_SUFFIXES "lib" )
+else ()
+    find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h )
+    find_library( PARQUET_LIBRARIES NAMES parquet )
+endif ()
+
+
+if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
+  set(PARQUET_FOUND TRUE)
+  get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
+  set(PARQUET_LIB_NAME libparquet)
+  set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
+  set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
+else ()
+  set(PARQUET_FOUND FALSE)
+endif ()
+
+if (PARQUET_FOUND)
+  if (NOT Parquet_FIND_QUIETLY)
+    message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}")
+  endif ()
+else ()
+  if (NOT Parquet_FIND_QUIETLY)
+    set(PARQUET_ERR_MSG "Could not find the Parquet library. Looked in ")
+    if ( _parquet_roots )
+      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.")
+    else ()
+      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.")
+    endif ()
+    if (Parquet_FIND_REQUIRED)
+      message(FATAL_ERROR "${PARQUET_ERR_MSG}")
+    else (Parquet_FIND_REQUIRED)
+      message(STATUS "${PARQUET_ERR_MSG}")
+    endif (Parquet_FIND_REQUIRED)
+  endif ()
+endif ()
+
+mark_as_advanced(
+  PARQUET_INCLUDE_DIR
+  PARQUET_LIBS
+  PARQUET_LIBRARIES
+  PARQUET_STATIC_LIB
+  PARQUET_SHARED_LIB
+)

From 9520c39f48e54a1e31f91a0ddf32debb3ea4aa12 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:03:09 +0200
Subject: [PATCH 12/28] Use PARQUET from miniconda path

---
 ci/travis_script_python.sh           |  1 +
 python/pyarrow/tests/test_parquet.py | 13 +++++++++++++
 2 files changed, 14 insertions(+)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 6ac92ec1295..ea4f07ca938 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 # Re-use conda installation from C++
 export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
 export PATH="$MINICONDA/bin:$PATH"
+export PARQUET_HOME=$MINICONDA
 
 # Share environment with C++
 pushd $CPP_BUILD_DIR
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 99b2c7e09ec..68fdf194bf3 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -27,6 +27,19 @@
 import os.path
 
 
+def test_single_int64_column(tmpdir):
+    filename = tmpdir.join('single_int64_column.parquet')
+    data = [A.from_pylist(range(5))]
+    table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
+    A.parquet.write_table(table, filename.strpath)
+    table_read = pyarrow.parquet.read_table(filename.strpath)
+    for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
+        assert col_written.name == col_read.name
+        assert col_read.data.num_chunks == 1
+        data_written = col_written.data.chunk(0)
+        data_read = col_read.data.chunk(0)
+        assert data_written == data_read
+
 class TestParquetIO(unittest.TestCase):
 
   def setUp(self):

From 2006e7021be022dc6ec149114810fc05cb55da19 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:03:24 +0200
Subject: [PATCH 13/28] Rewrite test py.test style

---
 python/pyarrow/tests/test_parquet.py | 45 ++++++++--------------------
 1 file changed, 13 insertions(+), 32 deletions(-)

diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 68fdf194bf3..ae9a75c0262 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -27,36 +27,17 @@
 import os.path
 
 
-def test_single_int64_column(tmpdir):
-    filename = tmpdir.join('single_int64_column.parquet')
-    data = [A.from_pylist(range(5))]
-    table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
-    A.parquet.write_table(table, filename.strpath)
-    table_read = pyarrow.parquet.read_table(filename.strpath)
-    for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
-        assert col_written.name == col_read.name
-        assert col_read.data.num_chunks == 1
-        data_written = col_written.data.chunk(0)
-        data_read = col_read.data.chunk(0)
-        assert data_written == data_read
+def test_single_pylist_column(tmpdir):
+    for dtype in [int, float]:
+        filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__))
+        data = [A.from_pylist(map(dtype, range(5)))]
+        table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
+        A.parquet.write_table(table, filename.strpath)
+        table_read = pyarrow.parquet.read_table(filename.strpath)
+        for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
+            assert col_written.name == col_read.name
+            assert col_read.data.num_chunks == 1
+            data_written = col_written.data.chunk(0)
+            data_read = col_read.data.chunk(0)
+            assert data_written == data_read
 
-class TestParquetIO(unittest.TestCase):
-
-  def setUp(self):
-    self.temp_directory = mkdtemp()
-
-  def tearDown(self):
-    rmtree(self.temp_directory)
-
-  def test_single_int64_column(self):
-    filename = os.path.join(self.temp_directory, 'single_int64_column.parquet')
-    data = [A.from_pylist(range(5))]
-    table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
-    A.parquet.write_table(table, filename)
-    table_read = pyarrow.parquet.read_table(filename)
-    for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()):
-        assert col_written.name == col_read.name
-        assert col_read.data.num_chunks == 1
-        data_written = col_written.data.chunk(0)
-        data_read = col_read.data.chunk(0)
-        assert data_written == data_read

From 2dffc1412b86bd473502628d9f6f98964efca72e Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:06:45 +0200
Subject: [PATCH 14/28] Fix min mistake, use equals instead of ==

---
 python/pyarrow/array.pyx             | 9 ++-------
 python/pyarrow/parquet.pyx           | 2 +-
 python/pyarrow/tests/test_parquet.py | 2 +-
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx
index 6d63c321931..619e5ef7e39 100644
--- a/python/pyarrow/array.pyx
+++ b/python/pyarrow/array.pyx
@@ -68,13 +68,8 @@ cdef class Array:
         values = array_format(self, window=10)
         return '{0}\n{1}'.format(type_format, values)
 
-    def __richcmp__(Array self, Array other, int op):
-        if op == cpython.Py_EQ:
-            return self.ap.Equals(other.sp_array)
-        elif op == cpython.Py_NE:
-            return not self.ap.Equals(other.sp_array)
-        else:
-            raise TypeError('Invalid comparison')
+    def equals(Array self, Array other):
+        return self.ap.Equals(other.sp_array)
 
     def __len__(self):
         if self.sp_array.get():
diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 78837a45988..1dcfd60bee6 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -61,7 +61,7 @@ def write_table(table, filename, chunk_size=None):
     cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename))
     cdef int64_t chunk_size_ = 0
     if chunk_size is None:
-        chunk_size_ = max(ctable_.num_rows(), int(2**16))
+        chunk_size_ = min(ctable_.num_rows(), int(2**16))
     else:
         chunk_size_ = chunk_size
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index ae9a75c0262..236f06ca69b 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -39,5 +39,5 @@ def test_single_pylist_column(tmpdir):
             assert col_read.data.num_chunks == 1
             data_written = col_written.data.chunk(0)
             data_read = col_read.data.chunk(0)
-            assert data_written == data_read
+            assert data_written.equals(data_read)
 

From 443de8ba42b1097f217175dd168986c021b18414 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:14:53 +0200
Subject: [PATCH 15/28] Add miniconda to the LD_LIBRARY_PATH

---
 ci/travis_script_python.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index ea4f07ca938..6d35785356a 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 # Re-use conda installation from C++
 export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
 export PATH="$MINICONDA/bin:$PATH"
+export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH"
 export PARQUET_HOME=$MINICONDA
 
 # Share environment with C++

From 5706db2f6386f125ce90ecc5db9bd1b0680f940f Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:26:25 +0200
Subject: [PATCH 16/28] Use length and offset instead of slicing

---
 cpp/src/arrow/parquet/writer.cc      | 89 +++++++++-------------------
 cpp/src/arrow/parquet/writer.h       |  3 +-
 python/pyarrow/tests/test_parquet.py |  2 +-
 3 files changed, 32 insertions(+), 62 deletions(-)

diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index 742b8ba6852..0114f4c321e 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -42,8 +42,9 @@ class FileWriter::Impl {
 
   Status NewRowGroup(int64_t chunk_size);
   template <typename ParquetType>
-  Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data);
-  Status WriteFlatColumnChunk(const PrimitiveArray* data);
+  Status TypedWriteBatch(::parquet::ColumnWriter* writer, const PrimitiveArray* data,
+      int64_t offset, int64_t length);
+  Status WriteFlatColumnChunk(const PrimitiveArray* data, int64_t offset, int64_t length);
   Status Close();
 
   virtual ~Impl() {}
@@ -70,31 +71,31 @@ Status FileWriter::Impl::NewRowGroup(int64_t chunk_size) {
 }
 
 template <typename ParquetType>
-Status FileWriter::Impl::TypedWriteBatch(
-    ::parquet::ColumnWriter* column_writer, const PrimitiveArray* data) {
+Status FileWriter::Impl::TypedWriteBatch(::parquet::ColumnWriter* column_writer,
+    const PrimitiveArray* data, int64_t offset, int64_t length) {
+  // TODO: DCHECK((offset + length) <= data->length());
   auto data_ptr =
-      reinterpret_cast<const typename ParquetType::c_type*>(data->data()->data());
+      reinterpret_cast<const typename ParquetType::c_type*>(data->data()->data()) +
+      offset;
   auto writer =
       reinterpret_cast<::parquet::TypedColumnWriter<ParquetType>*>(column_writer);
   if (writer->descr()->max_definition_level() == 0) {
     // no nulls, just dump the data
-    PARQUET_CATCH_NOT_OK(writer->WriteBatch(data->length(), nullptr, nullptr, data_ptr));
+    PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, nullptr, nullptr, data_ptr));
   } else if (writer->descr()->max_definition_level() == 1) {
-    RETURN_NOT_OK(def_levels_buffer_.Resize(data->length() * sizeof(int16_t)));
+    RETURN_NOT_OK(def_levels_buffer_.Resize(length * sizeof(int16_t)));
     int16_t* def_levels_ptr =
         reinterpret_cast<int16_t*>(def_levels_buffer_.mutable_data());
     if (data->null_count() == 0) {
-      std::fill(def_levels_ptr, def_levels_ptr + data->length(), 1);
-      PARQUET_CATCH_NOT_OK(
-          writer->WriteBatch(data->length(), def_levels_ptr, nullptr, data_ptr));
+      std::fill(def_levels_ptr, def_levels_ptr + length, 1);
+      PARQUET_CATCH_NOT_OK(writer->WriteBatch(length, def_levels_ptr, nullptr, data_ptr));
     } else {
-      RETURN_NOT_OK(data_buffer_.Resize(
-          (data->length() - data->null_count()) * sizeof(typename ParquetType::c_type)));
+      RETURN_NOT_OK(data_buffer_.Resize(length * sizeof(typename ParquetType::c_type)));
       auto buffer_ptr =
           reinterpret_cast<typename ParquetType::c_type*>(data_buffer_.mutable_data());
       int buffer_idx = 0;
-      for (size_t i = 0; i < data->length(); i++) {
-        if (data->IsNull(i)) {
+      for (size_t i = 0; i < length; i++) {
+        if (data->IsNull(offset + i)) {
           def_levels_ptr[i] = 0;
         } else {
           def_levels_ptr[i] = 1;
@@ -102,7 +103,7 @@ Status FileWriter::Impl::TypedWriteBatch(
         }
       }
       PARQUET_CATCH_NOT_OK(
-          writer->WriteBatch(data->length(), def_levels_ptr, nullptr, buffer_ptr));
+          writer->WriteBatch(length, def_levels_ptr, nullptr, buffer_ptr));
     }
   } else {
     return Status::NotImplemented("no support for max definition level > 1 yet");
@@ -117,12 +118,13 @@ Status FileWriter::Impl::Close() {
   return Status::OK();
 }
 
-#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType) \
-  case Type::ENUM:                                     \
-    return TypedWriteBatch<ParquetType>(writer, data); \
+#define TYPED_BATCH_CASE(ENUM, ArrowType, ParquetType)                 \
+  case Type::ENUM:                                                     \
+    return TypedWriteBatch<ParquetType>(writer, data, offset, length); \
     break;
 
-Status FileWriter::Impl::WriteFlatColumnChunk(const PrimitiveArray* data) {
+Status FileWriter::Impl::WriteFlatColumnChunk(
+    const PrimitiveArray* data, int64_t offset, int64_t length) {
   ::parquet::ColumnWriter* writer;
   PARQUET_CATCH_NOT_OK(writer = row_group_writer_->NextColumn());
   switch (data->type_enum()) {
@@ -143,8 +145,11 @@ Status FileWriter::NewRowGroup(int64_t chunk_size) {
   return impl_->NewRowGroup(chunk_size);
 }
 
-Status FileWriter::WriteFlatColumnChunk(const PrimitiveArray* data) {
-  return impl_->WriteFlatColumnChunk(data);
+Status FileWriter::WriteFlatColumnChunk(
+    const PrimitiveArray* data, int64_t offset, int64_t length) {
+  int64_t real_length = length;
+  if (length == -1) { real_length = data->length(); }
+  return impl_->WriteFlatColumnChunk(data, offset, real_length);
 }
 
 Status FileWriter::Close() {
@@ -153,43 +158,8 @@ Status FileWriter::Close() {
 
 FileWriter::~FileWriter() {}
 
-// Create a slice of a PrimitiveArray.
-//
-// This method is specially crafted for WriteFlatTable and assumes the following:
-//  * chunk_size is a multiple of 512
-Status TemporaryArraySlice(int64_t chunk, int64_t chunk_size, const PrimitiveArray* array,
-    std::shared_ptr<PrimitiveArray>* out) {
-  // The last chunk may be smaller than the chunk_size
-  const int64_t size = std::min(chunk_size, array->length() - chunk * chunk_size);
-  const int64_t buffer_offset = chunk * chunk_size * array->type()->value_size();
-  const int64_t value_size = size * array->type()->value_size();
-  auto chunk_buffer = std::make_shared<Buffer>(array->data(), buffer_offset, value_size);
-  std::shared_ptr<Buffer> null_bitmap;
-  int32_t null_count = 0;
-  if (array->null_count() > 0) {
-    int64_t null_offset = (chunk * chunk_size) / 8;
-    int64_t null_size = util::ceil_byte(size) / 8;
-    null_bitmap = std::make_shared<Buffer>(array->null_bitmap(), null_offset, null_size);
-    for (int64_t k = 0; k < size; k++) {
-      if (!util::get_bit(null_bitmap->data(), k)) { null_count++; }
-    }
-  }
-  std::shared_ptr<Array> out_array;
-  RETURN_NOT_OK(MakePrimitiveArray(
-      array->type(), size, chunk_buffer, null_count, null_bitmap, &out_array));
-  *out = std::static_pointer_cast<PrimitiveArray>(out_array);
-  return Status::OK();
-}
-
 Status WriteFlatTable(const Table* table, MemoryPool* pool,
     std::shared_ptr<::parquet::OutputStream> sink, int64_t chunk_size) {
-  // Ensure alignment of sliced PrimitiveArray, esp. the null bitmap
-  // TODO: Support other chunksizes than multiples of 512
-  if (((chunk_size & 511) != 0) && (chunk_size != table->num_rows())) {
-    return Status::NotImplemented(
-        "Only chunk sizes that are a multiple of 512 are supported");
-  }
-
   std::shared_ptr<::parquet::SchemaDescriptor> parquet_schema;
   RETURN_NOT_OK(ToParquetSchema(table->schema().get(), &parquet_schema));
   auto schema_node = std::static_pointer_cast<GroupNode>(parquet_schema->schema());
@@ -217,12 +187,11 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool,
   }
 
   for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) {
-    int64_t size = std::min(chunk_size, table->num_rows() - chunk * chunk_size);
+    int64_t offset = chunk * chunk_size;
+    int64_t size = std::min(chunk_size, table->num_rows() - offset);
     RETURN_NOT_OK(writer.NewRowGroup(size));
     for (int i = 0; i < table->num_columns(); i++) {
-      std::shared_ptr<PrimitiveArray> array;
-      RETURN_NOT_OK(TemporaryArraySlice(chunk, chunk_size, arrays[i].get(), &array));
-      RETURN_NOT_OK(writer.WriteFlatColumnChunk(array.get()));
+      RETURN_NOT_OK(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size));
     }
   }
 
diff --git a/cpp/src/arrow/parquet/writer.h b/cpp/src/arrow/parquet/writer.h
index 64625021c11..83e799f7ed1 100644
--- a/cpp/src/arrow/parquet/writer.h
+++ b/cpp/src/arrow/parquet/writer.h
@@ -43,7 +43,8 @@ class FileWriter {
   FileWriter(MemoryPool* pool, std::unique_ptr<::parquet::ParquetFileWriter> writer);
 
   Status NewRowGroup(int64_t chunk_size);
-  Status WriteFlatColumnChunk(const PrimitiveArray* data);
+  Status WriteFlatColumnChunk(
+      const PrimitiveArray* data, int64_t offset = 0, int64_t length = -1);
   Status Close();
 
   virtual ~FileWriter();
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 236f06ca69b..12cd248314e 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -30,7 +30,7 @@
 def test_single_pylist_column(tmpdir):
     for dtype in [int, float]:
         filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__))
-        data = [A.from_pylist(map(dtype, range(5)))]
+        data = [A.from_pylist(list(map(dtype, range(5))))]
         table = A.Table.from_arrays(('a', 'b'), data, 'table_name')
         A.parquet.write_table(table, filename.strpath)
         table_read = pyarrow.parquet.read_table(filename.strpath)

From 066c08afc183838866299bb1040c7c738fe6530f Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:35:11 +0200
Subject: [PATCH 17/28] Add missing functions to smart pointers

---
 python/pyarrow/includes/common.pxd | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 5c3df6401bc..421c8920234 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -38,6 +38,11 @@ cdef extern from "<memory>" namespace "std" nogil:
         unique_ptr()
         unique_ptr(T*)
         T* get()
+        T* release()
+        void reset()
+        void reset(nullptr_t)
+        void reset(T*)
+        void swap(unique_ptr&)
 
     cdef cppclass shared_ptr[T]:
         shared_ptr()
@@ -45,3 +50,4 @@ cdef extern from "<memory>" namespace "std" nogil:
         T* get()
         void reset()
         void reset(T* p)
+        void swap(shared_ptr&)

From 4a80116ef2fd263471cd459da7bd7217679f7eae Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 09:40:59 +0200
Subject: [PATCH 18/28] Handle Python3 strings correctly

---
 python/pyarrow/parquet.pyx | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 1dcfd60bee6..39f5b229884 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -23,6 +23,7 @@ from pyarrow.includes.libarrow cimport *
 cimport pyarrow.includes.pyarrow as pyarrow
 from pyarrow.includes.parquet cimport *
 
+from pyarrow.compat import tobytes
 from pyarrow.error cimport check_cstatus
 from pyarrow.table cimport Table
 
@@ -40,7 +41,7 @@ def read_table(filename, columns=None):
     # Must be in one expression to avoid calling std::move which is not possible
     # in Cython (due to missing rvalue support)
     reader = unique_ptr[FileReader](new FileReader(default_memory_pool(),
-        ParquetFileReader.OpenFile(filename)))
+        ParquetFileReader.OpenFile(tobytes(filename))))
     check_cstatus(reader.get().ReadFlatTable(&ctable))
     table.init(ctable)
     return table
@@ -58,7 +59,8 @@ def write_table(table, filename, chunk_size=None):
     """
     cdef Table table_ = table
     cdef CTable* ctable_ = table_.table
-    cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](new LocalFileOutputStream(filename))
+    cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](
+        new LocalFileOutputStream(tobytes(filename)))
     cdef int64_t chunk_size_ = 0
     if chunk_size is None:
         chunk_size_ = min(ctable_.num_rows(), int(2**16))

From 00c14611041d112da6680b74297a6e8f437506c0 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sat, 4 Jun 2016 12:12:10 +0200
Subject: [PATCH 19/28] Also ensure correct OSX compiler flags in PyArrow

---
 python/conda.recipe/build.sh | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/conda.recipe/build.sh b/python/conda.recipe/build.sh
index a9d9aedead3..a164c1af518 100644
--- a/python/conda.recipe/build.sh
+++ b/python/conda.recipe/build.sh
@@ -6,6 +6,19 @@ export ARROW_HOME=$PREFIX
 
 cd $RECIPE_DIR
 
+if [ "$(uname)" == "Darwin" ]; then
+  # C++11 finagling for Mac OSX
+  export CC=clang
+  export CXX=clang++
+  export MACOSX_VERSION_MIN="10.7"
+  CXXFLAGS="${CXXFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}"
+  CXXFLAGS="${CXXFLAGS} -stdlib=libc++ -std=c++11"
+  export LDFLAGS="${LDFLAGS} -mmacosx-version-min=${MACOSX_VERSION_MIN}"
+  export LDFLAGS="${LDFLAGS} -stdlib=libc++ -std=c++11"
+  export LINKFLAGS="${LDFLAGS}"
+  export MACOSX_DEPLOYMENT_TARGET=10.7
+fi
+
 echo Setting the compiler...
 if [ `uname` == Linux ]; then
   EXTRA_CMAKE_ARGS=-DCMAKE_SHARED_LINKER_FLAGS=-static-libstdc++

From f583b614322374324a58654931d455f60fe8719a Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sun, 5 Jun 2016 13:30:45 +0200
Subject: [PATCH 20/28] Fix rpath for libarrow_parquet

---
 cpp/src/arrow/parquet/CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/cpp/src/arrow/parquet/CMakeLists.txt b/cpp/src/arrow/parquet/CMakeLists.txt
index c00cc9f0f25..f00bb53c084 100644
--- a/cpp/src/arrow/parquet/CMakeLists.txt
+++ b/cpp/src/arrow/parquet/CMakeLists.txt
@@ -35,6 +35,13 @@ add_library(arrow_parquet SHARED
 target_link_libraries(arrow_parquet ${PARQUET_LIBS})
 SET_TARGET_PROPERTIES(arrow_parquet PROPERTIES LINKER_LANGUAGE CXX)
 
+if (APPLE)
+  set_target_properties(arrow_parquet
+    PROPERTIES
+    BUILD_WITH_INSTALL_RPATH ON
+    INSTALL_NAME_DIR "@rpath")
+endif()
+
 ADD_ARROW_TEST(parquet-schema-test)
 ARROW_TEST_LINK_LIBRARIES(parquet-schema-test arrow_parquet)
 

From 77bd21ab5a78f95c32a10c9ee54caeec8ca7a73a Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Sun, 5 Jun 2016 20:14:56 +0200
Subject: [PATCH 21/28] Add pandas roundtrip to tests

---
 python/pyarrow/error.pyx             |  1 +
 python/pyarrow/tests/test_parquet.py | 20 +++++++++++++++++++-
 2 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/python/pyarrow/error.pyx b/python/pyarrow/error.pyx
index 51e06101bf6..5a6a038a92e 100644
--- a/python/pyarrow/error.pyx
+++ b/python/pyarrow/error.pyx
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
+from pyarrow.includes.libarrow cimport CStatus
 from pyarrow.includes.common cimport c_string
 from pyarrow.compat import frombytes
 
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 12cd248314e..3469f5291b2 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -24,10 +24,13 @@
 from shutil import rmtree
 from tempfile import mkdtemp
 
+import numpy as np
 import os.path
+import pandas as pd
+import pandas.util.testing as pdt
 
 
-def test_single_pylist_column(tmpdir):
+def test_single_pylist_column_roundtrip(tmpdir):
     for dtype in [int, float]:
         filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__))
         data = [A.from_pylist(list(map(dtype, range(5))))]
@@ -41,3 +44,18 @@ def test_single_pylist_column(tmpdir):
             data_read = col_read.data.chunk(0)
             assert data_written.equals(data_read)
 
+def test_pandas_rountrip(tmpdir):
+    size = 10000
+    df = pd.DataFrame({
+        'int32': np.arange(size, dtype=np.int32),
+        'int64': np.arange(size, dtype=np.int64),
+        'float32': np.arange(size, dtype=np.float32),
+        'float64': np.arange(size, dtype=np.float64)
+    })
+    filename = tmpdir.join('pandas_rountrip.parquet')
+    arrow_table = A.from_pandas_dataframe(df)
+    A.parquet.write_table(arrow_table, filename.strpath)
+    table_read = pyarrow.parquet.read_table(filename.strpath)
+    df_read = table_read.to_pandas()
+    pdt.assert_frame_equal(df, df_read)
+

From 0514d01a39de0e0917fd80d46dd64092c8740169 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Tue, 7 Jun 2016 08:19:14 +0200
Subject: [PATCH 22/28] Handle exceptions on RowGroupWriter::Close better

---
 cpp/src/arrow/parquet/utils.h        | 5 +++++
 cpp/src/arrow/parquet/writer.cc      | 5 +++--
 cpp/src/arrow/util/status.h          | 9 +++++++++
 python/pyarrow/tests/test_parquet.py | 4 +---
 4 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h
index b32792fdf70..9f83fd0d0fc 100644
--- a/cpp/src/arrow/parquet/utils.h
+++ b/cpp/src/arrow/parquet/utils.h
@@ -31,6 +31,11 @@ namespace parquet {
     (s);                        \
   } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); }
 
+#define PARQUET_IGNORE_NOT_OK(s) \
+  try {                         \
+    (s);                        \
+  } catch (const ::parquet::ParquetException& e) { }
+
 }  // namespace parquet
 
 }  // namespace arrow
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index 0114f4c321e..70de6000d8b 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -181,6 +181,7 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool,
     std::shared_ptr<Array> array = table->column(i)->data()->chunk(0);
     auto primitive_array = std::dynamic_pointer_cast<PrimitiveArray>(array);
     if (!primitive_array) {
+      PARQUET_IGNORE_NOT_OK(writer.Close());
       return Status::NotImplemented("Table must consist of PrimitiveArray instances");
     }
     arrays[i] = primitive_array;
@@ -189,9 +190,9 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool,
   for (int chunk = 0; chunk * chunk_size < table->num_rows(); chunk++) {
     int64_t offset = chunk * chunk_size;
     int64_t size = std::min(chunk_size, table->num_rows() - offset);
-    RETURN_NOT_OK(writer.NewRowGroup(size));
+    RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close()));
     for (int i = 0; i < table->num_columns(); i++) {
-      RETURN_NOT_OK(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size));
+      RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), PARQUET_IGNORE_NOT_OK(writer.Close()));
     }
   }
 
diff --git a/cpp/src/arrow/util/status.h b/cpp/src/arrow/util/status.h
index 6ddc177a9a5..d1a74250008 100644
--- a/cpp/src/arrow/util/status.h
+++ b/cpp/src/arrow/util/status.h
@@ -63,6 +63,15 @@ namespace arrow {
     if (!_s.ok()) { return _s; } \
   } while (0);
 
+#define RETURN_NOT_OK_ELSE(s, else_) \
+  do {                               \
+    Status _s = (s);                 \
+    if (!_s.ok()) {                  \
+      else_;                         \
+      return _s;                     \
+    }                                \
+  } while (0);
+
 enum class StatusCode : char {
   OK = 0,
   OutOfMemory = 1,
diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py
index 3469f5291b2..d92cf4ca656 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -21,12 +21,10 @@
 
 A = arrow
 
-from shutil import rmtree
-from tempfile import mkdtemp
-
 import numpy as np
 import os.path
 import pandas as pd
+
 import pandas.util.testing as pdt
 
 

From 8f6010aa584b20d28514289312d2def97994099e Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Tue, 7 Jun 2016 09:00:37 +0200
Subject: [PATCH 23/28] Linter fixes

---
 cpp/src/arrow/parquet/utils.h   | 6 +++---
 cpp/src/arrow/parquet/writer.cc | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/src/arrow/parquet/utils.h b/cpp/src/arrow/parquet/utils.h
index 9f83fd0d0fc..409bcd9065c 100644
--- a/cpp/src/arrow/parquet/utils.h
+++ b/cpp/src/arrow/parquet/utils.h
@@ -32,9 +32,9 @@ namespace parquet {
   } catch (const ::parquet::ParquetException& e) { return Status::Invalid(e.what()); }
 
 #define PARQUET_IGNORE_NOT_OK(s) \
-  try {                         \
-    (s);                        \
-  } catch (const ::parquet::ParquetException& e) { }
+  try {                          \
+    (s);                         \
+  } catch (const ::parquet::ParquetException& e) {}
 
 }  // namespace parquet
 
diff --git a/cpp/src/arrow/parquet/writer.cc b/cpp/src/arrow/parquet/writer.cc
index 70de6000d8b..1223901d550 100644
--- a/cpp/src/arrow/parquet/writer.cc
+++ b/cpp/src/arrow/parquet/writer.cc
@@ -192,7 +192,8 @@ Status WriteFlatTable(const Table* table, MemoryPool* pool,
     int64_t size = std::min(chunk_size, table->num_rows() - offset);
     RETURN_NOT_OK_ELSE(writer.NewRowGroup(size), PARQUET_IGNORE_NOT_OK(writer.Close()));
     for (int i = 0; i < table->num_columns(); i++) {
-      RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size), PARQUET_IGNORE_NOT_OK(writer.Close()));
+      RETURN_NOT_OK_ELSE(writer.WriteFlatColumnChunk(arrays[i].get(), offset, size),
+          PARQUET_IGNORE_NOT_OK(writer.Close()));
     }
   }
 

From 000e1e34d6ad3b6c1a1bc430974f2eac05f96173 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 10 Jun 2016 11:56:53 +0200
Subject: [PATCH 24/28] Use unique_ptr and shared_ptr from Cython

---
 python/pyarrow/includes/common.pxd | 20 +-------------------
 python/pyarrow/parquet.pyx         |  4 ++--
 python/pyarrow/schema.pyx          |  9 ++++++---
 python/setup.py                    |  2 +-
 4 files changed, 10 insertions(+), 25 deletions(-)

diff --git a/python/pyarrow/includes/common.pxd b/python/pyarrow/includes/common.pxd
index 421c8920234..1f6ecee5105 100644
--- a/python/pyarrow/includes/common.pxd
+++ b/python/pyarrow/includes/common.pxd
@@ -19,6 +19,7 @@
 
 from libc.stdint cimport *
 from libcpp cimport bool as c_bool
+from libcpp.memory cimport shared_ptr, unique_ptr
 from libcpp.string cimport string as c_string
 from libcpp.vector cimport vector
 
@@ -32,22 +33,3 @@ cdef extern from "<iostream>":
 cdef extern from "<Python.h>":
     void Py_XDECREF(PyObject* o)
 
-cdef extern from "<memory>" namespace "std" nogil:
-
-    cdef cppclass unique_ptr[T]:
-        unique_ptr()
-        unique_ptr(T*)
-        T* get()
-        T* release()
-        void reset()
-        void reset(nullptr_t)
-        void reset(T*)
-        void swap(unique_ptr&)
-
-    cdef cppclass shared_ptr[T]:
-        shared_ptr()
-        shared_ptr(T*)
-        T* get()
-        void reset()
-        void reset(T* p)
-        void swap(shared_ptr&)
diff --git a/python/pyarrow/parquet.pyx b/python/pyarrow/parquet.pyx
index 39f5b229884..3d5355ebe43 100644
--- a/python/pyarrow/parquet.pyx
+++ b/python/pyarrow/parquet.pyx
@@ -59,13 +59,13 @@ def write_table(table, filename, chunk_size=None):
     """
     cdef Table table_ = table
     cdef CTable* ctable_ = table_.table
-    cdef shared_ptr[OutputStream] sink = shared_ptr[OutputStream](
-        new LocalFileOutputStream(tobytes(filename)))
+    cdef shared_ptr[OutputStream] sink
     cdef int64_t chunk_size_ = 0
     if chunk_size is None:
         chunk_size_ = min(ctable_.num_rows(), int(2**16))
     else:
         chunk_size_ = chunk_size
 
+    sink.reset(new LocalFileOutputStream(tobytes(filename)))
     check_cstatus(WriteFlatTable(ctable_, default_memory_pool(), sink, chunk_size_))
 
diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx
index 22ddf0cf17e..084c304aed2 100644
--- a/python/pyarrow/schema.pyx
+++ b/python/pyarrow/schema.pyx
@@ -201,7 +201,9 @@ def string():
 
 def list_(DataType value_type):
     cdef DataType out = DataType()
-    out.init(shared_ptr[CDataType](new CListType(value_type.sp_type)))
+    cdef shared_ptr[CDataType] list_type
+    list_type.reset(new CListType(value_type.sp_type))
+    out.init(list_type)
     return out
 
 def struct(fields):
@@ -212,12 +214,13 @@ def struct(fields):
         DataType out = DataType()
         Field field
         vector[shared_ptr[CField]] c_fields
+        cdef shared_ptr[CDataType] struct_type
 
     for field in fields:
         c_fields.push_back(field.sp_field)
 
-    out.init(shared_ptr[CDataType](
-        new CStructType(c_fields)))
+    struct_type.reset(new CStructType(c_fields))
+    out.init(struct_type)
     return out
 
 def schema(fields):
diff --git a/python/setup.py b/python/setup.py
index 1c55198ec8a..7edeb914331 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -242,7 +242,7 @@ def get_outputs(self):
         'clean': clean,
         'build_ext': build_ext
     },
-    install_requires=['cython >= 0.21', 'numpy >= 1.9'],
+    install_requires=['cython >= 0.23', 'numpy >= 1.9'],
     description=DESC,
     license='Apache License, Version 2.0',
     maintainer="Apache Arrow Developers",

From 8d90d3f0d57b82b26c9af5ba6a806e5b4ca52a3d Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 10 Jun 2016 12:02:15 +0200
Subject: [PATCH 25/28] Do not set LD_LIBRARY_PATH in python build

---
 ci/travis_script_python.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index 6d35785356a..ea4f07ca938 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -7,7 +7,6 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 # Re-use conda installation from C++
 export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
 export PATH="$MINICONDA/bin:$PATH"
-export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH"
 export PARQUET_HOME=$MINICONDA
 
 # Share environment with C++

From ec077689058f04343fdab148f6e30496e8fcc5f5 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 10 Jun 2016 12:36:00 +0200
Subject: [PATCH 26/28] Set LD_LIBRARY_PATH in python build

---
 ci/travis_script_python.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh
index ea4f07ca938..6d35785356a 100755
--- a/ci/travis_script_python.sh
+++ b/ci/travis_script_python.sh
@@ -7,6 +7,7 @@ PYTHON_DIR=$TRAVIS_BUILD_DIR/python
 # Re-use conda installation from C++
 export MINICONDA=$TRAVIS_BUILD_DIR/miniconda
 export PATH="$MINICONDA/bin:$PATH"
+export LD_LIBRARY_PATH="$MINICONDA/lib:$LD_LIBRARY_PATH"
 export PARQUET_HOME=$MINICONDA
 
 # Share environment with C++

From 38d786cbddaadb5c3238a594a75cea214b0fb108 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 10 Jun 2016 12:48:44 +0200
Subject: [PATCH 27/28] Make code more readable by using using

---
 cpp/src/arrow/parquet/parquet-io-test.cc | 28 ++++++------------------
 1 file changed, 7 insertions(+), 21 deletions(-)

diff --git a/cpp/src/arrow/parquet/parquet-io-test.cc b/cpp/src/arrow/parquet/parquet-io-test.cc
index 474d167628b..db779d8309c 100644
--- a/cpp/src/arrow/parquet/parquet-io-test.cc
+++ b/cpp/src/arrow/parquet/parquet-io-test.cc
@@ -71,6 +71,12 @@ struct test_traits<DoubleType> {
   static constexpr ParquetType::type parquet_enum = ParquetType::DOUBLE;
 };
 
+template <typename T>
+using ParquetDataType = ::parquet::DataType<test_traits<T>::parquet_enum>;
+
+template <typename T>
+using ParquetWriter = ::parquet::TypedColumnWriter<ParquetDataType<T>>;
+
 template <typename TestType>
 class TestParquetIO : public ::testing::Test {
  public:
@@ -113,25 +119,6 @@ class TestParquetIO : public ::testing::Test {
     ASSERT_NE(nullptr, out->get());
   }
 
-  std::unique_ptr<ParquetFileReader> Int64File(
-      std::vector<int64_t>& values, int num_chunks) {
-    std::shared_ptr<GroupNode> schema =
-        MakeSchema(ParquetType::INT64, Repetition::REQUIRED);
-    std::unique_ptr<ParquetFileWriter> file_writer = MakeWriter(schema);
-    size_t chunk_size = values.size() / num_chunks;
-    for (int i = 0; i < num_chunks; i++) {
-      auto row_group_writer = file_writer->AppendRowGroup(chunk_size);
-      auto column_writer =
-          static_cast<::parquet::Int64Writer*>(row_group_writer->NextColumn());
-      int64_t* data = values.data() + i * chunk_size;
-      column_writer->WriteBatch(chunk_size, nullptr, nullptr, data);
-      column_writer->Close();
-      row_group_writer->Close();
-    }
-    file_writer->Close();
-    return ReaderFromSink();
-  }
-
   std::unique_ptr<ParquetFileReader> TestFile(std::vector<T>& values, int num_chunks) {
     std::shared_ptr<GroupNode> schema =
         MakeSchema(test_traits<TestType>::parquet_enum, Repetition::REQUIRED);
@@ -139,8 +126,7 @@ class TestParquetIO : public ::testing::Test {
     size_t chunk_size = values.size() / num_chunks;
     for (int i = 0; i < num_chunks; i++) {
       auto row_group_writer = file_writer->AppendRowGroup(chunk_size);
-      auto column_writer = static_cast<::parquet::TypedColumnWriter<
-          ::parquet::DataType<test_traits<TestType>::parquet_enum>>*>(
+      auto column_writer = static_cast<ParquetWriter<TestType>*>(
           row_group_writer->NextColumn());
       T* data = values.data() + i * chunk_size;
       column_writer->WriteBatch(chunk_size, nullptr, nullptr, data);

From 405f85d88fb0f6943df7149d2e76ae95e78a5658 Mon Sep 17 00:00:00 2001
From: "Uwe L. Korn" <uwelk@xhochy.com>
Date: Fri, 10 Jun 2016 15:56:38 +0200
Subject: [PATCH 28/28] Remove FindParquet duplication

---
 python/cmake_modules/FindParquet.cmake | 80 --------------------------
 1 file changed, 80 deletions(-)
 delete mode 100644 python/cmake_modules/FindParquet.cmake

diff --git a/python/cmake_modules/FindParquet.cmake b/python/cmake_modules/FindParquet.cmake
deleted file mode 100644
index e3350d6e13d..00000000000
--- a/python/cmake_modules/FindParquet.cmake
+++ /dev/null
@@ -1,80 +0,0 @@
-# Copyright 2012 Cloudera Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# - Find PARQUET (parquet/parquet.h, libparquet.a, libparquet.so)
-# This module defines
-#  PARQUET_INCLUDE_DIR, directory containing headers
-#  PARQUET_LIBS, directory containing parquet libraries
-#  PARQUET_STATIC_LIB, path to libparquet.a
-#  PARQUET_SHARED_LIB, path to libparquet's shared library
-#  PARQUET_FOUND, whether parquet has been found
-
-if( NOT "$ENV{PARQUET_HOME}" STREQUAL "")
-    file( TO_CMAKE_PATH "$ENV{PARQUET_HOME}" _native_path )
-    list( APPEND _parquet_roots ${_native_path} )
-elseif ( Parquet_HOME )
-    list( APPEND _parquet_roots ${Parquet_HOME} )
-endif()
-
-# Try the parameterized roots, if they exist
-if ( _parquet_roots )
-    find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h
-        PATHS ${_parquet_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "include" )
-    find_library( PARQUET_LIBRARIES NAMES parquet
-        PATHS ${_parquet_roots} NO_DEFAULT_PATH
-        PATH_SUFFIXES "lib" )
-else ()
-    find_path( PARQUET_INCLUDE_DIR NAMES parquet/api/reader.h )
-    find_library( PARQUET_LIBRARIES NAMES parquet )
-endif ()
-
-
-if (PARQUET_INCLUDE_DIR AND PARQUET_LIBRARIES)
-  set(PARQUET_FOUND TRUE)
-  get_filename_component( PARQUET_LIBS ${PARQUET_LIBRARIES} PATH )
-  set(PARQUET_LIB_NAME libparquet)
-  set(PARQUET_STATIC_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}.a)
-  set(PARQUET_SHARED_LIB ${PARQUET_LIBS}/${PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX})
-else ()
-  set(PARQUET_FOUND FALSE)
-endif ()
-
-if (PARQUET_FOUND)
-  if (NOT Parquet_FIND_QUIETLY)
-    message(STATUS "Found the Parquet library: ${PARQUET_LIBRARIES}")
-  endif ()
-else ()
-  if (NOT Parquet_FIND_QUIETLY)
-    set(PARQUET_ERR_MSG "Could not find the Parquet library. Looked in ")
-    if ( _parquet_roots )
-      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} in ${_parquet_roots}.")
-    else ()
-      set(PARQUET_ERR_MSG "${PARQUET_ERR_MSG} system search paths.")
-    endif ()
-    if (Parquet_FIND_REQUIRED)
-      message(FATAL_ERROR "${PARQUET_ERR_MSG}")
-    else (Parquet_FIND_REQUIRED)
-      message(STATUS "${PARQUET_ERR_MSG}")
-    endif (Parquet_FIND_REQUIRED)
-  endif ()
-endif ()
-
-mark_as_advanced(
-  PARQUET_INCLUDE_DIR
-  PARQUET_LIBS
-  PARQUET_LIBRARIES
-  PARQUET_STATIC_LIB
-  PARQUET_SHARED_LIB
-)