Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions python/pyarrow/array.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import numpy as np

from pyarrow.includes.libarrow cimport *
from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.pyarrow as pyarrow

import pyarrow.config
Expand All @@ -35,6 +36,8 @@ from pyarrow.scalar import NA
from pyarrow.schema cimport Schema
import pyarrow.schema as schema

cimport cpython


def total_allocated_bytes():
cdef MemoryPool* pool = pyarrow.get_memory_pool()
Expand Down Expand Up @@ -111,6 +114,24 @@ cdef class Array:
def slice(self, start, end):
pass

def to_pandas(self):
"""
Convert to an array object suitable for use in pandas

See also
--------
Column.to_pandas
Table.to_pandas
RecordBatch.to_pandas
"""
cdef:
PyObject* np_arr

check_status(pyarrow.ConvertArrayToPandas(
self.sp_array, <PyObject*> self, &np_arr))

return PyObject_to_object(np_arr)


cdef class NullArray(Array):
pass
Expand Down
7 changes: 7 additions & 0 deletions python/pyarrow/includes/common.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -47,3 +47,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:
c_bool IsKeyError()
c_bool IsNotImplemented()
c_bool IsInvalid()


cdef inline object PyObject_to_object(PyObject* o):
# Cast to "object" increments reference count
cdef object result = <object> o
cpython.Py_DECREF(result)
return result
4 changes: 2 additions & 2 deletions python/pyarrow/includes/pyarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,10 @@ cdef extern from "pyarrow/api.h" namespace "pyarrow" nogil:
shared_ptr[CArray]* out)

CStatus ConvertArrayToPandas(const shared_ptr[CArray]& arr,
object py_ref, PyObject** out)
PyObject* py_ref, PyObject** out)

CStatus ConvertColumnToPandas(const shared_ptr[CColumn]& arr,
object py_ref, PyObject** out)
PyObject* py_ref, PyObject** out)

MemoryPool* get_memory_pool()

Expand Down
18 changes: 12 additions & 6 deletions python/pyarrow/table.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from cython.operator cimport dereference as deref

from pyarrow.includes.libarrow cimport *
from pyarrow.includes.common cimport PyObject_to_object
cimport pyarrow.includes.pyarrow as pyarrow

import pyarrow.config
Expand All @@ -32,6 +33,7 @@ from pyarrow.schema cimport box_data_type, box_schema

from pyarrow.compat import frombytes, tobytes

cimport cpython

cdef class ChunkedArray:
'''
Expand Down Expand Up @@ -100,8 +102,10 @@ cdef class Column:

import pandas as pd

check_status(pyarrow.ConvertColumnToPandas(self.sp_column, self, &arr))
return pd.Series(<object>arr, name=self.name)
check_status(pyarrow.ConvertColumnToPandas(self.sp_column,
<PyObject*> self, &arr))

return pd.Series(PyObject_to_object(arr), name=self.name)

cdef _check_nullptr(self):
if self.column == NULL:
Expand Down Expand Up @@ -248,9 +252,10 @@ cdef class RecordBatch:
data = []
for i in range(self.batch.num_columns()):
arr = self.batch.column(i)
check_status(pyarrow.ConvertArrayToPandas(arr, self, &np_arr))
check_status(pyarrow.ConvertArrayToPandas(arr, <PyObject*> self,
&np_arr))
names.append(frombytes(self.batch.column_name(i)))
data.append(<object> np_arr)
data.append(PyObject_to_object(np_arr))

return pd.DataFrame(dict(zip(names, data)), columns=names)

Expand Down Expand Up @@ -375,9 +380,10 @@ cdef class Table:
for i in range(self.table.num_columns()):
col = self.table.column(i)
column = self.column(i)
check_status(pyarrow.ConvertColumnToPandas(col, column, &arr))
check_status(pyarrow.ConvertColumnToPandas(
col, <PyObject*> column, &arr))
names.append(frombytes(col.get().name()))
data.append(<object> arr)
data.append(PyObject_to_object(arr))

return pd.DataFrame(dict(zip(names, data)), columns=names)

Expand Down
29 changes: 29 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
# specific language governing permissions and limitations
# under the License.

import sys

import pyarrow
import pyarrow.formatting as fmt

Expand Down Expand Up @@ -71,3 +73,30 @@ def test_long_array_format():
99
]"""
assert result == expected


def test_to_pandas_zero_copy():
import gc

arr = pyarrow.from_pylist(range(10))

for i in range(10):
np_arr = arr.to_pandas()
assert sys.getrefcount(np_arr) == 2
np_arr = None # noqa

assert sys.getrefcount(arr) == 2

for i in range(10):
arr = pyarrow.from_pylist(range(10))
np_arr = arr.to_pandas()
arr = None
gc.collect()

# Ensure base is still valid

# Because of py.test's assert inspection magic, if you put getrefcount
# on the line being examined, it will be 1 higher than you expect
base_refcount = sys.getrefcount(np_arr.base)
assert base_refcount == 2
np_arr.sum()
4 changes: 4 additions & 0 deletions python/pyarrow/tests/test_convert_builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,10 @@ def test_integer(self):

def test_garbage_collection(self):
import gc

# Force the cyclic garbage collector to run
gc.collect()

bytes_before = pyarrow.total_allocated_bytes()
pyarrow.from_pylist([1, None, 3, None])
gc.collect()
Expand Down
4 changes: 1 addition & 3 deletions python/src/pyarrow/adapters/pandas.cc
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,6 @@ class ArrowDeserializer {
PyAcquireGIL lock;

// Zero-Copy. We can pass the data pointer directly to NumPy.
Py_INCREF(py_ref_);
OwnedRef py_ref(py_ref_);
npy_intp dims[1] = {col_->length()};
out_ = reinterpret_cast<PyArrayObject*>(PyArray_SimpleNewFromData(1, dims,
type, data));
Expand All @@ -646,7 +644,7 @@ class ArrowDeserializer {
return Status::OK();
} else {
// PyArray_SetBaseObject steals our reference to py_ref_
py_ref.release();
Py_INCREF(py_ref_);
}

// Arrow data is immutable.
Expand Down