// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // Functions for pandas conversion via NumPy #include "arrow/python/numpy_to_arrow.h" #include "arrow/python/numpy_interop.h" #include #include #include #include #include #include #include #include #include #include "arrow/array.h" #include "arrow/array/builder_binary.h" #include "arrow/status.h" #include "arrow/table.h" #include "arrow/type_fwd.h" #include "arrow/type_traits.h" #include "arrow/util/bit_util.h" #include "arrow/util/bitmap_generate.h" #include "arrow/util/bitmap_ops.h" #include "arrow/util/checked_cast.h" #include "arrow/util/endian.h" #include "arrow/util/logging.h" #include "arrow/util/macros.h" #include "arrow/util/string.h" #include "arrow/util/utf8.h" #include "arrow/visit_type_inline.h" #include "arrow/compute/api_scalar.h" #include "arrow/python/common.h" #include "arrow/python/datetime.h" #include "arrow/python/helpers.h" #include "arrow/python/iterators.h" #include "arrow/python/numpy_convert.h" #include "arrow/python/numpy_internal.h" #include "arrow/python/python_to_arrow.h" #include "arrow/python/type_traits.h" #include "arrow/python/vendored/pythoncapi_compat.h" namespace arrow { using internal::checked_cast; using internal::CopyBitmap; using internal::GenerateBitsUnrolled; namespace py { using internal::NumPyTypeSize; // ---------------------------------------------------------------------- // Conversion utilities namespace { Status AllocateNullBitmap(MemoryPool* pool, int64_t length, std::shared_ptr* out) { int64_t null_bytes = bit_util::BytesForBits(length); ARROW_ASSIGN_OR_RAISE(auto null_bitmap, AllocateResizableBuffer(null_bytes, pool)); // Padding zeroed by AllocateResizableBuffer memset(null_bitmap->mutable_data(), 0, static_cast(null_bytes)); *out = std::move(null_bitmap); return Status::OK(); } // ---------------------------------------------------------------------- // Conversion from NumPy-in-Pandas to Arrow null bitmap template inline int64_t ValuesToBitmap(PyArrayObject* arr, uint8_t* bitmap) { typedef internal::npy_traits traits; typedef typename traits::value_type T; int64_t null_count = 0; Ndarray1DIndexer values(arr); for (int i = 0; i < values.size(); ++i) { if (traits::isnull(values[i])) { ++null_count; } else { bit_util::SetBit(bitmap, i); } } return null_count; } class NumPyNullsConverter { public: /// Convert the given array's null values to a null bitmap. /// The null bitmap is only allocated if null values are ever possible. static Status Convert(MemoryPool* pool, PyArrayObject* arr, bool from_pandas, std::shared_ptr* out_null_bitmap_, int64_t* out_null_count) { NumPyNullsConverter converter(pool, arr, from_pandas); RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); *out_null_bitmap_ = converter.null_bitmap_; *out_null_count = converter.null_count_; return Status::OK(); } template Status Visit(PyArrayObject* arr) { typedef internal::npy_traits traits; const bool null_sentinels_possible = // Always treat Numpy's NaT as null TYPE == NPY_DATETIME || TYPE == NPY_TIMEDELTA || // Observing pandas's null sentinels (from_pandas_ && traits::supports_nulls); if (null_sentinels_possible) { RETURN_NOT_OK(AllocateNullBitmap(pool_, PyArray_SIZE(arr), &null_bitmap_)); null_count_ = ValuesToBitmap(arr, null_bitmap_->mutable_data()); } return Status::OK(); } protected: NumPyNullsConverter(MemoryPool* pool, PyArrayObject* arr, bool from_pandas) : pool_(pool), arr_(arr), from_pandas_(from_pandas), null_bitmap_data_(nullptr), null_count_(0) {} MemoryPool* pool_; PyArrayObject* arr_; bool from_pandas_; std::shared_ptr null_bitmap_; uint8_t* null_bitmap_data_; int64_t null_count_; }; // Returns null count int64_t MaskToBitmap(PyArrayObject* mask, int64_t length, uint8_t* bitmap) { int64_t null_count = 0; if (!PyArray_Check(mask)) return -1; Ndarray1DIndexer mask_values(mask); for (int i = 0; i < length; ++i) { if (mask_values[i]) { ++null_count; bit_util::ClearBit(bitmap, i); } else { bit_util::SetBit(bitmap, i); } } return null_count; } } // namespace // ---------------------------------------------------------------------- // Conversion from NumPy arrays (possibly originating from pandas) to Arrow // format. Does not handle NPY_OBJECT dtype arrays; use ConvertPySequence for // that class NumPyConverter { public: NumPyConverter(MemoryPool* pool, PyObject* arr, PyObject* mo, const std::shared_ptr& type, bool from_pandas, const compute::CastOptions& cast_options = compute::CastOptions()) : pool_(pool), type_(type), arr_(reinterpret_cast(arr)), dtype_(PyArray_DESCR(arr_)), mask_(nullptr), from_pandas_(from_pandas), cast_options_(cast_options), null_bitmap_data_(nullptr), null_count_(0) { if (mo != nullptr && mo != Py_None) { mask_ = reinterpret_cast(mo); } length_ = static_cast(PyArray_SIZE(arr_)); itemsize_ = static_cast(PyArray_ITEMSIZE(arr_)); stride_ = static_cast(PyArray_STRIDES(arr_)[0]); } bool is_strided() const { return itemsize_ != stride_; } Status Convert(); const ArrayVector& result() const { return out_arrays_; } template enable_if_primitive_ctype Visit(const T& type) { return VisitNative(); } Status Visit(const HalfFloatType& type) { return VisitNative(); } Status Visit(const Date32Type& type) { return VisitNative(); } Status Visit(const Date64Type& type) { return VisitNative(); } Status Visit(const TimestampType& type) { return VisitNative(); } Status Visit(const Time32Type& type) { return VisitNative(); } Status Visit(const Time64Type& type) { return VisitNative(); } Status Visit(const DurationType& type) { return VisitNative(); } Status Visit(const NullType& type) { return TypeNotImplemented(type.ToString()); } // NumPy ascii string arrays Status Visit(const BinaryType& type); Status Visit(const LargeBinaryType& type); Status Visit(const BinaryViewType& type); // NumPy unicode arrays Status Visit(const StringType& type); Status Visit(const LargeStringType& type); Status Visit(const StringViewType& type); Status Visit(const StructType& type); Status Visit(const FixedSizeBinaryType& type); // Default case Status Visit(const DataType& type) { return TypeNotImplemented(type.ToString()); } protected: Status InitNullBitmap() { RETURN_NOT_OK(AllocateNullBitmap(pool_, length_, &null_bitmap_)); null_bitmap_data_ = null_bitmap_->mutable_data(); return Status::OK(); } // Called before ConvertData to ensure Numpy input buffer is in expected // Arrow layout template Status PrepareInputData(std::shared_ptr* data); // ---------------------------------------------------------------------- // Traditional visitor conversion for non-object arrays template Status ConvertData(std::shared_ptr* data); template Status PushBuilderResult(T* builder) { std::shared_ptr out; RETURN_NOT_OK(builder->Finish(&out)); out_arrays_.emplace_back(out); return Status::OK(); } Status PushArray(const std::shared_ptr& data) { out_arrays_.emplace_back(MakeArray(data)); return Status::OK(); } template Status VisitNative() { if (mask_ != nullptr) { RETURN_NOT_OK(InitNullBitmap()); null_count_ = MaskToBitmap(mask_, length_, null_bitmap_data_); if (null_count_ == -1) return Status::Invalid("Invalid mask type"); } else { RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, &null_count_)); } std::shared_ptr data; RETURN_NOT_OK(ConvertData(&data)); auto arr_data = ArrayData::Make(type_, length_, {null_bitmap_, data}, null_count_, 0); return PushArray(arr_data); } template Status VisitBinary(T* builder); template Status VisitString(T* builder); Status TypeNotImplemented(std::string type_name) { return Status::NotImplemented("NumPyConverter doesn't implement <", type_name, "> conversion. "); } MemoryPool* pool_; std::shared_ptr type_; PyArrayObject* arr_; PyArray_Descr* dtype_; PyArrayObject* mask_; int64_t length_; int64_t stride_; int64_t itemsize_; bool from_pandas_; compute::CastOptions cast_options_; // Used in visitor pattern ArrayVector out_arrays_; std::shared_ptr null_bitmap_; uint8_t* null_bitmap_data_; int64_t null_count_; }; Status NumPyConverter::Convert() { if (PyArray_NDIM(arr_) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); } if (dtype_->type_num == NPY_OBJECT) { // If an object array, convert it like a normal Python sequence PyConversionOptions py_options; py_options.type = type_; py_options.from_pandas = from_pandas_; ARROW_ASSIGN_OR_RAISE( auto chunked_array, ConvertPySequence(reinterpret_cast(arr_), reinterpret_cast(mask_), py_options, pool_)); out_arrays_ = chunked_array->chunks(); return Status::OK(); } if (type_ == nullptr) { return Status::Invalid("Must pass data type for non-object arrays"); } // Visit the type to perform conversion return VisitTypeInline(*type_, this); } namespace { Status CastBuffer(const std::shared_ptr& in_type, const std::shared_ptr& input, const int64_t length, const std::shared_ptr& valid_bitmap, const int64_t null_count, const std::shared_ptr& out_type, const compute::CastOptions& cast_options, MemoryPool* pool, std::shared_ptr* out) { // Must cast auto tmp_data = ArrayData::Make(in_type, length, {valid_bitmap, input}, null_count); compute::ExecContext context(pool); ARROW_ASSIGN_OR_RAISE( std::shared_ptr casted_array, compute::Cast(*MakeArray(tmp_data), out_type, cast_options, &context)); *out = casted_array->data()->buffers[1]; return Status::OK(); } template Status StaticCastBuffer(const Buffer& input, const int64_t length, MemoryPool* pool, std::shared_ptr* out) { ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(ToType) * length, pool)); auto in_values = reinterpret_cast(input.data()); auto out_values = reinterpret_cast(result->mutable_data()); for (int64_t i = 0; i < length; ++i) { *out_values++ = static_cast(*in_values++); } *out = std::move(result); return Status::OK(); } template void CopyStridedBytewise(int8_t* input_data, int64_t length, int64_t stride, T* output_data) { // Passing input_data as non-const is a concession to PyObject* for (int64_t i = 0; i < length; ++i) { memcpy(output_data + i, input_data, sizeof(T)); input_data += stride; } } template void CopyStridedNatural(T* input_data, int64_t length, int64_t stride, T* output_data) { // Passing input_data as non-const is a concession to PyObject* int64_t j = 0; for (int64_t i = 0; i < length; ++i) { output_data[i] = input_data[j]; j += stride; } } class NumPyStridedConverter { public: static Status Convert(PyArrayObject* arr, int64_t length, MemoryPool* pool, std::shared_ptr* out) { NumPyStridedConverter converter(arr, length, pool); RETURN_NOT_OK(VisitNumpyArrayInline(arr, &converter)); *out = converter.buffer_; return Status::OK(); } template Status Visit(PyArrayObject* arr) { using traits = internal::npy_traits; using T = typename traits::value_type; ARROW_ASSIGN_OR_RAISE(buffer_, AllocateBuffer(sizeof(T) * length_, pool_)); const int64_t stride = PyArray_STRIDES(arr)[0]; // ARROW-16013: convert sizeof(T) to signed int64 first, otherwise dividing by it // would do an unsigned division. This cannot be caught by tests without ubsan, since // common signed overflow behavior and the fact that the sizeof(T) is currently always // a power of two here cause CopyStridedNatural to still produce correct results const int64_t element_size = sizeof(T); if (stride % element_size == 0) { const int64_t stride_elements = stride / element_size; CopyStridedNatural(reinterpret_cast(PyArray_DATA(arr)), length_, stride_elements, reinterpret_cast(buffer_->mutable_data())); } else { CopyStridedBytewise(reinterpret_cast(PyArray_DATA(arr)), length_, stride, reinterpret_cast(buffer_->mutable_data())); } return Status::OK(); } protected: NumPyStridedConverter(PyArrayObject* arr, int64_t length, MemoryPool* pool) : arr_(arr), length_(length), pool_(pool), buffer_(nullptr) {} PyArrayObject* arr_; int64_t length_; MemoryPool* pool_; std::shared_ptr buffer_; }; } // namespace template inline Status NumPyConverter::PrepareInputData(std::shared_ptr* data) { if (PyArray_ISBYTESWAPPED(arr_)) { // TODO return Status::NotImplemented("Byte-swapped arrays not supported"); } if (dtype_->type_num == NPY_BOOL) { int64_t nbytes = bit_util::BytesForBits(length_); ARROW_ASSIGN_OR_RAISE(auto buffer, AllocateBuffer(nbytes, pool_)); Ndarray1DIndexer values(arr_); int64_t i = 0; const auto generate = [&values, &i]() -> bool { return values[i++] > 0; }; GenerateBitsUnrolled(buffer->mutable_data(), 0, length_, generate); *data = std::move(buffer); } else if (is_strided()) { RETURN_NOT_OK(NumPyStridedConverter::Convert(arr_, length_, pool_, data)); } else { // Can zero-copy *data = std::make_shared(reinterpret_cast(arr_)); } return Status::OK(); } template inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { RETURN_NOT_OK(PrepareInputData(data)); ARROW_ASSIGN_OR_RAISE(auto input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); } return Status::OK(); } template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { std::shared_ptr input_type; RETURN_NOT_OK(PrepareInputData(data)); auto date_dtype = reinterpret_cast(PyDataType_C_METADATA(dtype_)); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted // separately here from int64_t to int32_t, because this data is not // supported in compute::Cast if (date_dtype->meta.base == NPY_FR_D) { // TODO(wesm): How pedantic do we really want to be about checking for int32 // overflow here? Status s = StaticCastBuffer(**data, length_, pool_, data); RETURN_NOT_OK(s); } else { ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); } } } else { ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); } } return Status::OK(); } template <> inline Status NumPyConverter::ConvertData(std::shared_ptr* data) { constexpr int64_t kMillisecondsInDay = 86400000; std::shared_ptr input_type; RETURN_NOT_OK(PrepareInputData(data)); auto date_dtype = reinterpret_cast(PyDataType_C_METADATA(dtype_)); if (dtype_->type_num == NPY_DATETIME) { // If we have inbound datetime64[D] data, this needs to be downcasted // separately here from int64_t to int32_t, because this data is not // supported in compute::Cast if (date_dtype->meta.base == NPY_FR_D) { ARROW_ASSIGN_OR_RAISE(auto result, AllocateBuffer(sizeof(int64_t) * length_, pool_)); auto in_values = reinterpret_cast((*data)->data()); auto out_values = reinterpret_cast(result->mutable_data()); for (int64_t i = 0; i < length_; ++i) { *out_values++ = kMillisecondsInDay * (*in_values++); } *data = std::move(result); } else { ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { // The null bitmap was already computed in VisitNative() RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); } } } else { ARROW_ASSIGN_OR_RAISE(input_type, NumPyDtypeToArrow(dtype_)); if (!input_type->Equals(*type_)) { RETURN_NOT_OK(CastBuffer(input_type, *data, length_, null_bitmap_, null_count_, type_, cast_options_, pool_, data)); } } return Status::OK(); } // Create 16MB chunks for binary data constexpr int32_t kBinaryChunksize = 1 << 24; template Status NumPyConverter::VisitBinary(T* builder) { auto data = reinterpret_cast(PyArray_DATA(arr_)); auto AppendNotNull = [builder, this](const uint8_t* data) { // This is annoying. NumPy allows strings to have nul-terminators, so // we must check for them here const size_t item_size = strnlen(reinterpret_cast(data), static_cast(itemsize_)); return builder->Append(data, static_cast(item_size)); }; if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder->AppendNull()); } else { RETURN_NOT_OK(AppendNotNull(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { RETURN_NOT_OK(AppendNotNull(data)); data += stride_; } } return Status::OK(); } Status NumPyConverter::Visit(const BinaryType& type) { ::arrow::internal::ChunkedBinaryBuilder builder(kBinaryChunksize, pool_); RETURN_NOT_OK(VisitBinary(&builder)); ArrayVector result; RETURN_NOT_OK(builder.Finish(&result)); for (auto arr : result) { RETURN_NOT_OK(PushArray(arr->data())); } return Status::OK(); } Status NumPyConverter::Visit(const LargeBinaryType& type) { ::arrow::LargeBinaryBuilder builder(pool_); RETURN_NOT_OK(VisitBinary(&builder)); std::shared_ptr result; RETURN_NOT_OK(builder.Finish(&result)); return PushArray(result->data()); } Status NumPyConverter::Visit(const BinaryViewType& type) { ::arrow::BinaryViewBuilder builder(pool_); RETURN_NOT_OK(VisitBinary(&builder)); std::shared_ptr result; RETURN_NOT_OK(builder.Finish(&result)); return PushArray(result->data()); } Status NumPyConverter::Visit(const FixedSizeBinaryType& type) { auto byte_width = type.byte_width(); if (itemsize_ != byte_width) { return Status::Invalid("Got bytestring of length ", itemsize_, " (expected ", byte_width, ")"); } FixedSizeBinaryBuilder builder(::arrow::fixed_size_binary(byte_width), pool_); auto data = reinterpret_cast(PyArray_DATA(arr_)); if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); RETURN_NOT_OK(builder.Reserve(length_)); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder.AppendNull()); } else { RETURN_NOT_OK(builder.Append(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { RETURN_NOT_OK(builder.Append(data)); data += stride_; } } std::shared_ptr result; RETURN_NOT_OK(builder.Finish(&result)); return PushArray(result->data()); } namespace { // NumPy unicode is UCS4/UTF32 always constexpr int kNumPyUnicodeSize = 4; template Status AppendUTF32(const char* data, int64_t itemsize, int byteorder, T* builder) { // The binary \x00\x00\x00\x00 indicates a nul terminator in NumPy unicode, // so we need to detect that here to truncate if necessary. Yep. Py_ssize_t actual_length = 0; for (; actual_length < itemsize / kNumPyUnicodeSize; ++actual_length) { const char* code_point = data + actual_length * kNumPyUnicodeSize; if ((*code_point == '\0') && (*(code_point + 1) == '\0') && (*(code_point + 2) == '\0') && (*(code_point + 3) == '\0')) { break; } } OwnedRef unicode_obj(PyUnicode_DecodeUTF32(data, actual_length * kNumPyUnicodeSize, nullptr, &byteorder)); RETURN_IF_PYERROR(); OwnedRef utf8_obj(PyUnicode_AsUTF8String(unicode_obj.obj())); if (utf8_obj.obj() == NULL) { PyErr_Clear(); return Status::Invalid("failed converting UTF32 to UTF8"); } const int32_t length = static_cast(PyBytes_GET_SIZE(utf8_obj.obj())); return builder->Append( reinterpret_cast(PyBytes_AS_STRING(utf8_obj.obj())), length); } } // namespace template Status NumPyConverter::VisitString(T* builder) { auto data = reinterpret_cast(PyArray_DATA(arr_)); char numpy_byteorder = dtype_->byteorder; // For Python C API, -1 is little-endian, 1 is big-endian #if ARROW_LITTLE_ENDIAN // Yield little-endian from both '|' (native) and '<' int byteorder = numpy_byteorder == '>' ? 1 : -1; #else // Yield big-endian from both '|' (native) and '>' int byteorder = numpy_byteorder == '<' ? -1 : 1; #endif PyAcquireGIL gil_lock; const bool is_binary_type = dtype_->type_num == NPY_STRING; const bool is_unicode_type = dtype_->type_num == NPY_UNICODE; if (!is_binary_type && !is_unicode_type) { const bool is_float_type = dtype_->kind == 'f'; if (from_pandas_ && is_float_type) { // in case of from_pandas=True, accept an all-NaN float array as input RETURN_NOT_OK(NumPyNullsConverter::Convert(pool_, arr_, from_pandas_, &null_bitmap_, &null_count_)); if (null_count_ == length_) { auto arr = std::make_shared(length_); compute::ExecContext context(pool_); ARROW_ASSIGN_OR_RAISE( std::shared_ptr out, compute::Cast(*arr, arrow::utf8(), cast_options_, &context)); out_arrays_.emplace_back(out); return Status::OK(); } } std::string dtype_string; RETURN_NOT_OK(internal::PyObject_StdStringStr(reinterpret_cast(dtype_), &dtype_string)); return Status::TypeError("Expected a string or bytes dtype, got ", dtype_string); } auto AppendNonNullValue = [&](const uint8_t* data) { if (is_binary_type) { if (ARROW_PREDICT_TRUE(util::ValidateUTF8(data, itemsize_))) { return builder->Append(data, static_cast(itemsize_)); } else { return Status::Invalid("Encountered non-UTF8 binary value: ", HexEncode(data, itemsize_)); } } else { // is_unicode_type case return AppendUTF32(reinterpret_cast(data), itemsize_, byteorder, builder); } }; if (mask_ != nullptr) { Ndarray1DIndexer mask_values(mask_); for (int64_t i = 0; i < length_; ++i) { if (mask_values[i]) { RETURN_NOT_OK(builder->AppendNull()); } else { RETURN_NOT_OK(AppendNonNullValue(data)); } data += stride_; } } else { for (int64_t i = 0; i < length_; ++i) { RETURN_NOT_OK(AppendNonNullValue(data)); data += stride_; } } return Status::OK(); } Status NumPyConverter::Visit(const StringType& type) { util::InitializeUTF8(); ::arrow::internal::ChunkedStringBuilder builder(kBinaryChunksize, pool_); RETURN_NOT_OK(VisitString(&builder)); ArrayVector result; RETURN_NOT_OK(builder.Finish(&result)); for (auto arr : result) { RETURN_NOT_OK(PushArray(arr->data())); } return Status::OK(); } Status NumPyConverter::Visit(const LargeStringType& type) { util::InitializeUTF8(); ::arrow::LargeStringBuilder builder(pool_); RETURN_NOT_OK(VisitString(&builder)); std::shared_ptr result; RETURN_NOT_OK(builder.Finish(&result)); RETURN_NOT_OK(PushArray(result->data())); return Status::OK(); } Status NumPyConverter::Visit(const StringViewType& type) { util::InitializeUTF8(); ::arrow::StringViewBuilder builder(pool_); RETURN_NOT_OK(VisitString(&builder)); std::shared_ptr result; RETURN_NOT_OK(builder.Finish(&result)); RETURN_NOT_OK(PushArray(result->data())); return Status::OK(); } Status NumPyConverter::Visit(const StructType& type) { std::vector sub_converters; std::vector sub_arrays; { PyAcquireGIL gil_lock; // Create converters for each struct type field if (PyDataType_FIELDS(dtype_) == NULL || !PyDict_Check(PyDataType_FIELDS(dtype_))) { return Status::TypeError("Expected struct array"); } for (auto field : type.fields()) { PyObject* tup; PyDict_GetItemStringRef(PyDataType_FIELDS(dtype_), field->name().c_str(), &tup); RETURN_IF_PYERROR(); OwnedRef tupref(tup); if (tup == NULL) { return Status::Invalid("Missing field '", field->name(), "' in struct array"); } PyArray_Descr* sub_dtype = reinterpret_cast(PyTuple_GET_ITEM(tup, 0)); ARROW_DCHECK(PyObject_TypeCheck(sub_dtype, &PyArrayDescr_Type)); int offset = static_cast(PyLong_AsLong(PyTuple_GET_ITEM(tup, 1))); RETURN_IF_PYERROR(); Py_INCREF(sub_dtype); /* PyArray_GetField() steals ref */ PyObject* sub_array = PyArray_GetField(arr_, sub_dtype, offset); RETURN_IF_PYERROR(); sub_arrays.emplace_back(sub_array); sub_converters.emplace_back(pool_, sub_array, nullptr /* mask */, field->type(), from_pandas_); } } std::vector groups; int64_t null_count = 0; // Compute null bitmap and store it as a Boolean Array to include it // in the rechunking below { if (mask_ != nullptr) { RETURN_NOT_OK(InitNullBitmap()); null_count = MaskToBitmap(mask_, length_, null_bitmap_data_); if (null_count_ == -1) return Status::Invalid("Invalid mask type"); } groups.push_back({std::make_shared(length_, null_bitmap_)}); } // Convert child data for (auto& converter : sub_converters) { RETURN_NOT_OK(converter.Convert()); groups.push_back(converter.result()); } // Ensure the different array groups are chunked consistently groups = ::arrow::internal::RechunkArraysConsistently(groups); // Make struct array chunks by combining groups size_t ngroups = groups.size(); size_t nchunks = groups[0].size(); for (size_t chunk = 0; chunk < nchunks; chunk++) { // First group has the null bitmaps as Boolean Arrays const auto& null_data = groups[0][chunk]->data(); ARROW_DCHECK_EQ(null_data->type->id(), Type::BOOL); ARROW_DCHECK_EQ(null_data->buffers.size(), 2); const auto& null_buffer = null_data->buffers[1]; // Careful: the rechunked null bitmap may have a non-zero offset // to its buffer, and it may not even start on a byte boundary int64_t null_offset = null_data->offset; std::shared_ptr fixed_null_buffer; if (!null_buffer) { fixed_null_buffer = null_buffer; } else if (null_offset % 8 == 0) { fixed_null_buffer = std::make_shared(null_buffer, // byte offset null_offset / 8, // byte size bit_util::BytesForBits(null_data->length)); } else { ARROW_ASSIGN_OR_RAISE( fixed_null_buffer, CopyBitmap(pool_, null_buffer->data(), null_offset, null_data->length)); } // Create struct array chunk and populate it auto arr_data = ArrayData::Make(type_, null_data->length, null_count ? kUnknownNullCount : 0, 0); arr_data->buffers.push_back(fixed_null_buffer); // Append child chunks for (size_t i = 1; i < ngroups; i++) { arr_data->child_data.push_back(groups[i][chunk]->data()); } RETURN_NOT_OK(PushArray(arr_data)); } return Status::OK(); } Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, const compute::CastOptions& cast_options, std::shared_ptr* out) { if (!PyArray_Check(ao)) { // This code path cannot be reached by Python unit tests currently so this // is only a sanity check. return Status::TypeError("Input object was not a NumPy array"); } if (PyArray_NDIM(reinterpret_cast(ao)) != 1) { return Status::Invalid("only handle 1-dimensional arrays"); } NumPyConverter converter(pool, ao, mo, type, from_pandas, cast_options); RETURN_NOT_OK(converter.Convert()); const auto& output_arrays = converter.result(); ARROW_DCHECK_GT(output_arrays.size(), 0); *out = std::make_shared(output_arrays); return Status::OK(); } Status NdarrayToArrow(MemoryPool* pool, PyObject* ao, PyObject* mo, bool from_pandas, const std::shared_ptr& type, std::shared_ptr* out) { return NdarrayToArrow(pool, ao, mo, from_pandas, type, compute::CastOptions(), out); } } // namespace py } // namespace arrow