// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. // Functions for converting between pandas's NumPy-based data representation // and Arrow data structures #pragma once #include "arrow/python/platform.h" #include #include #include #include "arrow/memory_pool.h" #include "arrow/python/visibility.h" namespace arrow { class Array; class ChunkedArray; class Column; class DataType; class MemoryPool; class Status; class Table; namespace py { enum class MapConversionType { DEFAULT, // convert arrow maps to assoc lists (list of kev-value tuples) in Pandas LOSSY, // report warnings when lossiness is encountered due to duplicate keys STRICT_, // raise a Python exception when lossiness is encountered due to duplicate // keys }; struct PandasOptions { /// arrow::MemoryPool to use for memory allocations MemoryPool* pool = default_memory_pool(); /// If true, we will convert all string columns to categoricals bool strings_to_categorical = false; bool zero_copy_only = false; bool integer_object_nulls = false; bool date_as_object = false; bool timestamp_as_object = false; bool use_threads = false; /// Coerce all date and timestamp to datetime64[ns] bool coerce_temporal_nanoseconds = false; /// Used to maintain backwards compatibility for /// timezone bugs (see ARROW-9528). Should be removed /// after Arrow 2.0 release. bool ignore_timezone = false; /// \brief If true, do not create duplicate PyObject versions of equal /// objects. This only applies to immutable objects like strings or datetime /// objects bool deduplicate_objects = false; /// \brief For certain data types, a cast is needed in order to store the /// data in a pandas DataFrame or Series (e.g. timestamps are always stored /// as nanoseconds in pandas). This option controls whether it is a safe /// cast or not. bool safe_cast = true; /// \brief If true, create one block per column rather than consolidated /// blocks (1 per data type). Do zero-copy wrapping when there are no /// nulls. pandas currently will consolidate the blocks on its own, causing /// increased memory use, so keep this in mind if you are working on a /// memory-constrained situation. bool split_blocks = false; /// \brief If true, allow non-writable zero-copy views to be created for /// single column blocks. This option is also used to provide zero copy for /// Series data bool allow_zero_copy_blocks = false; /// \brief If true, attempt to deallocate buffers in passed Arrow object if /// it is the only remaining shared_ptr copy of it. See ARROW-3789 for /// original context for this feature. Only currently implemented for Table /// conversions bool self_destruct = false; /// \brief The default behavior (DEFAULT), is to convert Arrow Map arrays to /// Python association lists (list-of-tuples) in the same order as the Arrow /// Map, as in [(key1, value1), (key2, value2), ...] /// If LOSSY or STRICT, convert Arrow Map arrays to native Python dicts. /// This can change the ordering of (key, value) pairs, and will deduplicate /// multiple keys, resulting in a possible loss of data. /// If 'lossy', this key deduplication results in a warning printed /// when detected. If 'strict', this instead results in an exception /// being raised when detected. MapConversionType maps_as_pydicts = MapConversionType::DEFAULT; // Used internally for nested arrays. bool decode_dictionaries = false; // Columns that should be casted to categorical std::unordered_set categorical_columns; // Columns that should be passed through to be converted to // ExtensionArray/Block std::unordered_set extension_columns; // Used internally to decipher between to_numpy() and to_pandas() when // the expected output differs bool to_numpy = false; }; ARROW_PYTHON_EXPORT Status ConvertArrayToPandas(const PandasOptions& options, std::shared_ptr arr, PyObject* py_ref, PyObject** out); ARROW_PYTHON_EXPORT Status ConvertChunkedArrayToPandas(const PandasOptions& options, std::shared_ptr col, PyObject* py_ref, PyObject** out); // Convert a whole table as efficiently as possible to a pandas.DataFrame. // // The returned Python object is a list of tuples consisting of the exact 2D // BlockManager structure of the pandas.DataFrame used as of pandas 0.19.x. // // tuple item: (indices: ndarray[int32], block: ndarray[TYPE, ndim=2]) ARROW_PYTHON_EXPORT Status ConvertTableToPandas(const PandasOptions& options, std::shared_ptr table, PyObject** out); } // namespace py } // namespace arrow