// Licensed to the Apache Software Foundation (ASF) under one // or more contributor license agreements. See the NOTICE file // distributed with this work for additional information // regarding copyright ownership. The ASF licenses this file // to you under the Apache License, Version 2.0 (the // "License"); you may not use this file except in compliance // with the License. You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, // software distributed under the License is distributed on an // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. #pragma once #include #include #include #include #include "arrow/util/span.h" #include "parquet/platform.h" #include "parquet/type_fwd.h" namespace parquet { /// A structure for capturing metadata for estimating the unencoded, /// uncompressed size of data written. This is useful for readers to estimate /// how much memory is needed to reconstruct data in their memory model and for /// fine-grained filter push down on nested structures (the histograms contained /// in this structure can help determine the number of nulls at a particular /// nesting level and maximum length of lists). struct PARQUET_EXPORT SizeStatistics { /// When present, there is expected to be one element corresponding to each /// definition (i.e. size=max definition+1) where each element /// represents the number of times the definition level was observed in the /// data. /// /// This field may be omitted (a.k.a. zero-length vector) if max_definition_level /// is 0 without loss of information. std::vector definition_level_histogram; /// Same as definition_level_histogram except for repetition levels. /// /// This field may be omitted (a.k.a. zero-length vector) if max_repetition_level /// is 0 without loss of information. std::vector repetition_level_histogram; /// The number of physical bytes stored for BYTE_ARRAY data values assuming /// no encoding. This is exclusive of the bytes needed to store the length of /// each byte array. In other words, this field is equivalent to the `(size /// of PLAIN-ENCODING the byte array values) - (4 bytes * number of values /// written)`. To determine unencoded sizes of other types readers can use /// schema information multiplied by the number of non-null and null values. /// The number of null/non-null values can be inferred from the histograms /// below. /// /// For example, if a column chunk is dictionary-encoded with dictionary /// ["a", "bc", "cde"], and a data page contains the indices [0, 0, 1, 2], /// then this value for that data page should be 7 (1 + 1 + 2 + 3). /// /// This field should only be set for types that use BYTE_ARRAY as their /// physical type. std::optional unencoded_byte_array_data_bytes; /// \brief Check if the SizeStatistics is set. bool is_set() const { return !repetition_level_histogram.empty() || !definition_level_histogram.empty() || unencoded_byte_array_data_bytes.has_value(); } /// \brief Increment the unencoded byte array data bytes. void IncrementUnencodedByteArrayDataBytes(int64_t value); /// \brief Merge two SizeStatistics. /// \throws ParquetException if SizeStatistics to merge is not compatible. void Merge(const SizeStatistics& other); /// \brief Validate the SizeStatistics /// \throws ParquetException if the histograms don't have the right length, /// or if unencoded_byte_array_data_bytes is present for a non-BYTE_ARRAY column. void Validate(const ColumnDescriptor* descr) const; /// \brief Reset the SizeStatistics to be empty. void Reset(); /// \brief Make an empty SizeStatistics object for specific type. static std::unique_ptr Make(const ColumnDescriptor* descr); }; PARQUET_EXPORT std::ostream& operator<<(std::ostream&, const SizeStatistics&); PARQUET_EXPORT void UpdateLevelHistogram(::arrow::util::span levels, ::arrow::util::span histogram); } // namespace parquet