/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /** * File format description for the parquet file format */ namespace cpp parquet namespace java org.apache.parquet.format /** * Types supported by Parquet. These types are intended to be used in combination * with the encodings to control the on disk storage format. * For example INT16 is not included as a type since a good encoding of INT32 * would handle this. */ enum Type { BOOLEAN = 0; INT32 = 1; INT64 = 2; INT96 = 3; // deprecated, only used by legacy implementations. FLOAT = 4; DOUBLE = 5; BYTE_ARRAY = 6; FIXED_LEN_BYTE_ARRAY = 7; } /** * Common types used by frameworks(e.g. hive, pig) using parquet. This helps map * between types in those frameworks to the base types in parquet. This is only * metadata and not needed to read or write the data. */ enum ConvertedType { /** a BYTE_ARRAY actually contains UTF8 encoded chars */ UTF8 = 0; /** a map is converted as an optional field containing a repeated key/value pair */ MAP = 1; /** a key/value pair is converted into a group of two fields */ MAP_KEY_VALUE = 2; /** a list is converted into an optional field containing a repeated field for its * values */ LIST = 3; /** an enum is converted into a binary field */ ENUM = 4; /** * A decimal value. * * This may be used to annotate binary or fixed primitive types. The * underlying byte array stores the unscaled value encoded as two's * complement using big-endian byte order (the most significant byte is the * zeroth element). The value of the decimal is the value * 10^{-scale}. * * This must be accompanied by a (maximum) precision and a scale in the * SchemaElement. The precision specifies the number of digits in the decimal * and the scale stores the location of the decimal point. For example 1.23 * would have precision 3 (3 total digits) and scale 2 (the decimal point is * 2 digits over). */ DECIMAL = 5; /** * A Date * * Stored as days since Unix epoch, encoded as the INT32 physical type. * */ DATE = 6; /** * A time * * The total number of milliseconds since midnight. The value is stored * as an INT32 physical type. */ TIME_MILLIS = 7; /** * A time. * * The total number of microseconds since midnight. The value is stored as * an INT64 physical type. */ TIME_MICROS = 8; /** * A date/time combination * * Date and time recorded as milliseconds since the Unix epoch. Recorded as * a physical type of INT64. */ TIMESTAMP_MILLIS = 9; /** * A date/time combination * * Date and time recorded as microseconds since the Unix epoch. The value is * stored as an INT64 physical type. */ TIMESTAMP_MICROS = 10; /** * An unsigned integer value. * * The number describes the maximum number of meainful data bits in * the stored value. 8, 16 and 32 bit values are stored using the * INT32 physical type. 64 bit values are stored using the INT64 * physical type. * */ UINT_8 = 11; UINT_16 = 12; UINT_32 = 13; UINT_64 = 14; /** * A signed integer value. * * The number describes the maximum number of meainful data bits in * the stored value. 8, 16 and 32 bit values are stored using the * INT32 physical type. 64 bit values are stored using the INT64 * physical type. * */ INT_8 = 15; INT_16 = 16; INT_32 = 17; INT_64 = 18; /** * An embedded JSON document * * A JSON document embedded within a single UTF8 column. */ JSON = 19; /** * An embedded BSON document * * A BSON document embedded within a single BINARY column. */ BSON = 20; /** * An interval of time * * This type annotates data stored as a FIXED_LEN_BYTE_ARRAY of length 12 * This data is composed of three separate little endian unsigned * integers. Each stores a component of a duration of time. The first * integer identifies the number of months associated with the duration, * the second identifies the number of days associated with the duration * and the third identifies the number of milliseconds associated with * the provided duration. This duration of time is independent of any * particular timezone or date. */ INTERVAL = 21; } /** * Representation of Schemas */ enum FieldRepetitionType { /** This field is required (can not be null) and each record has exactly 1 value. */ REQUIRED = 0; /** The field is optional (can be null) and each record has 0 or 1 values. */ OPTIONAL = 1; /** The field is repeated and can contain 0 or more values */ REPEATED = 2; } /** * Statistics per row group and per page * All fields are optional. */ struct Statistics { /** * DEPRECATED: min and max value of the column. Use min_value and max_value. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. * * These fields encode min and max values determined by signed comparison * only. New files should use the correct order for a column's logical type * and store the values in the min_value and max_value fields. * * To support older readers, these may be set when the column order is * signed. */ 1: optional binary max; 2: optional binary min; /** count of null value in the column */ 3: optional i64 null_count; /** count of distinct values occurring */ 4: optional i64 distinct_count; /** * Min and max values for the column, determined by its ColumnOrder. * * Values are encoded using PLAIN encoding, except that variable-length byte * arrays do not include a length prefix. */ 5: optional binary max_value; 6: optional binary min_value; } /** Empty structs to use as logical type annotations */ struct StringType {} // allowed for BINARY, must be encoded with UTF-8 struct UUIDType {} // allowed for FIXED[16], must encoded raw UUID bytes struct MapType {} // see LogicalTypes.md struct ListType {} // see LogicalTypes.md struct EnumType {} // allowed for BINARY, must be encoded with UTF-8 struct DateType {} // allowed for INT32 /** * Logical type to annotate a column that is always null. * * Sometimes when discovering the schema of existing data, values are always * null and the physical type can't be determined. This annotation signals * the case where the physical type was guessed from all null values. */ struct NullType {} // allowed for any physical type, only null values stored /** * Decimal logical type annotation * * To maintain forward-compatibility in v1, implementations using this logical * type must also set scale and precision on the annotated SchemaElement. * * Allowed for physical types: INT32, INT64, FIXED, and BINARY */ struct DecimalType { 1: required i32 scale 2: required i32 precision } /** Time units for logical types */ struct MilliSeconds {} struct MicroSeconds {} struct NanoSeconds {} union TimeUnit { 1: MilliSeconds MILLIS 2: MicroSeconds MICROS 3: NanoSeconds NANOS } /** * Timestamp logical type annotation * * Allowed for physical types: INT64 */ struct TimestampType { 1: required bool isAdjustedToUTC 2: required TimeUnit unit } /** * Time logical type annotation * * Allowed for physical types: INT32 (millis), INT64 (micros, nanos) */ struct TimeType { 1: required bool isAdjustedToUTC 2: required TimeUnit unit } /** * Integer logical type annotation * * bitWidth must be 8, 16, 32, or 64. * * Allowed for physical types: INT32, INT64 */ struct IntType { 1: required byte bitWidth 2: required bool isSigned } /** * Embedded JSON logical type annotation * * Allowed for physical types: BINARY */ struct JsonType { } /** * Embedded BSON logical type annotation * * Allowed for physical types: BINARY */ struct BsonType { } /** * LogicalType annotations to replace ConvertedType. * * To maintain compatibility, implementations using LogicalType for a * SchemaElement must also set the corresponding ConvertedType from the * following table. */ union LogicalType { 1: StringType STRING // use ConvertedType UTF8 2: MapType MAP // use ConvertedType MAP 3: ListType LIST // use ConvertedType LIST 4: EnumType ENUM // use ConvertedType ENUM 5: DecimalType DECIMAL // use ConvertedType DECIMAL 6: DateType DATE // use ConvertedType DATE 7: TimeType TIME // use ConvertedType TIME_MICROS or TIME_MILLIS 8: TimestampType TIMESTAMP // use ConvertedType TIMESTAMP_MICROS or TIMESTAMP_MILLIS // 9: reserved for INTERVAL 10: IntType INTEGER // use ConvertedType INT_* or UINT_* 11: NullType UNKNOWN // no compatible ConvertedType 12: JsonType JSON // use ConvertedType JSON 13: BsonType BSON // use ConvertedType BSON 14: UUIDType UUID } /** * Represents a element inside a schema definition. * - if it is a group (inner node) then type is undefined and num_children is defined * - if it is a primitive type (leaf) then type is defined and num_children is undefined * the nodes are listed in depth first traversal order. */ struct SchemaElement { /** Data type for this field. Not set if the current element is a non-leaf node */ 1: optional Type type; /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the vales. * Otherwise, if specified, this is the maximum bit length to store any of the values. * (e.g. a low cardinality INT col could have this set to 3). Note that this is * in the schema, and therefore fixed for the entire file. */ 2: optional i32 type_length; /** repetition of the field. The root of the schema does not have a repetition_type. * All other nodes must have one */ 3: optional FieldRepetitionType repetition_type; /** Name of the field in the schema */ 4: required string name; /** Nested fields. Since thrift does not support nested fields, * the nesting is flattened to a single list by a depth-first traversal. * The children count is used to construct the nested relationship. * This field is not set when the element is a primitive type */ 5: optional i32 num_children; /** When the schema is the result of a conversion from another model * Used to record the original type to help with cross conversion. */ 6: optional ConvertedType converted_type; /** Used when this column contains decimal data. * See the DECIMAL converted type for more details. */ 7: optional i32 scale 8: optional i32 precision /** When the original schema supports field ids, this will save the * original field id in the parquet schema */ 9: optional i32 field_id; /** * The logical type of this SchemaElement * * LogicalType replaces ConvertedType, but ConvertedType is still required * for some logical types to ensure forward-compatibility in format v1. */ 10: optional LogicalType logicalType } /** * Encodings supported by Parquet. Not all encodings are valid for all types. These * enums are also used to specify the encoding of definition and repetition levels. * See the accompanying doc for the details of the more complicated encodings. */ enum Encoding { /** Default encoding. * BOOLEAN - 1 bit per value. 0 is false; 1 is true. * INT32 - 4 bytes per value. Stored as little-endian. * INT64 - 8 bytes per value. Stored as little-endian. * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. * FIXED_LEN_BYTE_ARRAY - Just the bytes. */ PLAIN = 0; /** Group VarInt encoding for INT32/INT64. * This encoding is deprecated. It was never used */ // GROUP_VAR_INT = 1; /** * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the * plain type. * in a data page use RLE_DICTIONARY instead. * in a Dictionary page use PLAIN instead */ PLAIN_DICTIONARY = 2; /** Group packed run length encoding. Usable for definition/repetition levels * encoding and Booleans (on one bit: 0 is false; 1 is true.) */ RLE = 3; /** Bit packed encoding. This can only be used if the data has a known max * width. Usable for definition/repetition levels encoding. */ BIT_PACKED = 4; /** Delta encoding for integers. This can be used for int columns and works best * on sorted data */ DELTA_BINARY_PACKED = 5; /** Encoding for byte arrays to separate the length values and the data. The lengths * are encoded using DELTA_BINARY_PACKED */ DELTA_LENGTH_BYTE_ARRAY = 6; /** Incremental-encoded byte array. Prefix lengths are encoded using DELTA_BINARY_PACKED. * Suffixes are stored as delta length byte arrays. */ DELTA_BYTE_ARRAY = 7; /** Dictionary encoding: the ids are encoded using the RLE encoding */ RLE_DICTIONARY = 8; } /** * Supported compression algorithms. * * Codecs added in 2.4 can be read by readers based on 2.4 and later. * Codec support may vary between readers based on the format version and * libraries available at runtime. Gzip, Snappy, and LZ4 codecs are * widely available, while Zstd and Brotli require additional libraries. */ enum CompressionCodec { UNCOMPRESSED = 0; SNAPPY = 1; GZIP = 2; LZO = 3; BROTLI = 4; // Added in 2.4 LZ4 = 5; // Added in 2.4 ZSTD = 6; // Added in 2.4 } enum PageType { DATA_PAGE = 0; INDEX_PAGE = 1; DICTIONARY_PAGE = 2; DATA_PAGE_V2 = 3; } /** * Enum to annotate whether lists of min/max elements inside ColumnIndex * are ordered and if so, in which direction. */ enum BoundaryOrder { UNORDERED = 0; ASCENDING = 1; DESCENDING = 2; } /** Data page header */ struct DataPageHeader { /** Number of values, including NULLs, in this data page. **/ 1: required i32 num_values /** Encoding used for this data page **/ 2: required Encoding encoding /** Encoding used for definition levels **/ 3: required Encoding definition_level_encoding; /** Encoding used for repetition levels **/ 4: required Encoding repetition_level_encoding; /** Optional statistics for the data in this page**/ 5: optional Statistics statistics; } struct IndexPageHeader { /** TODO: **/ } struct DictionaryPageHeader { /** Number of values in the dictionary **/ 1: required i32 num_values; /** Encoding using this dictionary page **/ 2: required Encoding encoding /** If true, the entries in the dictionary are sorted in ascending order **/ 3: optional bool is_sorted; } /** * New page format allowing reading levels without decompressing the data * Repetition and definition levels are uncompressed * The remaining section containing the data is compressed if is_compressed is true **/ struct DataPageHeaderV2 { /** Number of values, including NULLs, in this data page. **/ 1: required i32 num_values /** Number of NULL values, in this data page. Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ 2: required i32 num_nulls /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ 3: required i32 num_rows /** Encoding used for data in this page **/ 4: required Encoding encoding // repetition levels and definition levels are always using RLE (without size in it) /** length of the definition levels */ 5: required i32 definition_levels_byte_length; /** length of the repetition levels */ 6: required i32 repetition_levels_byte_length; /** whether the values are compressed. Which means the section of the page between definition_levels_byte_length + repetition_levels_byte_length + 1 and compressed_page_size (included) is compressed with the compression_codec. If missing it is considered compressed */ 7: optional bool is_compressed = 1; /** optional statistics for this column chunk */ 8: optional Statistics statistics; } struct PageHeader { /** the type of the page: indicates which of the *_header fields is set **/ 1: required PageType type /** Uncompressed page size in bytes (not including this header) **/ 2: required i32 uncompressed_page_size /** Compressed page size in bytes (not including this header) **/ 3: required i32 compressed_page_size /** 32bit crc for the data below. This allows for disabling checksumming in HDFS * if only a few pages needs to be read **/ 4: optional i32 crc // Headers for page specific data. One only will be set. 5: optional DataPageHeader data_page_header; 6: optional IndexPageHeader index_page_header; 7: optional DictionaryPageHeader dictionary_page_header; 8: optional DataPageHeaderV2 data_page_header_v2; } /** * Wrapper struct to store key values */ struct KeyValue { 1: required string key 2: optional string value } /** * Wrapper struct to specify sort order */ struct SortingColumn { /** The column index (in this row group) **/ 1: required i32 column_idx /** If true, indicates this column is sorted in descending order. **/ 2: required bool descending /** If true, nulls will come before non-null values, otherwise, * nulls go at the end. */ 3: required bool nulls_first } /** * statistics of a given page type and encoding */ struct PageEncodingStats { /** the page type (data/dic/...) **/ 1: required PageType page_type; /** encoding of the page **/ 2: required Encoding encoding; /** number of pages of this type with this encoding **/ 3: required i32 count; } /** * Description for column metadata */ struct ColumnMetaData { /** Type of this column **/ 1: required Type type /** Set of all encodings used for this column. The purpose is to validate * whether we can decode those pages. **/ 2: required list encodings /** Path in schema **/ 3: required list path_in_schema /** Compression codec **/ 4: required CompressionCodec codec /** Number of values in this column **/ 5: required i64 num_values /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ 6: required i64 total_uncompressed_size /** total byte size of all compressed pages in this column chunk (including the headers) **/ 7: required i64 total_compressed_size /** Optional key/value metadata **/ 8: optional list key_value_metadata /** Byte offset from beginning of file to first data page **/ 9: required i64 data_page_offset /** Byte offset from beginning of file to root index page **/ 10: optional i64 index_page_offset /** Byte offset from the beginning of file to first (only) dictionary page **/ 11: optional i64 dictionary_page_offset /** optional statistics for this column chunk */ 12: optional Statistics statistics; /** Set of all encodings used for pages in this column chunk. * This information can be used to determine if all data pages are * dictionary encoded for example **/ 13: optional list encoding_stats; } struct ColumnChunk { /** File where column data is stored. If not set, assumed to be same file as * metadata. This path is relative to the current file. **/ 1: optional string file_path /** Byte offset in file_path to the ColumnMetaData **/ 2: required i64 file_offset /** Column metadata for this chunk. This is the same content as what is at * file_path/file_offset. Having it here has it replicated in the file * metadata. **/ 3: optional ColumnMetaData meta_data /** File offset of ColumnChunk's OffsetIndex **/ 4: optional i64 offset_index_offset /** Size of ColumnChunk's OffsetIndex, in bytes **/ 5: optional i32 offset_index_length /** File offset of ColumnChunk's ColumnIndex **/ 6: optional i64 column_index_offset /** Size of ColumnChunk's ColumnIndex, in bytes **/ 7: optional i32 column_index_length } struct RowGroup { /** Metadata for each column chunk in this row group. * This list must have the same order as the SchemaElement list in FileMetaData. **/ 1: required list columns /** Total byte size of all the uncompressed column data in this row group **/ 2: required i64 total_byte_size /** Number of rows in this row group **/ 3: required i64 num_rows /** If set, specifies a sort ordering of the rows in this RowGroup. * The sorting columns can be a subset of all the columns. */ 4: optional list sorting_columns } /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} /** * Union to specify the order used for the min_value and max_value fields for a * column. This union takes the role of an enhanced enum that allows rich * elements (which will be needed for a collation-based ordering in the future). * * Possible values are: * * TypeDefinedOrder - the column uses the order defined by its logical or * physical type (if there is no logical type). * * If the reader does not support the value of this union, min and max stats * for this column should be ignored. */ union ColumnOrder { /** * The sort orders for logical types are: * UTF8 - unsigned byte-wise comparison * INT8 - signed comparison * INT16 - signed comparison * INT32 - signed comparison * INT64 - signed comparison * UINT8 - unsigned comparison * UINT16 - unsigned comparison * UINT32 - unsigned comparison * UINT64 - unsigned comparison * DECIMAL - signed comparison of the represented value * DATE - signed comparison * TIME_MILLIS - signed comparison * TIME_MICROS - signed comparison * TIMESTAMP_MILLIS - signed comparison * TIMESTAMP_MICROS - signed comparison * INTERVAL - unsigned comparison * JSON - unsigned byte-wise comparison * BSON - unsigned byte-wise comparison * ENUM - unsigned byte-wise comparison * LIST - undefined * MAP - undefined * * In the absence of logical types, the sort order is determined by the physical type: * BOOLEAN - false, true * INT32 - signed comparison * INT64 - signed comparison * INT96 (only used for legacy timestamps) - undefined * FLOAT - signed comparison of the represented value (*) * DOUBLE - signed comparison of the represented value (*) * BYTE_ARRAY - unsigned byte-wise comparison * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison * * (*) Because the sorting order is not specified properly for floating * point values (relations vs. total ordering) the following * compatibility rules should be applied when reading statistics: * - If the min is a NaN, it should be ignored. * - If the max is a NaN, it should be ignored. * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. * - When looking for NaN values, min and max should be ignored. */ 1: TypeDefinedOrder TYPE_ORDER; } struct PageLocation { /** Offset of the page in the file **/ 1: required i64 offset /** * Size of the page, including header. Sum of compressed_page_size and header * length */ 2: required i32 compressed_page_size /** * Index within the RowGroup of the first row of the page; this means pages * change on record boundaries (r = 0). */ 3: required i64 first_row_index } struct OffsetIndex { /** * PageLocations, ordered by increasing PageLocation.offset. It is required * that page_locations[i].first_row_index < page_locations[i+1].first_row_index. */ 1: required list page_locations } /** * Description for ColumnIndex. * Each [i] refers to the page at OffsetIndex.page_locations[i] */ struct ColumnIndex { /** * A list of Boolean values to determine the validity of the corresponding * min and max values. If true, a page contains only null values, and writers * have to set the corresponding entries in min_values and max_values to * byte[0], so that all lists have the same length. If false, the * corresponding entries in min_values and max_values must be valid. */ 1: required list null_pages /** * Two lists containing lower and upper bounds for the values of each page. * These may be the actual minimum and maximum values found on a page, but * can also be (more compact) values that do not exist on a page. For * example, instead of storing ""Blart Versenwald III", a writer may set * min_values[i]="B", max_values[i]="C". Such more compact values must still * be valid values within the column's logical type. Readers must make sure * that list entries are populated before using them by inspecting null_pages. */ 2: required list min_values 3: required list max_values /** * Stores whether both min_values and max_values are orderd and if so, in * which direction. This allows readers to perform binary searches in both * lists. Readers cannot assume that max_values[i] <= min_values[i+1], even * if the lists are ordered. */ 4: required BoundaryOrder boundary_order /** A list containing the number of null values for each page **/ 5: optional list null_counts } /** * Description for file metadata */ struct FileMetaData { /** Version of this file **/ 1: required i32 version /** Parquet schema for this file. This schema contains metadata for all the columns. * The schema is represented as a tree with a single root. The nodes of the tree * are flattened to a list by doing a depth-first traversal. * The column metadata contains the path in the schema for that column which can be * used to map columns to nodes in the schema. * The first element is the root **/ 2: required list schema; /** Number of rows in this file **/ 3: required i64 num_rows /** Row groups in this file **/ 4: required list row_groups /** Optional key/value metadata **/ 5: optional list key_value_metadata /** String for application that wrote this file. This should be in the format * version (build ). * e.g. impala version 1.0 (build 6cf94d29b2b7115df4de2c06e2ab4326d721eb55) **/ 6: optional string created_by /** * Sort order used for the min_value and max_value fields of each column in * this file. Each sort order corresponds to one column, determined by its * position in the list, matching the position of the column in the schema. * * Without column_orders, the meaning of the min_value and max_value fields is * undefined. To ensure well-defined behaviour, if min_value and max_value are * written to a Parquet file, column_orders must be written as well. * * The obsolete min and max fields are always sorted by signed comparison * regardless of column_orders. */ 7: optional list column_orders; }