public class ParquetFileWriter extends Object
| Modifier and Type | Class and Description |
|---|---|
static class |
ParquetFileWriter.Mode |
| Modifier and Type | Field and Description |
|---|---|
static int |
CURRENT_VERSION |
static String |
EF_MAGIC_STR |
static byte[] |
EFMAGIC |
static byte[] |
MAGIC |
static String |
MAGIC_STR |
protected org.apache.parquet.io.PositionOutputStream |
out |
static String |
PARQUET_COMMON_METADATA_FILE |
static String |
PARQUET_METADATA_FILE |
| Constructor and Description |
|---|
ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration,
org.apache.parquet.schema.MessageType schema,
org.apache.hadoop.fs.Path file)
Deprecated.
will be removed in 2.0.0
|
ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration,
org.apache.parquet.schema.MessageType schema,
org.apache.hadoop.fs.Path file,
ParquetFileWriter.Mode mode)
Deprecated.
will be removed in 2.0.0
|
ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration,
org.apache.parquet.schema.MessageType schema,
org.apache.hadoop.fs.Path file,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize)
Deprecated.
will be removed in 2.0.0
|
ParquetFileWriter(org.apache.parquet.io.OutputFile file,
org.apache.parquet.schema.MessageType schema,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize)
Deprecated.
will be removed in 2.0.0
|
ParquetFileWriter(org.apache.parquet.io.OutputFile file,
org.apache.parquet.schema.MessageType schema,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize,
int columnIndexTruncateLength,
int statisticsTruncateLength,
boolean pageWriteChecksumEnabled) |
ParquetFileWriter(org.apache.parquet.io.OutputFile file,
org.apache.parquet.schema.MessageType schema,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize,
int columnIndexTruncateLength,
int statisticsTruncateLength,
boolean pageWriteChecksumEnabled,
org.apache.parquet.crypto.FileEncryptionProperties encryptionProperties) |
| Modifier and Type | Method and Description |
|---|---|
void |
appendColumnChunk(org.apache.parquet.column.ColumnDescriptor descriptor,
org.apache.parquet.io.SeekableInputStream from,
org.apache.parquet.hadoop.metadata.ColumnChunkMetaData chunk,
org.apache.parquet.column.values.bloomfilter.BloomFilter bloomFilter,
org.apache.parquet.internal.column.columnindex.ColumnIndex columnIndex,
org.apache.parquet.internal.column.columnindex.OffsetIndex offsetIndex) |
void |
appendFile(org.apache.hadoop.conf.Configuration conf,
org.apache.hadoop.fs.Path file)
Deprecated.
will be removed in 2.0.0; use
appendFile(InputFile) instead |
void |
appendFile(org.apache.parquet.io.InputFile file) |
void |
appendRowGroup(org.apache.hadoop.fs.FSDataInputStream from,
org.apache.parquet.hadoop.metadata.BlockMetaData rowGroup,
boolean dropColumns)
Deprecated.
will be removed in 2.0.0;
use
appendRowGroup(SeekableInputStream,BlockMetaData,boolean) instead |
void |
appendRowGroup(org.apache.parquet.io.SeekableInputStream from,
org.apache.parquet.hadoop.metadata.BlockMetaData rowGroup,
boolean dropColumns) |
void |
appendRowGroups(org.apache.hadoop.fs.FSDataInputStream file,
List<org.apache.parquet.hadoop.metadata.BlockMetaData> rowGroups,
boolean dropColumns)
Deprecated.
will be removed in 2.0.0;
use
appendRowGroups(SeekableInputStream,List,boolean) instead |
void |
appendRowGroups(org.apache.parquet.io.SeekableInputStream file,
List<org.apache.parquet.hadoop.metadata.BlockMetaData> rowGroups,
boolean dropColumns) |
void |
end(Map<String,String> extraMetaData)
ends a file once all blocks have been written.
|
void |
endBlock()
ends a block once all column chunks have been written
|
void |
endColumn()
end a column (once all rep, def and data have been written)
|
org.apache.parquet.hadoop.metadata.ParquetMetadata |
getFooter() |
long |
getNextRowGroupSize() |
long |
getPos() |
static org.apache.parquet.hadoop.metadata.ParquetMetadata |
mergeMetadataFiles(List<org.apache.hadoop.fs.Path> files,
org.apache.hadoop.conf.Configuration conf)
Deprecated.
metadata files are not recommended and will be removed in 2.0.0
|
static org.apache.parquet.hadoop.metadata.ParquetMetadata |
mergeMetadataFiles(List<org.apache.hadoop.fs.Path> files,
org.apache.hadoop.conf.Configuration conf,
org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy)
Deprecated.
metadata files are not recommended and will be removed in 2.0.0
|
void |
start()
start the file
|
void |
startBlock(long recordCount)
start a block
|
void |
startColumn(org.apache.parquet.column.ColumnDescriptor descriptor,
long valueCount,
org.apache.parquet.hadoop.metadata.CompressionCodecName compressionCodecName)
start a column inside a block
|
void |
writeDataPage(int valueCount,
int uncompressedPageSize,
org.apache.parquet.bytes.BytesInput bytes,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding)
Deprecated.
|
void |
writeDataPage(int valueCount,
int uncompressedPageSize,
org.apache.parquet.bytes.BytesInput bytes,
org.apache.parquet.column.statistics.Statistics statistics,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding)
Deprecated.
this method does not support writing column indexes; Use
writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding) instead |
void |
writeDataPage(int valueCount,
int uncompressedPageSize,
org.apache.parquet.bytes.BytesInput bytes,
org.apache.parquet.column.statistics.Statistics statistics,
long rowCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding)
Writes a single page
|
void |
writeDataPageV2(int rowCount,
int nullCount,
int valueCount,
org.apache.parquet.bytes.BytesInput repetitionLevels,
org.apache.parquet.bytes.BytesInput definitionLevels,
org.apache.parquet.column.Encoding dataEncoding,
org.apache.parquet.bytes.BytesInput compressedData,
int uncompressedDataSize,
org.apache.parquet.column.statistics.Statistics<?> statistics)
Writes a single v2 data page
|
void |
writeDictionaryPage(org.apache.parquet.column.page.DictionaryPage dictionaryPage)
writes a dictionary page page
|
void |
writeDictionaryPage(org.apache.parquet.column.page.DictionaryPage dictionaryPage,
org.apache.parquet.format.BlockCipher.Encryptor headerBlockEncryptor,
byte[] AAD) |
static void |
writeMergedMetadataFile(List<org.apache.hadoop.fs.Path> files,
org.apache.hadoop.fs.Path outputPath,
org.apache.hadoop.conf.Configuration conf)
Deprecated.
metadata files are not recommended and will be removed in 2.0.0
|
static void |
writeMetadataFile(org.apache.hadoop.conf.Configuration configuration,
org.apache.hadoop.fs.Path outputPath,
List<org.apache.parquet.hadoop.Footer> footers)
Deprecated.
metadata files are not recommended and will be removed in 2.0.0
|
static void |
writeMetadataFile(org.apache.hadoop.conf.Configuration configuration,
org.apache.hadoop.fs.Path outputPath,
List<org.apache.parquet.hadoop.Footer> footers,
org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel level)
Deprecated.
metadata files are not recommended and will be removed in 2.0.0
|
public static final String PARQUET_METADATA_FILE
public static final String MAGIC_STR
public static final byte[] MAGIC
public static final String EF_MAGIC_STR
public static final byte[] EFMAGIC
public static final String PARQUET_COMMON_METADATA_FILE
public static final int CURRENT_VERSION
protected final org.apache.parquet.io.PositionOutputStream out
@Deprecated public ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration, org.apache.parquet.schema.MessageType schema, org.apache.hadoop.fs.Path file) throws IOException
configuration - Hadoop configurationschema - the schema of the datafile - the file to write toIOException - if the file can not be created@Deprecated public ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration, org.apache.parquet.schema.MessageType schema, org.apache.hadoop.fs.Path file, ParquetFileWriter.Mode mode) throws IOException
configuration - Hadoop configurationschema - the schema of the datafile - the file to write tomode - file creation modeIOException - if the file can not be created@Deprecated public ParquetFileWriter(org.apache.hadoop.conf.Configuration configuration, org.apache.parquet.schema.MessageType schema, org.apache.hadoop.fs.Path file, ParquetFileWriter.Mode mode, long rowGroupSize, int maxPaddingSize) throws IOException
configuration - Hadoop configurationschema - the schema of the datafile - the file to write tomode - file creation moderowGroupSize - the row group sizemaxPaddingSize - the maximum paddingIOException - if the file can not be created@Deprecated public ParquetFileWriter(org.apache.parquet.io.OutputFile file, org.apache.parquet.schema.MessageType schema, ParquetFileWriter.Mode mode, long rowGroupSize, int maxPaddingSize) throws IOException
file - OutputFile to create or overwriteschema - the schema of the datamode - file creation moderowGroupSize - the row group sizemaxPaddingSize - the maximum paddingIOException - if the file can not be createdpublic ParquetFileWriter(org.apache.parquet.io.OutputFile file,
org.apache.parquet.schema.MessageType schema,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize,
int columnIndexTruncateLength,
int statisticsTruncateLength,
boolean pageWriteChecksumEnabled)
throws IOException
file - OutputFile to create or overwriteschema - the schema of the datamode - file creation moderowGroupSize - the row group sizemaxPaddingSize - the maximum paddingcolumnIndexTruncateLength - the length which the min/max values in column indexes tried to be truncated tostatisticsTruncateLength - the length which the min/max values in row groups tried to be truncated topageWriteChecksumEnabled - whether to write out page level checksumsIOException - if the file can not be createdpublic ParquetFileWriter(org.apache.parquet.io.OutputFile file,
org.apache.parquet.schema.MessageType schema,
ParquetFileWriter.Mode mode,
long rowGroupSize,
int maxPaddingSize,
int columnIndexTruncateLength,
int statisticsTruncateLength,
boolean pageWriteChecksumEnabled,
org.apache.parquet.crypto.FileEncryptionProperties encryptionProperties)
throws IOException
IOExceptionpublic void start()
throws IOException
IOException - if there is an error while writingpublic void startBlock(long recordCount)
throws IOException
recordCount - the record count in this blockIOException - if there is an error while writingpublic void startColumn(org.apache.parquet.column.ColumnDescriptor descriptor,
long valueCount,
org.apache.parquet.hadoop.metadata.CompressionCodecName compressionCodecName)
throws IOException
descriptor - the column descriptorvalueCount - the value count in this columncompressionCodecName - a compression codec nameIOException - if there is an error while writingpublic void writeDictionaryPage(org.apache.parquet.column.page.DictionaryPage dictionaryPage)
throws IOException
dictionaryPage - the dictionary pageIOException - if there is an error while writingpublic void writeDictionaryPage(org.apache.parquet.column.page.DictionaryPage dictionaryPage,
org.apache.parquet.format.BlockCipher.Encryptor headerBlockEncryptor,
byte[] AAD)
throws IOException
IOException@Deprecated public void writeDataPage(int valueCount, int uncompressedPageSize, org.apache.parquet.bytes.BytesInput bytes, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, org.apache.parquet.column.Encoding valuesEncoding) throws IOException
valueCount - count of valuesuncompressedPageSize - the size of the data once uncompressedbytes - the compressed data for the page without headerrlEncoding - encoding of the repetition leveldlEncoding - encoding of the definition levelvaluesEncoding - encoding of valuesIOException - if there is an error while writing@Deprecated public void writeDataPage(int valueCount, int uncompressedPageSize, org.apache.parquet.bytes.BytesInput bytes, org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, org.apache.parquet.column.Encoding valuesEncoding) throws IOException
writeDataPage(int, int, BytesInput, Statistics, long, Encoding, Encoding, Encoding) insteadvalueCount - count of valuesuncompressedPageSize - the size of the data once uncompressedbytes - the compressed data for the page without headerstatistics - statistics for the pagerlEncoding - encoding of the repetition leveldlEncoding - encoding of the definition levelvaluesEncoding - encoding of valuesIOException - if there is an error while writingpublic void writeDataPage(int valueCount,
int uncompressedPageSize,
org.apache.parquet.bytes.BytesInput bytes,
org.apache.parquet.column.statistics.Statistics statistics,
long rowCount,
org.apache.parquet.column.Encoding rlEncoding,
org.apache.parquet.column.Encoding dlEncoding,
org.apache.parquet.column.Encoding valuesEncoding)
throws IOException
valueCount - count of valuesuncompressedPageSize - the size of the data once uncompressedbytes - the compressed data for the page without headerstatistics - the statistics of the pagerowCount - the number of rows in the pagerlEncoding - encoding of the repetition leveldlEncoding - encoding of the definition levelvaluesEncoding - encoding of valuesIOException - if any I/O error occurs during writing the filepublic void writeDataPageV2(int rowCount,
int nullCount,
int valueCount,
org.apache.parquet.bytes.BytesInput repetitionLevels,
org.apache.parquet.bytes.BytesInput definitionLevels,
org.apache.parquet.column.Encoding dataEncoding,
org.apache.parquet.bytes.BytesInput compressedData,
int uncompressedDataSize,
org.apache.parquet.column.statistics.Statistics<?> statistics)
throws IOException
rowCount - count of rowsnullCount - count of nullsvalueCount - count of valuesrepetitionLevels - repetition level bytesdefinitionLevels - definition level bytesdataEncoding - encoding for datacompressedData - compressed data bytesuncompressedDataSize - the size of uncompressed datastatistics - the statistics of the pageIOException - if any I/O error occurs during writing the filepublic void endColumn()
throws IOException
IOException - if there is an error while writingpublic void endBlock()
throws IOException
IOException - if there is an error while writing@Deprecated public void appendFile(org.apache.hadoop.conf.Configuration conf, org.apache.hadoop.fs.Path file) throws IOException
appendFile(InputFile) insteadconf - a configurationfile - a file path to append the contents of to this fileIOException - if there is an error while reading or writingpublic void appendFile(org.apache.parquet.io.InputFile file)
throws IOException
IOException@Deprecated public void appendRowGroups(org.apache.hadoop.fs.FSDataInputStream file, List<org.apache.parquet.hadoop.metadata.BlockMetaData> rowGroups, boolean dropColumns) throws IOException
appendRowGroups(SeekableInputStream,List,boolean) insteadfile - a file stream to read fromrowGroups - row groups to copydropColumns - whether to drop columns from the file that are not in this file's schemaIOException - if there is an error while reading or writingpublic void appendRowGroups(org.apache.parquet.io.SeekableInputStream file,
List<org.apache.parquet.hadoop.metadata.BlockMetaData> rowGroups,
boolean dropColumns)
throws IOException
IOException@Deprecated public void appendRowGroup(org.apache.hadoop.fs.FSDataInputStream from, org.apache.parquet.hadoop.metadata.BlockMetaData rowGroup, boolean dropColumns) throws IOException
appendRowGroup(SeekableInputStream,BlockMetaData,boolean) insteadfrom - a file stream to read fromrowGroup - row group to copydropColumns - whether to drop columns from the file that are not in this file's schemaIOException - if there is an error while reading or writingpublic void appendRowGroup(org.apache.parquet.io.SeekableInputStream from,
org.apache.parquet.hadoop.metadata.BlockMetaData rowGroup,
boolean dropColumns)
throws IOException
IOExceptionpublic void appendColumnChunk(org.apache.parquet.column.ColumnDescriptor descriptor,
org.apache.parquet.io.SeekableInputStream from,
org.apache.parquet.hadoop.metadata.ColumnChunkMetaData chunk,
org.apache.parquet.column.values.bloomfilter.BloomFilter bloomFilter,
org.apache.parquet.internal.column.columnindex.ColumnIndex columnIndex,
org.apache.parquet.internal.column.columnindex.OffsetIndex offsetIndex)
throws IOException
descriptor - the descriptor for the target columnfrom - a file stream to read fromchunk - the column chunk to be copiedbloomFilter - the bloomFilter for this chunkcolumnIndex - the column index for this chunkoffsetIndex - the offset index for this chunkIOExceptionpublic void end(Map<String,String> extraMetaData) throws IOException
extraMetaData - the extra meta data to write in the footerIOException - if there is an error while writingpublic org.apache.parquet.hadoop.metadata.ParquetMetadata getFooter()
@Deprecated public static org.apache.parquet.hadoop.metadata.ParquetMetadata mergeMetadataFiles(List<org.apache.hadoop.fs.Path> files, org.apache.hadoop.conf.Configuration conf) throws IOException
files - a list of files to merge metadata fromconf - a configurationIOException - if there is an error while writing@Deprecated public static org.apache.parquet.hadoop.metadata.ParquetMetadata mergeMetadataFiles(List<org.apache.hadoop.fs.Path> files, org.apache.hadoop.conf.Configuration conf, org.apache.parquet.hadoop.metadata.KeyValueMetadataMergeStrategy keyValueMetadataMergeStrategy) throws IOException
files - a list of files to merge metadata fromconf - a configurationkeyValueMetadataMergeStrategy - strategy to merge values for same key, if there are multipleIOException - if there is an error while writing@Deprecated public static void writeMergedMetadataFile(List<org.apache.hadoop.fs.Path> files, org.apache.hadoop.fs.Path outputPath, org.apache.hadoop.conf.Configuration conf) throws IOException
files - a list of files to merge metadata fromoutputPath - path to write merged metadata toconf - a configurationIOException - if there is an error while reading or writing@Deprecated public static void writeMetadataFile(org.apache.hadoop.conf.Configuration configuration, org.apache.hadoop.fs.Path outputPath, List<org.apache.parquet.hadoop.Footer> footers) throws IOException
configuration - the configuration to use to get the FileSystemoutputPath - the directory to write the _metadata file tofooters - the list of footers to mergeIOException - if there is an error while writing@Deprecated public static void writeMetadataFile(org.apache.hadoop.conf.Configuration configuration, org.apache.hadoop.fs.Path outputPath, List<org.apache.parquet.hadoop.Footer> footers, org.apache.parquet.hadoop.ParquetOutputFormat.JobSummaryLevel level) throws IOException
ParquetOutputFormat.JobSummaryLevel providedconfiguration - the configuration to use to get the FileSystemoutputPath - the directory to write the _metadata file tofooters - the list of footers to mergelevel - level of summary to writeIOException - if there is an error while writingpublic long getPos()
throws IOException
IOException - if there is an error while getting the current stream's positionpublic long getNextRowGroupSize()
throws IOException
IOExceptionCopyright © 2021 The Apache Software Foundation. All rights reserved.