Io Writers#
- group io_writers
Functions
-
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())#
Writes a set of columns to CSV format.
The following code snippet demonstrates how to write columns to a file:
auto destination = cudf::io::sink_info("dataset.csv"); auto options = cudf::io::csv_writer_options(destination, table->view()) .na_rep(na) .include_header(include_header) .rows_per_chunk(rows_per_chunk); cudf::io::write_csv(options);
- Parameters:
options – Settings for controlling writing behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
void write_json(json_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())#
Writes a set of columns to JSON format.
The following code snippet demonstrates how to write columns to a file:
auto destination = cudf::io::sink_info("dataset.json"); auto options = cudf::io::json_writer_options(destination, table->view()) .na_rep(na) .lines(lines) .rows_per_chunk(rows_per_chunk); cudf::io::write_json(options);
- Parameters:
options – Settings for controlling writing behavior
stream – CUDA stream used for device memory operations and kernel launches
mr – Device memory resource to use for device memory allocation
-
void write_orc(orc_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Writes a set of columns to ORC format.
The following code snippet demonstrates how to write columns to a file:
auto destination = cudf::io::sink_info("dataset.orc"); auto options = cudf::io::orc_writer_options::builder(destination, table->view()); cudf::io::write_orc(options);
- Parameters:
options – Settings for controlling reading behavior
stream – CUDA stream used for device memory operations and kernel launches
-
std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Writes a set of columns to parquet format.
The following code snippet demonstrates how to write columns to a file:
auto destination = cudf::io::sink_info("dataset.parquet"); auto options = cudf::io::parquet_writer_options::builder(destination, table->view()); cudf::io::write_parquet(options);
- Parameters:
options – Settings for controlling writing behavior
stream – CUDA stream used for device memory operations and kernel launches
- Returns:
A blob that contains the file metadata (parquet FileMetadata thrift message) if requested in parquet_writer_options (empty blob otherwise).
-
std::unique_ptr<std::vector<uint8_t>> merge_row_group_metadata(std::vector<std::unique_ptr<std::vector<uint8_t>>> const &metadata_list)#
Merges multiple raw metadata blobs that were previously created by write_parquet into a single metadata blob.
- Parameters:
metadata_list – [in] List of input file metadata
- Returns:
A parquet-compatible blob that contains the data for all row groups in the list
Variables
-
static constexpr statistics_freq ORC_STATISTICS_STRIPE = statistics_freq::STATISTICS_ROWGROUP#
Constants to disambiguate statistics terminology for ORC.
ORC refers to its finest granularity of row-grouping as “row group”, which corresponds to Parquet “pages”. Similarly, ORC’s “stripe” corresponds to a Parquet “row group”. The following constants disambiguate the terminology for the statistics collected at each level.
-
static constexpr statistics_freq ORC_STATISTICS_ROW_GROUP = statistics_freq::STATISTICS_PAGE#
-
class csv_writer_options#
- #include <csv.hpp>
Settings to use for
write_csv()
.Public Functions
-
explicit csv_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline sink_info const &get_sink() const#
Returns sink used for writer output.
- Returns:
sink used for writer output
-
inline table_view const &get_table() const#
Returns table that would be written to output.
- Returns:
Table that would be written to output
-
inline std::vector<std::string> const &get_names() const#
Returns names of the columns.
- Returns:
Names of the columns in the output file
-
inline std::string get_na_rep() const#
Returns string to used for null entries.
- Returns:
string to used for null entries
-
inline bool is_enabled_include_header() const#
Whether to write headers to csv.
- Returns:
true
if writing headers to csv
-
inline size_type get_rows_per_chunk() const#
Returns maximum number of rows to process for each file write.
- Returns:
Maximum number of rows to process for each file write
-
inline std::string get_line_terminator() const#
Returns character used for separating lines.
- Returns:
Character used for separating lines
-
inline char get_inter_column_delimiter() const#
Returns character used for separating column values.
- Returns:
Character used for separating column values.
-
inline std::string get_true_value() const#
Returns string used for values != 0 in INT8 types.
- Returns:
string used for values != 0 in INT8 types
-
inline std::string get_false_value() const#
Returns string used for values == 0 in INT8 types.
- Returns:
string used for values == 0 in INT8 types
-
inline quote_style get_quoting() const#
Returns the quote style for the writer.
Note: Only MINIMAL and NONE are supported.
MINIMAL: String columns containing special characters like row-delimiters field-delimiter/quotes will be quoted.
NONE: No quoting is done for any columns.
- Returns:
quote_style The quote style for the writer
-
inline void set_names(std::vector<std::string> names)#
Sets optional associated column names.
- Parameters:
names – Associated column names
-
inline void set_na_rep(std::string val)#
Sets string to used for null entries.
- Parameters:
val – String to represent null value
-
inline void enable_include_header(bool val)#
Enables/Disables headers being written to csv.
- Parameters:
val – Boolean value to enable/disable
-
inline void set_rows_per_chunk(size_type val)#
Sets maximum number of rows to process for each file write.
- Parameters:
val – Number of rows per chunk
-
inline void set_line_terminator(std::string term)#
Sets character used for separating lines.
- Parameters:
term – Character to represent line termination
-
inline void set_inter_column_delimiter(char delim)#
Sets character used for separating column values.
- Parameters:
delim – Character to delimit column values
-
inline void set_true_value(std::string val)#
Sets string used for values != 0 in INT8 types.
- Parameters:
val – String to represent values != 0 in INT8 types
-
inline void set_false_value(std::string val)#
Sets string used for values == 0 in INT8 types.
- Parameters:
val – String to represent values == 0 in INT8 types
-
inline void set_table(table_view const &table)#
(Re)sets the table being written.
- Parameters:
table – Table to be written
-
inline void set_quoting(quote_style quoting)#
Sets the quote style for the writer.
Note: Only the following quote styles are supported:
MINIMAL: String columns containing special characters like row-delimiters/ field-delimiter/quotes will be quoted.
NONE: No quoting is done for any columns.
- Parameters:
quoting – The new quote_style for the writer.
Public Static Functions
-
static csv_writer_options_builder builder(sink_info const &sink, table_view const &table)#
Create builder to create
csv_writer_options
.- Parameters:
sink – The sink used for writer output
table – Table to be written to output
- Returns:
Builder to build csv_writer_options
-
explicit csv_writer_options() = default#
-
class csv_writer_options_builder#
- #include <csv.hpp>
Builder to build options for
writer_csv()
Public Functions
-
explicit csv_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit csv_writer_options_builder(sink_info const &sink, table_view const &table)#
Constructor from sink and table.
- Parameters:
sink – The sink used for writer output
table – Table to be written to output
-
inline csv_writer_options_builder &names(std::vector<std::string> names)#
Sets optional column names.
- Parameters:
names – Column names
- Returns:
this for chaining
-
inline csv_writer_options_builder &na_rep(std::string val)#
Sets string to used for null entries.
- Parameters:
val – String to represent null value
- Returns:
this for chaining
-
inline csv_writer_options_builder &include_header(bool val)#
Enables/Disables headers being written to csv.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline csv_writer_options_builder &rows_per_chunk(int val)#
Sets maximum number of rows to process for each file write.
- Parameters:
val – Number of rows per chunk
- Returns:
this for chaining
-
inline csv_writer_options_builder &line_terminator(std::string term)#
Sets character used for separating lines.
- Parameters:
term – Character to represent line termination
- Returns:
this for chaining
-
inline csv_writer_options_builder &inter_column_delimiter(char delim)#
Sets character used for separating column values.
- Parameters:
delim – Character to delimit column values
- Returns:
this for chaining
-
inline csv_writer_options_builder &true_value(std::string val)#
Sets string used for values != 0 in INT8 types.
- Parameters:
val – String to represent values != 0 in INT8 types
- Returns:
this for chaining
-
inline csv_writer_options_builder &false_value(std::string val)#
Sets string used for values == 0 in INT8 types.
- Parameters:
val – String to represent values == 0 in INT8 types
- Returns:
this for chaining
-
inline csv_writer_options_builder "ing(quote_style quoting)#
Sets the quote style for the writer.
Only MINIMAL and NONE are supported.
- Parameters:
quoting – The new quote style for the writer.
- Returns:
this for chaining
-
inline operator csv_writer_options&&()#
move
csv_writer_options
member once it’s built.
-
inline csv_writer_options &&build()#
move
csv_writer_options
member once it’s built.This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
csv_writer_options
object’s r-value reference
-
explicit csv_writer_options_builder() = default#
-
class json_writer_options#
- #include <json.hpp>
Settings to use for
write_json()
.Public Functions
-
explicit json_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline sink_info const &get_sink() const#
Returns sink used for writer output.
- Returns:
sink used for writer output
-
inline table_view const &get_table() const#
Returns table that would be written to output.
- Returns:
Table that would be written to output
-
inline std::optional<table_metadata> const &get_metadata() const#
Returns metadata information.
- Returns:
Metadata information
-
inline std::string const &get_na_rep() const#
Returns string to used for null entries.
- Returns:
string to used for null entries
-
inline bool is_enabled_include_nulls() const#
Whether to output nulls as ‘null’.
- Returns:
true
if nulls are output as ‘null’
-
inline bool is_enabled_lines() const#
Whether to use JSON lines for records format.
- Returns:
true
if JSON lines is used for records format
-
inline size_type get_rows_per_chunk() const#
Returns maximum number of rows to process for each file write.
- Returns:
Maximum number of rows to process for each file write
-
inline std::string const &get_true_value() const#
Returns string used for values != 0 in INT8 types.
- Returns:
string used for values != 0 in INT8 types
-
inline std::string const &get_false_value() const#
Returns string used for values == 0 in INT8 types.
- Returns:
string used for values == 0 in INT8 types
-
inline void set_table(table_view tbl)#
Sets table to be written to output.
- Parameters:
tbl – Table for the output
-
inline void set_metadata(table_metadata metadata)#
Sets metadata.
- Parameters:
metadata – Associated metadata
-
inline void set_na_rep(std::string val)#
Sets string to used for null entries.
- Parameters:
val – String to represent null value
-
inline void enable_include_nulls(bool val)#
Enables/Disables output of nulls as ‘null’.
- Parameters:
val – Boolean value to enable/disable
-
inline void enable_lines(bool val)#
Enables/Disables JSON lines for records format.
- Parameters:
val – Boolean value to enable/disable JSON lines
-
inline void set_rows_per_chunk(size_type val)#
Sets maximum number of rows to process for each file write.
- Parameters:
val – Number of rows per chunk
-
inline void set_true_value(std::string val)#
Sets string used for values != 0 in INT8 types.
- Parameters:
val – String to represent values != 0 in INT8 types
-
inline void set_false_value(std::string val)#
Sets string used for values == 0 in INT8 types.
- Parameters:
val – String to represent values == 0 in INT8 types
Public Static Functions
-
static json_writer_options_builder builder(sink_info const &sink, table_view const &table)#
Create builder to create
json_writer_options
.- Parameters:
sink – The sink used for writer output
table – Table to be written to output
- Returns:
Builder to build json_writer_options
-
explicit json_writer_options() = default#
-
class json_writer_options_builder#
- #include <json.hpp>
Builder to build options for
writer_json()
Public Functions
-
explicit json_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit json_writer_options_builder(sink_info const &sink, table_view const &table)#
Constructor from sink and table.
- Parameters:
sink – The sink used for writer output
table – Table to be written to output
-
inline json_writer_options_builder &table(table_view tbl)#
Sets table to be written to output.
- Parameters:
tbl – Table for the output
- Returns:
this for chaining
-
inline json_writer_options_builder &metadata(table_metadata metadata)#
Sets optional metadata (with column names).
- Parameters:
metadata – metadata (with column names)
- Returns:
this for chaining
-
inline json_writer_options_builder &na_rep(std::string val)#
Sets string to used for null entries.
- Parameters:
val – String to represent null value
- Returns:
this for chaining
-
inline json_writer_options_builder &include_nulls(bool val)#
Enables/Disables output of nulls as ‘null’.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline json_writer_options_builder &lines(bool val)#
Enables/Disables JSON lines for records format.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline json_writer_options_builder &rows_per_chunk(int val)#
Sets maximum number of rows to process for each file write.
- Parameters:
val – Number of rows per chunk
- Returns:
this for chaining
-
inline json_writer_options_builder &true_value(std::string val)#
Sets string used for values != 0 in INT8 types.
- Parameters:
val – String to represent values != 0 in INT8 types
- Returns:
this for chaining
-
inline json_writer_options_builder &false_value(std::string val)#
Sets string used for values == 0 in INT8 types.
- Parameters:
val – String to represent values == 0 in INT8 types
- Returns:
this for chaining
-
inline operator json_writer_options&&()#
move
json_writer_options
member once it’s built.
-
inline json_writer_options &&build()#
move
json_writer_options
member once it’s built.This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
json_writer_options
object’s r-value reference
-
explicit json_writer_options_builder() = default#
-
class orc_writer_options#
- #include <orc.hpp>
Settings to use for
write_orc()
.Public Functions
-
explicit orc_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline compression_type get_compression() const#
Returns compression type.
- Returns:
Compression type
-
inline bool is_enabled_statistics() const#
Whether writing column statistics is enabled/disabled.
- Returns:
true
if writing column statistics is enabled
-
inline statistics_freq get_statistics_freq() const#
Returns frequency of statistics collection.
- Returns:
Frequency of statistics collection
-
inline auto get_stripe_size_bytes() const#
Returns maximum stripe size, in bytes.
- Returns:
Maximum stripe size, in bytes
-
inline auto get_stripe_size_rows() const#
Returns maximum stripe size, in rows.
- Returns:
Maximum stripe size, in rows
-
inline auto get_row_index_stride() const#
Returns the row index stride.
- Returns:
Row index stride
-
inline table_view get_table() const#
Returns table to be written to output.
- Returns:
Table to be written to output
-
inline auto const &get_metadata() const#
Returns associated metadata.
- Returns:
Associated metadata
-
inline std::map<std::string, std::string> const &get_key_value_metadata() const#
Returns Key-Value footer metadata information.
- Returns:
Key-Value footer metadata information
-
inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#
Returns a shared pointer to the user-provided compression statistics.
- Returns:
Compression statistics
-
inline bool get_enable_dictionary_sort() const#
Returns whether string dictionaries should be sorted.
- Returns:
true
if string dictionaries should be sorted
-
inline void set_compression(compression_type comp)#
Sets compression type.
- Parameters:
comp – Compression type
-
inline void enable_statistics(statistics_freq val)#
Choose granularity of statistics collection.
The granularity can be set to:
cudf::io::STATISTICS_NONE: No statistics are collected.
cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
- Parameters:
val – Frequency of statistics collection
-
inline void set_stripe_size_bytes(size_t size_bytes)#
Sets the maximum stripe size, in bytes.
- Parameters:
size_bytes – Maximum stripe size, in bytes to be set
- Throws:
cudf::logic_error – if a value below the minimal size is passed
-
inline void set_stripe_size_rows(size_type size_rows)#
Sets the maximum stripe size, in rows.
If the stripe size is smaller that the row group size, row group size will be reduced to math the stripe size.
- Parameters:
size_rows – Maximum stripe size, in rows to be set
- Throws:
cudf::logic_error – if a value below the minimal number of rows is passed
-
inline void set_row_index_stride(size_type stride)#
Sets the row index stride.
Rounded down to a multiple of 8.
- Parameters:
stride – Row index stride to be set
- Throws:
cudf::logic_error – if a value below the minimal row index stride is passed
-
inline void set_table(table_view tbl)#
Sets table to be written to output.
- Parameters:
tbl – Table for the output
-
inline void set_metadata(table_input_metadata meta)#
Sets associated metadata.
- Parameters:
meta – Associated metadata
-
inline void set_key_value_metadata(std::map<std::string, std::string> metadata)#
Sets metadata.
- Parameters:
metadata – Key-Value footer metadata
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be updated after writing
-
inline void set_enable_dictionary_sort(bool val)#
Sets whether string dictionaries should be sorted.
- Parameters:
val – Boolean value to enable/disable
Public Static Functions
-
static orc_writer_options_builder builder(sink_info const &sink, table_view const &table)#
Create builder to create
orc_writer_options
.- Parameters:
sink – The sink used for writer output
table – Table to be written to output
- Returns:
Builder to build
orc_writer_options
-
explicit orc_writer_options() = default#
-
class orc_writer_options_builder#
- #include <orc.hpp>
Builds settings to use for
write_orc()
.Public Functions
-
orc_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline orc_writer_options_builder(sink_info const &sink, table_view const &table)#
Constructor from sink and table.
- Parameters:
sink – The sink used for writer output
table – Table to be written to output
-
inline orc_writer_options_builder &compression(compression_type comp)#
Sets compression type.
- Parameters:
comp – The compression type to use
- Returns:
this for chaining
-
inline orc_writer_options_builder &enable_statistics(statistics_freq val)#
Choose granularity of column statistics to be written.
The granularity can be set to:
cudf::io::STATISTICS_NONE: No statistics are collected.
cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
- Parameters:
val – Level of statistics collection
- Returns:
this for chaining
-
inline orc_writer_options_builder &stripe_size_bytes(size_t val)#
Sets the maximum stripe size, in bytes.
- Parameters:
val – maximum stripe size
- Returns:
this for chaining
-
inline orc_writer_options_builder &stripe_size_rows(size_type val)#
Sets the maximum number of rows in output stripes.
- Parameters:
val – maximum number or rows
- Returns:
this for chaining
-
inline orc_writer_options_builder &row_index_stride(size_type val)#
Sets the row index stride.
- Parameters:
val – new row index stride
- Returns:
this for chaining
-
inline orc_writer_options_builder &table(table_view tbl)#
Sets table to be written to output.
- Parameters:
tbl – Table for the output
- Returns:
this for chaining
-
inline orc_writer_options_builder &metadata(table_input_metadata meta)#
Sets associated metadata.
- Parameters:
meta – Associated metadata
- Returns:
this for chaining
-
inline orc_writer_options_builder &key_value_metadata(std::map<std::string, std::string> metadata)#
Sets Key-Value footer metadata.
- Parameters:
metadata – Key-Value footer metadata
- Returns:
this for chaining
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be filled once writer is done
- Returns:
this for chaining
-
inline orc_writer_options_builder &enable_dictionary_sort(bool val)#
Sets whether string dictionaries should be sorted.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline operator orc_writer_options&&()#
move orc_writer_options member once it’s built.
-
inline orc_writer_options &&build()#
move orc_writer_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
orc_writer_options
object’s r-value reference
-
orc_writer_options_builder() = default#
-
class chunked_orc_writer_options#
- #include <orc.hpp>
Settings to use for
write_orc_chunked()
.Public Functions
-
explicit chunked_orc_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline compression_type get_compression() const#
Returns compression type.
- Returns:
Compression type
-
inline statistics_freq get_statistics_freq() const#
Returns granularity of statistics collection.
- Returns:
Granularity of statistics collection
-
inline auto get_stripe_size_bytes() const#
Returns maximum stripe size, in bytes.
- Returns:
Maximum stripe size, in bytes
-
inline auto get_stripe_size_rows() const#
Returns maximum stripe size, in rows.
- Returns:
Maximum stripe size, in rows
-
inline auto get_row_index_stride() const#
Returns the row index stride.
- Returns:
Row index stride
-
inline auto const &get_metadata() const#
Returns associated metadata.
- Returns:
Associated metadata
-
inline std::map<std::string, std::string> const &get_key_value_metadata() const#
Returns Key-Value footer metadata information.
- Returns:
Key-Value footer metadata information
-
inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#
Returns a shared pointer to the user-provided compression statistics.
- Returns:
Compression statistics
-
inline bool get_enable_dictionary_sort() const#
Returns whether string dictionaries should be sorted.
- Returns:
true
if string dictionaries should be sorted
-
inline void set_compression(compression_type comp)#
Sets compression type.
- Parameters:
comp – The compression type to use
-
inline void enable_statistics(statistics_freq val)#
Choose granularity of statistics collection.
The granularity can be set to:
cudf::io::STATISTICS_NONE: No statistics are collected.
cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
- Parameters:
val – Frequency of statistics collection
-
inline void set_stripe_size_bytes(size_t size_bytes)#
Sets the maximum stripe size, in bytes.
- Parameters:
size_bytes – Maximum stripe size, in bytes to be set
- Throws:
cudf::logic_error – if a value below the minimal stripe size is passed
-
inline void set_stripe_size_rows(size_type size_rows)#
Sets the maximum stripe size, in rows.
If the stripe size is smaller that the row group size, row group size will be reduced to math the stripe size.
- Parameters:
size_rows – Maximum stripe size, in rows to be set
- Throws:
cudf::logic_error – if a value below the minimal number of rows in a stripe is passed
-
inline void set_row_index_stride(size_type stride)#
Sets the row index stride.
Rounded down to a multiple of 8.
- Parameters:
stride – Row index stride to be set
- Throws:
cudf::logic_error – if a value below the minimal number of rows in a row group is passed
-
inline void metadata(table_input_metadata meta)#
Sets associated metadata.
- Parameters:
meta – Associated metadata
-
inline void set_key_value_metadata(std::map<std::string, std::string> metadata)#
Sets Key-Value footer metadata.
- Parameters:
metadata – Key-Value footer metadata
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be updated after writing
-
inline void set_enable_dictionary_sort(bool val)#
Sets whether string dictionaries should be sorted.
- Parameters:
val – Boolean value to enable/disable
Public Static Functions
-
static chunked_orc_writer_options_builder builder(sink_info const &sink)#
Create builder to create
chunked_orc_writer_options
.- Parameters:
sink – The sink used for writer output
- Returns:
Builder to build chunked_orc_writer_options
-
explicit chunked_orc_writer_options() = default#
-
class chunked_orc_writer_options_builder#
- #include <orc.hpp>
Builds settings to use for
write_orc_chunked()
.Public Functions
-
chunked_orc_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit chunked_orc_writer_options_builder(sink_info const &sink)#
Constructor from sink and table.
- Parameters:
sink – The sink used for writer output
-
inline chunked_orc_writer_options_builder &compression(compression_type comp)#
Sets compression type.
- Parameters:
comp – The compression type to use
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &enable_statistics(statistics_freq val)#
Choose granularity of statistics collection.
The granularity can be set to:
cudf::io::STATISTICS_NONE: No statistics are collected.
cudf::io::ORC_STATISTICS_STRIPE: Statistics are collected for each ORC stripe.
cudf::io::ORC_STATISTICS_ROWGROUP: Statistics are collected for each ORC row group.
- Parameters:
val – Frequency of statistics collection
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &stripe_size_bytes(size_t val)#
Sets the maximum stripe size, in bytes.
- Parameters:
val – maximum stripe size
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &stripe_size_rows(size_type val)#
Sets the maximum number of rows in output stripes.
- Parameters:
val – maximum number or rows
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &row_index_stride(size_type val)#
Sets the row index stride.
- Parameters:
val – new row index stride
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &metadata(table_input_metadata meta)#
Sets associated metadata.
- Parameters:
meta – Associated metadata
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &key_value_metadata(std::map<std::string, std::string> metadata)#
Sets Key-Value footer metadata.
- Parameters:
metadata – Key-Value footer metadata
- Returns:
this for chaining
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be filled once writer is done
- Returns:
this for chaining
-
inline chunked_orc_writer_options_builder &enable_dictionary_sort(bool val)#
Sets whether string dictionaries should be sorted.
- Parameters:
val – Boolean value to enable/disable
- Returns:
this for chaining
-
inline operator chunked_orc_writer_options&&()#
move chunked_orc_writer_options member once it’s built.
-
inline chunked_orc_writer_options &&build()#
move chunked_orc_writer_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
chunked_orc_writer_options
object’s r-value reference
-
chunked_orc_writer_options_builder() = default#
-
class orc_chunked_writer#
- #include <orc.hpp>
Chunked orc writer class writes an ORC file in a chunked/stream form.
The intent of the write_orc_chunked_ path is to allow writing of an arbitrarily large / arbitrary number of rows to an ORC file in multiple passes.
The following code snippet demonstrates how to write a single ORC file containing one logical table by writing a series of individual cudf::tables.
... std::string filepath = "dataset.orc"; cudf::io::chunked_orc_writer_options options = cudf::io::chunked_orc_writer_options options::builder(cudf::sink_info(filepath)); ... orc_chunked_writer writer(options) writer.write(table0) writer.write(table1) ... writer.close();
Public Functions
-
orc_chunked_writer() = default#
Default constructor, this should never be used. This is added just to satisfy cython.
-
orc_chunked_writer(chunked_orc_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Constructor with chunked writer options.
- Parameters:
options – [in] options used to write table
stream – [in] CUDA stream used for device memory operations and kernel launches
-
orc_chunked_writer &write(table_view const &table)#
Writes table to output.
- Parameters:
table – [in] Table that needs to be written
- Returns:
returns reference of the class object
-
void close()#
Finishes the chunked/streamed write process.
-
orc_chunked_writer() = default#
-
class parquet_writer_options#
- #include <parquet.hpp>
Settings for
write_parquet()
.Public Functions
-
parquet_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline compression_type get_compression() const#
Returns compression format used.
- Returns:
Compression format
-
inline statistics_freq get_stats_level() const#
Returns level of statistics requested in output file.
- Returns:
level of statistics requested in output file
-
inline table_view get_table() const#
Returns table_view.
- Returns:
Table view
-
inline std::vector<partition_info> const &get_partitions() const#
Returns partitions.
- Returns:
Partitions
-
inline auto const &get_metadata() const#
Returns associated metadata.
- Returns:
Associated metadata
-
inline std::vector<std::map<std::string, std::string>> const &get_key_value_metadata() const#
Returns Key-Value footer metadata information.
- Returns:
Key-Value footer metadata information
-
inline bool is_enabled_int96_timestamps() const#
Returns
true
if timestamps will be written as INT96.- Returns:
true
if timestamps will be written as INT96
-
inline auto is_enabled_utc_timestamps() const#
Returns
true
if timestamps will be written as UTC.- Returns:
true
if timestamps will be written as UTC
-
inline std::vector<std::string> const &get_column_chunks_file_paths() const#
Returns Column chunks file paths to be set in the raw output metadata.
- Returns:
Column chunks file paths to be set in the raw output metadata
-
inline auto get_row_group_size_bytes() const#
Returns maximum row group size, in bytes.
- Returns:
Maximum row group size, in bytes
-
inline auto get_row_group_size_rows() const#
Returns maximum row group size, in rows.
- Returns:
Maximum row group size, in rows
-
inline auto get_max_page_size_bytes() const#
Returns the maximum uncompressed page size, in bytes.
If set larger than the row group size, then this will return the row group size.
- Returns:
Maximum uncompressed page size, in bytes
-
inline auto get_max_page_size_rows() const#
Returns maximum page size, in rows.
If set larger than the row group size, then this will return the row group size.
- Returns:
Maximum page size, in rows
-
inline auto get_column_index_truncate_length() const#
Returns maximum length of min or max values in column index, in bytes.
- Returns:
length min/max will be truncated to
-
inline dictionary_policy get_dictionary_policy() const#
Returns policy for dictionary use.
- Returns:
policy for dictionary use
-
inline auto get_max_dictionary_size() const#
Returns maximum dictionary size, in bytes.
- Returns:
Maximum dictionary size, in bytes.
-
inline auto get_max_page_fragment_size() const#
Returns maximum page fragment size, in rows.
- Returns:
Maximum page fragment size, in rows.
-
inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#
Returns a shared pointer to the user-provided compression statistics.
- Returns:
Compression statistics
-
inline auto is_enabled_write_v2_headers() const#
Returns
true
if V2 page headers should be written.- Returns:
true
if V2 page headers should be written.
-
void set_partitions(std::vector<partition_info> partitions)#
Sets partitions.
- Parameters:
partitions – Partitions of input table in {start_row, num_rows} pairs. If specified, must be same size as number of sinks in sink_info
-
inline void set_metadata(table_input_metadata metadata)#
Sets metadata.
- Parameters:
metadata – Associated metadata
-
void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#
Sets metadata.
- Parameters:
metadata – Key-Value footer metadata
-
inline void set_stats_level(statistics_freq sf)#
Sets the level of statistics.
- Parameters:
sf – Level of statistics requested in the output file
-
inline void set_compression(compression_type compression)#
Sets compression type.
- Parameters:
compression – The compression type to use
-
inline void enable_int96_timestamps(bool req)#
Sets timestamp writing preferences. INT96 timestamps will be written if
true
and TIMESTAMP_MICROS will be written iffalse
.- Parameters:
req – Boolean value to enable/disable writing of INT96 timestamps
-
inline void enable_utc_timestamps(bool val)#
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to
true
.- Parameters:
val – Boolean value to enable/disable writing of timestamps as UTC.
-
void set_column_chunks_file_paths(std::vector<std::string> file_paths)#
Sets column chunks file path to be set in the raw output metadata.
- Parameters:
file_paths – Vector of Strings which indicates file path. Must be same size as number of data sinks in sink info
-
void set_row_group_size_bytes(size_t size_bytes)#
Sets the maximum row group size, in bytes.
- Parameters:
size_bytes – Maximum row group size, in bytes to set
-
void set_row_group_size_rows(size_type size_rows)#
Sets the maximum row group size, in rows.
- Parameters:
size_rows – Maximum row group size, in rows to set
-
void set_max_page_size_bytes(size_t size_bytes)#
Sets the maximum uncompressed page size, in bytes.
- Parameters:
size_bytes – Maximum uncompressed page size, in bytes to set
-
void set_max_page_size_rows(size_type size_rows)#
Sets the maximum page size, in rows.
- Parameters:
size_rows – Maximum page size, in rows to set
-
void set_column_index_truncate_length(int32_t size_bytes)#
Sets the maximum length of min or max values in column index, in bytes.
- Parameters:
size_bytes – length min/max will be truncated to
-
void set_dictionary_policy(dictionary_policy policy)#
Sets the policy for dictionary use.
- Parameters:
policy – Policy for dictionary use
-
void set_max_dictionary_size(size_t size_bytes)#
Sets the maximum dictionary size, in bytes.
- Parameters:
size_bytes – Maximum dictionary size, in bytes
-
void set_max_page_fragment_size(size_type size_rows)#
Sets the maximum page fragment size, in rows.
- Parameters:
size_rows – Maximum page fragment size, in rows.
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be updated after writing
-
inline void enable_write_v2_headers(bool val)#
Sets preference for V2 page headers. Write V2 page headers if set to
true
.- Parameters:
val – Boolean value to enable/disable writing of V2 page headers.
Public Static Functions
-
static parquet_writer_options_builder builder(sink_info const &sink, table_view const &table)#
Create builder to create
parquet_writer_options
.- Parameters:
sink – The sink used for writer output
table – Table to be written to output
- Returns:
Builder to build parquet_writer_options
-
static parquet_writer_options_builder builder()#
Create builder to create
parquet_writer_options
.- Returns:
-
parquet_writer_options() = default#
-
class parquet_writer_options_builder#
- #include <parquet.hpp>
Class to build
parquet_writer_options
.Public Functions
-
explicit parquet_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline explicit parquet_writer_options_builder(sink_info const &sink, table_view const &table)#
Constructor from sink and table.
- Parameters:
sink – The sink used for writer output
table – Table to be written to output
-
parquet_writer_options_builder &partitions(std::vector<partition_info> partitions)#
Sets partitions in parquet_writer_options.
- Parameters:
partitions – Partitions of input table in {start_row, num_rows} pairs. If specified, must be same size as number of sinks in sink_info
- Returns:
this for chaining
-
inline parquet_writer_options_builder &metadata(table_input_metadata metadata)#
Sets metadata in parquet_writer_options.
- Parameters:
metadata – Associated metadata
- Returns:
this for chaining
-
parquet_writer_options_builder &key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#
Sets Key-Value footer metadata in parquet_writer_options.
- Parameters:
metadata – Key-Value footer metadata
- Returns:
this for chaining
-
inline parquet_writer_options_builder &stats_level(statistics_freq sf)#
Sets the level of statistics in parquet_writer_options.
- Parameters:
sf – Level of statistics requested in the output file
- Returns:
this for chaining
-
inline parquet_writer_options_builder &compression(compression_type compression)#
Sets compression type in parquet_writer_options.
- Parameters:
compression – The compression type to use
- Returns:
this for chaining
-
parquet_writer_options_builder &column_chunks_file_paths(std::vector<std::string> file_paths)#
Sets column chunks file path to be set in the raw output metadata.
- Parameters:
file_paths – Vector of Strings which indicates file path. Must be same size as number of data sinks
- Returns:
this for chaining
-
inline parquet_writer_options_builder &row_group_size_bytes(size_t val)#
Sets the maximum row group size, in bytes.
- Parameters:
val – maximum row group size
- Returns:
this for chaining
-
inline parquet_writer_options_builder &row_group_size_rows(size_type val)#
Sets the maximum number of rows in output row groups.
- Parameters:
val – maximum number or rows
- Returns:
this for chaining
-
inline parquet_writer_options_builder &max_page_size_bytes(size_t val)#
Sets the maximum uncompressed page size, in bytes.
Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, and will be adjusted to match if it is.
- Parameters:
val – maximum page size
- Returns:
this for chaining
-
inline parquet_writer_options_builder &max_page_size_rows(size_type val)#
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. Cannot be larger than the row group size in rows, and will be adjusted to match if it is.
- Parameters:
val – maximum rows per page
- Returns:
this for chaining
-
inline parquet_writer_options_builder &column_index_truncate_length(int32_t val)#
Sets the desired maximum size in bytes for min and max values in the column index.
Values exceeding this limit will be truncated, but modified such that they will still be valid lower and upper bounds. This only applies to variable length types, such as string. Maximum values will not be truncated if there is no suitable truncation that results in a valid upper bound.
Default value is 64.
- Parameters:
val – length min/max will be truncated to, with 0 indicating no truncation
- Returns:
this for chaining
-
parquet_writer_options_builder &dictionary_policy(enum dictionary_policy val)#
Sets the policy for dictionary use.
Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can be compressed. In some circumstances, the dictionary can grow beyond this limit, which will prevent the column from being compressed. This setting controls how the writer should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable dictionary encoding for columns where the dictionary exceeds the limit. A setting of dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in the disabling of compression for columns that would otherwise be compressed.
The default value is dictionary_policy::ALWAYS.
- Parameters:
val – policy for dictionary use
- Returns:
this for chaining
-
parquet_writer_options_builder &max_dictionary_size(size_t val)#
Sets the maximum dictionary size, in bytes.
Disables dictionary encoding for any column chunk where the dictionary will exceed this limit. Only used when the dictionary_policy is set to ‘ADAPTIVE’.
Default value is 1048576 (1MiB).
- Parameters:
val – maximum dictionary size
- Returns:
this for chaining
-
parquet_writer_options_builder &max_page_fragment_size(size_type val)#
Sets the maximum page fragment size, in rows.
Files with nested schemas or very long strings may need a page fragment size smaller than the default value of 5000 to ensure a single fragment will not exceed the desired maximum page size in bytes.
- Parameters:
val – maximum page fragment size
- Returns:
this for chaining
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be filled once writer is done
- Returns:
this for chaining
-
inline parquet_writer_options_builder &int96_timestamps(bool enabled)#
Sets whether int96 timestamps are written or not in parquet_writer_options.
- Parameters:
enabled – Boolean value to enable/disable int96 timestamps
- Returns:
this for chaining
-
inline parquet_writer_options_builder &utc_timestamps(bool enabled)#
Set to true if timestamps are to be written as UTC.
- Parameters:
enabled – Boolean value to enable/disable writing of timestamps as UTC.
- Returns:
this for chaining
-
parquet_writer_options_builder &write_v2_headers(bool enabled)#
Set to true if V2 page headers are to be written.
- Parameters:
enabled – Boolean value to enable/disable writing of V2 page headers.
- Returns:
this for chaining
-
inline operator parquet_writer_options&&()#
move parquet_writer_options member once it’s built.
-
inline parquet_writer_options &&build()#
move parquet_writer_options member once it’s built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
parquet_writer_options
object’s r-value reference
-
explicit parquet_writer_options_builder() = default#
-
class chunked_parquet_writer_options#
- #include <parquet.hpp>
Settings for
write_parquet_chunked()
.Public Functions
-
chunked_parquet_writer_options() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline compression_type get_compression() const#
Returns compression format used.
- Returns:
Compression format
-
inline statistics_freq get_stats_level() const#
Returns level of statistics requested in output file.
- Returns:
Level of statistics requested in output file
-
inline auto const &get_metadata() const#
Returns metadata information.
- Returns:
Metadata information
-
inline std::vector<std::map<std::string, std::string>> const &get_key_value_metadata() const#
Returns Key-Value footer metadata information.
- Returns:
Key-Value footer metadata information
-
inline bool is_enabled_int96_timestamps() const#
Returns
true
if timestamps will be written as INT96.- Returns:
true
if timestamps will be written as INT96
-
inline auto is_enabled_utc_timestamps() const#
Returns
true
if timestamps will be written as UTC.- Returns:
true
if timestamps will be written as UTC
-
inline auto get_row_group_size_bytes() const#
Returns maximum row group size, in bytes.
- Returns:
Maximum row group size, in bytes
-
inline auto get_row_group_size_rows() const#
Returns maximum row group size, in rows.
- Returns:
Maximum row group size, in rows
-
inline auto get_max_page_size_bytes() const#
Returns maximum uncompressed page size, in bytes.
If set larger than the row group size, then this will return the row group size.
- Returns:
Maximum uncompressed page size, in bytes
-
inline auto get_max_page_size_rows() const#
Returns maximum page size, in rows.
If set larger than the row group size, then this will return the row group size.
- Returns:
Maximum page size, in rows
-
inline auto get_column_index_truncate_length() const#
Returns maximum length of min or max values in column index, in bytes.
- Returns:
length min/max will be truncated to
-
inline dictionary_policy get_dictionary_policy() const#
Returns policy for dictionary use.
- Returns:
policy for dictionary use
-
inline auto get_max_dictionary_size() const#
Returns maximum dictionary size, in bytes.
- Returns:
Maximum dictionary size, in bytes.
-
inline auto get_max_page_fragment_size() const#
Returns maximum page fragment size, in rows.
- Returns:
Maximum page fragment size, in rows.
-
inline std::shared_ptr<writer_compression_statistics> get_compression_statistics() const#
Returns a shared pointer to the user-provided compression statistics.
- Returns:
Compression statistics
-
inline auto is_enabled_write_v2_headers() const#
Returns
true
if V2 page headers should be written.- Returns:
true
if V2 page headers should be written.
-
inline void set_metadata(table_input_metadata metadata)#
Sets metadata.
- Parameters:
metadata – Associated metadata
-
void set_key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#
Sets Key-Value footer metadata.
- Parameters:
metadata – Key-Value footer metadata
-
inline void set_stats_level(statistics_freq sf)#
Sets the level of statistics in parquet_writer_options.
- Parameters:
sf – Level of statistics requested in the output file
-
inline void set_compression(compression_type compression)#
Sets compression type.
- Parameters:
compression – The compression type to use
-
inline void enable_int96_timestamps(bool req)#
Sets timestamp writing preferences.
INT96 timestamps will be written if
true
and TIMESTAMP_MICROS will be written iffalse
.- Parameters:
req – Boolean value to enable/disable writing of INT96 timestamps
-
inline void enable_utc_timestamps(bool val)#
Sets preference for writing timestamps as UTC. Write timestamps as UTC if set to
true
.- Parameters:
val – Boolean value to enable/disable writing of timestamps as UTC.
-
void set_row_group_size_bytes(size_t size_bytes)#
Sets the maximum row group size, in bytes.
- Parameters:
size_bytes – Maximum row group size, in bytes to set
-
void set_row_group_size_rows(size_type size_rows)#
Sets the maximum row group size, in rows.
- Parameters:
size_rows – The maximum row group size, in rows to set
-
void set_max_page_size_bytes(size_t size_bytes)#
Sets the maximum uncompressed page size, in bytes.
- Parameters:
size_bytes – Maximum uncompressed page size, in bytes to set
-
void set_max_page_size_rows(size_type size_rows)#
Sets the maximum page size, in rows.
- Parameters:
size_rows – The maximum page size, in rows to set
-
void set_column_index_truncate_length(int32_t size_bytes)#
Sets the maximum length of min or max values in column index, in bytes.
- Parameters:
size_bytes – length min/max will be truncated to
-
void set_dictionary_policy(dictionary_policy policy)#
Sets the policy for dictionary use.
- Parameters:
policy – Policy for dictionary use
-
void set_max_dictionary_size(size_t size_bytes)#
Sets the maximum dictionary size, in bytes.
- Parameters:
size_bytes – Maximum dictionary size, in bytes
-
void set_max_page_fragment_size(size_type size_rows)#
Sets the maximum page fragment size, in rows.
- Parameters:
size_rows – Maximum page fragment size, in rows.
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be updated after writing
-
inline void enable_write_v2_headers(bool val)#
Sets preference for V2 page headers. Write V2 page headers if set to
true
.- Parameters:
val – Boolean value to enable/disable writing of V2 page headers.
Public Static Functions
-
static chunked_parquet_writer_options_builder builder(sink_info const &sink)#
creates builder to build chunked_parquet_writer_options.
- Parameters:
sink – sink to use for writer output
- Returns:
Builder to build
chunked_parquet_writer_options
-
chunked_parquet_writer_options() = default#
-
class chunked_parquet_writer_options_builder#
- #include <parquet.hpp>
Builds options for chunked_parquet_writer_options.
Public Functions
-
chunked_parquet_writer_options_builder() = default#
Default constructor.
This has been added since Cython requires a default constructor to create objects on stack.
-
inline chunked_parquet_writer_options_builder(sink_info const &sink)#
Constructor from sink.
- Parameters:
sink – The sink used for writer output
-
inline chunked_parquet_writer_options_builder &metadata(table_input_metadata metadata)#
Sets metadata to chunked_parquet_writer_options.
- Parameters:
metadata – Associated metadata
- Returns:
this for chaining
-
chunked_parquet_writer_options_builder &key_value_metadata(std::vector<std::map<std::string, std::string>> metadata)#
Sets Key-Value footer metadata in parquet_writer_options.
- Parameters:
metadata – Key-Value footer metadata
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &stats_level(statistics_freq sf)#
Sets the level of statistics in chunked_parquet_writer_options.
- Parameters:
sf – Level of statistics requested in the output file
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &compression(compression_type compression)#
Sets compression type to chunked_parquet_writer_options.
- Parameters:
compression – The compression type to use
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &int96_timestamps(bool enabled)#
Set to true if timestamps should be written as int96 types instead of int64 types. Even though int96 is deprecated and is not an internal type for cudf, it needs to be written for backwards compatibility reasons.
- Parameters:
enabled – Boolean value to enable/disable int96 timestamps
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &utc_timestamps(bool enabled)#
Set to true if timestamps are to be written as UTC.
- Parameters:
enabled – Boolean value to enable/disable writing of timestamps as UTC.
- Returns:
this for chaining
-
chunked_parquet_writer_options_builder &write_v2_headers(bool enabled)#
Set to true if V2 page headers are to be written.
- Parameters:
enabled – Boolean value to enable/disable writing of V2 page headers.
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &row_group_size_bytes(size_t val)#
Sets the maximum row group size, in bytes.
- Parameters:
val – maximum row group size
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &row_group_size_rows(size_type val)#
Sets the maximum number of rows in output row groups.
- Parameters:
val – maximum number or rows
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &max_page_size_bytes(size_t val)#
Sets the maximum uncompressed page size, in bytes.
Serves as a hint to the writer, and can be exceeded under certain circumstances. Cannot be larger than the row group size in bytes, and will be adjusted to match if it is.
- Parameters:
val – maximum page size
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &max_page_size_rows(size_type val)#
Sets the maximum page size, in rows. Counts only top-level rows, ignoring any nesting. Cannot be larger than the row group size in rows, and will be adjusted to match if it is.
- Parameters:
val – maximum rows per page
- Returns:
this for chaining
-
inline chunked_parquet_writer_options_builder &column_index_truncate_length(int32_t val)#
Sets the desired maximum size in bytes for min and max values in the column index.
Values exceeding this limit will be truncated, but modified such that they will still be valid lower and upper bounds. This only applies to variable length types, such as string. Maximum values will not be truncated if there is no suitable truncation that results in a valid upper bound.
Default value is 64.
- Parameters:
val – length min/max will be truncated to, with 0 indicating no truncation
- Returns:
this for chaining
-
chunked_parquet_writer_options_builder &dictionary_policy(enum dictionary_policy val)#
Sets the policy for dictionary use.
Certain compression algorithms (e.g Zstandard) have limits on how large of a buffer can be compressed. In some circumstances, the dictionary can grow beyond this limit, which will prevent the column from being compressed. This setting controls how the writer should act in these circumstances. A setting of dictionary_policy::ADAPTIVE will disable dictionary encoding for columns where the dictionary exceeds the limit. A setting of dictionary_policy::NEVER will disable the use of dictionary encoding globally. A setting of dictionary_policy::ALWAYS will allow the use of dictionary encoding even if it will result in the disabling of compression for columns that would otherwise be compressed.
The default value is dictionary_policy::ALWAYS.
- Parameters:
val – policy for dictionary use
- Returns:
this for chaining
-
chunked_parquet_writer_options_builder &max_dictionary_size(size_t val)#
Sets the maximum dictionary size, in bytes.
Disables dictionary encoding for any column chunk where the dictionary will exceed this limit. Only used when the dictionary_policy is set to ‘ADAPTIVE’.
Default value is 1048576 (1MiB).
- Parameters:
val – maximum dictionary size
- Returns:
this for chaining
-
chunked_parquet_writer_options_builder &max_page_fragment_size(size_type val)#
Sets the maximum page fragment size, in rows.
Files with nested schemas or very long strings may need a page fragment size smaller than the default value of 5000 to ensure a single fragment will not exceed the desired maximum page size in bytes.
- Parameters:
val – maximum page fragment size
- Returns:
this for chaining
Sets the pointer to the output compression statistics.
- Parameters:
comp_stats – Pointer to compression statistics to be filled once writer is done
- Returns:
this for chaining
-
inline operator chunked_parquet_writer_options&&()#
move chunked_parquet_writer_options member once it’s built.
-
inline chunked_parquet_writer_options &&build()#
move chunked_parquet_writer_options member once it’s is built.
This has been added since Cython does not support overloading of conversion operators.
- Returns:
Built
chunked_parquet_writer_options
object’s r-value reference
-
chunked_parquet_writer_options_builder() = default#
-
class parquet_chunked_writer#
- #include <parquet.hpp>
chunked parquet writer class to handle options and write tables in chunks.
The intent of the parquet_chunked_writer is to allow writing of an arbitrarily large / arbitrary number of rows to a parquet file in multiple passes.
The following code snippet demonstrates how to write a single parquet file containing one logical table by writing a series of individual cudf::tables.
auto destination = cudf::io::sink_info("dataset.parquet"); auto options = cudf::io::chunked_parquet_writer_options::builder(destination, table->view()); auto writer = cudf::io::parquet_chunked_writer(options); writer.write(table0) writer.write(table1) writer.close()
Public Functions
-
parquet_chunked_writer() = default#
Default constructor, this should never be used. This is added just to satisfy cython.
-
parquet_chunked_writer(chunked_parquet_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream())#
Constructor with chunked writer options.
- Parameters:
options – [in] options used to write table
stream – [in] CUDA stream used for device memory operations and kernel launches
-
parquet_chunked_writer &write(table_view const &table, std::vector<partition_info> const &partitions = {})#
Writes table to output.
- Parameters:
table – [in] Table that needs to be written
partitions – [in] Optional partitions to divide the table into. If specified, must be same size as number of sinks.
- Throws:
cudf::logic_error – If the number of partitions is not the same as number of sinks
rmm::bad_alloc – if there is insufficient space for temporary buffers
- Returns:
returns reference of the class object
-
std::unique_ptr<std::vector<uint8_t>> close(std::vector<std::string> const &column_chunks_file_paths = {})#
Finishes the chunked/streamed write process.
- Parameters:
column_chunks_file_paths – [in] Column chunks file path to be set in the raw output metadata
- Returns:
A parquet-compatible blob that contains the data for all rowgroups in the list only if
column_chunks_file_paths
is provided, else null.
-
parquet_chunked_writer() = default#
-
void write_csv(csv_writer_options const &options, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource())#