@@ -52,12 +52,41 @@ Result<std::unique_ptr<AvroOutputStream>> CreateOutputStream(const WriterOptions
5252 return std::make_unique<AvroOutputStream>(output, buffer_size);
5353}
5454
55+ Result<::avro::Codec> ParseCodec (const WriterProperties& properties) {
56+ const auto & codec_name = properties.Get (WriterProperties::kAvroCompression );
57+ ::avro::Codec codec;
58+ if (codec_name == " uncompressed" ) {
59+ codec = ::avro::NULL_CODEC;
60+ } else if (codec_name == " gzip" ) {
61+ codec = ::avro::DEFLATE_CODEC;
62+ } else if (codec_name == " snappy" ) {
63+ codec = ::avro::SNAPPY_CODEC;
64+ } else if (codec_name == " zstd" ) {
65+ codec = ::avro::ZSTD_CODEC;
66+ } else {
67+ return InvalidArgument (" Unsupported Avro codec: {}" , codec_name);
68+ }
69+ ICEBERG_PRECHECK (::avro::isCodecAvailable (codec),
70+ " Avro codec {} is not available in the current build" , codec_name);
71+ return codec;
72+ }
73+
74+ Result<std::optional<int32_t >> ParseCodecLevel (const WriterProperties& properties) {
75+ auto level_str = properties.Get (WriterProperties::kAvroCompressionLevel );
76+ if (level_str.empty ()) {
77+ return std::nullopt ;
78+ }
79+ ICEBERG_ASSIGN_OR_RAISE (auto level, StringUtils::ParseInt<int32_t >(level_str));
80+ return level;
81+ }
82+
5583// Abstract base class for Avro write backends.
5684class AvroWriteBackend {
5785 public:
5886 virtual ~AvroWriteBackend () = default ;
5987 virtual Status Init (std::unique_ptr<AvroOutputStream> output_stream,
6088 const ::avro::ValidSchema& avro_schema, int64_t sync_interval,
89+ ::avro::Codec codec, std::optional<int32_t > compression_level,
6190 const std::map<std::string, std::vector<uint8_t >>& metadata) = 0;
6291 virtual Status WriteRow (const Schema& write_schema, const ::arrow::Array& array,
6392 int64_t row_index) = 0;
@@ -70,10 +99,11 @@ class DirectEncoderBackend : public AvroWriteBackend {
7099 public:
71100 Status Init (std::unique_ptr<AvroOutputStream> output_stream,
72101 const ::avro::ValidSchema& avro_schema, int64_t sync_interval,
102+ ::avro::Codec codec, std::optional<int32_t > compression_level,
73103 const std::map<std::string, std::vector<uint8_t >>& metadata) override {
74- writer_ = std::make_unique<::avro::DataFileWriterBase>(std::move (output_stream),
75- avro_schema, sync_interval,
76- ::avro::NULL_CODEC, metadata );
104+ writer_ = std::make_unique<::avro::DataFileWriterBase>(
105+ std::move (output_stream), avro_schema, sync_interval, codec, metadata ,
106+ compression_level );
77107 avro_root_node_ = avro_schema.root ();
78108 return {};
79109 }
@@ -111,10 +141,11 @@ class GenericDatumBackend : public AvroWriteBackend {
111141 public:
112142 Status Init (std::unique_ptr<AvroOutputStream> output_stream,
113143 const ::avro::ValidSchema& avro_schema, int64_t sync_interval,
144+ ::avro::Codec codec, std::optional<int32_t > compression_level,
114145 const std::map<std::string, std::vector<uint8_t >>& metadata) override {
115146 writer_ = std::make_unique<::avro::DataFileWriter<::avro::GenericDatum>>(
116- std::move (output_stream), avro_schema, sync_interval, ::avro::NULL_CODEC ,
117- metadata );
147+ std::move (output_stream), avro_schema, sync_interval, codec, metadata ,
148+ compression_level );
118149 datum_ = std::make_unique<::avro::GenericDatum>(avro_schema);
119150 return {};
120151 }
@@ -158,7 +189,7 @@ class AvroWriter::Impl {
158189 ::avro::NodePtr root;
159190 ICEBERG_RETURN_UNEXPECTED (ToAvroNodeVisitor{}.Visit (*write_schema_, &root));
160191 if (const auto & schema_name =
161- options.properties -> Get (WriterProperties::kAvroSchemaName );
192+ options.properties . Get (WriterProperties::kAvroSchemaName );
162193 !schema_name.empty ()) {
163194 root->setName (::avro::Name (schema_name));
164195 }
@@ -169,7 +200,7 @@ class AvroWriter::Impl {
169200 ICEBERG_ASSIGN_OR_RAISE (
170201 auto output_stream,
171202 CreateOutputStream (options,
172- options.properties -> Get (WriterProperties::kAvroBufferSize )));
203+ options.properties . Get (WriterProperties::kAvroBufferSize )));
173204 arrow_output_stream_ = output_stream->arrow_output_stream ();
174205
175206 std::map<std::string, std::vector<uint8_t >> metadata;
@@ -181,15 +212,19 @@ class AvroWriter::Impl {
181212 }
182213
183214 // Create the appropriate backend based on configuration
184- if (options.properties -> Get (WriterProperties::kAvroSkipDatum )) {
215+ if (options.properties . Get (WriterProperties::kAvroSkipDatum )) {
185216 backend_ = std::make_unique<DirectEncoderBackend>();
186217 } else {
187218 backend_ = std::make_unique<GenericDatumBackend>();
188219 }
189220
190- ICEBERG_RETURN_UNEXPECTED (backend_->Init (
191- std::move (output_stream), *avro_schema_,
192- options.properties ->Get (WriterProperties::kAvroSyncInterval ), metadata));
221+ ICEBERG_ASSIGN_OR_RAISE (auto codec, ParseCodec (options.properties ));
222+ ICEBERG_ASSIGN_OR_RAISE (auto compression_level, ParseCodecLevel (options.properties ));
223+
224+ ICEBERG_RETURN_UNEXPECTED (
225+ backend_->Init (std::move (output_stream), *avro_schema_,
226+ options.properties .Get (WriterProperties::kAvroSyncInterval ), codec,
227+ compression_level, metadata));
193228
194229 ICEBERG_RETURN_UNEXPECTED (ToArrowSchema (*write_schema_, &arrow_schema_));
195230 return {};
0 commit comments