Skip to content

Commit ef34493

Browse files
committed
int_to_binary
1 parent 6a2209d commit ef34493

4 files changed

Lines changed: 49 additions & 14 deletions

File tree

docs/source/user-guide/latest/compatibility.md

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -73,16 +73,17 @@ should not be used in production. The feature will be enabled in a future releas
7373

7474
Cast operations in Comet fall into three levels of support:
7575

76-
- **C (Compatible)**: The results match Apache Spark
77-
- **I (Incompatible)**: The results may match Apache Spark for some inputs, but there are known issues where some inputs
76+
- **Compatible**: The results match Apache Spark
77+
- **Incompatible**: The results may match Apache Spark for some inputs, but there are known issues where some inputs
7878
will result in incorrect results or exceptions. The query stage will fall back to Spark by default. Setting
7979
`spark.comet.expression.Cast.allowIncompatible=true` will allow all incompatible casts to run natively in Comet, but this is not
8080
recommended for production use.
81-
- **U (Unsupported)**: Comet does not provide a native version of this cast expression and the query stage will fall back to
81+
- **Unsupported**: Comet does not provide a native version of this cast expression and the query stage will fall back to
8282
Spark.
83-
- **N/A**: Spark does not support this cast.
8483

85-
### Legacy Mode
84+
### Compatible Casts
85+
86+
The following cast operations are generally compatible with Spark except for the differences noted here.
8687

8788
<!--BEGIN:CAST_LEGACY_TABLE-->
8889
<!--END:CAST_LEGACY_TABLE-->

native/spark-expr/src/conversion_funcs/cast.rs

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,13 @@ use crate::{timezone, BinaryOutputStyle};
2020
use crate::{EvalMode, SparkError, SparkResult};
2121
use arrow::array::builder::StringBuilder;
2222
use arrow::array::{
23-
BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray,
23+
BinaryBuilder, BooleanBuilder, Decimal128Builder, DictionaryArray, GenericByteArray, ListArray,
2424
PrimitiveBuilder, StringArray, StructArray,
2525
};
2626
use arrow::compute::can_cast_types;
2727
use arrow::datatypes::{
28-
i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type, GenericBinaryType,
29-
Schema,
28+
i256, ArrowDictionaryKeyType, ArrowNativeType, DataType, Decimal256Type,
29+
GenericBinaryType, Schema,
3030
};
3131
use arrow::{
3232
array::{
@@ -311,22 +311,22 @@ fn can_cast_from_byte(to_type: &DataType, _: &SparkCastOptions) -> bool {
311311
use DataType::*;
312312
matches!(
313313
to_type,
314-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
314+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary
315315
)
316316
}
317317

318318
fn can_cast_from_short(to_type: &DataType, _: &SparkCastOptions) -> bool {
319319
use DataType::*;
320320
matches!(
321321
to_type,
322-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _)
322+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Decimal128(_, _) | Binary
323323
)
324324
}
325325

326326
fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool {
327327
use DataType::*;
328328
match to_type {
329-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 => true,
329+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Utf8 | Binary => true,
330330
Decimal128(_, _) => {
331331
// incompatible: no overflow check
332332
options.allow_incompat
@@ -338,7 +338,7 @@ fn can_cast_from_int(to_type: &DataType, options: &SparkCastOptions) -> bool {
338338
fn can_cast_from_long(to_type: &DataType, options: &SparkCastOptions) -> bool {
339339
use DataType::*;
340340
match to_type {
341-
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 => true,
341+
Boolean | Int8 | Int16 | Int32 | Int64 | Float32 | Float64 | Binary => true,
342342
Decimal128(_, _) => {
343343
// incompatible: no overflow check
344344
options.allow_incompat
@@ -501,6 +501,29 @@ macro_rules! cast_float_to_string {
501501
}};
502502
}
503503

504+
// eval mode is not needed since all ints can be implemented in binary format
505+
macro_rules! cast_whole_num_to_binary {
506+
($array:expr, $primitive_type:ty, $byte_size:expr) => {{
507+
let input_arr = $array
508+
.as_any()
509+
.downcast_ref::<$primitive_type>()
510+
.ok_or_else(|| SparkError::Internal("Expected numeric array".to_string()))?;
511+
512+
let len = input_arr.len();
513+
let mut builder = BinaryBuilder::with_capacity(len, len * $byte_size);
514+
515+
for i in 0..input_arr.len() {
516+
if input_arr.is_null(i) {
517+
builder.append_null();
518+
} else {
519+
builder.append_value(input_arr.value(i).to_be_bytes());
520+
}
521+
}
522+
523+
Ok(Arc::new(builder.finish()) as ArrayRef)
524+
}};
525+
}
526+
504527
macro_rules! cast_int_to_int_macro {
505528
(
506529
$array: expr,
@@ -1100,6 +1123,10 @@ fn cast_array(
11001123
Ok(cast_with_options(&array, to_type, &CAST_OPTIONS)?)
11011124
}
11021125
(Binary, Utf8) => Ok(cast_binary_to_string::<i32>(&array, cast_options)?),
1126+
(Int8, Binary) => cast_whole_num_to_binary!(&array, Int8Array, 1),
1127+
(Int16, Binary) => cast_whole_num_to_binary!(&array, Int16Array, 2),
1128+
(Int32, Binary) => cast_whole_num_to_binary!(&array, Int32Array, 4),
1129+
(Int64, Binary) => cast_whole_num_to_binary!(&array, Int64Array, 8),
11031130
_ if cast_options.is_adapting_schema
11041131
|| is_datafusion_spark_compatible(from_type, to_type) =>
11051132
{

spark/src/main/scala/org/apache/comet/expressions/CometCast.scala

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ package org.apache.comet.expressions
2121

2222
import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, Expression, Literal}
2323
import org.apache.spark.sql.internal.SQLConf
24-
import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DecimalType, NullType, StructType}
24+
import org.apache.spark.sql.types.{ArrayType, BinaryType, DataType, DataTypes, DecimalType, NullType, StructType}
2525

2626
import org.apache.comet.CometConf
2727
import org.apache.comet.CometSparkSessionExtensions.withInfo
@@ -126,6 +126,7 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim {
126126
isSupported(dt.elementType, DataTypes.StringType, timeZoneId, evalMode)
127127
case (dt: ArrayType, dt1: ArrayType) =>
128128
isSupported(dt.elementType, dt1.elementType, timeZoneId, evalMode)
129+
case (from: DataType, _: BinaryType) => canCastToBinary(from)
129130
case (dt: DataType, _) if dt.typeName == "timestamp_ntz" =>
130131
// https://github.com/apache/datafusion-comet/issues/378
131132
toType match {
@@ -344,6 +345,12 @@ object CometCast extends CometExpressionSerde[Cast] with CometExprShim {
344345
case _ => Unsupported(Some(s"Cast from DecimalType to $toType is not supported"))
345346
}
346347

348+
private def canCastToBinary(fromType: DataType): SupportLevel = fromType match {
349+
case DataTypes.ByteType | DataTypes.ShortType | DataTypes.IntegerType | DataTypes.LongType =>
350+
Compatible()
351+
case _ => Unsupported(Some(s"Cast from BinaryType to $fromType is not supported"))
352+
}
353+
347354
private def unsupported(fromType: DataType, toType: DataType): Unsupported = {
348355
Unsupported(Some(s"Cast from $fromType to $toType is not supported"))
349356
}

spark/src/test/scala/org/apache/comet/CometCastSuite.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,7 +281,7 @@ class CometCastSuite extends CometTestBase with AdaptiveSparkPlanHelper {
281281
hasIncompatibleType = usingParquetExecWithIncompatTypes)
282282
}
283283

284-
ignore("cast ShortType to BinaryType") {
284+
test("cast ShortType to BinaryType") {
285285
castTest(
286286
generateShorts(),
287287
DataTypes.BinaryType,

0 commit comments

Comments
 (0)