Skip to content

Commit d8c326d

Browse files
Fix primitive type parsing in complex types for Arrow-serialized formats (#1250)
## Summary - **TIMESTAMP / TIMESTAMP_NTZ** fields inside complex types (ARRAY, MAP, STRUCT) are serialized as epoch microseconds by Arrow. Added fallback to convert epoch micros to `java.sql.Timestamp`. Also handles TIMESTAMP_NTZ serialized as `[year,month,day,hour,min,sec]` component arrays. - **BINARY** fields inside complex types are serialized as base64-encoded strings by Arrow. Added base64 decoding in `convertPrimitive()`. - Added `TIMESTAMP_NTZ` case to `DatabricksStruct.convertSimpleValue()` and `DatabricksArray.convertValue()` switch statements that were missing it. - Added server-format comments documenting how Arrow serializes each type within nested structures. Related: #1247, #1248 ## Test plan - [x] 7 new unit tests covering TIMESTAMP, TIMESTAMP_NTZ, and BINARY across struct, array, and map containers - [x] All 20 tests in `ComplexDataTypeParserTest` pass - [x] Server output formats verified via E2E tests against real Databricks warehouse - [ ] CI passes 🤖 Generated with [Claude Code](https://claude.com/claude-code) Signed-off-by: Vikrant Puppala <vikrant.puppala@databricks.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent c6ab4de commit d8c326d

5 files changed

Lines changed: 198 additions & 4 deletions

File tree

NEXT_CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
### Updated
88

99
### Fixed
10+
- Fixed primitive types within complex types (ARRAY, MAP, STRUCT) not being correctly parsed when Arrow serialization uses alternate formats: TIMESTAMP/TIMESTAMP_NTZ as epoch microseconds or component arrays, and BINARY as base64-encoded strings.
1011

1112
---
1213
*Note: When making changes, please add your change under the appropriate section

src/main/java/com/databricks/jdbc/api/impl/ComplexDataTypeParser.java

Lines changed: 59 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,16 @@
1212
import com.fasterxml.jackson.databind.JsonNode;
1313
import java.io.IOException;
1414
import java.math.BigDecimal;
15+
import java.nio.charset.StandardCharsets;
1516
import java.sql.Date;
1617
import java.sql.Time;
1718
import java.sql.Timestamp;
1819
import java.time.DateTimeException;
20+
import java.time.Instant;
1921
import java.time.LocalDate;
22+
import java.time.LocalDateTime;
2023
import java.util.ArrayList;
24+
import java.util.Base64;
2125
import java.util.Iterator;
2226
import java.util.LinkedHashMap;
2327
import java.util.List;
@@ -144,6 +148,13 @@ private Object convertValueNode(JsonNode node, String expectedType)
144148
return jsonText;
145149
}
146150
}
151+
// Arrow serializes TIMESTAMP_NTZ inside nested types as a JSON array of components:
152+
// [year, month, day, hour, minute, second] (and optionally nanoseconds as a 7th element).
153+
// e.g., [{"event_ts_ntz":[2023,10,5,15,20,30]}]
154+
// We must handle this before calling node.asText(), which returns "" for array nodes.
155+
if (node.isArray() && expectedType.equalsIgnoreCase(DatabricksTypeUtil.TIMESTAMP_NTZ)) {
156+
return convertTimestampNtzArray(node);
157+
}
147158
return convertPrimitive(node.asText(), expectedType);
148159
}
149160

@@ -219,17 +230,63 @@ private Object convertPrimitive(String text, String type) {
219230
}
220231
}
221232
case DatabricksTypeUtil.TIMESTAMP:
222-
return parseTimestamp(text);
233+
case DatabricksTypeUtil.TIMESTAMP_NTZ:
234+
try {
235+
return parseTimestamp(text);
236+
} catch (IllegalArgumentException e) {
237+
// Arrow serializes TIMESTAMP/TIMESTAMP_NTZ inside nested types as epoch microseconds.
238+
// e.g., {"ts":1696519230000000} for 2023-10-05 15:20:30 UTC
239+
try {
240+
long micros = Long.parseLong(text);
241+
long seconds = Math.floorDiv(micros, 1_000_000L);
242+
long microsRemainder = Math.floorMod(micros, 1_000_000L);
243+
Instant instant = Instant.ofEpochSecond(seconds, microsRemainder * 1_000);
244+
return Timestamp.from(instant);
245+
} catch (NumberFormatException nfe) {
246+
LOGGER.error(e, "Failed to parse TIMESTAMP value '{}' as epoch microseconds", text);
247+
throw e;
248+
}
249+
}
223250
case DatabricksTypeUtil.TIME:
224251
return Time.valueOf(text);
225252
case DatabricksTypeUtil.BINARY:
226-
return text.getBytes();
253+
// Arrow serializes BINARY inside nested types as base64-encoded strings.
254+
// e.g., {"bin_data":"QUJD"} for CAST('ABC' AS BINARY)
255+
try {
256+
return Base64.getDecoder().decode(text);
257+
} catch (IllegalArgumentException e) {
258+
// Not base64 encoded, fall back to raw bytes
259+
return text.getBytes(StandardCharsets.UTF_8);
260+
}
227261
case DatabricksTypeUtil.STRING:
228262
default:
229263
return text;
230264
}
231265
}
232266

267+
/**
268+
* Converts a TIMESTAMP_NTZ value serialized as a JSON array of components
269+
* [year,month,day,hour,minute,second] into a {@link Timestamp}.
270+
*/
271+
private Timestamp convertTimestampNtzArray(JsonNode arrayNode) throws DatabricksParsingException {
272+
if (arrayNode == null || !arrayNode.isArray() || arrayNode.size() < 6) {
273+
throw new DatabricksParsingException(
274+
"Invalid TIMESTAMP_NTZ array representation: expected at least 6 elements "
275+
+ "[year,month,day,hour,minute,second], but got: "
276+
+ arrayNode,
277+
DatabricksDriverErrorCode.JSON_PARSING_ERROR);
278+
}
279+
int year = arrayNode.get(0).asInt();
280+
int month = arrayNode.get(1).asInt();
281+
int day = arrayNode.get(2).asInt();
282+
int hour = arrayNode.get(3).asInt();
283+
int minute = arrayNode.get(4).asInt();
284+
int second = arrayNode.get(5).asInt();
285+
int nano = arrayNode.size() > 6 && arrayNode.get(6) != null ? arrayNode.get(6).asInt(0) : 0;
286+
LocalDateTime ldt = LocalDateTime.of(year, month, day, hour, minute, second, nano);
287+
return Timestamp.valueOf(ldt);
288+
}
289+
233290
private Timestamp parseTimestamp(String text) {
234291
if (WildcardUtil.isNullOrEmpty(text)) {
235292
return null;

src/main/java/com/databricks/jdbc/api/impl/DatabricksArray.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,8 @@ private Object convertValue(Object value, String type) {
127127
case DatabricksTypeUtil.DATE:
128128
return Date.valueOf(value.toString());
129129
case DatabricksTypeUtil.TIMESTAMP:
130-
return Timestamp.valueOf(value.toString());
130+
case DatabricksTypeUtil.TIMESTAMP_NTZ:
131+
return value instanceof Timestamp ? value : Timestamp.valueOf(value.toString());
131132
case DatabricksTypeUtil.TIME:
132133
return Time.valueOf(value.toString());
133134
case DatabricksTypeUtil.BINARY:

src/main/java/com/databricks/jdbc/api/impl/DatabricksStruct.java

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,8 @@ private Object convertSimpleValue(Object value, String type) {
126126
case DatabricksTypeUtil.DATE:
127127
return Date.valueOf(value.toString());
128128
case DatabricksTypeUtil.TIMESTAMP:
129-
return Timestamp.valueOf(value.toString());
129+
case DatabricksTypeUtil.TIMESTAMP_NTZ:
130+
return value instanceof Timestamp ? value : Timestamp.valueOf(value.toString());
130131
case DatabricksTypeUtil.TIME:
131132
return Time.valueOf(value.toString());
132133
case DatabricksTypeUtil.BINARY:

src/test/java/com/databricks/jdbc/api/impl/ComplexDataTypeParserTest.java

Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,140 @@ void testDateAsStringInStruct() throws DatabricksParsingException {
236236
}
237237
}
238238

239+
@Test
240+
void testTimestampAsEpochMicrosInStruct() throws DatabricksParsingException {
241+
// TIMESTAMP inside STRUCT — Arrow serializes as epoch microseconds
242+
// 1696519230000000 micros = 1696519230000 millis (2023-10-05 15:20:30 UTC)
243+
String json = "{\"ts\":1696519230000000}";
244+
245+
DatabricksStruct dbStruct = parser.parseJsonStringToDbStruct(json, "STRUCT<ts:TIMESTAMP>");
246+
assertNotNull(dbStruct);
247+
248+
try {
249+
Object[] attrs = dbStruct.getAttributes();
250+
assertEquals(1, attrs.length);
251+
assertInstanceOf(Timestamp.class, attrs[0]);
252+
Timestamp ts = (Timestamp) attrs[0];
253+
assertEquals(1696519230000L, ts.getTime());
254+
assertEquals(0, ts.getNanos() % 1_000_000); // no sub-millisecond component
255+
} catch (Exception e) {
256+
fail("Should not throw: " + e.getMessage());
257+
}
258+
}
259+
260+
@Test
261+
void testTimestampAsEpochMicrosInArray() throws DatabricksParsingException {
262+
// TIMESTAMP inside plain ARRAY — Arrow serializes as epoch microseconds
263+
String json = "[1696519230000000]";
264+
265+
DatabricksArray dbArray = parser.parseJsonStringToDbArray(json, "ARRAY<TIMESTAMP>");
266+
assertNotNull(dbArray);
267+
268+
try {
269+
Object[] elements = (Object[]) dbArray.getArray();
270+
assertEquals(1, elements.length);
271+
assertInstanceOf(Timestamp.class, elements[0]);
272+
Timestamp ts = (Timestamp) elements[0];
273+
assertEquals(1696519230000L, ts.getTime());
274+
} catch (Exception e) {
275+
fail("Should not throw: " + e.getMessage());
276+
}
277+
}
278+
279+
@Test
280+
void testTimestampAsEpochMicrosInMap() throws DatabricksParsingException {
281+
// TIMESTAMP as value in MAP — Arrow serializes as epoch microseconds
282+
String json = "{\"key1\":1696519230000000}";
283+
284+
DatabricksMap<String, Object> dbMap =
285+
parser.parseJsonStringToDbMap(json, "MAP<STRING,TIMESTAMP>");
286+
assertNotNull(dbMap);
287+
288+
Object val = dbMap.get("key1");
289+
assertInstanceOf(Timestamp.class, val);
290+
assertEquals(1696519230000L, ((Timestamp) val).getTime());
291+
}
292+
293+
@Test
294+
void testTimestampNtzAsStringInStruct() throws DatabricksParsingException {
295+
// TIMESTAMP_NTZ with string format should be handled, not fall through to STRING
296+
String json = "{\"ts\":\"2023-10-05 15:20:30\"}";
297+
298+
DatabricksStruct dbStruct = parser.parseJsonStringToDbStruct(json, "STRUCT<ts:TIMESTAMP_NTZ>");
299+
assertNotNull(dbStruct);
300+
301+
try {
302+
Object[] attrs = dbStruct.getAttributes();
303+
assertEquals(1, attrs.length);
304+
assertInstanceOf(Timestamp.class, attrs[0]);
305+
} catch (Exception e) {
306+
fail("Should not throw: " + e.getMessage());
307+
}
308+
}
309+
310+
@Test
311+
void testTimestampNtzAsArrayComponentsInStruct() throws DatabricksParsingException {
312+
// Server actually returns TIMESTAMP_NTZ as array of components: [year,month,day,hour,min,sec]
313+
// Confirmed via E2E: [{"event_ts_ntz":[2023,10,5,15,20,30]}]
314+
String json = "{\"ts_ntz\":[2023,10,5,15,20,30]}";
315+
316+
DatabricksStruct dbStruct =
317+
parser.parseJsonStringToDbStruct(json, "STRUCT<ts_ntz:TIMESTAMP_NTZ>");
318+
assertNotNull(dbStruct);
319+
320+
try {
321+
Object[] attrs = dbStruct.getAttributes();
322+
assertEquals(1, attrs.length);
323+
assertInstanceOf(Timestamp.class, attrs[0]);
324+
// TIMESTAMP_NTZ is timezone-independent — Timestamp.valueOf(LocalDateTime) is used,
325+
// so toLocalDateTime() gives back the exact components regardless of JVM timezone.
326+
Timestamp ts = (Timestamp) attrs[0];
327+
assertEquals(java.time.LocalDateTime.of(2023, 10, 5, 15, 20, 30), ts.toLocalDateTime());
328+
} catch (Exception e) {
329+
fail("Should not throw: " + e.getMessage());
330+
}
331+
}
332+
333+
@Test
334+
void testBinaryAsBase64InStruct() throws DatabricksParsingException {
335+
// BINARY inside STRUCT — server returns base64-encoded strings
336+
// Confirmed via E2E: [{"bin_data":"QUJD"}] for CAST('ABC' AS BINARY)
337+
// "QUJD" is base64 for "ABC"
338+
String json = "{\"bin_data\":\"QUJD\"}";
339+
340+
DatabricksStruct dbStruct = parser.parseJsonStringToDbStruct(json, "STRUCT<bin_data:BINARY>");
341+
assertNotNull(dbStruct);
342+
343+
try {
344+
Object[] attrs = dbStruct.getAttributes();
345+
assertEquals(1, attrs.length);
346+
assertInstanceOf(byte[].class, attrs[0]);
347+
assertArrayEquals("ABC".getBytes(), (byte[]) attrs[0]);
348+
} catch (Exception e) {
349+
fail("Should not throw: " + e.getMessage());
350+
}
351+
}
352+
353+
@Test
354+
void testBinaryAsBase64InArray() throws DatabricksParsingException {
355+
// BINARY inside ARRAY — server returns base64-encoded strings
356+
// Confirmed via E2E: ["QUJD","WFla"] for ARRAY(CAST('ABC' AS BINARY), CAST('XYZ' AS BINARY))
357+
String json = "[\"QUJD\",\"WFla\"]";
358+
359+
DatabricksArray dbArray = parser.parseJsonStringToDbArray(json, "ARRAY<BINARY>");
360+
assertNotNull(dbArray);
361+
362+
try {
363+
Object[] elements = (Object[]) dbArray.getArray();
364+
assertEquals(2, elements.length);
365+
assertInstanceOf(byte[].class, elements[0]);
366+
assertArrayEquals("ABC".getBytes(), (byte[]) elements[0]);
367+
assertArrayEquals("XYZ".getBytes(), (byte[]) elements[1]);
368+
} catch (Exception e) {
369+
fail("Should not throw: " + e.getMessage());
370+
}
371+
}
372+
239373
@Test
240374
void testFormatComplexTypeString_withMapType() {
241375
String jsonString = "[{\"key\":1,\"value\":2},{\"key\":3,\"value\":4}]";

0 commit comments

Comments
 (0)