Skip to content

Commit 499024f

Browse files
committed
[Analytics Engine] Recursively normalize nested Arrow Text in ArrowValues
ArrowValues.toJavaValue was the boundary that converts Arrow vectors to plain Java values handed to ExprValueUtils, but it only stripped Arrow's `Text` wrapper at one level — top-level VarCharVector cells, MapVector entry values, and ListVector elements. Nested shapes (e.g. the BRAIN patterns `tokens` column, which is Map<String, List<String>>) escaped with raw `Text` wrappers inside the inner list, because MapVector's value-extraction path didn't recurse into the list's elements. ExprValueUtils.fromObjectValue then rejected those raw Text objects with "unsupported object class". Introduce a small private `normalize(Object)` helper that recurses through Lists, Maps, and Text wrappers, so containers of any depth land on the SQL side as pure Java types. This keeps the Arrow→Java conversion responsibility where it belongs (the analytics-engine result materialization) instead of leaking Arrow type knowledge into the SQL plugin's generic value converter. Signed-off-by: Kai Huang <ahkcs@amazon.com>
1 parent 086dc77 commit 499024f

1 file changed

Lines changed: 42 additions & 15 deletions

File tree

  • sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/exec

sandbox/plugins/analytics-engine/src/main/java/org/opensearch/analytics/exec/ArrowValues.java

Lines changed: 42 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,15 @@ private ArrowValues() {}
3535
* {@code getObject} returns), {@link Text#toString()} for any other vector
3636
* type whose {@code getObject} returns a {@link Text} and
3737
* {@link FieldVector#getObject} for every other vector type.
38+
*
39+
* <p>Containers ({@link MapVector}, {@link ListVector}) are recursively
40+
* normalized so no Arrow {@link Text} wrapper escapes this boundary. This
41+
* matters for nested shapes like {@code Map<String, List<String>>} (e.g.
42+
* the PPL {@code patterns ... show_numbered_token=true} {@code tokens}
43+
* column), where Arrow's {@code MapVector.getObject} yields entry structs
44+
* whose values are themselves {@code List<Text>}; downstream consumers
45+
* such as {@code ExprValueUtils.fromObjectValue} reject the raw {@code
46+
* Text} as "unsupported object class".
3847
*/
3948
public static Object toJavaValue(FieldVector vector, int index) {
4049
if (vector.isNull(index)) return null;
@@ -47,36 +56,54 @@ public static Object toJavaValue(FieldVector vector, int index) {
4756
// JsonStringArrayList of entry structs rather than a Map. Reassemble
4857
// entries into a LinkedHashMap (insertion-order preserving) so the
4958
// downstream ExprValueUtils tuple converter sees the same shape as a
50-
// legacy v2 Map<String, Object> column. Routes the values through
51-
// Text→String normalization so JSON serialization doesn't choke on
52-
// Arrow's UTF-8 byte wrapper. In-tree callers include spath's
53-
// `json_extract_all` and parse's `parse` UDFs on the analytics-engine route.
59+
// legacy v2 Map<String, Object> column. In-tree callers include
60+
// spath's `json_extract_all`, parse's `parse` UDF, and the BRAIN
61+
// patterns `tokens` column on the analytics-engine route.
5462
if (vector instanceof MapVector && vector.getObject(index) instanceof List<?> entries) {
5563
LinkedHashMap<String, Object> map = new LinkedHashMap<>();
5664
for (Object entry : entries) {
5765
if (!(entry instanceof Map<?, ?> e)) continue;
5866
Object k = e.get(MapVector.KEY_NAME);
5967
Object v = e.get(MapVector.VALUE_NAME);
60-
map.put(k instanceof Text t ? t.toString() : String.valueOf(k), v instanceof Text t ? t.toString() : v);
68+
map.put(k instanceof Text t ? t.toString() : String.valueOf(k), normalize(v));
6169
}
6270
return map;
6371
}
6472
Object value = vector.getObject(index);
6573
if (vector instanceof ListVector && value instanceof List<?> raw) {
66-
// ListVector.getObject returns a JsonStringArrayList whose elements are the
67-
// child vector's typed values. For VarCharVector children that's Arrow's
68-
// Text, which downstream consumers (e.g. {@code ExprValueUtils.fromObjectValue})
69-
// don't recognize and reject as "unsupported object class". Mirror the
70-
// top-level VarCharVector branch above and substitute Java strings.
71-
List<Object> normalized = new ArrayList<>(raw.size());
72-
for (Object element : raw) {
73-
normalized.add(element instanceof Text t ? t.toString() : element);
74-
}
75-
return normalized;
74+
return normalizeList(raw);
7675
}
76+
return normalize(value);
77+
}
78+
79+
/**
80+
* Recursively converts Arrow {@link Text} wrappers to {@link String} inside
81+
* arbitrary nested {@code List} / {@code Map} structures returned by
82+
* {@code FieldVector#getObject}. Returns primitive values unchanged.
83+
*/
84+
private static Object normalize(Object value) {
7785
if (value instanceof Text t) {
7886
return t.toString();
7987
}
88+
if (value instanceof List<?> list) {
89+
return normalizeList(list);
90+
}
91+
if (value instanceof Map<?, ?> m) {
92+
LinkedHashMap<String, Object> out = new LinkedHashMap<>(m.size());
93+
for (Map.Entry<?, ?> entry : m.entrySet()) {
94+
Object k = entry.getKey();
95+
out.put(k instanceof Text t ? t.toString() : String.valueOf(k), normalize(entry.getValue()));
96+
}
97+
return out;
98+
}
8099
return value;
81100
}
101+
102+
private static List<Object> normalizeList(List<?> raw) {
103+
List<Object> out = new ArrayList<>(raw.size());
104+
for (Object element : raw) {
105+
out.add(normalize(element));
106+
}
107+
return out;
108+
}
82109
}

0 commit comments

Comments
 (0)