Skip to content

Commit 46338ba

Browse files
committed
Reject unsupported transform methods in getCategoricalMask
getCategoricalMask only models recode/dummycode/hash column expansion. Specs using bin, word_embedding, bag_of_words, or udf would yield a mask with the wrong number of columns, so reject them explicitly with a clear error instead of returning a silently incorrect result. impute and omit remain accepted since they do not change output arity or the categorical flag. Add instruction-level tests covering each rejected method and the accepted impute/omit case.
1 parent 6aef55a commit 46338ba

2 files changed

Lines changed: 57 additions & 0 deletions

File tree

src/main/java/org/apache/sysds/runtime/instructions/cp/BinaryFrameScalarCPInstruction.java

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,19 @@ public void processGetCategorical(ExecutionContext ec, FrameBlock f, ScalarObjec
7070
throw new DMLRuntimeException("not supported non ID based spec for get_categorical_mask");
7171
}
7272

73+
// get_categorical_mask only models the column expansion of recode/dummycode/hash.
74+
// Methods that change the output arity (bin expands under dummycode, word_embedding and
75+
// bag_of_words map to many columns) or are user-defined (udf) would produce a mask with
76+
// the wrong number of columns, so reject them explicitly instead of emitting a silently
77+
// incorrect result. impute and omit are intentionally allowed: they do not alter the
78+
// output column count or the categorical flag of a column.
79+
for(TfMethod m : new TfMethod[] {TfMethod.BIN, TfMethod.WORD_EMBEDDING, TfMethod.BAG_OF_WORDS,
80+
TfMethod.UDF}) {
81+
if(jSpec.containsKey(m.toString()))
82+
throw new DMLRuntimeException(
83+
"unsupported transform method '" + m + "' for get_categorical_mask");
84+
}
85+
7386
String recode = TfMethod.RECODE.toString();
7487
String dummycode = TfMethod.DUMMYCODE.toString();
7588
String hash = TfMethod.HASH.toString();

src/test/java/org/apache/sysds/test/component/frame/transform/GetCategoricalMaskInstructionTest.java

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,50 @@ public void nonIdSpecIdsFalseThrows() {
9898
assertThrowsMessage("non ID based spec", () -> run(meta, "{\"ids\": false, \"recode\": [1]}"));
9999
}
100100

101+
@Test
102+
public void unsupportedBinMethodThrows() {
103+
// bin expands to bin-count columns under dummycode, which the mask does not model
104+
FrameBlock meta = new FrameBlock(new ValueType[] {ValueType.STRING}, new String[][] {{"a"}});
105+
assertThrowsMessage("unsupported transform method 'bin'",
106+
() -> run(meta, "{\"ids\": true, \"bin\": [{\"id\": 1, \"method\": \"equi-width\", \"numbins\": 3}]}"));
107+
}
108+
109+
@Test
110+
public void unsupportedWordEmbeddingMethodThrows() {
111+
// word_embedding maps a column to an embedding vector (many columns), not a single mask entry
112+
FrameBlock meta = new FrameBlock(new ValueType[] {ValueType.STRING}, new String[][] {{"a"}});
113+
assertThrowsMessage("unsupported transform method 'word_embedding'",
114+
() -> run(meta, "{\"ids\": true, \"word_embedding\": [1]}"));
115+
}
116+
117+
@Test
118+
public void unsupportedBagOfWordsMethodThrows() {
119+
// bag_of_words expands to one column per dictionary token
120+
FrameBlock meta = new FrameBlock(new ValueType[] {ValueType.STRING}, new String[][] {{"a"}});
121+
assertThrowsMessage("unsupported transform method 'bag_of_words'",
122+
() -> run(meta, "{\"ids\": true, \"bag_of_words\": [1]}"));
123+
}
124+
125+
@Test
126+
public void unsupportedUdfMethodThrows() {
127+
// udf output arity is user-defined and cannot be inferred from the spec
128+
FrameBlock meta = new FrameBlock(new ValueType[] {ValueType.STRING}, new String[][] {{"a"}});
129+
assertThrowsMessage("unsupported transform method 'udf'",
130+
() -> run(meta, "{\"ids\": true, \"udf\": {\"name\": \"f\", \"ids\": [1]}}"));
131+
}
132+
133+
@Test
134+
public void imputeAndOmitAreAccepted() {
135+
// impute and omit do not change the output column count or categorical flag, so a spec that
136+
// only adds them on top of a recoded column must still succeed and mark that column categorical
137+
FrameBlock meta = new FrameBlock(new ValueType[] {ValueType.STRING}, new String[][] {{"a"}});
138+
MatrixBlock res = run(meta, "{\"ids\": true, \"recode\": [1], \"impute\": [{\"id\": 1, \"method\": \"global_mode\"}], \"omit\": [1]}");
139+
140+
assertEquals(1, res.getNumRows());
141+
assertEquals(1, res.getNumColumns());
142+
assertEquals(1.0, res.get(0, 0), 0.0);
143+
}
144+
101145
@Test
102146
public void unsupportedOpcodeThrows() {
103147
// any frame-scalar binary opcode other than get_categorical_mask must be rejected

0 commit comments

Comments
 (0)