Skip to content

Commit 9e8568d

Browse files
committed
Speed up AINode IT and split CPU-only tests off GPU runner
Two-tier isolation for org.apache.iotdb.ainode.it so that only the GPU-bound tests need a self-hosted GPU runner: - New AINodeIT category for tests that exercise only metadata/lifecycle paths (SHOW/DROP builtin model, REMOVE AINODE). These tests run on a plain ubuntu-latest runner via the new AINodeIT Maven profile and cluster-it-ainode-cpu.yml workflow. - AIClusterIT profile keeps tests that drive CALL INFERENCE, FORECAST, or LOAD MODEL TO DEVICES (CUDA), still running on the GPU runner. - AINodeBasicIT collects the 4 metadata tests previously mixed into AINodeSharedClusterIT; AINodeClusterConfigIT is re-tagged to AINodeIT. - AINodeWrapper now tolerates a missing /data/ainode/models cache, so CPU runners can boot AINode without the multi-GB weight bundle. When the cache is present, weights are symlinked instead of copied per fork to remove a large per-test-class IO cost. Additional speedups in the GPU pipeline: - AINodeConcurrentForecastIT loop count 100 -> 10 (still 100 reqs per model for a concurrency smoke check; nightly can dial up). - AINodeTestUtils.prepareDataInTree/Table/Table2 and AINodeConcurrentForecastIT.prepareDataForTableModel switched from per-row execute() to addBatch()/executeBatch() in chunks of 500.
1 parent af48621 commit 9e8568d

9 files changed

Lines changed: 310 additions & 90 deletions

File tree

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
name: AINode IT - CPU
2+
3+
on:
4+
push:
5+
branches:
6+
- master
7+
- 'rel/*'
8+
- 'rc/*'
9+
paths-ignore:
10+
- 'docs/**'
11+
- 'site/**'
12+
pull_request:
13+
branches:
14+
- master
15+
- 'rel/*'
16+
- 'rc/*'
17+
- 'force_ci/**'
18+
paths-ignore:
19+
- 'docs/**'
20+
- 'site/**'
21+
workflow_dispatch:
22+
23+
concurrency:
24+
group: ${{ github.workflow }}-${{ github.ref }}
25+
cancel-in-progress: true
26+
27+
env:
28+
MAVEN_OPTS: -Dhttp.keepAlive=false -Dmaven.wagon.http.pool=false -Dmaven.wagon.http.retryHandler.class=standard -Dmaven.wagon.http.retryHandler.count=3
29+
MAVEN_ARGS: --batch-mode --no-transfer-progress
30+
DEVELOCITY_ACCESS_KEY: ${{ secrets.DEVELOCITY_ACCESS_KEY }}
31+
32+
jobs:
33+
AINode-CPU:
34+
runs-on: ubuntu-latest
35+
36+
steps:
37+
- uses: actions/checkout@v5
38+
- name: Set up JDK
39+
uses: actions/setup-java@v5
40+
with:
41+
distribution: corretto
42+
java-version: 17
43+
env:
44+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
45+
- name: Cache Maven packages
46+
uses: actions/cache@v5
47+
with:
48+
path: ~/.m2
49+
key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }}
50+
restore-keys: ${{ runner.os }}-m2-
51+
- name: Adjust Linux kernel somaxconn
52+
shell: bash
53+
run: sudo sysctl -w net.core.somaxconn=65535
54+
- name: IT Test
55+
shell: bash
56+
run: |
57+
mvn clean verify \
58+
-P with-integration-tests,with-ainode \
59+
-DskipUTs \
60+
-DintegrationTest.forkCount=1 \
61+
-pl integration-test,iotdb-core/ainode \
62+
-am \
63+
-PAINodeIT
64+
- name: Upload Artifact
65+
if: failure()
66+
uses: actions/upload-artifact@v6
67+
with:
68+
name: ainode-cpu-logs
69+
path: integration-test/target/*-logs
70+
retention-days: 30

integration-test/pom.xml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,20 @@
693693
<integrationTest.testEnv>AI</integrationTest.testEnv>
694694
</properties>
695695
</profile>
696+
<profile>
697+
<id>AINodeIT</id>
698+
<activation>
699+
<activeByDefault>false</activeByDefault>
700+
</activation>
701+
<properties>
702+
<integrationTest.excludedGroups>org.apache.iotdb.itbase.category.ManualIT</integrationTest.excludedGroups>
703+
<integrationTest.includedGroups>org.apache.iotdb.itbase.category.AINodeIT</integrationTest.includedGroups>
704+
<integrationTest.launchNodeInSameJVM>false</integrationTest.launchNodeInSameJVM>
705+
<integrationTest.randomSelectWriteNode>false</integrationTest.randomSelectWriteNode>
706+
<integrationTest.readAndVerifyWithMultiNode>false</integrationTest.readAndVerifyWithMultiNode>
707+
<integrationTest.testEnv>AI</integrationTest.testEnv>
708+
</properties>
709+
</profile>
696710
<profile>
697711
<id>DailyIT</id>
698712
<activation>

integration-test/src/main/java/org/apache/iotdb/it/env/cluster/node/AINodeWrapper.java

Lines changed: 54 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -131,37 +131,62 @@ public void start() {
131131
},
132132
propertiesFile);
133133

134-
// copy built-in LTSM
134+
// Link built-in LTSM weights from the runner-wide cache. These can be hundreds of MB to
135+
// multiple GB; copying them per fork dominates IT startup. Symlinks share read-only weights
136+
// across forks; we fall back to a copy on platforms / filesystems that reject symlinks.
137+
// CPU-only runners that only run metadata-level AINode tests won't have the cache pre-staged
138+
// — log and skip in that case rather than failing.
135139
String builtInModelPath = filePrefix + File.separator + BUILT_IN_MODEL_PATH;
136-
new File(builtInModelPath).mkdirs();
137-
try {
138-
if (new File(builtInModelPath).exists()) {
139-
PathUtils.deleteDirectory(Paths.get(builtInModelPath));
140+
File builtInModelDir = new File(builtInModelPath);
141+
Path cacheRoot = Paths.get(CACHE_BUILT_IN_MODEL_PATH);
142+
if (!Files.isDirectory(cacheRoot)) {
143+
logger.info(
144+
"AINode model weight cache {} not present; starting AINode without preloaded weights",
145+
cacheRoot);
146+
builtInModelDir.mkdirs();
147+
} else {
148+
try {
149+
if (builtInModelDir.exists()) {
150+
PathUtils.deleteDirectory(builtInModelDir.toPath());
151+
}
152+
} catch (NoSuchFileException e) {
153+
// ignored
154+
}
155+
Path destRoot = builtInModelDir.toPath();
156+
builtInModelDir.getParentFile().mkdirs();
157+
try {
158+
Files.createSymbolicLink(destRoot, cacheRoot);
159+
logger.info("AINode symlinked model weights {} -> {}", destRoot, cacheRoot);
160+
} catch (UnsupportedOperationException | IOException symlinkErr) {
161+
logger.warn(
162+
"AINode failed to symlink {} -> {} ({}), falling back to copy",
163+
destRoot,
164+
cacheRoot,
165+
symlinkErr.toString());
166+
builtInModelDir.mkdirs();
167+
try (Stream<Path> s = Files.walk(cacheRoot)) {
168+
s.forEach(
169+
source -> {
170+
Path destination =
171+
Paths.get(
172+
builtInModelPath,
173+
source.toString().substring(CACHE_BUILT_IN_MODEL_PATH.length()));
174+
logger.info("AINode copying model weights from {} to {}", source, destination);
175+
try {
176+
Files.copy(
177+
source,
178+
destination,
179+
LinkOption.NOFOLLOW_LINKS,
180+
StandardCopyOption.COPY_ATTRIBUTES);
181+
} catch (IOException e) {
182+
logger.error("AINode got error copying model weights", e);
183+
throw new RuntimeException(e);
184+
}
185+
});
186+
} catch (Exception e) {
187+
logger.error("AINode got error copying model weights", e);
188+
}
140189
}
141-
} catch (NoSuchFileException e) {
142-
// ignored
143-
}
144-
try (Stream<Path> s = Files.walk(Paths.get(CACHE_BUILT_IN_MODEL_PATH))) {
145-
s.forEach(
146-
source -> {
147-
Path destination =
148-
Paths.get(
149-
builtInModelPath,
150-
source.toString().substring(CACHE_BUILT_IN_MODEL_PATH.length()));
151-
logger.info("AINode copying model weights from {} to {}", source, destination);
152-
try {
153-
Files.copy(
154-
source,
155-
destination,
156-
LinkOption.NOFOLLOW_LINKS,
157-
StandardCopyOption.COPY_ATTRIBUTES);
158-
} catch (IOException e) {
159-
logger.error("AINode got error copying model weights", e);
160-
throw new RuntimeException(e);
161-
}
162-
});
163-
} catch (Exception e) {
164-
logger.error("AINode got error copying model weights", e);
165190
}
166191

167192
// start AINode
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.iotdb.itbase.category;
21+
22+
/**
23+
* Marker for AINode integration tests that exercise only metadata / lifecycle paths and therefore
24+
* don't need a GPU. Tests tagged with this category can run on plain CPU runners; tests that drive
25+
* inference, forecasting, or device-binding still belong in {@link AIClusterIT}.
26+
*/
27+
public interface AINodeIT {}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one
3+
* or more contributor license agreements. See the NOTICE file
4+
* distributed with this work for additional information
5+
* regarding copyright ownership. The ASF licenses this file
6+
* to you under the Apache License, Version 2.0 (the
7+
* "License"); you may not use this file except in compliance
8+
* with the License. You may obtain a copy of the License at
9+
*
10+
* http://www.apache.org/licenses/LICENSE-2.0
11+
*
12+
* Unless required by applicable law or agreed to in writing,
13+
* software distributed under the License is distributed on an
14+
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15+
* KIND, either express or implied. See the License for the
16+
* specific language governing permissions and limitations
17+
* under the License.
18+
*/
19+
20+
package org.apache.iotdb.ainode.it;
21+
22+
import org.apache.iotdb.ainode.utils.AINodeTestUtils.FakeModelInfo;
23+
import org.apache.iotdb.it.env.EnvFactory;
24+
import org.apache.iotdb.it.framework.IoTDBTestRunner;
25+
import org.apache.iotdb.itbase.category.AINodeIT;
26+
import org.apache.iotdb.itbase.env.BaseEnv;
27+
28+
import org.junit.AfterClass;
29+
import org.junit.BeforeClass;
30+
import org.junit.Test;
31+
import org.junit.experimental.categories.Category;
32+
import org.junit.runner.RunWith;
33+
34+
import java.sql.Connection;
35+
import java.sql.ResultSet;
36+
import java.sql.ResultSetMetaData;
37+
import java.sql.SQLException;
38+
import java.sql.Statement;
39+
40+
import static org.apache.iotdb.ainode.utils.AINodeTestUtils.BUILTIN_MODEL_MAP;
41+
import static org.apache.iotdb.ainode.utils.AINodeTestUtils.checkHeader;
42+
import static org.apache.iotdb.ainode.utils.AINodeTestUtils.errorTest;
43+
import static org.junit.Assert.assertEquals;
44+
import static org.junit.Assert.assertTrue;
45+
46+
/**
47+
* Metadata-only AINode tests that don't drive inference or bind GPU devices, so they can run on a
48+
* plain CPU runner. Tests that do exercise CUDA paths live in {@link AINodeSharedClusterIT}.
49+
*/
50+
@RunWith(IoTDBTestRunner.class)
51+
@Category({AINodeIT.class})
52+
public class AINodeBasicIT {
53+
54+
@BeforeClass
55+
public static void setUp() throws Exception {
56+
EnvFactory.getEnv().initClusterEnvironment(1, 1);
57+
}
58+
59+
@AfterClass
60+
public static void tearDown() throws Exception {
61+
EnvFactory.getEnv().cleanClusterEnvironment();
62+
}
63+
64+
@Test
65+
public void dropBuiltInModelErrorTestInTree() throws SQLException {
66+
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TREE_SQL_DIALECT);
67+
Statement statement = connection.createStatement()) {
68+
errorTest(statement, "drop model sundial", "1506: Cannot delete built-in model: sundial");
69+
}
70+
}
71+
72+
@Test
73+
public void dropBuiltInModelErrorTestInTable() throws SQLException {
74+
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TABLE_SQL_DIALECT);
75+
Statement statement = connection.createStatement()) {
76+
errorTest(statement, "drop model sundial", "1506: Cannot delete built-in model: sundial");
77+
}
78+
}
79+
80+
@Test
81+
public void showBuiltInModelTestInTree() throws SQLException {
82+
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TREE_SQL_DIALECT);
83+
Statement statement = connection.createStatement()) {
84+
showBuiltInModelTest(statement);
85+
}
86+
}
87+
88+
@Test
89+
public void showBuiltInModelTestInTable() throws SQLException {
90+
try (Connection connection = EnvFactory.getEnv().getConnection(BaseEnv.TABLE_SQL_DIALECT);
91+
Statement statement = connection.createStatement()) {
92+
showBuiltInModelTest(statement);
93+
}
94+
}
95+
96+
private void showBuiltInModelTest(Statement statement) throws SQLException {
97+
int builtInModelCount = 0;
98+
final String showSql = "SHOW MODELS";
99+
try (ResultSet resultSet = statement.executeQuery(showSql)) {
100+
ResultSetMetaData resultSetMetaData = resultSet.getMetaData();
101+
checkHeader(resultSetMetaData, "ModelId,ModelType,Category,State");
102+
while (resultSet.next()) {
103+
builtInModelCount++;
104+
FakeModelInfo modelInfo =
105+
new FakeModelInfo(
106+
resultSet.getString(1),
107+
resultSet.getString(2),
108+
resultSet.getString(3),
109+
resultSet.getString(4));
110+
assertTrue(BUILTIN_MODEL_MAP.containsKey(modelInfo.getModelId()));
111+
assertEquals(BUILTIN_MODEL_MAP.get(modelInfo.getModelId()), modelInfo);
112+
}
113+
}
114+
assertEquals(BUILTIN_MODEL_MAP.size(), builtInModelCount);
115+
}
116+
}

integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeClusterConfigIT.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121

2222
import org.apache.iotdb.it.env.EnvFactory;
2323
import org.apache.iotdb.it.framework.IoTDBTestRunner;
24-
import org.apache.iotdb.itbase.category.AIClusterIT;
24+
import org.apache.iotdb.itbase.category.AINodeIT;
2525
import org.apache.iotdb.itbase.env.BaseEnv;
2626

2727
import org.junit.AfterClass;
@@ -40,7 +40,7 @@
4040
import static org.junit.Assert.assertEquals;
4141

4242
@RunWith(IoTDBTestRunner.class)
43-
@Category({AIClusterIT.class})
43+
@Category({AINodeIT.class})
4444
public class AINodeClusterConfigIT {
4545

4646
@BeforeClass

integration-test/src/test/java/org/apache/iotdb/ainode/it/AINodeConcurrentForecastIT.java

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,10 +75,15 @@ private static void prepareDataForTableModel() throws SQLException {
7575
statement.execute("CREATE DATABASE root");
7676
statement.execute("CREATE TABLE root.AI (s DOUBLE FIELD)");
7777
for (int i = 0; i < 2880; i++) {
78-
statement.execute(
78+
statement.addBatch(
7979
String.format(
8080
"INSERT INTO root.AI(time, s) VALUES(%d, %f)", i, Math.sin(i * Math.PI / 1440)));
81+
if ((i + 1) % 500 == 0) {
82+
statement.executeBatch();
83+
statement.clearBatch();
84+
}
8185
}
86+
statement.executeBatch();
8287
}
8388
}
8489

@@ -101,7 +106,9 @@ public void concurrentGPUForecastTest(AINodeTestUtils.FakeModelInfo modelInfo, S
101106
String.format(
102107
FORECAST_TABLE_FUNCTION_SQL_TEMPLATE, modelInfo.getModelId(), forecastLength);
103108
final int threadCnt = 10;
104-
final int loop = 100;
109+
// PR CI keeps a concurrency smoke check; nightly/daily can dial this up if regressions
110+
// appear.
111+
final int loop = 10;
105112
statement.execute(
106113
String.format("LOAD MODEL %s TO DEVICES '%s'", modelInfo.getModelId(), devices));
107114
checkModelOnSpecifiedDevice(statement, modelInfo.getModelId(), devices);

0 commit comments

Comments
 (0)