Skip to content

Commit a3e805e

Browse files
authored
Merge pull request #7272 from gchq/7239-check-k8s-pods-deleted
Issue 7239 - Check Kubernetes pods/jobs are deleted after a bulk import
2 parents 9b2a4fd + f5568a5 commit a3e805e

21 files changed

Lines changed: 495 additions & 25 deletions

File tree

NOTICES

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,10 @@ ASM (org.ow2.asm:asm:9.*)
315315

316316
- The 3-Clause BSD License
317317

318+
Fabric8 (io.fabric8:kubernetes-*:6.*)
319+
320+
- Apache License, Version 2.0
321+
318322
Docker Java (com.github.docker-java:docker-java-*:3.*)
319323

320324
- Apache License, Version 2.0

docs/usage/properties/instance/cdk/bulk_import.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,5 +23,7 @@ The following instance properties relate to bulk import, i.e. ingesting data usi
2323
| sleeper.bulk.import.eks.job.queue.url | The URL of the queue for bulk import jobs using EKS. |
2424
| sleeper.bulk.import.eks.job.queue.arn | The ARN of the queue for bulk import jobs using EKS. |
2525
| sleeper.bulk.import.eks.statemachine.arn | The ARN of the state machine for bulk import jobs using EKS. |
26-
| sleeper.bulk.import.eks.k8s.namespace | The namespace ID of the bulk import cluster using EKS. |
27-
| sleeper.bulk.import.eks.k8s.endpoint | The endpoint of the bulk import cluster using EKS. |
26+
| sleeper.bulk.import.eks.k8s.namespace | The ID of the Kubernetes namespace where Spark jobs will run for bulk import. |
27+
| sleeper.bulk.import.eks.k8s.cluster.name | The name of the EKS cluster for bulk import. |
28+
| sleeper.bulk.import.eks.k8s.endpoint | The endpoint of the EKS cluster for bulk import. |
29+
| sleeper.bulk.import.eks.k8s.ca.data | The certificate authority data of the EKS cluster for bulk import. |

java/core/src/main/java/sleeper/core/properties/instance/CdkDefinedInstanceProperty.java

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -787,12 +787,22 @@ static List<CdkDefinedInstanceProperty> getAllInGroup(PropertyGroup group) {
787787
.build();
788788
CdkDefinedInstanceProperty BULK_IMPORT_EKS_NAMESPACE = Index
789789
.propertyBuilder("sleeper.bulk.import.eks.k8s.namespace")
790-
.description("The namespace ID of the bulk import cluster using EKS.")
790+
.description("The ID of the Kubernetes namespace where Spark jobs will run for bulk import.")
791+
.propertyGroup(InstancePropertyGroup.BULK_IMPORT)
792+
.build();
793+
CdkDefinedInstanceProperty BULK_IMPORT_EKS_CLUSTER_NAME = Index
794+
.propertyBuilder("sleeper.bulk.import.eks.k8s.cluster.name")
795+
.description("The name of the EKS cluster for bulk import.")
791796
.propertyGroup(InstancePropertyGroup.BULK_IMPORT)
792797
.build();
793798
CdkDefinedInstanceProperty BULK_IMPORT_EKS_CLUSTER_ENDPOINT = Index
794799
.propertyBuilder("sleeper.bulk.import.eks.k8s.endpoint")
795-
.description("The endpoint of the bulk import cluster using EKS.")
800+
.description("The endpoint of the EKS cluster for bulk import.")
801+
.propertyGroup(InstancePropertyGroup.BULK_IMPORT)
802+
.build();
803+
CdkDefinedInstanceProperty BULK_IMPORT_EKS_CLUSTER_CA_DATA = Index
804+
.propertyBuilder("sleeper.bulk.import.eks.k8s.ca.data")
805+
.description("The certificate authority data of the EKS cluster for bulk import.")
796806
.propertyGroup(InstancePropertyGroup.BULK_IMPORT)
797807
.build();
798808

java/deployment/cdk/src/main/java/sleeper/cdk/stack/bulkimport/EksBulkImportStack.java

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
import sleeper.cdk.lambda.SleeperLambdaCode;
5757
import sleeper.cdk.stack.SleeperCoreStacks;
5858
import sleeper.cdk.stack.core.LoggingStack.LogGroupRef;
59+
import sleeper.cdk.stack.core.ManagedPoliciesStack;
5960
import sleeper.cdk.util.Utils;
6061
import sleeper.core.deploy.DockerDeployment;
6162
import sleeper.core.deploy.LambdaHandler;
@@ -146,6 +147,8 @@ public EksBulkImportStack(
146147
.build();
147148

148149
instanceProperties.set(CdkDefinedInstanceProperty.BULK_IMPORT_EKS_CLUSTER_ENDPOINT, bulkImportCluster.getClusterEndpoint());
150+
instanceProperties.set(CdkDefinedInstanceProperty.BULK_IMPORT_EKS_CLUSTER_CA_DATA, bulkImportCluster.getClusterCertificateAuthorityData());
151+
instanceProperties.set(CdkDefinedInstanceProperty.BULK_IMPORT_EKS_CLUSTER_NAME, bulkImportCluster.getClusterName());
149152

150153
KubernetesManifest namespace = createNamespace(bulkImportCluster, uniqueBulkImportId);
151154
instanceProperties.set(CdkDefinedInstanceProperty.BULK_IMPORT_EKS_NAMESPACE, uniqueBulkImportId);
@@ -289,6 +292,9 @@ private KubernetesManifest createNamespace(Cluster cluster, String namespaceName
289292
}
290293

291294
private void addClusterAdminRoles(Cluster cluster, InstanceProperties properties) {
295+
296+
cluster.getAwsAuth().addMastersRole(Role.fromRoleName(this, "ClusterAccessForInstanceAdmin", ManagedPoliciesStack.getAdminRoleName(properties)));
297+
292298
List<String> roles = properties.getList(EKS_CLUSTER_ADMIN_ROLES);
293299
if (roles == null) {
294300
return;

java/deployment/cdk/src/main/java/sleeper/cdk/stack/core/ManagedPoliciesStack.java

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -155,7 +155,7 @@ private static List<IBucket> addIngestSourceBucketReferences(Construct scope, In
155155
private Role createAdminRole() {
156156
Role role = Role.Builder.create(this, "AdminRole")
157157
.assumedBy(new AccountRootPrincipal())
158-
.roleName("sleeper-admin-" + instanceProperties.cleanInstanceId())
158+
.roleName(getAdminRoleName(instanceProperties))
159159
.build();
160160

161161
instanceAdminPolicies().forEach(policy -> policy.attachToRole(role));
@@ -165,6 +165,10 @@ private Role createAdminRole() {
165165
return role;
166166
}
167167

168+
public static String getAdminRoleName(InstanceProperties instanceProperties) {
169+
return "sleeper-admin-" + instanceProperties.cleanInstanceId();
170+
}
171+
168172
private Stream<ManagedPolicy> instanceAdminPolicies() {
169173
return Stream.of(
170174
directIngestPolicy, ingestByQueuePolicy, queryPolicy,

java/pom.xml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,8 @@
227227
<docker-java.version>3.7.1</docker-java.version>
228228
<jib.version>0.28.1</jib.version>
229229
<asm.version>9.10.1</asm.version>
230+
<!-- We match the version of Fabric8 used by Spark -->
231+
<fabric8.version>6.7.2</fabric8.version>
230232
<!-- Testing -->
231233
<junit.version>5.14.4</junit.version>
232234
<junit.platform.version>1.14.4</junit.platform.version>
@@ -1069,6 +1071,11 @@
10691071
<artifactId>logging-interceptor</artifactId>
10701072
<version>${okhttp3.version}</version>
10711073
</dependency>
1074+
<dependency>
1075+
<groupId>com.squareup.okhttp3</groupId>
1076+
<artifactId>mockwebserver</artifactId>
1077+
<version>${okhttp3.version}</version>
1078+
</dependency>
10721079
<dependency>
10731080
<groupId>org.jetbrains.kotlin</groupId>
10741081
<artifactId>kotlin-stdlib-jdk8</artifactId>
@@ -1214,6 +1221,21 @@
12141221
<artifactId>jib-core</artifactId>
12151222
<version>${jib.version}</version>
12161223
</dependency>
1224+
<dependency>
1225+
<groupId>io.fabric8</groupId>
1226+
<artifactId>kubernetes-client</artifactId>
1227+
<version>${fabric8.version}</version>
1228+
</dependency>
1229+
<dependency>
1230+
<groupId>io.fabric8</groupId>
1231+
<artifactId>kubernetes-client-api</artifactId>
1232+
<version>${fabric8.version}</version>
1233+
</dependency>
1234+
<dependency>
1235+
<groupId>io.fabric8</groupId>
1236+
<artifactId>kubernetes-server-mock</artifactId>
1237+
<version>${fabric8.version}</version>
1238+
</dependency>
12171239
<dependency>
12181240
<groupId>org.ow2.asm</groupId>
12191241
<artifactId>asm</artifactId>

java/system-test/system-test-drivers/pom.xml

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@
3838
<groupId>software.amazon.awssdk</groupId>
3939
<artifactId>apache-client</artifactId>
4040
</dependency>
41+
<dependency>
42+
<groupId>io.fabric8</groupId>
43+
<artifactId>kubernetes-client-api</artifactId>
44+
</dependency>
45+
<dependency>
46+
<groupId>io.fabric8</groupId>
47+
<artifactId>kubernetes-client</artifactId>
48+
<scope>runtime</scope>
49+
</dependency>
4150
<!-- Sleeper dependencies -->
4251
<dependency>
4352
<groupId>sleeper</groupId>
@@ -95,6 +104,11 @@
95104
<artifactId>json-unit-assertj</artifactId>
96105
<scope>test</scope>
97106
</dependency>
107+
<dependency>
108+
<groupId>io.fabric8</groupId>
109+
<artifactId>kubernetes-server-mock</artifactId>
110+
<scope>test</scope>
111+
</dependency>
98112
<dependency>
99113
<groupId>sleeper</groupId>
100114
<artifactId>localstack-test</artifactId>
@@ -115,6 +129,41 @@
115129
<scope>test</scope>
116130
<type>test-jar</type>
117131
</dependency>
132+
<dependency>
133+
<groupId>sleeper</groupId>
134+
<artifactId>sketches</artifactId>
135+
<version>${project.parent.version}</version>
136+
<type>test-jar</type>
137+
<scope>test</scope>
138+
</dependency>
139+
<dependency>
140+
<groupId>sleeper</groupId>
141+
<artifactId>ingest-core</artifactId>
142+
<version>${project.parent.version}</version>
143+
<type>test-jar</type>
144+
<scope>test</scope>
145+
</dependency>
146+
<dependency>
147+
<groupId>sleeper</groupId>
148+
<artifactId>ingest-runner</artifactId>
149+
<version>${project.parent.version}</version>
150+
<type>test-jar</type>
151+
<scope>test</scope>
152+
</dependency>
153+
<dependency>
154+
<groupId>sleeper</groupId>
155+
<artifactId>ingest-batcher-core</artifactId>
156+
<version>${project.parent.version}</version>
157+
<type>test-jar</type>
158+
<scope>test</scope>
159+
</dependency>
160+
<dependency>
161+
<groupId>sleeper</groupId>
162+
<artifactId>query-core</artifactId>
163+
<version>${project.parent.version}</version>
164+
<type>test-jar</type>
165+
<scope>test</scope>
166+
</dependency>
118167
</dependencies>
119168

120169
<dependencyManagement>
@@ -139,6 +188,25 @@
139188
<groupId>org.apache.maven.plugins</groupId>
140189
<artifactId>maven-shade-plugin</artifactId>
141190
</plugin>
191+
<plugin>
192+
<groupId>org.apache.maven.plugins</groupId>
193+
<artifactId>maven-dependency-plugin</artifactId>
194+
<executions>
195+
<execution>
196+
<id>analyze</id>
197+
<configuration>
198+
<!-- Maven doesn't seem to have a good way to get transitive dependencies of test jars. -->
199+
<ignoredUnusedDeclaredDependencies combine.children="append">
200+
<ignoredUnusedDeclaredDependency>sleeper:sketches:test-jar:</ignoredUnusedDeclaredDependency>
201+
<ignoredUnusedDeclaredDependency>sleeper:ingest-core:test-jar:</ignoredUnusedDeclaredDependency>
202+
<ignoredUnusedDeclaredDependency>sleeper:ingest-runner:test-jar:</ignoredUnusedDeclaredDependency>
203+
<ignoredUnusedDeclaredDependency>sleeper:ingest-batcher-core:test-jar:</ignoredUnusedDeclaredDependency>
204+
<ignoredUnusedDeclaredDependency>sleeper:query-core:test-jar:</ignoredUnusedDeclaredDependency>
205+
</ignoredUnusedDeclaredDependencies>
206+
</configuration>
207+
</execution>
208+
</executions>
209+
</plugin>
142210
</plugins>
143211
</build>
144212
</project>

java/system-test/system-test-drivers/src/main/java/sleeper/systemtest/drivers/statemachine/AwsEksBulkImportDriver.java renamed to java/system-test/system-test-drivers/src/main/java/sleeper/systemtest/drivers/ingest/AwsEksBulkImportDriver.java

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,19 +14,28 @@
1414
* limitations under the License.
1515
*/
1616

17-
package sleeper.systemtest.drivers.statemachine;
17+
package sleeper.systemtest.drivers.ingest;
1818

19+
import io.fabric8.kubernetes.api.model.Pod;
20+
import io.fabric8.kubernetes.api.model.PodList;
21+
import io.fabric8.kubernetes.api.model.batch.v1.Job;
22+
import io.fabric8.kubernetes.api.model.batch.v1.JobList;
23+
import io.fabric8.kubernetes.client.KubernetesClient;
24+
import org.slf4j.Logger;
25+
import org.slf4j.LoggerFactory;
1926
import software.amazon.awssdk.services.sfn.SfnClient;
2027
import software.amazon.awssdk.services.sfn.model.DescribeExecutionResponse;
2128

2229
import sleeper.bulkimport.core.statemachine.DeriveJobExecutionName;
30+
import sleeper.core.properties.instance.InstanceProperties;
2331
import sleeper.systemtest.drivers.util.SystemTestClients;
2432
import sleeper.systemtest.dsl.ingest.EksBulkImportDriver;
2533
import sleeper.systemtest.dsl.ingest.SentIngestJobsContext;
2634
import sleeper.systemtest.dsl.instance.SystemTestInstanceContext;
2735

2836
import java.util.List;
2937

38+
import static sleeper.core.properties.instance.CdkDefinedInstanceProperty.BULK_IMPORT_EKS_NAMESPACE;
3039
import static sleeper.core.properties.instance.CdkDefinedInstanceProperty.BULK_IMPORT_EKS_STATE_MACHINE_ARN;
3140
import static sleeper.core.properties.table.TableProperty.TABLE_ID;
3241

@@ -35,14 +44,22 @@
3544
* table IDs using DeriveJobExecutionName, then calls the Step Functions API to retrieve each execution's status.
3645
*/
3746
public class AwsEksBulkImportDriver implements EksBulkImportDriver {
47+
private static final Logger LOGGER = LoggerFactory.getLogger(AwsEksBulkImportDriver.class);
48+
3849
private final SystemTestInstanceContext instance;
3950
private final SentIngestJobsContext sentJobs;
4051
private final SfnClient sfnClient;
52+
private final KubernetesClientProvider k8sProvider;
4153

4254
public AwsEksBulkImportDriver(SystemTestInstanceContext instance, SentIngestJobsContext sentJobs, SystemTestClients clients) {
55+
this(instance, sentJobs, clients.getSfn(), clients::createKubernetesClient);
56+
}
57+
58+
public AwsEksBulkImportDriver(SystemTestInstanceContext instance, SentIngestJobsContext sentJobs, SfnClient sfnClient, KubernetesClientProvider k8sProvider) {
4359
this.instance = instance;
4460
this.sentJobs = sentJobs;
45-
this.sfnClient = clients.getSfn();
61+
this.sfnClient = sfnClient;
62+
this.k8sProvider = k8sProvider;
4663
}
4764

4865
@Override
@@ -54,7 +71,36 @@ public List<String> getExecutionStatuses() {
5471
String executionName = DeriveJobExecutionName.jobExecutionName(tableId, jobId);
5572
String executionArn = stateMachineArn.replace(":stateMachine:", ":execution:") + ":" + executionName;
5673
DescribeExecutionResponse response = sfnClient.describeExecution(req -> req.executionArn(executionArn));
74+
LOGGER.info("Found execution for job {}: {}", jobId, response);
75+
if (response.error() != null) {
76+
LOGGER.info("Error: {}", response.error());
77+
LOGGER.info("Cause: {}", response.cause());
78+
}
5779
return response.statusAsString();
5880
}).toList();
5981
}
82+
83+
@Override
84+
public List<String> getPods() {
85+
InstanceProperties properties = instance.getInstanceProperties();
86+
PodList list = k8sProvider.getClient(properties).pods()
87+
.inNamespace(properties.get(BULK_IMPORT_EKS_NAMESPACE))
88+
.list();
89+
LOGGER.info("Found pods in Spark namespace: {}", list);
90+
return list.getItems().stream().map(Pod::toString).toList();
91+
}
92+
93+
@Override
94+
public List<String> getJobs() {
95+
InstanceProperties properties = instance.getInstanceProperties();
96+
JobList list = k8sProvider.getClient(properties).batch().v1().jobs()
97+
.inNamespace(properties.get(BULK_IMPORT_EKS_NAMESPACE))
98+
.list();
99+
LOGGER.info("Found jobs in Spark namespace: {}", list);
100+
return list.getItems().stream().map(Job::toString).toList();
101+
}
102+
103+
public interface KubernetesClientProvider {
104+
KubernetesClient getClient(InstanceProperties instanceProperties);
105+
}
60106
}

java/system-test/system-test-drivers/src/main/java/sleeper/systemtest/drivers/util/AwsSystemTestDrivers.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import sleeper.systemtest.drivers.gc.AwsGarbageCollectionDriver;
2424
import sleeper.systemtest.drivers.ingest.AwsDataGenerationTasksDriver;
2525
import sleeper.systemtest.drivers.ingest.AwsDirectIngestDriver;
26+
import sleeper.systemtest.drivers.ingest.AwsEksBulkImportDriver;
2627
import sleeper.systemtest.drivers.ingest.AwsIngestBatcherDriver;
2728
import sleeper.systemtest.drivers.ingest.AwsIngestByQueueDriver;
2829
import sleeper.systemtest.drivers.ingest.AwsIngestReportsDriver;
@@ -48,7 +49,6 @@
4849
import sleeper.systemtest.drivers.query.WebSocketQueryDriver;
4950
import sleeper.systemtest.drivers.sourcedata.AwsGeneratedIngestSourceFilesDriver;
5051
import sleeper.systemtest.drivers.sourcedata.AwsIngestSourceFilesDriver;
51-
import sleeper.systemtest.drivers.statemachine.AwsEksBulkImportDriver;
5252
import sleeper.systemtest.drivers.statestore.AwsSnapshotsDriver;
5353
import sleeper.systemtest.drivers.statestore.AwsStateStoreCommitterDriver;
5454
import sleeper.systemtest.drivers.statestore.AwsStateStoreCommitterLogsDriver;

0 commit comments

Comments
 (0)