Skip to content

Commit 7aa3ccf

Browse files
adam360xsystems-assistant[bot]
authored andcommitted
[SWDEV-548460] Add RDC Policy Reset Message
* [SWDEV-548460] Add RDC Policy Reset Message * [rdc] Bump version to 1.3.0 Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com> * chore: [rdc] Format CMakeLists.txt Signed-off-by: Galantsev, Dmitrii <dmitrii.galantsev@amd.com> [rocm-systems] ROCm/rocm-systems#2180 (commit 5bf6e36)
1 parent 3cfe951 commit 7aa3ccf

10 files changed

Lines changed: 37 additions & 15 deletions

File tree

CMakeLists.txt

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,13 @@
2323
#
2424
cmake_minimum_required(VERSION 3.15)
2525

26-
set( COMP_TYPE "runtime" )
27-
set( BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides" )
28-
set( BUILD_DEBIAN_PKGING_FLAG ON CACHE BOOL "Internal Status Flag to indicate Debian Packaging Build" )
26+
set(COMP_TYPE "runtime")
27+
set(BUILD_ENABLE_LINTIAN_OVERRIDES ON CACHE BOOL "Enable/Disable Lintian Overrides")
28+
set(BUILD_DEBIAN_PKGING_FLAG
29+
ON
30+
CACHE BOOL
31+
"Internal Status Flag to indicate Debian Packaging Build"
32+
)
2933
set(RDC "rdc" CACHE INTERNAL "")
3034
set(RDC_PACKAGE ${RDC} CACHE STRING "")
3135

@@ -89,7 +93,7 @@ include(utils)
8993
set(PKG_VERSION_GIT_TAG_PREFIX "rdc_pkg_ver")
9094
# Provide git to utilities
9195
find_program(GIT NAMES git)
92-
get_version_from_tag("1.2.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
96+
get_version_from_tag("1.3.0" ${PKG_VERSION_GIT_TAG_PREFIX} GIT)
9397
# VERSION_* variables should be set by get_version_from_tag
9498
message("Package version: ${VERSION_STRING}")
9599

@@ -167,8 +171,8 @@ set(CMAKE_LIBRARY_PATH ${CMAKE_LIBRARY_PATH} /usr/lib64 /usr/lib/x86_64-linux-gn
167171

168172
# configure packaging
169173
# cpack version is populated with CMAKE_PROJECT_VERSION implicitly
170-
set(PKG_MAINTAINER_NM "RDC Support")
171-
set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com")
174+
set(PKG_MAINTAINER_NM "RDC Support")
175+
set(PKG_MAINTAINER_EMAIL "rdc.support@amd.com")
172176
set(CPACK_PACKAGE_NAME ${RDC_PACKAGE} CACHE INTERNAL "")
173177
set(CPACK_PACKAGE_VENDOR "Advanced Micro Devices, Inc." CACHE STRING "")
174178
set(CPACK_PACKAGE_CONTACT "${PKG_MAINTAINER_NM} <${PKG_MAINTAINER_EMAIL}>")
@@ -565,7 +569,7 @@ set(CPACK_RPM_RUNTIME_POST_UNINSTALL_SCRIPT_FILE "${CMAKE_CURRENT_SOURCE_DIR}/RP
565569
set(CPACK_DEBIAN_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}")
566570
set(CPACK_RPM_RUNTIME_PACKAGE_NAME "${CPACK_PACKAGE_NAME}")
567571

568-
configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL} )
572+
configure_pkg( ${RDC} ${COMP_TYPE} ${CPACK_PACKAGE_VERSION} ${PKG_MAINTAINER_NM} ${PKG_MAINTAINER_EMAIL})
569573

570574
include(CPack)
571575

include/rdc/rdc.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1537,6 +1537,8 @@ typedef struct {
15371537
rdc_policy_condition_t condition; //!< the condition that is meet
15381538
rdc_gpu_group_t group_id; //!< The group id trigger this callback
15391539
int64_t value; //!< The current value that meet the condition
1540+
uint32_t gpu_index; //!< GPU index that hit the condition
1541+
bool reset_triggered; //!< if reset was attempted
15401542
} rdc_policy_callback_response_t;
15411543

15421544
/**

protos/rdc.proto

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -659,6 +659,8 @@ message RegisterPolicyResponse {
659659
PolicyCondition condition =3;
660660
uint32 group_id =4;
661661
uint64 value=5;
662+
uint32 gpu_index=6;
663+
bool reset_triggered = 7;
662664
}
663665

664666
message UnRegisterPolicyResult {

rdc_libs/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ find_program(GIT NAMES git)
5858

5959
# Debian package specific variables
6060
# Set a default value for the package version
61-
get_version_from_tag("1.2.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
61+
get_version_from_tag("1.3.0.0" ${SO_VERSION_GIT_TAG_PREFIX} GIT)
6262

6363
# VERSION_* variables should be set by get_version_from_tag
6464
set(SO_VERSION_STRING "${VERSION_MAJOR}.${VERSION_MINOR}")

rdc_libs/rdc/src/RdcPolicyImpl.cc

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -218,14 +218,17 @@ void RdcPolicyImpl::rdc_policy_check_condition() {
218218
status = metric_fetcher_->fetch_smi_field(gpu_index, map[policy.condition.type], &value);
219219
if (status == RDC_ST_OK) {
220220
if (value.value.l_int > policy.condition.value) {
221+
222+
bool reset = RDC_POLICY_ACTION_GPU_RESET == policy.action;
223+
221224
// callback if needed
222225
if (callback) {
223226
rdc_policy_callback_response_t response = {1, policy.condition, group_id,
224-
value.value.l_int};
227+
value.value.l_int, gpu_index, reset};
225228
callback(&response);
226229
}
227230

228-
if (RDC_POLICY_ACTION_GPU_RESET == policy.action) {
231+
if (reset) {
229232
rdc_policy_gpu_reset(gpu_index);
230233
}
231234
}

rdc_libs/rdc_client/src/RdcStandaloneHandler.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -918,6 +918,8 @@ rdc_status_t RdcStandaloneHandler::rdc_policy_register(rdc_gpu_group_t group_id,
918918
response.condition.value = cond.value();
919919
response.group_id = reply.group_id();
920920
response.value = reply.value();
921+
response.gpu_index = reply.gpu_index();
922+
response.reset_triggered = reply.reset_triggered();
921923

922924
callback(&response);
923925
}

rdci/src/RdciPolicySubSystem.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -195,8 +195,16 @@ int rdc_policy_callback(rdc_policy_callback_response_t* userData) {
195195
threshold /= 1000000;
196196
}
197197

198-
std::cout << "A " << condition_type_to_str(userData->condition.type) << " exceeds the threshold "
199-
<< threshold << " with the value " << value << std::endl;
198+
std::cout << "A " << condition_type_to_str(userData->condition.type)
199+
<< " exceeds the threshold " << threshold
200+
<< " with the value " << value;
201+
202+
if (userData->reset_triggered) {
203+
std::cout << " and triggered reset on GPU " << userData->gpu_index;
204+
}
205+
206+
std::cout << std::endl;
207+
200208
last_time = now; // update the last time
201209
return 0;
202210
}

rdci/src/rdci.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ THE SOFTWARE.
4141
#include "rdc_lib/rdc_common.h"
4242

4343
#define RDC_CLIENT_VERSION_MAJOR 1
44-
#define RDC_CLIENT_VERSION_MINOR 2
44+
#define RDC_CLIENT_VERSION_MINOR 3
4545
#define RDC_CLIENT_VERSION_RELEASE 0
4646

4747
#define RDC_CLIENT_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE)

server/include/rdc/rdc_server_main.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ THE SOFTWARE.
3131
#include "rdc/rdc_api_service.h"
3232

3333
#define RDC_SERVER_VERSION_MAJOR 1
34-
#define RDC_SERVER_VERSION_MINOR 2
34+
#define RDC_SERVER_VERSION_MINOR 3
3535
#define RDC_SERVER_VERSION_RELEASE 0
3636

3737
#define RDC_SERVER_VERSION_CREATE_STRING(MAJOR, MINOR, RELEASE) (#MAJOR "." #MINOR "." #RELEASE)

server/src/rdc_api_service.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -931,7 +931,8 @@ ::grpc::Status RdcAPIServiceImpl::RegisterPolicy(
931931
::rdc::PolicyCondition* cond = reply.mutable_condition();
932932
cond->set_type(static_cast<::rdc::PolicyCondition_Type>(ctx->response.condition.type));
933933
cond->set_value(ctx->response.condition.value);
934-
934+
reply.set_gpu_index(ctx->response.gpu_index);
935+
reply.set_reset_triggered(ctx->response.reset_triggered != 0);
935936
writer->Write(reply);
936937
}
937938

0 commit comments

Comments
 (0)