Skip to content

Commit cd9ba6a

Browse files
committed
fix(java): improve socket lifecycle handling for connect/send/close (#11760)
* fix(eBPF): Use MSG_NOSIGNAL in send() to avoid SIGPIPE on Linux * fix(java): serialize socket fd close and send paths Split close_files() into a locked wrapper and a non-locking close_files_locked() helper to ensure fd close operations share the same mutex with send paths. Update send_msg() to use close_files_locked() on send failure to avoid recursive locking when already holding g_df_lock. Move perf_map_socket_fd validity checks in df_send_symbol() inside the critical section to eliminate races between fd close and send operations during re-attach or error handling. This change only tightens fd lifecycle synchronization and does not alter JVMTI event lifecycle or replay semantics. * Add `close()` when `connect()` fails. * Adjust comments
1 parent 3db6360 commit cd9ba6a

2 files changed

Lines changed: 38 additions & 21 deletions

File tree

agent/src/ebpf/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@ OBJS := user/elf.o \
102102
user/profile/java/collect_symbol_files.o
103103

104104
JAVA_TOOL := deepflow-jattach
105-
JAVA_AGENT_VERSION := 4
105+
JAVA_AGENT_VERSION := 4.1
106106
JAVA_AGENT_GNU_SO := df_java_agent_v$(JAVA_AGENT_VERSION).so
107107
JAVA_AGENT_MUSL_SO := df_java_agent_musl_v$(JAVA_AGENT_VERSION).so
108108
JAVA_AGENT_SO := $(JAVA_AGENT_GNU_SO) $(JAVA_AGENT_MUSL_SO)

agent/src/ebpf/user/profile/java/symbol_collect_agent.c

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ int perf_map_log_socket_fd = -1;
6868
// Cache symbols for batch sending
6969
char g_symbol_buffer[STRING_BUFFER_SIZE * 4];
7070
int g_cached_bytes;
71+
static jint close_files_locked(void);
7172
jint close_files(void);
7273

7374
#define _(e) \
@@ -94,19 +95,23 @@ inline int send_msg(int sock_fd, const char *buf, size_t len)
9495
int n = 0; // Initialize n
9596

9697
do {
97-
n = send(sock_fd, buf + send_bytes, len - send_bytes, 0);
98+
/*
99+
* Note: To avoid SIGPIPE signal (which terminates the process), use
100+
* MSG_NOSIGNAL flag in send() call.
101+
*/
102+
n = send(sock_fd, buf + send_bytes, len - send_bytes, MSG_NOSIGNAL);
98103
if (n == -1) {
99104
if (errno == EINTR || errno == EAGAIN
100105
|| errno == EWOULDBLOCK) {
101106
// Retry on interrupt or temporary failure
102107
continue;
103108
} else {
104-
close_files(); // Example function call, define as needed
109+
close_files_locked(); // Example function call, define as needed
105110
break;
106111
}
107112
} else if (n == 0) {
108113
// Connection closed by peer
109-
close_files(); // Example function call, define as needed
114+
close_files_locked(); // Example function call, define as needed
110115
break;
111116
}
112117

@@ -126,11 +131,9 @@ jint df_open_socket(const char *path, int *ptr)
126131

127132
/*
128133
* The reason for setting non-blocking mode:
129-
* 1 To prevent Java threads from being blocked.
130-
* 2 When attempts to write data to a closed writing port of a pipe or
131-
* socket, the operating system detects this situation and sends the
132-
* SIGPIPE signal to Java process, which causes the program to exit.
133-
* Use non-blocking mode to avoid this issue.
134+
* 1 To prevent Java threads from being blocked when writing to socket.
135+
* 2 Non-blocking mode allows send() to fail with EAGAIN/EWOULDBLOCK
136+
* instead of blocking, enabling graceful error handling.
134137
*/
135138
int flags = fcntl(s, F_GETFL, 0);
136139
if (flags == -1) {
@@ -148,6 +151,7 @@ jint df_open_socket(const char *path, int *ptr)
148151
strncpy(remote.sun_path, path, UNIX_PATH_MAX - 1);
149152
int len = sizeof(remote.sun_family) + strlen(remote.sun_path);
150153
if (connect(s, (struct sockaddr *)&remote, len) == -1) {
154+
close(s);
151155
fprintf(stderr, "Call connect() failed: errno(%d)\n", errno);
152156
return JNI_ERR;
153157
}
@@ -215,21 +219,32 @@ jint df_agent_config(char *opts)
215219
return JNI_OK;
216220
}
217221

218-
jint close_files(void)
222+
static jint close_files_locked(void)
219223
{
220-
if (perf_map_socket_fd > 0) {
221-
close(perf_map_socket_fd);
222-
perf_map_socket_fd = -1;
223-
}
224+
int perf_fd = perf_map_socket_fd;
225+
int log_fd = perf_map_log_socket_fd;
226+
227+
perf_map_socket_fd = -1;
228+
perf_map_log_socket_fd = -1;
224229

225-
if (perf_map_log_socket_fd > 0) {
226-
close(perf_map_log_socket_fd);
227-
perf_map_log_socket_fd = -1;
230+
if (perf_fd > 0) {
231+
close(perf_fd);
232+
}
233+
if (log_fd > 0) {
234+
close(log_fd);
228235
}
229236

230237
return JNI_OK;
231238
}
232239

240+
jint close_files(void)
241+
{
242+
pthread_mutex_lock(&g_df_lock);
243+
close_files_locked();
244+
pthread_mutex_unlock(&g_df_lock);
245+
return JNI_OK;
246+
}
247+
233248
JNIEXPORT uint64_t df_java_agent_so_libs_test(void)
234249
{
235250
/*
@@ -302,13 +317,16 @@ void deallocate(jvmtiEnv * jvmti, void *string)
302317
void df_send_symbol(enum event_type type, const void *code_addr,
303318
unsigned int code_size, const char *entry)
304319
{
320+
int send_bytes;
321+
struct symbol_metadata *meta;
322+
char symbol_str[STRING_BUFFER_SIZE];
323+
324+
pthread_mutex_lock(&g_df_lock);
305325
if (perf_map_socket_fd < 0) {
326+
pthread_mutex_unlock(&g_df_lock);
306327
return;
307328
}
308329

309-
int send_bytes;
310-
struct symbol_metadata *meta;
311-
char symbol_str[STRING_BUFFER_SIZE];
312330
if (type == METHOD_UNLOAD) {
313331
snprintf(symbol_str + sizeof(*meta),
314332
sizeof(symbol_str) - sizeof(*meta), "%lx",
@@ -322,7 +340,6 @@ void df_send_symbol(enum event_type type, const void *code_addr,
322340
meta->len = strlen(symbol_str + sizeof(*meta));
323341
meta->type = type;
324342
send_bytes = meta->len + sizeof(*meta);
325-
pthread_mutex_lock(&g_df_lock);
326343
if (replay_finish) {
327344
send_msg(perf_map_socket_fd, symbol_str, send_bytes);
328345
} else {

0 commit comments

Comments
 (0)