Skip to content

Commit 953a2f6

Browse files
authored
Add max_retries and cooldown timer to send_hep (#3627)
* Add logic to prevent an unreachable HEP destination on any TCP connection to crash OpenSIPS * Make the pkg_free into a spearate function
1 parent fe8642d commit 953a2f6

4 files changed

Lines changed: 115 additions & 15 deletions

File tree

modules/proto_hep/README

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,34 @@ modparam("proto_hep", "hep_async_max_postponed_chunks", 16)
246246
modparam("proto_hep", "hep_capture_id", 234)
247247
...
248248

249-
1.3.10. hep_async_local_connect_timeout (integer)
249+
1.3.10. hep_retry_cooldown (integer)
250+
251+
This parameter defines how many seconds OpenSIPS should wait
252+
before retrying a TCP connection to the HEP destination after
253+
reaching the maximum number of failed attempts set by
254+
hep_max_retries. Limitation: 16-bit integer.
255+
256+
Default value is "3600".
257+
258+
Example 1.10. Set hep_retry_cooldown parameter
259+
...
260+
modparam("proto_hep", "hep_retry_cooldown", 60)
261+
...
262+
263+
1.3.11. hep_max_retries (integer)
264+
265+
This parameter defines the maximum number of attempts
266+
OpenSIPS will make to establish a TCP connection with
267+
the HEP destination. Limitation: 16-bit integer.
268+
269+
Default value is "5".
270+
271+
Example 1.11. Set hep_max_retries parameter
272+
...
273+
modparam("proto_hep", "hep_max_retries", 10)
274+
...
275+
276+
1.3.12. hep_async_local_connect_timeout (integer)
250277

251278
If hep_async is enabled, this specifies the number of
252279
milliseconds that a connect will be tried in blocking mode
@@ -256,12 +283,12 @@ modparam("proto_hep", "hep_capture_id", 234)
256283

257284
Default value is 100 ms.
258285

259-
Example 1.10. Set hep_async_local_connect_timeout parameter
286+
Example 1.12. Set hep_async_local_connect_timeout parameter
260287
...
261288
modparam("proto_hep", "hep_async_local_connect_timeout", 200)
262289
...
263290

264-
1.3.11. hep_async_local_write_timeout (integer)
291+
1.3.13. hep_async_local_write_timeout (integer)
265292

266293
If hep_async is enabled, this specifies the number of
267294
milliseconds that a write op will be tried in blocking mode
@@ -271,7 +298,7 @@ modparam("proto_hep", "hep_async_local_connect_timeout", 200)
271298

272299
Default value is 10 ms.
273300

274-
Example 1.11. Set hep_async_local_write_timeout parameter
301+
Example 1.13. Set hep_async_local_write_timeout parameter
275302
...
276303
modparam("proto_hep", "hep_async_local_write_timeout", 100)
277304
...

modules/proto_hep/doc/proto_hep_admin.xml

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -283,6 +283,48 @@ modparam("proto_hep", "hep_capture_id", 234)
283283
</example>
284284
</section>
285285

286+
<section id="param_hep_retry_cooldown" xreflabel="hep_retry_cooldown">
287+
<title><varname>hep_retry_cooldown</varname> (integer)</title>
288+
<para>
289+
This parameter defines how many seconds OpenSIPS should wait before retrying a TCP connection to the HEP destination after reaching the maximum number of failed attempts set by hep_max_retries.
290+
Limitation: 16-bit integer.
291+
</para>
292+
<para>
293+
<emphasis>
294+
Default value is "3600".
295+
</emphasis>
296+
</para>
297+
<example>
298+
<title>Set <varname>hep_retry_cooldown</varname> parameter</title>
299+
<programlisting format="linespecific">
300+
...
301+
modparam("proto_hep", "hep_retry_cooldown", 60)
302+
...
303+
</programlisting>
304+
</example>
305+
</section>
306+
307+
<section id="param_hep_max_retries" xreflabel="hep_max_retries">
308+
<title><varname>hep_max_retries</varname> (integer)</title>
309+
<para>
310+
This parameter defines the maximum number of attempts OpenSIPS will make to establish a TCP connection with the HEP destination.
311+
Limitation: 16-bit integer.
312+
</para>
313+
<para>
314+
<emphasis>
315+
Default value is "5".
316+
</emphasis>
317+
</para>
318+
<example>
319+
<title>Set <varname>hep_max_retries</varname> parameter</title>
320+
<programlisting format="linespecific">
321+
...
322+
modparam("proto_hep", "hep_max_retries", 10)
323+
...
324+
</programlisting>
325+
</example>
326+
</section>
327+
286328
<section id="param_hep_async_local_connect_timeout" xreflabel="hep_async_local_connect_timeout">
287329
<title><varname>hep_async_local_connect_timeout</varname> (integer)</title>
288330
<para>

modules/proto_hep/hep.c

Lines changed: 40 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,8 @@
5151
#define HEP_PROTO_SIP 0x01
5252

5353
static int control_id = -1;
54+
static int hep_failed_retries = 0;
55+
static time_t hep_last_attempt = 0;
5456

5557
struct hep_message_id {
5658
char* proto;
@@ -71,6 +73,8 @@ static hid_list_p hid_list=NULL;
7173
static hid_list_p *hid_dyn_list=NULL;
7274
static gen_lock_t *hid_dyn_lock=NULL;
7375

76+
extern int hep_max_retries;
77+
extern int hep_retry_cooldown;
7478
extern int hep_capture_id;
7579
extern int payload_compression;
7680
extern int homer5_on;
@@ -1677,6 +1681,20 @@ int add_hep_payload(trace_message message, char* pld_name, str* pld_value)
16771681
return 0;
16781682
}
16791683

1684+
static void free_hep_send_resources(struct proxy_l *p, union sockaddr_union *to, char *buf) {
1685+
if (p) {
1686+
free_proxy(p);
1687+
pkg_free(p);
1688+
}
1689+
if (to) {
1690+
pkg_free(to);
1691+
}
1692+
if (buf) {
1693+
pkg_free(buf);
1694+
}
1695+
}
1696+
1697+
16801698

16811699
int send_hep_message(trace_message message, trace_dest dest, const struct socket_info* send_sock)
16821700
{
@@ -1710,35 +1728,46 @@ int send_hep_message(trace_message message, trace_dest dest, const struct socket
17101728
/* */
17111729
p=mk_proxy( &hep_dest->ip, hep_dest->port_no ? hep_dest->port_no : HEP_PORT, hep_dest->transport, 0);
17121730
if (p == NULL) {
1713-
pkg_free(buf);
17141731
LM_ERR("bad hep host name!\n");
1732+
free_hep_send_resources(NULL, NULL, buf);
17151733
goto end;
17161734
}
17171735

17181736
to=(union sockaddr_union *)pkg_malloc(sizeof(union sockaddr_union));
17191737
if (to == 0) {
17201738
LM_ERR("no more pkg mem!\n");
1721-
pkg_free(buf);
1722-
free_proxy(p);
1723-
pkg_free(p);
1739+
free_hep_send_resources(p, NULL, buf);
17241740
goto end;
17251741
}
17261742

17271743
hostent2su(to, &p->host, p->addr_idx, p->port?p->port:HEP_PORT);
17281744

1745+
time_t now = time(NULL);
1746+
1747+
// Check cooldown logic
1748+
if (hep_failed_retries >= hep_max_retries && (now - hep_last_attempt) < hep_retry_cooldown) {
1749+
LM_ERR("HEP send suppressed: too many failures (%d), in cooldown (%ld seconds left)\n", hep_failed_retries, hep_retry_cooldown - (now - hep_last_attempt));
1750+
free_hep_send_resources(p, to, buf);
1751+
goto end;
1752+
}
1753+
1754+
hep_last_attempt = now;
1755+
17291756
do {
17301757
if (msg_send(send_sock, hep_dest->transport, to, 0, buf, len, NULL) < 0) {
1731-
LM_ERR("Cannot send hep message!\n");
1758+
LM_ERR("Cannot send HEP message!\n");
1759+
hep_failed_retries++;
17321760
continue;
17331761
}
1734-
ret=0;
1762+
1763+
// Success: reset retry state
1764+
hep_failed_retries = 0;
1765+
ret = 0;
17351766
break;
1736-
} while ( get_next_su( p, to, 0)==0);
1767+
} while (get_next_su(p, to, 0) == 0);
17371768

1738-
free_proxy(p);
1739-
pkg_free(p);
1740-
pkg_free(to);
1741-
pkg_free(buf);
1769+
1770+
free_hep_send_resources(p, to, buf);
17421771

17431772
end:
17441773
return ret;

modules/proto_hep/proto_hep.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ static int hep_tls_async_handshake_connect_timeout = 10;
9292
int hep_ctx_idx = 0;
9393
int hep_capture_id = 1;
9494
int payload_compression = 0;
95+
int hep_max_retries = 5;
96+
int hep_retry_cooldown = 3600; //seconds
9597

9698
int homer5_on = 1;
9799
str homer5_delim = {":", 0};

0 commit comments

Comments
 (0)