Skip to content

Commit eb42f07

Browse files
authored
Merge pull request #477 from mrrobot47/fix/rclone-backup-oom-buffer-size
fix: cap rclone backup/restore memory to prevent OOM kills
2 parents ae6786e + 42cc62d commit eb42f07

1 file changed

Lines changed: 199 additions & 14 deletions

File tree

src/helper/Site_Backup_Restore.php

Lines changed: 199 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,11 +1269,84 @@ private function get_remote_path( $upload = true ) {
12691269
}
12701270

12711271

1272+
/**
1273+
* Read currently-available memory in MB.
1274+
*
1275+
* Pins `LC_ALL=C` so the `Mem:` label and column layout stay stable across
1276+
* locales, locates the "available" column by its header name (rather than a
1277+
* fixed field index, which differs across `free`/procps versions) and falls
1278+
* back to the "free" column on older builds that have no "available" column.
1279+
*
1280+
* @return int Available memory in MB (0 if it cannot be determined, in which
1281+
* case the resource helpers fall back to their safe minimums).
1282+
*/
1283+
private function get_available_ram_mb() {
1284+
$command = "LC_ALL=C free -m | awk 'NR==1{for(i=1;i<=NF;i++) if(\$i==\"available\") c=i+1} /^Mem:/{print (c ? \$c : \$4)}'";
1285+
1286+
return intval( EE::launch( $command )->stdout );
1287+
}
1288+
12721289
private function rclone_download( $path ) {
12731290
$cpu_cores = intval( EE::launch( 'nproc' )->stdout );
1274-
$multi_threads = min( intval( $cpu_cores ) * 2, 32 );
1275-
$command = sprintf( "rclone copy -P --multi-thread-streams %d %s %s", $multi_threads, escapeshellarg( $this->get_remote_path( false ) ), escapeshellarg( $path ) );
1276-
$output = EE::launch( $command );
1291+
$available_ram = $this->get_available_ram_mb();
1292+
1293+
// Derive the memory-safe transfer count and the total RAM budget from the
1294+
// shared helper (is_s3 = false: downloads allocate no S3 multipart-upload
1295+
// buffers). The budget is reused below rather than recomputed.
1296+
$res = $this->compute_rclone_resources( $cpu_cores, $available_ram, false );
1297+
$transfers = $res['transfers'];
1298+
$budget = $res['budget']; // MB; available_ram * rclone-mem-fraction.
1299+
$max_buffer = $res['max_buffer']; // MB.
1300+
1301+
// Per concurrent --transfers a download holds one --buffer-size read-ahead
1302+
// buffer plus, for files above rclone's --multi-thread-cutoff,
1303+
// --multi-thread-streams streams -- each with a
1304+
// --multi-thread-write-buffer-size buffer and one in-flight
1305+
// --multi-thread-chunk-size range. The whole footprint must fit ONE budget:
1306+
//
1307+
// transfers * ( buffer_size + multi_thread_streams * per_stream_mem ) <= budget
1308+
//
1309+
// so each transfer's share of the budget is split between the read-ahead
1310+
// buffer (up to half, capped at $max_buffer) and the multi-thread streams.
1311+
// Both scale with available RAM; on a tight budget streams floor at 1. The
1312+
// previous code budgeted the streams against the full budget independently
1313+
// of the read-ahead buffers, so the two pools could together reach ~2x the
1314+
// intended fraction and still OOM during restore/rollback.
1315+
$mt_write_buffer = max( 1, intval( get_config_value( 'rclone-mt-write-buffer-size', 128 ) ) ); // KiB; rclone default.
1316+
$mt_chunk_size = max( 1, intval( get_config_value( 'rclone-mt-chunk-size', 64 ) ) ); // MB; rclone default.
1317+
$per_stream_mem = ( $mt_write_buffer / 1024 ) + $mt_chunk_size; // MB.
1318+
1319+
// Reduce transfers until each transfer's share of the budget can hold the
1320+
// minimum read-ahead buffer plus at least one stream, so the combined
1321+
// footprint stays within budget whenever the budget allows it at all.
1322+
$min_per_transfer = 16 + (int) ceil( $per_stream_mem );
1323+
while ( $transfers > 1 && intval( floor( $budget / $transfers ) ) < $min_per_transfer ) {
1324+
$transfers--;
1325+
}
1326+
$per_transfer = max( $min_per_transfer, intval( floor( $budget / $transfers ) ) );
1327+
1328+
// Give the read-ahead buffer up to half the share (capped at $max_buffer)
1329+
// but always leave room for at least one stream; spend the rest on streams.
1330+
$buffer_mb = min( intval( floor( $per_transfer / 2 ) ), $max_buffer, $per_transfer - (int) ceil( $per_stream_mem ) );
1331+
$buffer_mb = max( 16, $buffer_mb );
1332+
$stream_budget = $per_transfer - $buffer_mb;
1333+
$mt_streams = max( 1, min( $cpu_cores * 2, 32, intval( floor( $stream_budget / $per_stream_mem ) ) ) );
1334+
$buffer_size = $buffer_mb . 'M';
1335+
1336+
EE::debug( sprintf(
1337+
'rclone download tuning: available_ram=%dMB budget=%dMB transfers=%d buffer-size=%s multi-thread-streams=%d mt-write-buffer=%dKi mt-chunk-size=%dM (est. peak ~%dMB)',
1338+
$available_ram,
1339+
$budget,
1340+
$transfers,
1341+
$buffer_size,
1342+
$mt_streams,
1343+
$mt_write_buffer,
1344+
$mt_chunk_size,
1345+
(int) ( $transfers * ( $buffer_mb + $mt_streams * $per_stream_mem ) )
1346+
) );
1347+
1348+
$command = sprintf( "rclone copy -P --transfers %d --buffer-size %s --multi-thread-streams %d --multi-thread-write-buffer-size %dKi --multi-thread-chunk-size %dM %s %s", $transfers, $buffer_size, $mt_streams, $mt_write_buffer, $mt_chunk_size, escapeshellarg( $this->get_remote_path( false ) ), escapeshellarg( $path ) );
1349+
$output = EE::launch( $command );
12771350

12781351
if ( $output->return_code ) {
12791352
EE::error( 'Error downloading backup from remote storage.' );
@@ -1283,24 +1356,56 @@ private function rclone_download( $path ) {
12831356
}
12841357

12851358

1286-
private function rclone_upload( $path ) {
1287-
$cpu_cores = intval( EE::launch( 'nproc' )->stdout );
1288-
$ram = intval( EE::launch( "free -m | grep Mem | awk '{print $7}'" )->stdout );
1289-
$transfers = max( 2, min( intval( $cpu_cores / 2 ), 4 ) );
1290-
$max_buffer_size = 4096;
1359+
/**
1360+
* Whether the configured rclone remote is an S3 backend.
1361+
*
1362+
* Resolves the remote name from `rclone-path` (instead of assuming
1363+
* `easyengine`) and compares the backend's exact `type` value to `s3`,
1364+
* rather than substring-matching the raw `rclone config show` output -- which
1365+
* could both miss a non-`easyengine` remote and false-positive on any line
1366+
* whose value merely contains the substring `s3`. All S3-compatible providers
1367+
* (AWS, Spaces, Wasabi, MinIO, ...) share `type = s3`, so an exact match on
1368+
* the type value covers them.
1369+
*
1370+
* @return bool
1371+
*/
1372+
private function is_s3_remote() {
1373+
$rclone_path = get_config_value( 'rclone-path', 'easyengine:easyengine' );
1374+
$remote = explode( ':', $rclone_path )[0];
1375+
1376+
$command = sprintf( "rclone config show %s | awk -F '=' '/^[[:space:]]*type[[:space:]]*=/ {gsub(/[[:space:]]/, \"\", \$2); print \$2; exit}'", escapeshellarg( $remote ) );
1377+
$type = trim( EE::launch( $command )->stdout );
1378+
1379+
return ( 's3' === $type );
1380+
}
12911381

1382+
private function rclone_upload( $path ) {
1383+
$cpu_cores = intval( EE::launch( 'nproc' )->stdout );
1384+
$available_ram = $this->get_available_ram_mb();
12921385

1293-
$buffer_size = min( floor( $ram / $transfers ), $max_buffer_size ) . 'M';
1386+
// Detect S3 backends, which require additional multipart-upload tuning.
1387+
$is_s3 = $this->is_s3_remote();
12941388

1389+
$res = $this->compute_rclone_resources( $cpu_cores, $available_ram, $is_s3 );
1390+
$transfers = $res['transfers'];
1391+
$buffer_size = $res['buffer_size'] . 'M';
12951392

1296-
$command = 'rclone config show easyengine | grep type';
1297-
$output = EE::launch( $command )->stdout;
12981393
$s3_flag = '';
1299-
1300-
if ( strpos( $output, 's3' ) !== false ) {
1301-
$s3_flag = ' --s3-chunk-size=64M --s3-upload-concurrency ' . min( intval( $cpu_cores ) * 2, 32 );
1394+
if ( $is_s3 ) {
1395+
$s3_flag = sprintf( ' --s3-chunk-size=%dM --s3-upload-concurrency %d', $res['s3_chunk_size'], $res['s3_concurrency'] );
13021396
}
13031397

1398+
EE::debug( sprintf(
1399+
'rclone upload tuning: available_ram=%dMB transfers=%d buffer-size=%s s3=%s s3-chunk-size=%dM s3-upload-concurrency=%d (est. peak ~%dMB)',
1400+
$available_ram,
1401+
$transfers,
1402+
$buffer_size,
1403+
$is_s3 ? 'yes' : 'no',
1404+
$res['s3_chunk_size'],
1405+
$res['s3_concurrency'],
1406+
$transfers * ( $res['buffer_size'] + ( $res['s3_chunk_size'] * $res['s3_concurrency'] ) )
1407+
) );
1408+
13041409
$command = sprintf( "rclone copy -P %s --transfers %d --checkers %d --buffer-size %s %s %s", $s3_flag, $transfers, $transfers, $buffer_size, escapeshellarg( $path ), escapeshellarg( $this->get_remote_path() ) );
13051410
$output = EE::launch( $command );
13061411

@@ -1331,6 +1436,86 @@ private function rclone_upload( $path ) {
13311436
}
13321437
}
13331438

1439+
/**
1440+
* Compute memory-safe rclone transfer settings shared by upload and download.
1441+
*
1442+
* rclone allocates one `--buffer-size` read-ahead buffer per concurrent
1443+
* `--transfers`, and, for S3 uploads, an additional
1444+
* `--s3-chunk-size * --s3-upload-concurrency` multipart buffer per transfer.
1445+
* The upload in-memory footprint is therefore:
1446+
*
1447+
* transfers * ( buffer_size + s3_chunk_size * s3_upload_concurrency )
1448+
*
1449+
* The previous implementation set `buffer_size = available_ram / transfers`,
1450+
* which made the read-ahead buffers alone consume ~100% of available memory
1451+
* (e.g. `--buffer-size 1328M --transfers 2` on a 4 GB host) and routinely
1452+
* triggered the OOM killer during backups. This helper instead caps rclone's
1453+
* total footprint at a fraction of currently-available memory, while still
1454+
* scaling parallelism and buffer size up on larger hosts so spare RAM is used.
1455+
*
1456+
* The returned `budget` (and `max_buffer`) let callers that allocate further
1457+
* buffer pools -- e.g. `rclone_download()` sizing `--multi-thread-streams` --
1458+
* stay within the same single budget instead of recomputing their own.
1459+
*
1460+
* Tunable via global config: `rclone-mem-fraction` (default 0.5) and
1461+
* `rclone-max-buffer-size` in MB (default 256).
1462+
*
1463+
* @param int $cpu_cores Number of CPU cores (nproc).
1464+
* @param int $available_ram Currently available memory in MB.
1465+
* @param bool $is_s3 Whether the remote is an S3 backend.
1466+
*
1467+
* @return array{transfers:int,buffer_size:int,s3_concurrency:int,s3_chunk_size:int,budget:int,max_buffer:int}
1468+
*/
1469+
private function compute_rclone_resources( $cpu_cores, $available_ram, $is_s3 ) {
1470+
$cpu_cores = max( 1, intval( $cpu_cores ) );
1471+
$available_ram = max( 0, intval( $available_ram ) );
1472+
1473+
// Fraction of *available* RAM rclone may use, clamped to a safe range so
1474+
// a backup never starves the host (MariaDB, PHP-FPM, nginx) or itself.
1475+
$mem_fraction = floatval( get_config_value( 'rclone-mem-fraction', 0.5 ) );
1476+
$mem_fraction = min( 0.9, max( 0.1, $mem_fraction ) );
1477+
1478+
$min_buffer = 16; // rclone's default; never go below it.
1479+
$max_buffer = max( $min_buffer, intval( get_config_value( 'rclone-max-buffer-size', 256 ) ) );
1480+
$s3_chunk_size = $is_s3 ? 64 : 0;
1481+
1482+
// Total memory budget for rclone transfer/multipart buffers.
1483+
$budget = (int) floor( $available_ram * $mem_fraction );
1484+
1485+
// Desired parallelism, scaled with cores but capped to sane bounds.
1486+
$transfers = max( 2, min( $cpu_cores, 8 ) );
1487+
$s3_concurrency = $is_s3 ? max( 2, min( $cpu_cores * 2, 32 ) ) : 0;
1488+
1489+
// If the budget can't fit the desired parallelism even at the minimum
1490+
// buffer size, first shrink S3 multipart concurrency (the biggest memory
1491+
// lever at 64M/chunk), then the number of parallel transfers. Both may
1492+
// fall to 1 on extremely tight budgets so the helper honours its own cap
1493+
// whenever the budget allows it at all (the desired floor stays at 2).
1494+
while ( $is_s3 && $s3_concurrency > 1 &&
1495+
$transfers * ( $min_buffer + $s3_chunk_size * $s3_concurrency ) > $budget ) {
1496+
$s3_concurrency = max( 1, intval( $s3_concurrency / 2 ) );
1497+
}
1498+
while ( $transfers > 1 &&
1499+
$transfers * ( $min_buffer + $s3_chunk_size * $s3_concurrency ) > $budget ) {
1500+
$transfers--;
1501+
}
1502+
1503+
// Spend whatever budget remains after reserving S3 multipart buffers on
1504+
// the read-ahead buffer, clamped to [$min_buffer, $max_buffer].
1505+
$per_transfer_budget = intval( floor( $budget / $transfers ) );
1506+
$buffer_size = $per_transfer_budget - ( $s3_chunk_size * $s3_concurrency );
1507+
$buffer_size = max( $min_buffer, min( $buffer_size, $max_buffer ) );
1508+
1509+
return [
1510+
'transfers' => $transfers,
1511+
'buffer_size' => $buffer_size,
1512+
's3_concurrency' => $s3_concurrency,
1513+
's3_chunk_size' => $s3_chunk_size,
1514+
'budget' => $budget,
1515+
'max_buffer' => $max_buffer,
1516+
];
1517+
}
1518+
13341519
/**
13351520
* Delete old backups from remote storage after successful upload.
13361521
* Keeps only the configured number of most recent backups.

0 commit comments

Comments
 (0)