[FIX] fs_attachment: paginate autovacuum GC loop to bound worker memory

TecnologiaIG · TecnologiaIG · commit 1afbf7c1d7d2 · 2026-04-21T21:16:49.000-06:00
``FsFileGC._gc_files_unsafe`` loaded the entire backlog of orphan files
into a single Python list (via ``array_agg(store_fname)`` grouped by
storage) and iterated ``fs.rm`` over all of them in one shot. With the
Azure Blob backend (``adlfs``, same class of issue on ``s3fs`` and other
fsspec-based clients) and tens of thousands of queued orphans, each
HEAD+DELETE pair held onto response buffers and connection-pool state
inside the SDK that was only released when the worker exited.

In production (Odoo.sh, 30k+ orphans, Azure Blob backend) every
autovacuum run hit Odoo's ``limit_memory_hard`` and received
``SIGKILL`` at around minute 17 — 60k+ blob requests per run, zero
``DELETE`` committed, queue never drained, next worker re-ran the same
failing loop. 14 kills observed in a single 24 h window.

Fix: paginate both the ``SELECT`` and the ``fs.rm`` loop per storage,
in batches of ``_GC_BATCH_SIZE = 500``, with an explicit
``gc.collect()`` between batches to reclaim the SDK's buffered state.
The caller ``_gc_files`` still holds the ``SHARE`` lock on
``fs_file_gc`` / ``ir_attachment`` and performs the final commit, so
consistency guarantees and transactional semantics are unchanged.

Signed-off-by: TecnologiaIG &lt;tecnologia@intensegroupgt.com&gt;
diff --git a/fs_attachment/__manifest__.py b/fs_attachment/__manifest__.py
@@ -5,7 +5,7 @@
 {
     "name": "Base Attachment Object Store",
     "summary": "Store attachments on external object store",
-    "version": "16.0.2.0.1",
+    "version": "16.0.2.0.3",
     "author": "Camptocamp, ACSONE SA/NV, Odoo Community Association (OCA)",
     "license": "AGPL-3",
     "development_status": "Beta",
diff --git a/fs_attachment/models/fs_file_gc.py b/fs_attachment/models/fs_file_gc.py
@@ -1,5 +1,6 @@
 # Copyright 2023 ACSONE SA/NV
 # License AGPL-3.0 or later (https://www.gnu.org/licenses/agpl).
+import gc
 import logging
 import threading
 from contextlib import closing, contextmanager
@@ -118,51 +119,63 @@ def _gc_files(self) -> None:
         # commit to release the lock
         cr.commit()  # pylint: disable=invalid-commit
 
+    # Max orphan files processed per batch. Bounds the per-run memory
+    # footprint of the autovacuum job when many orphans are queued:
+    # fsspec backend clients (adlfs, s3fs, ...) keep response buffers
+    # and connection-pool state alive until their referents are
+    # collected. Loading tens of thousands of fnames into a single
+    # Python list and iterating ``fs.rm`` in one pass was enough to
+    # hit Odoo's ``limit_memory_hard`` on production workers.
+    _GC_BATCH_SIZE = 500
+
     def _gc_files_unsafe(self) -> None:
         # get the list of fs.storage codes that must be autovacuumed
         codes = (
             self.env["fs.storage"].search([]).filtered("autovacuum_gc").mapped("code")
         )
         if not codes:
             return
-        # we process by batch of storage codes.
-        self._cr.execute(
-            """
-            SELECT
-                fs_storage_code,
-                array_agg(store_fname)
-
-            FROM
-                fs_file_gc
-            WHERE
-                fs_storage_code IN %s
-                AND NOT EXISTS (
-                    SELECT 1
-                    FROM ir_attachment
-                    WHERE store_fname = fs_file_gc.store_fname
-                )
-            GROUP BY
-                fs_storage_code
-            """,
-            (tuple(codes),),
-        )
-        for code, store_fnames in self._cr.fetchall():
+        # Process one storage at a time and paginate both the SELECT and
+        # the fs.rm loop so neither the Python list of file names nor
+        # the storage SDK's response buffers can grow unbounded.
+        for code in codes:
             self.env["fs.storage"].get_by_code(code)
             fs = self.env["fs.storage"].get_fs_by_code(code)
-            for store_fname in store_fnames:
-                try:
-                    file_path = store_fname.partition("://")[2]
-                    fs.rm(file_path)
-                except Exception:
-                    _logger.debug("Failed to remove file %s", store_fname)
-
-        # delete the records from the table fs_file_gc
-        self._cr.execute(
-            """
-            DELETE FROM
-                fs_file_gc
-            WHERE
-                fs_storage_code IN %s
-            """,
-            (tuple(codes),),
-        )
+            while True:
+                self._cr.execute(
+                    """
+                    SELECT store_fname
+                    FROM fs_file_gc
+                    WHERE fs_storage_code = %s
+                      AND NOT EXISTS (
+                          SELECT 1
+                          FROM ir_attachment
+                          WHERE store_fname = fs_file_gc.store_fname
+                      )
+                    LIMIT %s
+                    """,
+                    (code, self._GC_BATCH_SIZE),
+                )
+                rows = self._cr.fetchall()
+                if not rows:
+                    break
+                deleted = []
+                for (store_fname,) in rows:
+                    try:
+                        file_path = store_fname.partition("://")[2]
+                        fs.rm(file_path)
+                        deleted.append(store_fname)
+                    except Exception:
+                        _logger.debug("Failed to remove file %s", store_fname)
+                if deleted:
+                    self._cr.execute(
+                        "DELETE FROM fs_file_gc WHERE store_fname = ANY(%s)",
+                        (deleted,),
+                    )
+                # Force a collection between batches to reclaim response
+                # buffers and connection objects held by the storage SDK
+                # that would otherwise only be freed on worker exit. Do
+                # NOT commit here: the caller (_gc_files) holds a SHARE
+                # lock on fs_file_gc and ir_attachment for consistency
+                # and commits at the end.
+                gc.collect()
diff --git a/fs_attachment/readme/newsfragments/gc_batching.bugfix b/fs_attachment/readme/newsfragments/gc_batching.bugfix
@@ -0,0 +1,16 @@
+Paginate the autovacuum GC loop to bound worker memory.
+
+``FsFileGC._gc_files_unsafe`` used to load the entire backlog of orphan
+files into a single Python list via ``array_agg(store_fname)`` and iterate
+``fs.rm`` over all of them in one pass. With the Azure Blob backend and
+tens of thousands of orphans queued, each HEAD+DELETE pair retained
+response buffers and connection-pool state inside the adlfs client that
+was only released when the worker exited. The autovacuum cron hit Odoo's
+``limit_memory_hard`` and got ``SIGKILL``'d mid-run every time, so the
+queue never drained and the next worker ran the same failing loop.
+
+The SELECT and the ``fs.rm`` loop are now paginated in batches of 500 per
+storage, with an explicit ``gc.collect()`` between batches. The caller
+(``_gc_files``) still holds the ``SHARE`` lock and performs the final
+commit, so the consistency guarantees and transactional semantics are
+unchanged.

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`{`
`6`	`6`	`"name": "Base Attachment Object Store",`
`7`	`7`	`"summary": "Store attachments on external object store",`
`8`		`- "version": "16.0.2.0.1",`
	`8`	`+ "version": "16.0.2.0.3",`
`9`	`9`	`"author": "Camptocamp, ACSONE SA/NV, Odoo Community Association (OCA)",`
`10`	`10`	`"license": "AGPL-3",`
`11`	`11`	`"development_status": "Beta",`