fix(deps): bump diffusers to b9feed8, lock bitsandbytes==0.39.1

gadicc · gadicc · commit be1c32218cd0 · 2023-07-12T10:09:25.000+01:00
merge upstream changes into train_dreambooth.py.
diff --git a/Dockerfile b/Dockerfile
@@ -37,11 +37,8 @@ WORKDIR /api
 ADD requirements.txt requirements.txt
 RUN pip install -r requirements.txt
 
-# [9965cb5] [Community Pipelines] Update lpw_stable_diffusion pipeline (#3197)
-# Above was reverted shortly afterwards for not being backwards compatible.
-
-# [ce55049] Update pipeline_flax_stable_diffusion_controlnet.py (#3306)
-ARG DIFFUSERS_VERSION="ce5504934ac484fca39a1a5434ecfae09eabdf41"
+# [b9feed8] move to 0.19.0dev (#4048)
+ARG DIFFUSERS_VERSION="b9feed87958c27074b0618cc543696c05f58e2c9"
 ENV DIFFUSERS_VERSION=${DIFFUSERS_VERSION}
 
 RUN git clone https://github.com/huggingface/diffusers && cd diffusers && git checkout ${DIFFUSERS_VERSION}
@@ -69,7 +66,8 @@ ENV USE_DREAMBOOTH=${USE_DREAMBOOTH}
 RUN if [ "$USE_DREAMBOOTH" = "1" ] ; then \
     # By specifying the same torch version as conda, it won't download again.
     # Without this, it will upgrade torch, break xformers, make bigger image.
-    pip install -r diffusers/examples/dreambooth/requirements.txt bitsandbytes torch==1.12.1 ; \
+    # bitsandbytes==0.40.0.post4 had failed cuda detection on dreambooth test.
+    pip install -r diffusers/examples/dreambooth/requirements.txt bitsandbytes==0.39.1 torch==1.12.1 ; \
   fi
 RUN if [ "$USE_DREAMBOOTH" = "1" ] ; then apt-get install git-lfs ; fi
 
diff --git a/api/train_dreambooth.py b/api/train_dreambooth.py
@@ -1,5 +1,5 @@
 # Based on https://github.com/huggingface/diffusers/commits/main/examples/dreambooth/train_dreambooth.py
-# Synced to commit c42f6ee43e0408c5fe8a1d3dc3cdeb9eb3a02fa6 on 2023-06-14
+# Synced to commit b9feed87958c27074b0618cc543696c05f58e2c9 on 2023-07-12
 
 # Reasons for not using that file directly:
 #
@@ -29,6 +29,7 @@
 import logging
 import math
 import os
+import shutil
 import warnings
 from pathlib import Path
 
@@ -257,7 +258,7 @@ def TrainDreamBooth(model_id: str, pipeline, model_inputs, call_inputs, send_opt
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.17.0")
+check_min_version("0.19.0.dev0")
 
 logger = get_logger(__name__)
 
@@ -653,9 +654,7 @@ def main(args, init_pipeline, send_opts):
     logging_dir = Path(args.output_dir, args.logging_dir)
 
     accelerator_project_config = ProjectConfiguration(
-        total_limit=args.checkpoints_total_limit,
-        project_dir=args.output_dir,
-        logging_dir=logging_dir,
+        project_dir=args.output_dir, logging_dir=logging_dir
     )
 
     accelerator = Accelerator(
@@ -1055,8 +1054,8 @@ def compute_text_embeddings(prompt):
             unet, optimizer, train_dataloader, lr_scheduler
         )
 
-    # For mixed precision training we cast the text_encoder and vae weights to half-precision
-    # as these models are only used for inference, keeping weights in full precision is not required.
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
     weight_dtype = torch.float32
     if accelerator.mixed_precision == "fp16":
         weight_dtype = torch.float16
@@ -1277,14 +1276,46 @@ def compute_text_embeddings(prompt):
                 global_step += 1
 
                 if accelerator.is_main_process:
-                    images = []
                     if global_step % args.checkpointing_steps == 0:
+                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
+                        if args.checkpoints_total_limit is not None:
+                            checkpoints = os.listdir(args.output_dir)
+                            checkpoints = [
+                                d for d in checkpoints if d.startswith("checkpoint")
+                            ]
+                            checkpoints = sorted(
+                                checkpoints, key=lambda x: int(x.split("-")[1])
+                            )
+
+                            # before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
+                            if len(checkpoints) >= args.checkpoints_total_limit:
+                                num_to_remove = (
+                                    len(checkpoints) - args.checkpoints_total_limit + 1
+                                )
+                                removing_checkpoints = checkpoints[0:num_to_remove]
+
+                                logger.info(
+                                    f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
+                                )
+                                logger.info(
+                                    f"removing checkpoints: {', '.join(removing_checkpoints)}"
+                                )
+
+                                for removing_checkpoint in removing_checkpoints:
+                                    removing_checkpoint = os.path.join(
+                                        args.output_dir, removing_checkpoint
+                                    )
+                                    shutil.rmtree(removing_checkpoint)
+
                         save_path = os.path.join(
                             args.output_dir, f"checkpoint-{global_step}"
                         )
                         pipeline.save_pretrained(save_path)
                         accelerator.save_state(save_path)
                         logger.info(f"Saved state to {save_path}")
+
+                    images = []
+
                     if (
                         args.validation_prompt is not None
                         and global_step % args.validation_steps == 0