ControlNet
diff --git a/‎.gitignore‎
Lines changed: 6 additions & 1 deletion b/‎.gitignore‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎examples/batfd/README.md‎
Lines changed: 42 additions & 0 deletions b/‎examples/batfd/README.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎examples/batfd/batfd.toml‎
Lines changed: 35 additions & 0 deletions b/‎examples/batfd/batfd.toml‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎examples/batfd/batfd/__init__.py‎ b/‎examples/batfd/batfd/__init__.py‎
diff --git a/‎examples/batfd/batfd/inference.py‎
Lines changed: 129 additions & 0 deletions b/‎examples/batfd/batfd/inference.py‎
Lines changed: 129 additions & 0 deletions
diff --git a/‎examples/batfd/batfd/model/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎examples/batfd/batfd/model/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎examples/batfd/batfd/model/audio_encoder.py‎
Lines changed: 137 additions & 0 deletions b/‎examples/batfd/batfd/model/audio_encoder.py‎
Lines changed: 137 additions & 0 deletions
@@ -441,4 +441,9 @@ docs/_build/
 # Pyenv
 .python-version
 
-lightning_logs
+/examples/batfd/lightning_logs
+/examples/batfd/ckpt
+/examples/batfd/output
+/examples/xception/lightning_logs
+/examples/xception/ckpt
+/examples/xception/output
@@ -0,0 +1,42 @@
+# BA-TFD
+
+This example trains a Xception model on the AVDeepfake1M/AVDeepfake1M++ dataset for classification with video-level labels.
+## Requirements
+
+Ensure you have the necessary environment setup. You can create a Conda environment using the following commands:
+
+```bash
+# prepare the environment
+conda create -n batfd python=3.10 -y
+conda activate batfd
+conda install pytorch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 pytorch-cuda=11.8 -c pytorch -c nvidia -y
+pip install avdeepfake1m toml tensorboard pytorch-lightning pandas
+```
+
+## Training
+
+Train the BATFD or BATFD+ model using a TOML configuration file (e.g., `batfd.toml` or `batfd_plus.toml`).
+
+```bash
+python train.py --config ./batfd.toml --data_root /path/to/AV-Deepfake1M-PlusPlus
+```
+
+### Output
+
+*   **Checkpoints:** Model checkpoints are saved under `./ckpt/xception/`. The last checkpoint is saved as `last.ckpt`.
+*   **Logs:** Training logs (including metrics like `train_loss`, `val_loss`, and learning rates) are saved by PyTorch Lightning, typically in a directory named `./lightning_logs/`. You can view these logs using TensorBoard (`tensorboard --logdir ./lightning_logs`). 
+
+## Inference
+
+After training, generate predictions on a dataset subset (e.g., `val`, `test`) using `infer.py`. This script saves the predictions to a JSON file, which is required for evaluation.
+
+```bash
+python infer.py --config ./batfd.toml --checkpoint /path/to/checkpoint --data_root /path/to/AV-Deepfake1M-PlusPlus --subset val
+```
+
+## Evaluation
+
+```bash
+python evaluate.py /path/to/prediction_file /path/to/metadata_file
+```
+
@@ -0,0 +1,35 @@
+name = "batfd"
+num_frames = 100  # T
+max_duration = 30  # D
+model_type = "batfd"
+dataset = "avdeepfake1m++"
+
+[model.video_encoder]
+type = "c3d"
+hidden_dims = [64, 96, 128, 128]
+cla_feature_in = 256  # C_f
+
+[model.audio_encoder]
+type = "cnn"
+hidden_dims = [32, 64, 64]
+cla_feature_in = 256  # C_f
+
+[model.frame_classifier]
+type = "lr"
+
+[model.boundary_module]
+hidden_dims = [512, 128]
+samples = 10  # N
+
+[optimizer]
+learning_rate = 0.00001
+frame_loss_weight = 2.0
+modal_bm_loss_weight = 1.0
+contrastive_loss_weight = 0.1
+contrastive_loss_margin = 0.99
+weight_decay = 0.0001
+
+[soft_nms]
+alpha = 0.7234
+t1 = 0.1968
+t2 = 0.4123
@@ -0,0 +1,129 @@
+import os.path
+from typing import Any, List, Optional
+from torch import Tensor
+import pandas as pd
+from pathlib import Path
+from lightning.pytorch import LightningModule, Trainer, Callback
+
+from avdeepfake1m.loader import Metadata
+from torch.utils.data import DataLoader
+
+
+def nullable_index(obj, index):
+    if obj is None:
+        return None
+    return obj[index]
+
+
+class SaveToCsvCallback(Callback):
+
+    def __init__(self, max_duration: int, metadata: List[Metadata], model_name: str, model_type: str, temp_dir: str):
+        super().__init__()
+        self.max_duration = max_duration
+        self.metadata = metadata
+        self.model_name = model_name
+        self.model_type = model_type
+        self.temp_dir = temp_dir
+
+    def on_predict_batch_end(
+        self,
+        trainer: Trainer,
+        pl_module: LightningModule,
+        outputs: Any,
+        batch: Any,
+        batch_idx: int,
+    ) -> None:
+        if self.model_type == "batfd":
+            fusion_bm_map, v_bm_map, a_bm_map, v_frame_cla, a_frame_cla = outputs
+            batch_size = fusion_bm_map.shape[0]
+
+            for i in range(batch_size):
+                temporal_size = batch[3][i]
+                video_name = self.metadata[batch_idx * batch_size + i].file
+                n_frames = self.metadata[batch_idx * batch_size + i].video_frames
+
+                assert isinstance(video_name, str)
+                self.gen_df_for_batfd(fusion_bm_map[i], temporal_size, n_frames, os.path.join(
+                    self.temp_dir, self.model_name, video_name.replace("/", "_").replace(".mp4", ".csv")
+                ))
+
+        elif self.model_type == "batfd_plus":
+            fusion_bm_map, fusion_start, fusion_end, v_bm_map, v_start, v_end, a_bm_map, a_start, a_end, v_frame_cla, a_frame_cla = outputs
+            batch_size = fusion_bm_map.shape[0]
+
+            for i in range(batch_size):
+                temporal_size = batch[3][i]
+                video_name = self.metadata[batch_idx * batch_size + i].file
+                n_frames = self.metadata[batch_idx * batch_size + i].video_frames
+                assert isinstance(video_name, str)
+
+                self.gen_df_for_batfd_plus(fusion_bm_map[i], nullable_index(fusion_start, i),
+                    nullable_index(fusion_end, i), temporal_size,
+                    n_frames, os.path.join(self.temp_dir, self.model_name,
+                        video_name.replace("/", "_").replace(".mp4", ".csv")
+                    ))
+
+        else:
+            raise ValueError("Invalid model type")
+
+    def gen_df_for_batfd(self, bm_map: Tensor, temporal_size: Tensor, n_frames: int, output_file: str):
+        bm_map = bm_map.cpu().numpy()
+        temporal_size = temporal_size.cpu().numpy().item()
+        # for each boundary proposal in boundary map
+        df = pd.DataFrame(bm_map)
+        df = df.stack().reset_index()
+        df.columns = ["duration", "begin", "score"]
+        df["end"] = df.duration + df.begin
+        df = df[(df.duration > 0) & (df.end <= temporal_size)]
+        df = df.sort_values(["begin", "end"])
+        df = df.reset_index()[["begin", "end", "score"]]
+        df["begin"] = (df["begin"] / temporal_size * n_frames).astype(int)
+        df["end"] = (df["end"] / temporal_size * n_frames).astype(int)
+        df = df.sort_values(["score"], ascending=False).iloc[:100]
+        df.to_csv(output_file, index=False)
+
+    def gen_df_for_batfd_plus(self, bm_map: Tensor, start: Optional[Tensor], end: Optional[Tensor],
+        temporal_size: Tensor, n_frames: int, output_file: str
+    ):
+        bm_map = bm_map.cpu().numpy()
+        temporal_size = temporal_size.cpu().numpy().item()
+        if start is not None and end is not None:
+            start = start.cpu().numpy()
+            end = end.cpu().numpy()
+
+        # for each boundary proposal in boundary map
+        df = pd.DataFrame(bm_map)
+        df = df.stack().reset_index()
+        df.columns = ["duration", "begin", "score"]
+        df["end"] = df.duration + df.begin
+        df = df[(df.duration > 0) & (df.end <= temporal_size)]
+        df = df.sort_values(["begin", "end"])
+        df = df.reset_index()[["begin", "end", "score"]]
+        if start is not None and end is not None:
+            df["score"] = df["score"] * start[df.begin] * end[df.end]
+
+        df["begin"] = (df["begin"] / temporal_size * n_frames).astype(int)
+        df["end"] = (df["end"] / temporal_size * n_frames).astype(int)
+        df = df.sort_values(["score"], ascending=False).iloc[:100]
+        df.to_csv(output_file, index=False)
+
+
+def inference_model(model_name: str, model: LightningModule, dataloader: DataLoader,
+    metadata: List[Metadata],
+    max_duration: int, model_type: str,
+    gpus: int = 1,
+    temp_dir: str = "output/",
+    subset: str = "test"
+) -> List[Metadata]:
+    Path(os.path.join(temp_dir, model_name)).mkdir(parents=True, exist_ok=True)
+    assert subset in ["test", "val"]
+
+    model.eval()
+
+    trainer = Trainer(logger=False,
+        enable_checkpointing=False, devices=1 if gpus > 1 else "auto",
+        accelerator="auto" if gpus > 0 else "cpu",
+        callbacks=[SaveToCsvCallback(max_duration, metadata, model_name, model_type, temp_dir)]
+    )
+
+    trainer.predict(model, dataloader)
@@ -0,0 +1,2 @@
+from .batfd import Batfd
+from .batfd_plus import BatfdPlus
@@ -0,0 +1,137 @@
+from typing import Literal
+
+from einops import rearrange
+from einops.layers.torch import Rearrange
+from torch import Tensor
+from torch.nn import Module, Sequential, LeakyReLU, MaxPool2d, Linear
+from torchvision.models.vision_transformer import Encoder as ViTEncoder
+
+from ..utils import Conv2d
+
+
+class CNNAudioEncoder(Module):
+    """
+    Audio encoder (E_a): Process log mel spectrogram to extract features.
+    Input:
+        A': (B, F_m, T_a)
+    Output:
+        E_a: (B, C_f, T)
+    """
+
+    def __init__(self, n_features=(32, 64, 64)):
+        super().__init__()
+
+        n_dim0, n_dim1, n_dim2 = n_features
+
+        # (B, 64, 2048) -> (B, 1, 64, 2048) -> (B, 32, 32, 1024)
+        self.block0 = Sequential(
+            Rearrange("b c t -> b 1 c t"),
+            Conv2d(1, n_dim0, kernel_size=3, stride=1, padding=1, build_activation=LeakyReLU),
+            MaxPool2d(2)
+        )
+
+        # (B, 32, 32, 1024) -> (B, 64, 16, 512)
+        self.block1 = Sequential(
+            Conv2d(n_dim0, n_dim1, kernel_size=3, stride=1, padding=1, build_activation=LeakyReLU),
+            Conv2d(n_dim1, n_dim1, kernel_size=3, stride=1, padding=1, build_activation=LeakyReLU),
+            MaxPool2d(2)
+        )
+
+        # (B, 64, 16, 512) -> (B, 64, 4, 512) -> (B, 256, 512)
+        self.block2 = Sequential(
+            Conv2d(n_dim1, n_dim2, kernel_size=(2, 1), stride=1, padding=(1, 0), build_activation=LeakyReLU),
+            MaxPool2d((2, 1)),
+            Conv2d(n_dim2, n_dim2, kernel_size=(3, 1), stride=1, padding=(1, 0), build_activation=LeakyReLU),
+            MaxPool2d((2, 1)),
+            Rearrange("b f c t -> b (f c) t")
+        )
+
+    def forward(self, audio: Tensor) -> Tensor:
+        x = self.block0(audio)
+        x = self.block1(x)
+        x = self.block2(x)
+        return x
+
+
+class SelfAttentionAudioEncoder(Module):
+
+    def __init__(self, block_type: Literal["vit_t", "vit_s", "vit_b"], a_cla_feature_in: int = 256, temporal_size: int = 512):
+        super().__init__()
+        # The ViT configurations are from:
+        # https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+        if block_type == "vit_t":
+            self.n_features = 192
+            self.block = ViTEncoder(
+                seq_length=temporal_size,
+                num_layers=12,
+                num_heads=3,
+                hidden_dim=self.n_features,
+                mlp_dim=self.n_features * 4,
+                dropout=0.,
+                attention_dropout=0.
+            )
+        elif block_type == "vit_s":
+            self.n_features = 384
+            self.block = ViTEncoder(
+                seq_length=temporal_size,
+                num_layers=12,
+                num_heads=6,
+                hidden_dim=self.n_features,
+                mlp_dim=self.n_features * 4,
+                dropout=0.,
+                attention_dropout=0.
+            )
+        elif block_type == "vit_b":
+            self.n_features = 768
+            self.block = ViTEncoder(
+                seq_length=temporal_size,
+                num_layers=12,
+                num_heads=12,
+                hidden_dim=self.n_features,
+                mlp_dim=self.n_features * 4,
+                dropout=0.,
+                attention_dropout=0.
+            )
+        else:
+            raise ValueError(f"Unknown block type: {block_type}")
+
+        self.input_proj = Conv2d(1, self.n_features, kernel_size=(64, 4), stride=(64, 4))
+        self.output_proj = Linear(self.n_features, a_cla_feature_in)
+
+    def forward(self, audio: Tensor) -> Tensor:
+        x = audio.unsqueeze(1)  # (B, 64, 2048) -> (B, 1, 64, 2048)
+        x = self.input_proj(x)  # (B, 1, 64, 2048) -> (B, feat, 1, 512)
+        x = rearrange(x, "b f 1 t -> b t f")  # (B, feat, 1, 512) -> (B, 512, feat)
+        x = self.block(x)
+        x = self.output_proj(x)  # (B, 512, feat) -> (B, 512, 256)
+        x = x.permute(0, 2, 1)  # (B, 512, 256) -> (B, 256, 512)
+        return x
+
+
+class AudioFeatureProjection(Module):
+
+    def __init__(self, input_feature_dim: int, a_cla_feature_in: int = 256):
+        super().__init__()
+        self.proj = Linear(input_feature_dim, a_cla_feature_in)
+
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.proj(x)
+        return x.permute(0, 2, 1)
+
+
+def get_audio_encoder(a_cla_feature_in, temporal_size, a_encoder, ae_features):
+    if a_encoder == "cnn":
+        audio_encoder = CNNAudioEncoder(n_features=ae_features)
+    elif a_encoder == "vit_t":
+        audio_encoder = SelfAttentionAudioEncoder(block_type="vit_t", a_cla_feature_in=a_cla_feature_in, temporal_size=temporal_size)
+    elif a_encoder == "vit_s":
+        audio_encoder = SelfAttentionAudioEncoder(block_type="vit_s", a_cla_feature_in=a_cla_feature_in, temporal_size=temporal_size)
+    elif a_encoder == "vit_b":
+        audio_encoder = SelfAttentionAudioEncoder(block_type="vit_b", a_cla_feature_in=a_cla_feature_in, temporal_size=temporal_size)
+    elif a_encoder == "wav2vec2":
+        audio_encoder = AudioFeatureProjection(input_feature_dim=1536, a_cla_feature_in=a_cla_feature_in)
+    elif a_encoder == "trillsson3":
+        audio_encoder = AudioFeatureProjection(input_feature_dim=1280, a_cla_feature_in=a_cla_feature_in)
+    else:
+        raise ValueError(f"Invalid audio encoder: {a_encoder}")
+    return audio_encoder
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .batfd import Batfd`
	`2`	`+from .batfd_plus import BatfdPlus`