fix: correct paper references for LAION, OpenFace 3, CLIP and remove legacy Dockerfile

InfantLab · claude · InfantLab · commit d24bae7b833c · 2026-03-03T12:35:03.000Z
- Replace LAION-5B citation with EmoNet-Face (NeurIPS 2025) and EmoNet-Voice (arXiv 2025)
- Replace OpenFace 2.0 citation with OpenFace 3.0 (IEEE FG 2025)
- Add missing CLIP citation (ICML 2021) for scene detection pipeline
- Update videoannotator and viewer bib entries (year, authors, version)
- Remove obsolete Dockerfile.oldgpu

Co-Authored-By: Claude Opus 4.6 &lt;noreply@anthropic.com&gt;
diff --git a/Dockerfile.oldgpu b/Dockerfile.oldgpu
diff --git a/paper/paper.bib b/paper/paper.bib
@@ -1,19 +1,18 @@
 @misc{videoannotator,
   title        = {VideoAnnotator: an extensible, reproducible toolkit for automated video annotation in behavioral research},
   author       = {Addyman, Caspar and Ishaya, Jeremiah and Uwerikowe, Irene and Stamate, Daniel and Lachman, Jamie and Tomlinson, Mark},
-  year         = {2025},
+  year         = {2026},
   howpublished = {\url{https://github.com/InfantLab/VideoAnnotator}},
   note         = {Version v1.4.1}
 }
 
-@inproceedings{openface2,
-  title     = {OpenFace 2.0: Facial Behavior Analysis Toolkit},
-  author    = {Baltru\v{s}aitis, Tadas and Zadeh, Amir and Lim, Yao Chong and Morency, Louis-Philippe},
-  booktitle = {2018 13th IEEE International Conference on Automatic Face \& Gesture Recognition (FG 2018)},
-  pages     = {59--66},
-  year      = {2018},
+@inproceedings{openface3,
+  title     = {{OpenFace 3.0: A Lightweight Multitask System for Comprehensive Facial Behavior Analysis}},
+  author    = {Hu, Jiewen and Mathur, Leena and Liang, Paul Pu and Morency, Louis-Philippe},
+  booktitle = {IEEE International Conference on Automatic Face and Gesture Recognition (FG 2025)},
+  year      = {2025},
   publisher = {IEEE},
-  doi       = {10.1109/FG.2018.00019}
+  doi       = {10.48550/arXiv.2506.02891}
 }
 
 @article{whisper,
@@ -111,10 +110,10 @@ @software{pyscenedetect
 
 @software{viewer,
   title   = {Video Annotation Viewer: interactive visualization for video annotation review},
-  author  = {Addyman, Caspar and Ishaya, Jeremiah},
-  year    = {2025},
+  author  = {Addyman, Caspar  and Uwerikowe, Irene and Ishaya, Jeremiah and Stamate, Daniel and Lachman, Jamie and Tomlinson, Mark},
+  year    = {2026},
   url     = {https://github.com/InfantLab/video-annotation-viewer},
-  version = {0.3.0}
+  version = {0.4.2}
 }
 
 @book{observer,
@@ -136,13 +135,30 @@ @inproceedings{deepface
   doi={10.1109/ASYU50717.2020.9259802}
 }
 
-@article{laion,
-  title={LAION-5B: An open large-scale dataset for training next generation image-text models},
-  author={Schuhmann, Christoph and Beaumont, Romain and Vencu, Richard and Gordon, Cade and Wightman, Ross and Cherti, Mehdi and Coombes, Theo and Katta, Aarush and Mullis, Clayton and Wortsman, Mitchell and others},
-  journal={Advances in Neural Information Processing Systems},
-  volume={35},
-  pages={25278--25294},
-  year={2022}
+@inproceedings{emonet_face,
+  title={{EmoNet-Face: An Expert-Annotated Benchmark for Synthetic Emotion Recognition}},
+  author={Schuhmann, Christoph and Kaczmarczyk, Robert and Rabby, Gollam and Kraus, Maurice and Friedrich, Felix and Nguyen, Huu and Kalyan, Krishna and Nadi, Kourosh and Kersting, Kristian and Auer, S\"{o}ren},
+  booktitle={NeurIPS},
+  year={2025},
+  doi={10.48550/arXiv.2505.20033}
+}
+
+@article{emonet_voice,
+  title={{EmoNet-Voice: A Fine-Grained, Expert-Verified Benchmark for Speech Emotion Detection}},
+  author={Schuhmann, Christoph and Kaczmarczyk, Robert and Rabby, Gollam and Friedrich, Felix and Kraus, Maurice and Nadi, Kourosh and Nguyen, Huu and Kersting, Kristian and Auer, S\"{o}ren},
+  journal={arXiv preprint arXiv:2506.09827},
+  year={2025},
+  doi={10.48550/arXiv.2506.09827}
+}
+
+@inproceedings{clip,
+  title     = {Learning Transferable Visual Models From Natural Language Supervision},
+  author    = {Radford, Alec and Kim, Jong Wook and Hallacy, Chris and Ramesh, Aditya and Goh, Gabriel and Agarwal, Sandhini and Sastry, Girish and Askell, Amanda and Mishkin, Pamela and Clark, Jack and Krueger, Gretchen and Sutskever, Ilya},
+  booktitle = {Proceedings of the 38th International Conference on Machine Learning},
+  volume    = {139},
+  pages     = {8748--8763},
+  year      = {2021},
+  series    = {PMLR}
 }
 
 @inproceedings{bytetrack,
diff --git a/paper/paper.md b/paper/paper.md
@@ -39,7 +39,7 @@ bibliography: paper.bib
 
 # Summary
 
-VideoAnnotator is an open-source Python toolkit for automated video annotation, designed for behavioral, social, and health research at scale. It ships ten declaratively configured pipelines spanning four modalities: person tracking via YOLOv11 with ByteTrack [@yolo11; @bytetrack]; facial analysis using DeepFace [@deepface], LAION CLIP face embeddings [@laion], and OpenFace 3 [@openface2]; scene detection with PySceneDetect and CLIP-based labelling [@pyscenedetect]; and audio processing comprising Whisper speech recognition [@whisper], pyannote speaker diarization [@pyannote], and LAION empathic voice emotion analysis. All pipelines share a uniform interface behind a local-first FastAPI service [@fastapi], with Docker images for consistent CPU and GPU execution. Outputs are standardized to established formats (COCO JSON, RTTM, WebVTT) and accompanied by provenance metadata suitable for downstream modeling and review.
+VideoAnnotator is an open-source Python toolkit for automated video annotation, designed for behavioral, social, and health research at scale. It ships ten declaratively configured pipelines spanning four modalities: person tracking via YOLOv11 with ByteTrack [@yolo11; @bytetrack]; facial analysis using DeepFace [@deepface], LAION EmoNet face emotion analysis [@emonet_face], and OpenFace 3 [@openface3]; scene detection with PySceneDetect and CLIP-based labelling [@pyscenedetect; @clip]; and audio processing comprising Whisper speech recognition [@whisper], pyannote speaker diarization [@pyannote], and LAION EmoNet voice emotion analysis [@emonet_voice]. All pipelines share a uniform interface behind a local-first FastAPI service [@fastapi], with Docker images for consistent CPU and GPU execution. Outputs are standardized to established formats (COCO JSON, RTTM, WebVTT) and accompanied by provenance metadata suitable for downstream modeling and review.
 
 A companion web application, Video Annotation Viewer [@viewer], provides an interactive interface for overlaying annotations on source video — rendering pose skeletons, speaker timelines, subtitle tracks, and scene boundaries — so that researchers can visually inspect and validate pipeline outputs before downstream analysis.
 
@@ -55,7 +55,7 @@ VideoAnnotator addresses this gap by providing a maintainable software layer tha
 
 Existing tools for behavioral video analysis fall into two broad categories. Manual annotation platforms such as ELAN [@elan] and Datavyu [@datavyu] provide flexible coding interfaces but require trained human coders and do not scale to large corpora. At the other end, specialized computer-vision libraries such as DeepLabCut [@deeplabcut] and YOLO [@yolo11] offer powerful pose estimation and object detection but target a single modality and leave integration, output standardization, and batch orchestration to the user.
 
-For facial affect, toolkits such as Py-Feat [@pyfeat] and OpenFace [@openface2] extract action units, landmarks, and emotion labels from video frames. On the audio side, openSMILE [@opensmile] remains widely cited for acoustic feature extraction but has seen limited maintenance, and no current open-source toolkit offers end-to-end speech emotion analysis integrated with video. These tools all run locally but each addresses a single modality and produces its own output schema. Scene-detection libraries such as PySceneDetect [@pyscenedetect] and speaker-diarization toolkits like pyannote [@pyannote] similarly solve one piece of the puzzle. A researcher studying parent–child interaction, for example, may need person tracking, facial expression analysis, speech segmentation, and scene detection applied to the same set of videos — currently requiring ad-hoc glue code across four or more libraries with no shared output format or batch orchestration.
+For facial affect, toolkits such as Py-Feat [@pyfeat] and OpenFace [@openface3] extract action units, landmarks, and emotion labels from video frames. On the audio side, openSMILE [@opensmile] remains widely cited for acoustic feature extraction but has seen limited maintenance, and no current open-source toolkit offers end-to-end speech emotion analysis integrated with video. These tools all run locally but each addresses a single modality and produces its own output schema. Scene-detection libraries such as PySceneDetect [@pyscenedetect] and speaker-diarization toolkits like pyannote [@pyannote] similarly solve one piece of the puzzle. A researcher studying parent–child interaction, for example, may need person tracking, facial expression analysis, speech segmentation, and scene detection applied to the same set of videos — currently requiring ad-hoc glue code across four or more libraries with no shared output format or batch orchestration.
 
 VideoAnnotator was built rather than contributed to an existing project because no single package offered the combination of multi-modal pipeline composition, declarative configuration, standardized output formats (COCO, RTTM, WebVTT), and local-first batch orchestration that our research workflows required. The closest comparable systems are either commercial, cloud-dependent, or tightly coupled to a single detector.