stephan271c
diff --git a/‎_posts/2025-12-07-Test-Time-Regression.md‎
Lines changed: 299 additions & 0 deletions b/‎_posts/2025-12-07-Test-Time-Regression.md‎
Lines changed: 299 additions & 0 deletions
diff --git a/‎assets/bibliography/2018-12-22-distill.bib‎
Lines changed: 0 additions & 7 deletions b/‎assets/bibliography/2018-12-22-distill.bib‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎assets/bibliography/2025-12-07-Test-Time-Regression.bib‎
Lines changed: 120 additions & 0 deletions b/‎assets/bibliography/2025-12-07-Test-Time-Regression.bib‎
Lines changed: 120 additions & 0 deletions
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx1-acc.png‎
304 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx1-acc.png‎
304 KB
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx1-ret.png‎
260 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx1-ret.png‎
260 KB
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx5-acc.png‎
308 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx5-acc.png‎
308 KB
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx5-ret.png‎
269 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0-ctx5-ret.png‎
269 KB
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0.7-ctx5-acc.png‎
303 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0.7-ctx5-acc.png‎
303 KB
diff --git a/‎assets/img/2025-12-07-Test-Time-Regression/corr0.7-ctx5-ret.png‎
296 KB b/‎assets/img/2025-12-07-Test-Time-Regression/corr0.7-ctx5-ret.png‎
296 KB
@@ -0,0 +1,120 @@
+@article{Shalev-Shwartz_2011,
+  author = {Shalev-Shwartz, Shai},
+  title = {Online learning and online convex optimization},
+  journal = {Foundations and Trends® in Machine Learning},
+  volume = {4},
+  number = {2},
+  pages = {107-194},
+  year = {2011},
+  doi = {10.1561/2200000018},
+}
+
+@misc{behrouz2025atlas,
+  title = {Atlas: Learning to optimally memorize the context at Test Time},
+  author = {Behrouz, Ali and Li, Zeman and Kacham, Praneeth and Daliri, Majid and Deng, Yuan and Zhong, Peilin and Razaviyayn, Meisam and Mirrokni, Vahab},
+  year = {2025},
+  eprint = {2505.23735},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2505.23735},
+}
+
+@misc{behrouz2025its,
+  title = {It's All connected: A journey through test-time memorization, attentional bias, retention, and online optimization},
+  author = {Behrouz, Ali and Razaviyayn, Meisam and Zhong, Peilin and Mirrokni, Vahab},
+  year = {2025},
+  eprint = {2504.13173},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2504.13173},
+}
+
+@misc{behrouz2024titans,
+  title = {Titans: Learning to memorize at Test Time},
+  author = {Behrouz, Ali and Zhong, Peilin and Mirrokni, Vahab},
+  year = {2024},
+  eprint = {2501.00663},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2501.00663},
+}
+
+@misc{katharopoulos2020transformers,
+  title = {Transformers are RNNS: Fast autoregressive transformers with linear attention},
+  author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François},
+  year = {2020},
+  eprint = {2006.16236},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2006.16236},
+} 
+
+@misc{nichani2024understanding,
+  title = {Understanding factual recall in Transformers via associative memories},
+  author = {Nichani, Eshaan and Lee, Jason D. and Bietti, Alberto},
+  year = {2024},
+  eprint = {2412.06538},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2412.06538},
+}
+
+@misc{sun2023retentive,
+  title = {Retentive network: A successor to Transformer for large language models},
+  author = {Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu},
+  year = {2023},
+  eprint = {2307.08621},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2307.08621},
+}
+
+@misc{sun2025learning,
+  title = {Learning to (Learn at test time): RNNS with Expressive Hidden States},
+  author = {Sun, Yu and Li, Xinhao and Dalal, Karan and Xu, Jiarui and Vikram, Arjun and Zhang, Genghan and Dubois, Yann and Chen, Xinlei and Wang, Xiaolong and Koyejo, Sanmi and et al.},
+  year = {2025},
+  eprint = {2407.04620},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2407.04620},
+}
+
+@misc{voswald2024uncovering,
+  title = {Uncovering Mesa-optimization algorithms in transformers},
+  author = {von Oswald, Johannes and Schlegel, Maximilian and Meulemans, Alexander and Kobayashi, Seijin and Niklasson, Eyvind and Zucchet, Nicolas and Scherrer, Nino and Miller, Nolan and Sandler, Mark and Arcas, Blaise Agüera y and et al.},
+  year = {2024},
+  eprint = {2309.05858},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2309.05858},
+}
+
+@misc{wang2025testtime,
+  title = {Test-time regression: A unifying framework for designing sequence models with associative memory},
+  author = {Wang, Ke Alexander and Shi, Jiaxin and Fox, Emily B.},
+  year = {2025},
+  eprint = {2501.12352},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2501.12352},
+}
+
+@misc{yang2025gated,
+  title = {Gated Delta Networks: Improving Mamba2 with delta rule},
+  author = {Yang, Songlin and Kautz, Jan and Hatamizadeh, Ali},
+  year = {2025},
+  eprint = {2412.06464},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2412.06464},
+}
+
+@misc{yang2025parallelizing,
+  title = {Parallelizing Linear Transformers with the delta rule over sequence length},
+  author = {Yang, Songlin and Wang, Bailin and Zhang, Yu and Shen, Yikang and Kim, Yoon},
+  year = {2025},
+  eprint = {2406.06484},
+  archivePrefix = {arXiv},
+  primaryClass = {cs.LG},
+  url = {https://arxiv.org/abs/2406.06484},
+}