Skip to content

Commit 84023f5

Browse files
committed
new article
1 parent 7556f28 commit 84023f5

9 files changed

Lines changed: 419 additions & 7 deletions

File tree

_posts/2025-12-07-Test-Time-Regression.md

Lines changed: 299 additions & 0 deletions
Large diffs are not rendered by default.

assets/bibliography/2018-12-22-distill.bib

Lines changed: 0 additions & 7 deletions
This file was deleted.
Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
@article{Shalev-Shwartz_2011,
2+
author = {Shalev-Shwartz, Shai},
3+
title = {Online learning and online convex optimization},
4+
journal = {Foundations and Trends® in Machine Learning},
5+
volume = {4},
6+
number = {2},
7+
pages = {107-194},
8+
year = {2011},
9+
doi = {10.1561/2200000018},
10+
}
11+
12+
@misc{behrouz2025atlas,
13+
title = {Atlas: Learning to optimally memorize the context at Test Time},
14+
author = {Behrouz, Ali and Li, Zeman and Kacham, Praneeth and Daliri, Majid and Deng, Yuan and Zhong, Peilin and Razaviyayn, Meisam and Mirrokni, Vahab},
15+
year = {2025},
16+
eprint = {2505.23735},
17+
archivePrefix = {arXiv},
18+
primaryClass = {cs.LG},
19+
url = {https://arxiv.org/abs/2505.23735},
20+
}
21+
22+
@misc{behrouz2025its,
23+
title = {It's All connected: A journey through test-time memorization, attentional bias, retention, and online optimization},
24+
author = {Behrouz, Ali and Razaviyayn, Meisam and Zhong, Peilin and Mirrokni, Vahab},
25+
year = {2025},
26+
eprint = {2504.13173},
27+
archivePrefix = {arXiv},
28+
primaryClass = {cs.LG},
29+
url = {https://arxiv.org/abs/2504.13173},
30+
}
31+
32+
@misc{behrouz2024titans,
33+
title = {Titans: Learning to memorize at Test Time},
34+
author = {Behrouz, Ali and Zhong, Peilin and Mirrokni, Vahab},
35+
year = {2024},
36+
eprint = {2501.00663},
37+
archivePrefix = {arXiv},
38+
primaryClass = {cs.LG},
39+
url = {https://arxiv.org/abs/2501.00663},
40+
}
41+
42+
@misc{katharopoulos2020transformers,
43+
title = {Transformers are RNNS: Fast autoregressive transformers with linear attention},
44+
author = {Katharopoulos, Angelos and Vyas, Apoorv and Pappas, Nikolaos and Fleuret, François},
45+
year = {2020},
46+
eprint = {2006.16236},
47+
archivePrefix = {arXiv},
48+
primaryClass = {cs.LG},
49+
url = {https://arxiv.org/abs/2006.16236},
50+
}
51+
52+
@misc{nichani2024understanding,
53+
title = {Understanding factual recall in Transformers via associative memories},
54+
author = {Nichani, Eshaan and Lee, Jason D. and Bietti, Alberto},
55+
year = {2024},
56+
eprint = {2412.06538},
57+
archivePrefix = {arXiv},
58+
primaryClass = {cs.LG},
59+
url = {https://arxiv.org/abs/2412.06538},
60+
}
61+
62+
@misc{sun2023retentive,
63+
title = {Retentive network: A successor to Transformer for large language models},
64+
author = {Sun, Yutao and Dong, Li and Huang, Shaohan and Ma, Shuming and Xia, Yuqing and Xue, Jilong and Wang, Jianyong and Wei, Furu},
65+
year = {2023},
66+
eprint = {2307.08621},
67+
archivePrefix = {arXiv},
68+
primaryClass = {cs.LG},
69+
url = {https://arxiv.org/abs/2307.08621},
70+
}
71+
72+
@misc{sun2025learning,
73+
title = {Learning to (Learn at test time): RNNS with Expressive Hidden States},
74+
author = {Sun, Yu and Li, Xinhao and Dalal, Karan and Xu, Jiarui and Vikram, Arjun and Zhang, Genghan and Dubois, Yann and Chen, Xinlei and Wang, Xiaolong and Koyejo, Sanmi and et al.},
75+
year = {2025},
76+
eprint = {2407.04620},
77+
archivePrefix = {arXiv},
78+
primaryClass = {cs.LG},
79+
url = {https://arxiv.org/abs/2407.04620},
80+
}
81+
82+
@misc{voswald2024uncovering,
83+
title = {Uncovering Mesa-optimization algorithms in transformers},
84+
author = {von Oswald, Johannes and Schlegel, Maximilian and Meulemans, Alexander and Kobayashi, Seijin and Niklasson, Eyvind and Zucchet, Nicolas and Scherrer, Nino and Miller, Nolan and Sandler, Mark and Arcas, Blaise Agüera y and et al.},
85+
year = {2024},
86+
eprint = {2309.05858},
87+
archivePrefix = {arXiv},
88+
primaryClass = {cs.LG},
89+
url = {https://arxiv.org/abs/2309.05858},
90+
}
91+
92+
@misc{wang2025testtime,
93+
title = {Test-time regression: A unifying framework for designing sequence models with associative memory},
94+
author = {Wang, Ke Alexander and Shi, Jiaxin and Fox, Emily B.},
95+
year = {2025},
96+
eprint = {2501.12352},
97+
archivePrefix = {arXiv},
98+
primaryClass = {cs.LG},
99+
url = {https://arxiv.org/abs/2501.12352},
100+
}
101+
102+
@misc{yang2025gated,
103+
title = {Gated Delta Networks: Improving Mamba2 with delta rule},
104+
author = {Yang, Songlin and Kautz, Jan and Hatamizadeh, Ali},
105+
year = {2025},
106+
eprint = {2412.06464},
107+
archivePrefix = {arXiv},
108+
primaryClass = {cs.LG},
109+
url = {https://arxiv.org/abs/2412.06464},
110+
}
111+
112+
@misc{yang2025parallelizing,
113+
title = {Parallelizing Linear Transformers with the delta rule over sequence length},
114+
author = {Yang, Songlin and Wang, Bailin and Zhang, Yu and Shen, Yikang and Kim, Yoon},
115+
year = {2025},
116+
eprint = {2406.06484},
117+
archivePrefix = {arXiv},
118+
primaryClass = {cs.LG},
119+
url = {https://arxiv.org/abs/2406.06484},
120+
}
304 KB
Loading
260 KB
Loading
308 KB
Loading
269 KB
Loading
303 KB
Loading
296 KB
Loading

0 commit comments

Comments
 (0)