@article{zhang2019adam, title={Why ADAM beats SGD for attention models}, author={Zhang, Jingzhao and Karimireddy, Sai Praneeth and Veit, Andreas and Kim, Seungyeon and Reddi, Sashank J and Kumar, Sanjiv and Sra, Suvrit}, journal={arXiv preprint arXiv:1912.03194}, year={2019} }