From 4275cf4d698a10d4043b39a7138b1d5bf91c9835 Mon Sep 17 00:00:00 2001 From: Furyton <26501227+Furyton@users.noreply.github.com> Date: Wed, 18 Sep 2024 11:12:07 +0000 Subject: [PATCH] Automated update on 2024-09-18 --- papers/phenomena-of-interest/in-context-learning/papers.csv | 3 ++- papers/phenomena-of-interest/knowledge/papers.csv | 3 ++- papers/phenomena-of-interest/other-phenomena/papers.csv | 3 ++- .../what-can-transformer-do/papers.csv | 3 ++- .../what-can-transformer-not-do/papers.csv | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/papers/phenomena-of-interest/in-context-learning/papers.csv b/papers/phenomena-of-interest/in-context-learning/papers.csv index f909da2..e36e8cc 100644 --- a/papers/phenomena-of-interest/in-context-learning/papers.csv +++ b/papers/phenomena-of-interest/in-context-learning/papers.csv @@ -73,4 +73,5 @@ In-Context Learning with Representations: Contextual Generalization of Trained T Unveiling In-Context Learning: A Coordinate System to Understand Its Working Mechanism,2024-07-24,http://arxiv.org/abs/2407.17011,Anhao Zhao; Fanghua Ye; Jinlan Fu; Xiaoyu Shen Polynomial Regression as a Task for Understanding In-context Learning Through Finetuning and Alignment,2024-07-27,http://arxiv.org/abs/2407.19346,Max Wilcoxson; Morten Svendgård; Ria Doshi; Dylan Davis; Reya Vir; Anant Sahai One-Layer Transformer Provably Learns One-Nearest Neighbor In Context,2024-07-24,https://klusowski.princeton.edu/sites/g/files/toruqf5901/files/documents/li2024one.pdf,Zihao Li; Yuan Cao; Cheng Gao; Yihan He; Han Liu; Jason M. Klusowski; Jianqing Fan; Mengdi Wang -Learning vs Retrieval: The Role of In-Context Examples in Regression with LLMs,2024-09-06,http://arxiv.org/abs/2409.04318,Aliakbar Nafar; Kristen Brent Venable; Parisa Kordjamshidi \ No newline at end of file +Learning vs Retrieval: The Role of In-Context Examples in Regression with LLMs,2024-09-06,http://arxiv.org/abs/2409.04318,Aliakbar Nafar; Kristen Brent Venable; Parisa Kordjamshidi +"Unveiling Induction Heads: Provable Training Dynamics and Feature Learning in Transformers",2024-09-09,https://arxiv.org/pdf/2409.10559,Siyu Chen; Heejune Sheen; Tianhao Wang; Zhuoran Yang diff --git a/papers/phenomena-of-interest/knowledge/papers.csv b/papers/phenomena-of-interest/knowledge/papers.csv index 13c552f..ccfb7bb 100644 --- a/papers/phenomena-of-interest/knowledge/papers.csv +++ b/papers/phenomena-of-interest/knowledge/papers.csv @@ -24,4 +24,5 @@ Memorisation In In-Context Learning,2024-08-21,http://arxiv.org/abs/2408.11546,S "Understanding Memorisation in LLMs: Dynamics, Influencing Factors, and Implications",2024-07-27,http://arxiv.org/abs/2407.19262,Till Speicher; Mohammad Aflah Khan; Qinyuan Wu; Vedant Nanda; Soumi Das; Bishwamittra Ghosh; Krishna P. Gummadi; Evimaria Terzi Generalization v.s. Memorization: Tracing Language Models' Capabilities Back to Pretraining Data,2024-07-20,http://arxiv.org/abs/2407.14985,Antonis Antoniades; Xinyi Wang; Yanai Elazar; Alfonso Amayuelas; Alon Albalak; Kexun Zhang; William Yang Wang Induction Heads as an Essential Mechanism for Pattern Matching in In-context Learning,2024-07-09,http://arxiv.org/abs/2407.07011,J. Crosbie; E. Shutova -"Schrodingers Memory: Large Language Models",2024-09-16,https://arxiv.org/pdf/2409.10482,Wei Wang; Qing Li \ No newline at end of file +"Schrodingers Memory: Large Language Models",2024-09-16,https://arxiv.org/pdf/2409.10482,Wei Wang; Qing Li +"Self-Attention Limits Working Memory Capacity of Transformer-Based Models",2024-09-16,https://arxiv.org/pdf/2409.10715,Dongyu Gong; Hantao Zhang diff --git a/papers/phenomena-of-interest/other-phenomena/papers.csv b/papers/phenomena-of-interest/other-phenomena/papers.csv index a4df311..ce52881 100644 --- a/papers/phenomena-of-interest/other-phenomena/papers.csv +++ b/papers/phenomena-of-interest/other-phenomena/papers.csv @@ -24,4 +24,5 @@ On the Emergence of Cross-Task Linearity in the Pretraining-Finetuning Paradigm, Does Liking Yellow Imply Driving a School Bus? Semantic Leakage in Language Models,2024-08-12,http://arxiv.org/abs/2408.06518,Hila Gonen; Terra Blevins; Alisa Liu; Luke Zettlemoyer; Noah A. Smith Large Language Monkeys: Scaling Inference Compute with Repeated Sampling,2024-07-31,http://arxiv.org/abs/2407.21787,Bradley Brown; Jordan Juravsky; Ryan Ehrlich; Ronald Clark; Quoc V. Le; Christopher Ré; Azalia Mirhoseini Monotonic Representation of Numeric Properties in Language Models,2024-08-15,http://arxiv.org/abs/2408.10381,Benjamin Heinzerling; Kentaro Inui -Masked Mixers for Language Generation and Retrieval,2024-09-02,http://arxiv.org/abs/2409.01482,Benjamin L. Badger \ No newline at end of file +Masked Mixers for Language Generation and Retrieval,2024-09-02,http://arxiv.org/abs/2409.01482,Benjamin L. Badger +"Norm of Mean Contextualized Embeddings Determines their Variance",2024-09-17,https://arxiv.org/pdf/2409.11253,Hiroaki Yamagiwa; Hidetoshi Shimodaira diff --git a/papers/representational-capacity/what-can-transformer-do/papers.csv b/papers/representational-capacity/what-can-transformer-do/papers.csv index dc6f4c4..6d50453 100644 --- a/papers/representational-capacity/what-can-transformer-do/papers.csv +++ b/papers/representational-capacity/what-can-transformer-do/papers.csv @@ -52,4 +52,5 @@ Learning Randomized Algorithms with Transformers,2024-08-20,http://arxiv.org/abs Attention is a smoothed cubic spline,2024-08-19,http://arxiv.org/abs/2408.09624,Zehua Lai; Lek-Heng Lim; Yucong Liu Transformers As Approximations of Solomonoff Induction,2024-08-22,http://arxiv.org/abs/2408.12065,Nathan Young; Michael Witbrock Implicit Geometry of Next-token Prediction: From Language Sparsity Patterns to Model Representations,2024-08-27,http://arxiv.org/abs/2408.15417,Yize Zhao; Tina Behnia; Vala Vakilian; Christos Thrampoulidis -A Law of Next-Token Prediction in Large Language Models,2024-08-24,http://arxiv.org/abs/2408.13442,Hangfeng He; Weijie J. Su \ No newline at end of file +A Law of Next-Token Prediction in Large Language Models,2024-08-24,http://arxiv.org/abs/2408.13442,Hangfeng He; Weijie J. Su +"Adaptive Large Language Models By Layerwise Attention Shortcuts",2024-09-17,https://arxiv.org/pdf/2409.10870,Prateek Verma; Mert Pilanci diff --git a/papers/representational-capacity/what-can-transformer-not-do/papers.csv b/papers/representational-capacity/what-can-transformer-not-do/papers.csv index 26c024f..e7d59b0 100644 --- a/papers/representational-capacity/what-can-transformer-not-do/papers.csv +++ b/papers/representational-capacity/what-can-transformer-not-do/papers.csv @@ -20,4 +20,5 @@ Hopping Too Late: Exploring the Limitations of Large Language Models on Multi-Ho When can transformers compositionally generalize in-context?,2024-07-17,http://arxiv.org/abs/2407.12275,Seijin Kobayashi; Simon Schug; Yassir Akram; Florian Redhardt; Johannes von Oswald; Razvan Pascanu; Guillaume Lajoie; João Sacramento When Can Transformers Count to n?,2024-07-21,http://arxiv.org/abs/2407.15160,Gilad Yehudai; Haim Kaplan; Asma Ghandeharioun; Mor Geva; Amir Globerson Your Context Is Not an Array: Unveiling Random Access Limitations in Transformers,2024-08-10,http://arxiv.org/abs/2408.05506,MohammadReza Ebrahimi; Sunny Panchal; Roland Memisevic -One-layer transformers fail to solve the induction heads task,2024-08-26,http://arxiv.org/abs/2408.14332,Clayton Sanford; Daniel Hsu; Matus Telgarsky \ No newline at end of file +One-layer transformers fail to solve the induction heads task,2024-08-26,http://arxiv.org/abs/2408.14332,Clayton Sanford; Daniel Hsu; Matus Telgarsky +"Self-Attention Limits Working Memory Capacity of Transformer-Based Models",2024-09-16,https://arxiv.org/pdf/2409.10715,Dongyu Gong; Hantao Zhang