|
1 | 1 | [
|
| 2 | + { |
| 3 | + "title": "Diffusion of Thought: Chain-of-Thought Reasoning in Diffusion Language Models", |
| 4 | + "author": "Jiacheng Ye et al", |
| 5 | + "year": "2024", |
| 6 | + "topic":"cot, llm, diffusion", |
| 7 | + "venue": "NeurIPS", |
| 8 | + "description": "The paper introduces Diffusion-of-Thought (DoT), a novel approach that adapts chain-of-thought reasoning for diffusion language models. Unlike autoregressive language models that generate reasoning steps sequentially left-to-right, DoT allows reasoning to diffuse over time through parallel updates of latent variables, offering greater flexibility in trading computation for reasoning performance. Two key innovations include scheduled sampling during training to improve self-correction capabilities and a multi-pass variant (DoTMP) that generates reasoning steps sequentially to introduce causal bias. For training, the authors fine-tune pre-trained diffusion models (Plaid and SEDD) using classifier-free guidance and implement a conditional ODE solver to accelerate inference. Experimental results on multiplication, boolean logic, and grade school math problems demonstrate that DoT achieves comparable or better performance than autoregressive models while offering significant speed advantages on simpler tasks, with a small diffusion model outperforming a much larger autoregressive model in both efficiency and accuracy.", |
| 9 | + "link": "https://arxiv.org/pdf/2402.07754" |
| 10 | + }, |
| 11 | + { |
| 12 | + "title": "Beyond Autoregression: Discrete Diffusion for Complex Reasoning and Planning", |
| 13 | + "author": "Jiacheng Ye et al", |
| 14 | + "year": "2025", |
| 15 | + "topic":"planning, llm, diffusion", |
| 16 | + "venue": "ICLR", |
| 17 | + "description": "The authors introduce the concept of \"subgoal imbalance,\" demonstrating that autoregressive models struggle with difficult subgoals in planning tasks, often achieving near-random performance. They show that diffusion models effectively decompose these challenging subgoals into more manageable interrelated views within a multi-view learning framework, resulting in superior performance. Building on these insights, they propose Multi-Granularity Diffusion Modeling (MGDM), which prioritizes subgoals based on difficulty during learning, leading to more effective outcomes and faster convergence. Their experimental evaluation focuses on complex problem-solving tasks like Countdown (a mathematical reasoning challenge), Sudoku, and Boolean Satisfiability Problems. For problems like math reasoning or Sudoku where later steps directly depend on earlier ones, this creates a \"plan as you go\" approach that's inherently flawed. Diffusion models transform hard subgoals into multiple interrelated \"views\" during the denoising process. Each view offers a different perspective on the same problem, creating a more manageable learning objective. The authors identified a phenomenon they call the \"Regretful Compromise\" where AR models, after making early mistakes, are forced to produce clearly incorrect calculations in final steps to reach the target answer. The iterative nature of diffusion models naturally promotes global consistency. Each refinement step considers the entire solution, allowing the model to make coordinated changes that maintain mathematical validity across all equations or puzzle constraints.", |
| 18 | + "link": "https://arxiv.org/pdf/2410.14157" |
| 19 | + }, |
| 20 | + { |
| 21 | + "title": "Do Language Models Plan Ahead for Future Tokens?", |
| 22 | + "author": "Wilson Wu et al", |
| 23 | + "year": "2024", |
| 24 | + "topic":"planning, llm", |
| 25 | + "venue": "COLM", |
| 26 | + "description": "This paper investigates whether transformers \"think ahead\" during inference by preparing information in hidden states that will be useful for future tokens. The authors propose two hypotheses: the \"pre-caching\" hypothesis (where models deliberately compute features irrelevant to the current token but useful for future tokens) and the \"breadcrumbs\" hypothesis (where features helpful for current prediction naturally benefit future tokens without deliberate planning). To test these hypotheses, the researchers develop \"myopic training\", where models are trained without propagating gradients to past timesteps. They first create a synthetic task that can only be solved via pre-caching, confirming that transformers can learn this capability when necessary. However, in natural language modeling with smaller models like GPT-2, they find minimal pre-caching, suggesting the breadcrumbs hypothesis predominates—models compute features relevant to the immediate next token that happen to benefit future tokens, without significant trade-offs. Interestingly, the authors discover that pre-caching increases with model scale, becoming more significant with larger models like Pythia 2.8B. This indicates that larger language models may indeed \"plan for the future\" in ways smaller models cannot. They also examine multiplication tasks, finding evidence that pre-caching enables computation on \"filler tokens\" that improves overall performance.", |
| 27 | + "link": "https://arxiv.org/pdf/2404.00859" |
| 28 | + }, |
| 29 | + { |
| 30 | + "title": "Categorical Reparameterization with Gumbel-Softmax", |
| 31 | + "author": "Eric Jang et al", |
| 32 | + "year": "2017", |
| 33 | + "topic":"gumbel-softmax", |
| 34 | + "venue": "ICLR", |
| 35 | + "description": "This paper presents a method for efficiently training neural networks with discrete random variables. The Gumbel-Softmax is a continuous relaxation of categorical distributions that addresses the challenge of backpropagating through discrete random variables in neural networks. It builds upon the Gumbel-Max trick, which samples from categorical distributions by adding Gumbel noise to logits and taking the argmax, but replaces the non-differentiable argmax operation with a differentiable softmax function controlled by a temperature parameter τ. When the temperature approaches zero, the distribution approximates a categorical one-hot vector, while higher temperatures yield more uniform distributions. This temperature can be gradually annealed during training to balance exploration and discrete decision-making. For applications requiring truly discrete outputs, the Straight-Through Gumbel-Softmax variant uses argmax in the forward pass while preserving differentiable gradients through the softmax in the backward pass. This technique has enabled significant advances in training neural networks with discrete variables, categorical variational autoencoders, and efficient semi-supervised learning algorithms, solving a fundamental limitation in stochastic neural networks.", |
| 36 | + "link": "https://arxiv.org/pdf/1611.01144" |
| 37 | + }, |
| 38 | + { |
| 39 | + "title": "Training Verifiers to Solve Math Word Problems", |
| 40 | + "author": "Karl Cobbe et al", |
| 41 | + "year": "2021", |
| 42 | + "topic":"verifiers", |
| 43 | + "venue": "Arxiv", |
| 44 | + "description": "This paper introduces GSM8K, a dataset of 8.5K high-quality linguistic diverse grade school math word problems, to address the challenges language models face with multi-step mathematical reasoning. Despite their success in many tasks, even large language models struggle with mathematics due to the sensitivity to individual errors in step-by-step reasoning. The authors propose a verification approach to improve performance: they first finetune a generator model, then train a separate verifier model to judge the correctness of potential solutions. At test time, they generate multiple candidate solutions and select the highest-ranked one. This verification method significantly outperforms basic finetuning, providing performance equivalent to a 30x increase in model size.", |
| 45 | + "link": "https://arxiv.org/pdf/2110.14168" |
| 46 | + }, |
| 47 | + { |
| 48 | + "title": "LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models", |
| 49 | + "author": "Yukang Chen et al", |
| 50 | + "year": "2024", |
| 51 | + "topic":"lora, long context", |
| 52 | + "venue": "ICML", |
| 53 | + "description": "LongLoRA efficiently extends the context window of large language models through a two-pronged methodology. First, the authors introduce Shifted Sparse Attention (S²-Attn), which divides the input sequence into multiple groups and performs attention only within each group, then critically shifts the group partitioning by half a group size in 50% of attention heads to ensure information flows between groups. This approximates full attention during training while dramatically reducing computational costs. Second, they enhance Low-Rank Adaptation (LoRA) by making embedding and normalization layers trainable—components that comprise less than 2% of model parameters but prove essential for long-context adaptation. Their experiments demonstrate this approach closes the performance gap between LoRA and full fine-tuning while maintaining significantly lower memory requirements. Importantly, models trained with S²-Attn retain standard attention during inference, ensuring compatibility with existing optimization techniques like Flash-Attention2, making LongLoRA a practical solution that enables extending Llama2 models to context lengths of up to 100K tokens on modest hardware setups.", |
| 54 | + "link": "https://arxiv.org/pdf/2309.12307" |
| 55 | + }, |
2 | 56 | {
|
3 | 57 | "title": "Hardware-Aware Parallel Prompt Decoding for Memory-Efficient Acceleration of LLM Inference",
|
4 | 58 | "author": "Hao (Mark) Chen et al",
|
|
18 | 72 | "link": "https://arxiv.org/pdf/2207.06881"
|
19 | 73 | },
|
20 | 74 | {
|
21 |
| - "title": "Learning to Keep a Promise: Sacling Language Model Decoding Parallelism with Learned Asynchronous Decoding", |
| 75 | + "title": "Learning to Keep a Promise: Scaling Language Model Decoding Parallelism with Learned Asynchronous Decoding", |
22 | 76 | "author": "Tian Jin et al",
|
23 | 77 | "year": "2025",
|
24 | 78 | "topic":"parallelism, asynchronous",
|
|
0 commit comments