Updated on 2024-08-28

lxaw · lxaw · commit 448e6f9ecd49 · 2024-08-28T05:52:25.000-04:00
diff --git a/index.html b/index.html
@@ -39,7 +39,7 @@ <h3>
         When?
     </h3>
     <p>
-        Last time this was edited was 2024-08-27 (YYYY/MM/DD).
+        Last time this was edited was 2024-08-28 (YYYY/MM/DD).
     </p>
     <small><a href="misc.html">misc</a></small>
 </body>
diff --git a/papers/list.json b/papers/list.json
@@ -1,5 +1,14 @@
 [
 
+  {
+    "title": "Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference",
+    "author": "Jiaming Tang et al",
+    "year": "2024",
+    "topic": "KV cache, sparsity, LLM",
+    "venue": "ICML",
+    "description": "Long context LLM inference is slow and the speed decreases significantly as sequence lengths grow. This is mainly due to needing to load a big KV cache during self-attention. Prior works have use methods to evict tokens in the attention maps to promote sparsity, but the Han lab (smartly!) found that the criticality of tokens strongly correlates with the current query token. Thus, they employ a KV Cache eviction method that retains all KV cache (since past evicted tokens may be needed to handle future queries), while being able to select the top K relevant tokens to a particular query. This allows for speedups in self-attention at low cost to accuracy.",
+    "link": "https://arxiv.org/pdf/2406.10774"
+  },
   {
     "title": "BigNAS: Scaling Up Neural Architecture Search with Big Single-Stage Models",
     "author": "Jiahui Yu et al",

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,14 @@`
`1`	`1`	`[`
`2`	`2`
	`3`	`+ {`
	`4`	`+ "title": "Quest: Query-Aware Sparsity for Efficient Long-Context LLM Inference",`
	`5`	`+ "author": "Jiaming Tang et al",`
	`6`	`+ "year": "2024",`
	`7`	`+ "topic": "KV cache, sparsity, LLM",`
	`8`	`+ "venue": "ICML",`
	`9`	+ "description": "Long context LLM inference is slow and the speed decreases significantly as sequence lengths grow. This is mainly due to needing to load a big KV cache during self-attention. Prior works have use methods to evict tokens in the attention maps to promote sparsity, but the Han lab (smartly!) found that the criticality of tokens strongly correlates with the current query token. Thus, they employ a KV Cache eviction method that retains all KV cache (since past evicted tokens may be needed to handle future queries), while being able to select the top K relevant tokens to a particular query. This allows for speedups in self-attention at low cost to accuracy.",
	`10`	`+ "link": "https://arxiv.org/pdf/2406.10774"`
	`11`	`+ },`
`3`	`12`	`{`
`4`	`13`	`"title": "BigNAS: Scaling Up Neural Architecture Search with Big Single-Stage Models",`
`5`	`14`	`"author": "Jiahui Yu et al",`