@@ -133,11 +133,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
133
133
return - ENOMEM ;
134
134
}
135
135
136
- static inline unsigned int atomic_xor_bits (atomic_t * v , unsigned int bits )
137
- {
138
- return atomic_fetch_xor (bits , v ) ^ bits ;
139
- }
140
-
141
136
#ifdef CONFIG_PGSTE
142
137
143
138
struct page * page_table_alloc_pgste (struct mm_struct * mm )
@@ -162,303 +157,85 @@ void page_table_free_pgste(struct page *page)
162
157
163
158
#endif /* CONFIG_PGSTE */
164
159
165
- /*
166
- * A 2KB-pgtable is either upper or lower half of a normal page.
167
- * The second half of the page may be unused or used as another
168
- * 2KB-pgtable.
169
- *
170
- * Whenever possible the parent page for a new 2KB-pgtable is picked
171
- * from the list of partially allocated pages mm_context_t::pgtable_list.
172
- * In case the list is empty a new parent page is allocated and added to
173
- * the list.
174
- *
175
- * When a parent page gets fully allocated it contains 2KB-pgtables in both
176
- * upper and lower halves and is removed from mm_context_t::pgtable_list.
177
- *
178
- * When 2KB-pgtable is freed from to fully allocated parent page that
179
- * page turns partially allocated and added to mm_context_t::pgtable_list.
180
- *
181
- * If 2KB-pgtable is freed from the partially allocated parent page that
182
- * page turns unused and gets removed from mm_context_t::pgtable_list.
183
- * Furthermore, the unused parent page is released.
184
- *
185
- * As follows from the above, no unallocated or fully allocated parent
186
- * pages are contained in mm_context_t::pgtable_list.
187
- *
188
- * The upper byte (bits 24-31) of the parent page _refcount is used
189
- * for tracking contained 2KB-pgtables and has the following format:
190
- *
191
- * PP AA
192
- * 01234567 upper byte (bits 24-31) of struct page::_refcount
193
- * || ||
194
- * || |+--- upper 2KB-pgtable is allocated
195
- * || +---- lower 2KB-pgtable is allocated
196
- * |+------- upper 2KB-pgtable is pending for removal
197
- * +-------- lower 2KB-pgtable is pending for removal
198
- *
199
- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
200
- * using _refcount is possible).
201
- *
202
- * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
203
- * The parent page is either:
204
- * - added to mm_context_t::pgtable_list in case the second half of the
205
- * parent page is still unallocated;
206
- * - removed from mm_context_t::pgtable_list in case both hales of the
207
- * parent page are allocated;
208
- * These operations are protected with mm_context_t::lock.
209
- *
210
- * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
211
- * and the corresponding PP bit is set to 1 in a single atomic operation.
212
- * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
213
- * exclusive and may never be both set to 1!
214
- * The parent page is either:
215
- * - added to mm_context_t::pgtable_list in case the second half of the
216
- * parent page is still allocated;
217
- * - removed from mm_context_t::pgtable_list in case the second half of
218
- * the parent page is unallocated;
219
- * These operations are protected with mm_context_t::lock.
220
- *
221
- * It is important to understand that mm_context_t::lock only protects
222
- * mm_context_t::pgtable_list and AA bits, but not the parent page itself
223
- * and PP bits.
224
- *
225
- * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
226
- * while both AA bits and the second PP bit are already unset. Then the
227
- * parent page does not contain any 2KB-pgtable fragment anymore, and it has
228
- * also been removed from mm_context_t::pgtable_list. It is safe to release
229
- * the page therefore.
230
- *
231
- * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
232
- * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
233
- * while the PP bits are never used, nor such a page is added to or removed
234
- * from mm_context_t::pgtable_list.
235
- *
236
- * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
237
- * and prevents both 2K fragments from being reused. pte_free_defer() has to
238
- * guarantee that its pgtable cannot be reused before the RCU grace period
239
- * has elapsed (which page_table_free_rcu() does not actually guarantee).
240
- * But for simplicity, because page->rcu_head overlays page->lru, and because
241
- * the RCU callback might not be called before the mm_context_t has been freed,
242
- * pte_free_defer() in this implementation prevents both fragments from being
243
- * reused, and delays making the call to RCU until both fragments are freed.
244
- */
245
160
unsigned long * page_table_alloc (struct mm_struct * mm )
246
161
{
247
- unsigned long * table ;
248
162
struct ptdesc * ptdesc ;
249
- unsigned int mask , bit ;
250
-
251
- /* Try to get a fragment of a 4K page as a 2K page table */
252
- if (!mm_alloc_pgste (mm )) {
253
- table = NULL ;
254
- spin_lock_bh (& mm -> context .lock );
255
- if (!list_empty (& mm -> context .pgtable_list )) {
256
- ptdesc = list_first_entry (& mm -> context .pgtable_list ,
257
- struct ptdesc , pt_list );
258
- mask = atomic_read (& ptdesc -> _refcount ) >> 24 ;
259
- /*
260
- * The pending removal bits must also be checked.
261
- * Failure to do so might lead to an impossible
262
- * value of (i.e 0x13 or 0x23) written to _refcount.
263
- * Such values violate the assumption that pending and
264
- * allocation bits are mutually exclusive, and the rest
265
- * of the code unrails as result. That could lead to
266
- * a whole bunch of races and corruptions.
267
- */
268
- mask = (mask | (mask >> 4 )) & 0x03U ;
269
- if (mask != 0x03U ) {
270
- table = (unsigned long * ) ptdesc_to_virt (ptdesc );
271
- bit = mask & 1 ; /* =1 -> second 2K */
272
- if (bit )
273
- table += PTRS_PER_PTE ;
274
- atomic_xor_bits (& ptdesc -> _refcount ,
275
- 0x01U << (bit + 24 ));
276
- list_del_init (& ptdesc -> pt_list );
277
- }
278
- }
279
- spin_unlock_bh (& mm -> context .lock );
280
- if (table )
281
- return table ;
282
- }
283
- /* Allocate a fresh page */
163
+ unsigned long * table ;
164
+
284
165
ptdesc = pagetable_alloc (GFP_KERNEL , 0 );
285
166
if (!ptdesc )
286
167
return NULL ;
287
168
if (!pagetable_pte_ctor (ptdesc )) {
288
169
pagetable_free (ptdesc );
289
170
return NULL ;
290
171
}
291
- /* Initialize page table */
292
172
table = ptdesc_to_virt (ptdesc );
293
173
__arch_set_page_dat (table , 1 );
294
- if (mm_alloc_pgste (mm )) {
295
- /* Return 4K page table with PGSTEs */
296
- INIT_LIST_HEAD (& ptdesc -> pt_list );
297
- atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
298
- memset64 ((u64 * )table , _PAGE_INVALID , PTRS_PER_PTE );
299
- memset64 ((u64 * )table + PTRS_PER_PTE , 0 , PTRS_PER_PTE );
300
- } else {
301
- /* Return the first 2K fragment of the page */
302
- atomic_xor_bits (& ptdesc -> _refcount , 0x01U << 24 );
303
- memset64 ((u64 * )table , _PAGE_INVALID , 2 * PTRS_PER_PTE );
304
- spin_lock_bh (& mm -> context .lock );
305
- list_add (& ptdesc -> pt_list , & mm -> context .pgtable_list );
306
- spin_unlock_bh (& mm -> context .lock );
307
- }
174
+ /* pt_list is used by gmap only */
175
+ INIT_LIST_HEAD (& ptdesc -> pt_list );
176
+ memset64 ((u64 * )table , _PAGE_INVALID , PTRS_PER_PTE );
177
+ memset64 ((u64 * )table + PTRS_PER_PTE , 0 , PTRS_PER_PTE );
308
178
return table ;
309
179
}
310
180
311
- static void page_table_release_check (struct page * page , void * table ,
312
- unsigned int half , unsigned int mask )
313
- {
314
- char msg [128 ];
315
-
316
- if (!IS_ENABLED (CONFIG_DEBUG_VM ))
317
- return ;
318
- if (!mask && list_empty (& page -> lru ))
319
- return ;
320
- snprintf (msg , sizeof (msg ),
321
- "Invalid pgtable %p release half 0x%02x mask 0x%02x" ,
322
- table , half , mask );
323
- dump_page (page , msg );
324
- }
325
-
326
- static void pte_free_now (struct rcu_head * head )
181
+ static void pagetable_pte_dtor_free (struct ptdesc * ptdesc )
327
182
{
328
- struct ptdesc * ptdesc ;
329
-
330
- ptdesc = container_of (head , struct ptdesc , pt_rcu_head );
331
183
pagetable_pte_dtor (ptdesc );
332
184
pagetable_free (ptdesc );
333
185
}
334
186
335
187
void page_table_free (struct mm_struct * mm , unsigned long * table )
336
188
{
337
- unsigned int mask , bit , half ;
338
189
struct ptdesc * ptdesc = virt_to_ptdesc (table );
339
190
340
- if (!mm_alloc_pgste (mm )) {
341
- /* Free 2K page table fragment of a 4K page */
342
- bit = ((unsigned long ) table & ~PAGE_MASK )/(PTRS_PER_PTE * sizeof (pte_t ));
343
- spin_lock_bh (& mm -> context .lock );
344
- /*
345
- * Mark the page for delayed release. The actual release
346
- * will happen outside of the critical section from this
347
- * function or from __tlb_remove_table()
348
- */
349
- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x11U << (bit + 24 ));
350
- mask >>= 24 ;
351
- if ((mask & 0x03U ) && !folio_test_active (ptdesc_folio (ptdesc ))) {
352
- /*
353
- * Other half is allocated, and neither half has had
354
- * its free deferred: add page to head of list, to make
355
- * this freed half available for immediate reuse.
356
- */
357
- list_add (& ptdesc -> pt_list , & mm -> context .pgtable_list );
358
- } else {
359
- /* If page is on list, now remove it. */
360
- list_del_init (& ptdesc -> pt_list );
361
- }
362
- spin_unlock_bh (& mm -> context .lock );
363
- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x10U << (bit + 24 ));
364
- mask >>= 24 ;
365
- if (mask != 0x00U )
366
- return ;
367
- half = 0x01U << bit ;
368
- } else {
369
- half = 0x03U ;
370
- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
371
- mask >>= 24 ;
372
- }
373
-
374
- page_table_release_check (ptdesc_page (ptdesc ), table , half , mask );
375
- if (folio_test_clear_active (ptdesc_folio (ptdesc )))
376
- call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
377
- else
378
- pte_free_now (& ptdesc -> pt_rcu_head );
191
+ pagetable_pte_dtor_free (ptdesc );
379
192
}
380
193
381
194
void page_table_free_rcu (struct mmu_gather * tlb , unsigned long * table ,
382
195
unsigned long vmaddr )
383
196
{
384
197
struct mm_struct * mm ;
385
- unsigned int bit , mask ;
386
- struct ptdesc * ptdesc = virt_to_ptdesc (table );
387
198
388
199
mm = tlb -> mm ;
389
- if (mm_alloc_pgste (mm )) {
200
+ if (mm_alloc_pgste (mm ))
390
201
gmap_unlink (mm , table , vmaddr );
391
- table = (unsigned long * ) ((unsigned long )table | 0x03U );
392
- tlb_remove_ptdesc (tlb , table );
393
- return ;
394
- }
395
- bit = ((unsigned long ) table & ~PAGE_MASK ) / (PTRS_PER_PTE * sizeof (pte_t ));
396
- spin_lock_bh (& mm -> context .lock );
397
- /*
398
- * Mark the page for delayed release. The actual release will happen
399
- * outside of the critical section from __tlb_remove_table() or from
400
- * page_table_free()
401
- */
402
- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x11U << (bit + 24 ));
403
- mask >>= 24 ;
404
- if ((mask & 0x03U ) && !folio_test_active (ptdesc_folio (ptdesc ))) {
405
- /*
406
- * Other half is allocated, and neither half has had
407
- * its free deferred: add page to end of list, to make
408
- * this freed half available for reuse once its pending
409
- * bit has been cleared by __tlb_remove_table().
410
- */
411
- list_add_tail (& ptdesc -> pt_list , & mm -> context .pgtable_list );
412
- } else {
413
- /* If page is on list, now remove it. */
414
- list_del_init (& ptdesc -> pt_list );
415
- }
416
- spin_unlock_bh (& mm -> context .lock );
417
- table = (unsigned long * ) ((unsigned long ) table | (0x01U << bit ));
202
+ table = (unsigned long * )((unsigned long )table | 0x01U );
418
203
tlb_remove_ptdesc (tlb , table );
419
204
}
420
205
421
206
void __tlb_remove_table (void * _table )
422
207
{
423
- unsigned int mask = (unsigned long ) _table & 0x03U , half = mask ;
424
- void * table = (void * )((unsigned long ) _table ^ mask );
425
- struct ptdesc * ptdesc = virt_to_ptdesc (table );
426
-
427
- switch (half ) {
428
- case 0x00U : /* pmd, pud, or p4d */
208
+ struct ptdesc * ptdesc ;
209
+ unsigned int mask ;
210
+ void * table ;
211
+
212
+ mask = (unsigned long )_table & 0x01U ;
213
+ table = (void * )((unsigned long )_table ^ mask );
214
+ ptdesc = virt_to_ptdesc (table );
215
+ if (!mask ) {
216
+ /* pmd, pud, or p4d */
429
217
pagetable_free (ptdesc );
430
218
return ;
431
- case 0x01U : /* lower 2K of a 4K page table */
432
- case 0x02U : /* higher 2K of a 4K page table */
433
- mask = atomic_xor_bits (& ptdesc -> _refcount , mask << (4 + 24 ));
434
- mask >>= 24 ;
435
- if (mask != 0x00U )
436
- return ;
437
- break ;
438
- case 0x03U : /* 4K page table with pgstes */
439
- mask = atomic_xor_bits (& ptdesc -> _refcount , 0x03U << 24 );
440
- mask >>= 24 ;
441
- break ;
442
219
}
443
-
444
- page_table_release_check (ptdesc_page (ptdesc ), table , half , mask );
445
- if (folio_test_clear_active (ptdesc_folio (ptdesc )))
446
- call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
447
- else
448
- pte_free_now (& ptdesc -> pt_rcu_head );
220
+ pagetable_pte_dtor_free (ptdesc );
449
221
}
450
222
451
223
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
224
+ static void pte_free_now (struct rcu_head * head )
225
+ {
226
+ struct ptdesc * ptdesc = container_of (head , struct ptdesc , pt_rcu_head );
227
+
228
+ pagetable_pte_dtor_free (ptdesc );
229
+ }
230
+
452
231
void pte_free_defer (struct mm_struct * mm , pgtable_t pgtable )
453
232
{
454
- struct page * page ;
233
+ struct ptdesc * ptdesc = virt_to_ptdesc ( pgtable ) ;
455
234
456
- page = virt_to_page (pgtable );
457
- SetPageActive (page );
458
- page_table_free (mm , (unsigned long * )pgtable );
235
+ call_rcu (& ptdesc -> pt_rcu_head , pte_free_now );
459
236
/*
460
- * page_table_free() does not do the pgste gmap_unlink() which
461
- * page_table_free_rcu() does: warn us if pgste ever reaches here .
237
+ * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
238
+ * Turn to the generic pte_free_defer() version once gmap is removed .
462
239
*/
463
240
WARN_ON_ONCE (mm_has_pgste (mm ));
464
241
}
0 commit comments