Skip to content

Commit d08d4e7

Browse files
Alexander GordeevVasily Gorbik
authored andcommitted
s390/mm: use full 4KB page for 2KB PTE
Cease using 4KB pages to host two 2KB PTEs. That greatly simplifies the memory management code at the expense of page tables memory footprint. Instead of two PTEs per 4KB page use only upper half of the parent page for a single PTE. With that the list of half-used pages pgtable_list becomes unneeded. Further, the upper byte of the parent page _refcount counter does not need to be used for fragments tracking and could be left alone. Commit 8211dad ("s390: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to coordinate a deferred free with 2KB page table fragments tracking. Since there is no tracking anymore, there is no need for using PageActive flag. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
1 parent a51324c commit d08d4e7

File tree

4 files changed

+31
-262
lines changed

4 files changed

+31
-262
lines changed

arch/s390/include/asm/mmu.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@ typedef struct {
1111
cpumask_t cpu_attach_mask;
1212
atomic_t flush_count;
1313
unsigned int flush_mm;
14-
struct list_head pgtable_list;
1514
struct list_head gmap_list;
1615
unsigned long gmap_asce;
1716
unsigned long asce;
@@ -39,7 +38,6 @@ typedef struct {
3938

4039
#define INIT_MM_CONTEXT(name) \
4140
.context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \
42-
.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
4341
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
4442

4543
#endif

arch/s390/include/asm/mmu_context.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ static inline int init_new_context(struct task_struct *tsk,
2222
unsigned long asce_type, init_entry;
2323

2424
spin_lock_init(&mm->context.lock);
25-
INIT_LIST_HEAD(&mm->context.pgtable_list);
2625
INIT_LIST_HEAD(&mm->context.gmap_list);
2726
cpumask_clear(&mm->context.cpu_attach_mask);
2827
atomic_set(&mm->context.flush_count, 0);

arch/s390/include/asm/tlb.h

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
6969
tlb->mm->context.flush_mm = 1;
7070
tlb->freed_tables = 1;
7171
tlb->cleared_pmds = 1;
72-
/*
73-
* page_table_free_rcu takes care of the allocation bit masks
74-
* of the 2K table fragments in the 4K page table page,
75-
* then calls tlb_remove_table.
76-
*/
7772
page_table_free_rcu(tlb, (unsigned long *) pte, address);
7873
}
7974

arch/s390/mm/pgalloc.c

Lines changed: 31 additions & 254 deletions
Original file line numberDiff line numberDiff line change
@@ -133,11 +133,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
133133
return -ENOMEM;
134134
}
135135

136-
static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
137-
{
138-
return atomic_fetch_xor(bits, v) ^ bits;
139-
}
140-
141136
#ifdef CONFIG_PGSTE
142137

143138
struct page *page_table_alloc_pgste(struct mm_struct *mm)
@@ -162,303 +157,85 @@ void page_table_free_pgste(struct page *page)
162157

163158
#endif /* CONFIG_PGSTE */
164159

165-
/*
166-
* A 2KB-pgtable is either upper or lower half of a normal page.
167-
* The second half of the page may be unused or used as another
168-
* 2KB-pgtable.
169-
*
170-
* Whenever possible the parent page for a new 2KB-pgtable is picked
171-
* from the list of partially allocated pages mm_context_t::pgtable_list.
172-
* In case the list is empty a new parent page is allocated and added to
173-
* the list.
174-
*
175-
* When a parent page gets fully allocated it contains 2KB-pgtables in both
176-
* upper and lower halves and is removed from mm_context_t::pgtable_list.
177-
*
178-
* When 2KB-pgtable is freed from to fully allocated parent page that
179-
* page turns partially allocated and added to mm_context_t::pgtable_list.
180-
*
181-
* If 2KB-pgtable is freed from the partially allocated parent page that
182-
* page turns unused and gets removed from mm_context_t::pgtable_list.
183-
* Furthermore, the unused parent page is released.
184-
*
185-
* As follows from the above, no unallocated or fully allocated parent
186-
* pages are contained in mm_context_t::pgtable_list.
187-
*
188-
* The upper byte (bits 24-31) of the parent page _refcount is used
189-
* for tracking contained 2KB-pgtables and has the following format:
190-
*
191-
* PP AA
192-
* 01234567 upper byte (bits 24-31) of struct page::_refcount
193-
* || ||
194-
* || |+--- upper 2KB-pgtable is allocated
195-
* || +---- lower 2KB-pgtable is allocated
196-
* |+------- upper 2KB-pgtable is pending for removal
197-
* +-------- lower 2KB-pgtable is pending for removal
198-
*
199-
* (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
200-
* using _refcount is possible).
201-
*
202-
* When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
203-
* The parent page is either:
204-
* - added to mm_context_t::pgtable_list in case the second half of the
205-
* parent page is still unallocated;
206-
* - removed from mm_context_t::pgtable_list in case both hales of the
207-
* parent page are allocated;
208-
* These operations are protected with mm_context_t::lock.
209-
*
210-
* When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
211-
* and the corresponding PP bit is set to 1 in a single atomic operation.
212-
* Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
213-
* exclusive and may never be both set to 1!
214-
* The parent page is either:
215-
* - added to mm_context_t::pgtable_list in case the second half of the
216-
* parent page is still allocated;
217-
* - removed from mm_context_t::pgtable_list in case the second half of
218-
* the parent page is unallocated;
219-
* These operations are protected with mm_context_t::lock.
220-
*
221-
* It is important to understand that mm_context_t::lock only protects
222-
* mm_context_t::pgtable_list and AA bits, but not the parent page itself
223-
* and PP bits.
224-
*
225-
* Releasing the parent page happens whenever the PP bit turns from 1 to 0,
226-
* while both AA bits and the second PP bit are already unset. Then the
227-
* parent page does not contain any 2KB-pgtable fragment anymore, and it has
228-
* also been removed from mm_context_t::pgtable_list. It is safe to release
229-
* the page therefore.
230-
*
231-
* PGSTE memory spaces use full 4KB-pgtables and do not need most of the
232-
* logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
233-
* while the PP bits are never used, nor such a page is added to or removed
234-
* from mm_context_t::pgtable_list.
235-
*
236-
* pte_free_defer() overrides those rules: it takes the page off pgtable_list,
237-
* and prevents both 2K fragments from being reused. pte_free_defer() has to
238-
* guarantee that its pgtable cannot be reused before the RCU grace period
239-
* has elapsed (which page_table_free_rcu() does not actually guarantee).
240-
* But for simplicity, because page->rcu_head overlays page->lru, and because
241-
* the RCU callback might not be called before the mm_context_t has been freed,
242-
* pte_free_defer() in this implementation prevents both fragments from being
243-
* reused, and delays making the call to RCU until both fragments are freed.
244-
*/
245160
unsigned long *page_table_alloc(struct mm_struct *mm)
246161
{
247-
unsigned long *table;
248162
struct ptdesc *ptdesc;
249-
unsigned int mask, bit;
250-
251-
/* Try to get a fragment of a 4K page as a 2K page table */
252-
if (!mm_alloc_pgste(mm)) {
253-
table = NULL;
254-
spin_lock_bh(&mm->context.lock);
255-
if (!list_empty(&mm->context.pgtable_list)) {
256-
ptdesc = list_first_entry(&mm->context.pgtable_list,
257-
struct ptdesc, pt_list);
258-
mask = atomic_read(&ptdesc->_refcount) >> 24;
259-
/*
260-
* The pending removal bits must also be checked.
261-
* Failure to do so might lead to an impossible
262-
* value of (i.e 0x13 or 0x23) written to _refcount.
263-
* Such values violate the assumption that pending and
264-
* allocation bits are mutually exclusive, and the rest
265-
* of the code unrails as result. That could lead to
266-
* a whole bunch of races and corruptions.
267-
*/
268-
mask = (mask | (mask >> 4)) & 0x03U;
269-
if (mask != 0x03U) {
270-
table = (unsigned long *) ptdesc_to_virt(ptdesc);
271-
bit = mask & 1; /* =1 -> second 2K */
272-
if (bit)
273-
table += PTRS_PER_PTE;
274-
atomic_xor_bits(&ptdesc->_refcount,
275-
0x01U << (bit + 24));
276-
list_del_init(&ptdesc->pt_list);
277-
}
278-
}
279-
spin_unlock_bh(&mm->context.lock);
280-
if (table)
281-
return table;
282-
}
283-
/* Allocate a fresh page */
163+
unsigned long *table;
164+
284165
ptdesc = pagetable_alloc(GFP_KERNEL, 0);
285166
if (!ptdesc)
286167
return NULL;
287168
if (!pagetable_pte_ctor(ptdesc)) {
288169
pagetable_free(ptdesc);
289170
return NULL;
290171
}
291-
/* Initialize page table */
292172
table = ptdesc_to_virt(ptdesc);
293173
__arch_set_page_dat(table, 1);
294-
if (mm_alloc_pgste(mm)) {
295-
/* Return 4K page table with PGSTEs */
296-
INIT_LIST_HEAD(&ptdesc->pt_list);
297-
atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
298-
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
299-
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
300-
} else {
301-
/* Return the first 2K fragment of the page */
302-
atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
303-
memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
304-
spin_lock_bh(&mm->context.lock);
305-
list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
306-
spin_unlock_bh(&mm->context.lock);
307-
}
174+
/* pt_list is used by gmap only */
175+
INIT_LIST_HEAD(&ptdesc->pt_list);
176+
memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
177+
memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
308178
return table;
309179
}
310180

311-
static void page_table_release_check(struct page *page, void *table,
312-
unsigned int half, unsigned int mask)
313-
{
314-
char msg[128];
315-
316-
if (!IS_ENABLED(CONFIG_DEBUG_VM))
317-
return;
318-
if (!mask && list_empty(&page->lru))
319-
return;
320-
snprintf(msg, sizeof(msg),
321-
"Invalid pgtable %p release half 0x%02x mask 0x%02x",
322-
table, half, mask);
323-
dump_page(page, msg);
324-
}
325-
326-
static void pte_free_now(struct rcu_head *head)
181+
static void pagetable_pte_dtor_free(struct ptdesc *ptdesc)
327182
{
328-
struct ptdesc *ptdesc;
329-
330-
ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
331183
pagetable_pte_dtor(ptdesc);
332184
pagetable_free(ptdesc);
333185
}
334186

335187
void page_table_free(struct mm_struct *mm, unsigned long *table)
336188
{
337-
unsigned int mask, bit, half;
338189
struct ptdesc *ptdesc = virt_to_ptdesc(table);
339190

340-
if (!mm_alloc_pgste(mm)) {
341-
/* Free 2K page table fragment of a 4K page */
342-
bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
343-
spin_lock_bh(&mm->context.lock);
344-
/*
345-
* Mark the page for delayed release. The actual release
346-
* will happen outside of the critical section from this
347-
* function or from __tlb_remove_table()
348-
*/
349-
mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
350-
mask >>= 24;
351-
if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
352-
/*
353-
* Other half is allocated, and neither half has had
354-
* its free deferred: add page to head of list, to make
355-
* this freed half available for immediate reuse.
356-
*/
357-
list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
358-
} else {
359-
/* If page is on list, now remove it. */
360-
list_del_init(&ptdesc->pt_list);
361-
}
362-
spin_unlock_bh(&mm->context.lock);
363-
mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
364-
mask >>= 24;
365-
if (mask != 0x00U)
366-
return;
367-
half = 0x01U << bit;
368-
} else {
369-
half = 0x03U;
370-
mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
371-
mask >>= 24;
372-
}
373-
374-
page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
375-
if (folio_test_clear_active(ptdesc_folio(ptdesc)))
376-
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
377-
else
378-
pte_free_now(&ptdesc->pt_rcu_head);
191+
pagetable_pte_dtor_free(ptdesc);
379192
}
380193

381194
void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
382195
unsigned long vmaddr)
383196
{
384197
struct mm_struct *mm;
385-
unsigned int bit, mask;
386-
struct ptdesc *ptdesc = virt_to_ptdesc(table);
387198

388199
mm = tlb->mm;
389-
if (mm_alloc_pgste(mm)) {
200+
if (mm_alloc_pgste(mm))
390201
gmap_unlink(mm, table, vmaddr);
391-
table = (unsigned long *) ((unsigned long)table | 0x03U);
392-
tlb_remove_ptdesc(tlb, table);
393-
return;
394-
}
395-
bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
396-
spin_lock_bh(&mm->context.lock);
397-
/*
398-
* Mark the page for delayed release. The actual release will happen
399-
* outside of the critical section from __tlb_remove_table() or from
400-
* page_table_free()
401-
*/
402-
mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
403-
mask >>= 24;
404-
if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
405-
/*
406-
* Other half is allocated, and neither half has had
407-
* its free deferred: add page to end of list, to make
408-
* this freed half available for reuse once its pending
409-
* bit has been cleared by __tlb_remove_table().
410-
*/
411-
list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
412-
} else {
413-
/* If page is on list, now remove it. */
414-
list_del_init(&ptdesc->pt_list);
415-
}
416-
spin_unlock_bh(&mm->context.lock);
417-
table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
202+
table = (unsigned long *)((unsigned long)table | 0x01U);
418203
tlb_remove_ptdesc(tlb, table);
419204
}
420205

421206
void __tlb_remove_table(void *_table)
422207
{
423-
unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
424-
void *table = (void *)((unsigned long) _table ^ mask);
425-
struct ptdesc *ptdesc = virt_to_ptdesc(table);
426-
427-
switch (half) {
428-
case 0x00U: /* pmd, pud, or p4d */
208+
struct ptdesc *ptdesc;
209+
unsigned int mask;
210+
void *table;
211+
212+
mask = (unsigned long)_table & 0x01U;
213+
table = (void *)((unsigned long)_table ^ mask);
214+
ptdesc = virt_to_ptdesc(table);
215+
if (!mask) {
216+
/* pmd, pud, or p4d */
429217
pagetable_free(ptdesc);
430218
return;
431-
case 0x01U: /* lower 2K of a 4K page table */
432-
case 0x02U: /* higher 2K of a 4K page table */
433-
mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
434-
mask >>= 24;
435-
if (mask != 0x00U)
436-
return;
437-
break;
438-
case 0x03U: /* 4K page table with pgstes */
439-
mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
440-
mask >>= 24;
441-
break;
442219
}
443-
444-
page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
445-
if (folio_test_clear_active(ptdesc_folio(ptdesc)))
446-
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
447-
else
448-
pte_free_now(&ptdesc->pt_rcu_head);
220+
pagetable_pte_dtor_free(ptdesc);
449221
}
450222

451223
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
224+
static void pte_free_now(struct rcu_head *head)
225+
{
226+
struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
227+
228+
pagetable_pte_dtor_free(ptdesc);
229+
}
230+
452231
void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
453232
{
454-
struct page *page;
233+
struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);
455234

456-
page = virt_to_page(pgtable);
457-
SetPageActive(page);
458-
page_table_free(mm, (unsigned long *)pgtable);
235+
call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
459236
/*
460-
* page_table_free() does not do the pgste gmap_unlink() which
461-
* page_table_free_rcu() does: warn us if pgste ever reaches here.
237+
* THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
238+
* Turn to the generic pte_free_defer() version once gmap is removed.
462239
*/
463240
WARN_ON_ONCE(mm_has_pgste(mm));
464241
}

0 commit comments

Comments
 (0)