@@ -124,6 +124,51 @@ static u64 get_cc_mask(void)
124
124
return BIT_ULL (gpa_width - 1 );
125
125
}
126
126
127
+ /*
128
+ * The TDX module spec states that #VE may be injected for a limited set of
129
+ * reasons:
130
+ *
131
+ * - Emulation of the architectural #VE injection on EPT violation;
132
+ *
133
+ * - As a result of guest TD execution of a disallowed instruction,
134
+ * a disallowed MSR access, or CPUID virtualization;
135
+ *
136
+ * - A notification to the guest TD about anomalous behavior;
137
+ *
138
+ * The last one is opt-in and is not used by the kernel.
139
+ *
140
+ * The Intel Software Developer's Manual describes cases when instruction
141
+ * length field can be used in section "Information for VM Exits Due to
142
+ * Instruction Execution".
143
+ *
144
+ * For TDX, it ultimately means GET_VEINFO provides reliable instruction length
145
+ * information if #VE occurred due to instruction execution, but not for EPT
146
+ * violations.
147
+ */
148
+ static int ve_instr_len (struct ve_info * ve )
149
+ {
150
+ switch (ve -> exit_reason ) {
151
+ case EXIT_REASON_HLT :
152
+ case EXIT_REASON_MSR_READ :
153
+ case EXIT_REASON_MSR_WRITE :
154
+ case EXIT_REASON_CPUID :
155
+ case EXIT_REASON_IO_INSTRUCTION :
156
+ /* It is safe to use ve->instr_len for #VE due instructions */
157
+ return ve -> instr_len ;
158
+ case EXIT_REASON_EPT_VIOLATION :
159
+ /*
160
+ * For EPT violations, ve->insn_len is not defined. For those,
161
+ * the kernel must decode instructions manually and should not
162
+ * be using this function.
163
+ */
164
+ WARN_ONCE (1 , "ve->instr_len is not defined for EPT violations" );
165
+ return 0 ;
166
+ default :
167
+ WARN_ONCE (1 , "Unexpected #VE-type: %lld\n" , ve -> exit_reason );
168
+ return ve -> instr_len ;
169
+ }
170
+ }
171
+
127
172
static u64 __cpuidle __halt (const bool irq_disabled , const bool do_sti )
128
173
{
129
174
struct tdx_hypercall_args args = {
@@ -147,7 +192,7 @@ static u64 __cpuidle __halt(const bool irq_disabled, const bool do_sti)
147
192
return __tdx_hypercall (& args , do_sti ? TDX_HCALL_ISSUE_STI : 0 );
148
193
}
149
194
150
- static bool handle_halt (void )
195
+ static int handle_halt (struct ve_info * ve )
151
196
{
152
197
/*
153
198
* Since non safe halt is mainly used in CPU offlining
@@ -158,9 +203,9 @@ static bool handle_halt(void)
158
203
const bool do_sti = false;
159
204
160
205
if (__halt (irq_disabled , do_sti ))
161
- return false ;
206
+ return - EIO ;
162
207
163
- return true ;
208
+ return ve_instr_len ( ve ) ;
164
209
}
165
210
166
211
void __cpuidle tdx_safe_halt (void )
@@ -180,7 +225,7 @@ void __cpuidle tdx_safe_halt(void)
180
225
WARN_ONCE (1 , "HLT instruction emulation failed\n" );
181
226
}
182
227
183
- static bool read_msr (struct pt_regs * regs )
228
+ static int read_msr (struct pt_regs * regs , struct ve_info * ve )
184
229
{
185
230
struct tdx_hypercall_args args = {
186
231
.r10 = TDX_HYPERCALL_STANDARD ,
@@ -194,14 +239,14 @@ static bool read_msr(struct pt_regs *regs)
194
239
* (GHCI), section titled "TDG.VP.VMCALL<Instruction.RDMSR>".
195
240
*/
196
241
if (__tdx_hypercall (& args , TDX_HCALL_HAS_OUTPUT ))
197
- return false ;
242
+ return - EIO ;
198
243
199
244
regs -> ax = lower_32_bits (args .r11 );
200
245
regs -> dx = upper_32_bits (args .r11 );
201
- return true ;
246
+ return ve_instr_len ( ve ) ;
202
247
}
203
248
204
- static bool write_msr (struct pt_regs * regs )
249
+ static int write_msr (struct pt_regs * regs , struct ve_info * ve )
205
250
{
206
251
struct tdx_hypercall_args args = {
207
252
.r10 = TDX_HYPERCALL_STANDARD ,
@@ -215,10 +260,13 @@ static bool write_msr(struct pt_regs *regs)
215
260
* can be found in TDX Guest-Host-Communication Interface
216
261
* (GHCI) section titled "TDG.VP.VMCALL<Instruction.WRMSR>".
217
262
*/
218
- return !__tdx_hypercall (& args , 0 );
263
+ if (__tdx_hypercall (& args , 0 ))
264
+ return - EIO ;
265
+
266
+ return ve_instr_len (ve );
219
267
}
220
268
221
- static bool handle_cpuid (struct pt_regs * regs )
269
+ static int handle_cpuid (struct pt_regs * regs , struct ve_info * ve )
222
270
{
223
271
struct tdx_hypercall_args args = {
224
272
.r10 = TDX_HYPERCALL_STANDARD ,
@@ -236,7 +284,7 @@ static bool handle_cpuid(struct pt_regs *regs)
236
284
*/
237
285
if (regs -> ax < 0x40000000 || regs -> ax > 0x4FFFFFFF ) {
238
286
regs -> ax = regs -> bx = regs -> cx = regs -> dx = 0 ;
239
- return true ;
287
+ return ve_instr_len ( ve ) ;
240
288
}
241
289
242
290
/*
@@ -245,7 +293,7 @@ static bool handle_cpuid(struct pt_regs *regs)
245
293
* (GHCI), section titled "VP.VMCALL<Instruction.CPUID>".
246
294
*/
247
295
if (__tdx_hypercall (& args , TDX_HCALL_HAS_OUTPUT ))
248
- return false ;
296
+ return - EIO ;
249
297
250
298
/*
251
299
* As per TDX GHCI CPUID ABI, r12-r15 registers contain contents of
@@ -257,7 +305,7 @@ static bool handle_cpuid(struct pt_regs *regs)
257
305
regs -> cx = args .r14 ;
258
306
regs -> dx = args .r15 ;
259
307
260
- return true ;
308
+ return ve_instr_len ( ve ) ;
261
309
}
262
310
263
311
static bool mmio_read (int size , unsigned long addr , unsigned long * val )
@@ -283,45 +331,60 @@ static bool mmio_write(int size, unsigned long addr, unsigned long val)
283
331
EPT_WRITE , addr , val );
284
332
}
285
333
286
- static bool handle_mmio (struct pt_regs * regs , struct ve_info * ve )
334
+ static int handle_mmio (struct pt_regs * regs , struct ve_info * ve )
287
335
{
336
+ unsigned long * reg , val , vaddr ;
288
337
char buffer [MAX_INSN_SIZE ];
289
- unsigned long * reg , val ;
290
338
struct insn insn = {};
291
339
enum mmio_type mmio ;
292
340
int size , extend_size ;
293
341
u8 extend_val = 0 ;
294
342
295
343
/* Only in-kernel MMIO is supported */
296
344
if (WARN_ON_ONCE (user_mode (regs )))
297
- return false ;
345
+ return - EFAULT ;
298
346
299
347
if (copy_from_kernel_nofault (buffer , (void * )regs -> ip , MAX_INSN_SIZE ))
300
- return false ;
348
+ return - EFAULT ;
301
349
302
350
if (insn_decode (& insn , buffer , MAX_INSN_SIZE , INSN_MODE_64 ))
303
- return false ;
351
+ return - EINVAL ;
304
352
305
353
mmio = insn_decode_mmio (& insn , & size );
306
354
if (WARN_ON_ONCE (mmio == MMIO_DECODE_FAILED ))
307
- return false ;
355
+ return - EINVAL ;
308
356
309
357
if (mmio != MMIO_WRITE_IMM && mmio != MMIO_MOVS ) {
310
358
reg = insn_get_modrm_reg_ptr (& insn , regs );
311
359
if (!reg )
312
- return false ;
360
+ return - EINVAL ;
313
361
}
314
362
315
- ve -> instr_len = insn .length ;
363
+ /*
364
+ * Reject EPT violation #VEs that split pages.
365
+ *
366
+ * MMIO accesses are supposed to be naturally aligned and therefore
367
+ * never cross page boundaries. Seeing split page accesses indicates
368
+ * a bug or a load_unaligned_zeropad() that stepped into an MMIO page.
369
+ *
370
+ * load_unaligned_zeropad() will recover using exception fixups.
371
+ */
372
+ vaddr = (unsigned long )insn_get_addr_ref (& insn , regs );
373
+ if (vaddr / PAGE_SIZE != (vaddr + size - 1 ) / PAGE_SIZE )
374
+ return - EFAULT ;
316
375
317
376
/* Handle writes first */
318
377
switch (mmio ) {
319
378
case MMIO_WRITE :
320
379
memcpy (& val , reg , size );
321
- return mmio_write (size , ve -> gpa , val );
380
+ if (!mmio_write (size , ve -> gpa , val ))
381
+ return - EIO ;
382
+ return insn .length ;
322
383
case MMIO_WRITE_IMM :
323
384
val = insn .immediate .value ;
324
- return mmio_write (size , ve -> gpa , val );
385
+ if (!mmio_write (size , ve -> gpa , val ))
386
+ return - EIO ;
387
+ return insn .length ;
325
388
case MMIO_READ :
326
389
case MMIO_READ_ZERO_EXTEND :
327
390
case MMIO_READ_SIGN_EXTEND :
@@ -334,15 +397,15 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
334
397
* decoded or handled properly. It was likely not using io.h
335
398
* helpers or accessed MMIO accidentally.
336
399
*/
337
- return false ;
400
+ return - EINVAL ;
338
401
default :
339
402
WARN_ONCE (1 , "Unknown insn_decode_mmio() decode value?" );
340
- return false ;
403
+ return - EINVAL ;
341
404
}
342
405
343
406
/* Handle reads */
344
407
if (!mmio_read (size , ve -> gpa , & val ))
345
- return false ;
408
+ return - EIO ;
346
409
347
410
switch (mmio ) {
348
411
case MMIO_READ :
@@ -364,13 +427,13 @@ static bool handle_mmio(struct pt_regs *regs, struct ve_info *ve)
364
427
default :
365
428
/* All other cases has to be covered with the first switch() */
366
429
WARN_ON_ONCE (1 );
367
- return false ;
430
+ return - EINVAL ;
368
431
}
369
432
370
433
if (extend_size )
371
434
memset (reg , extend_val , extend_size );
372
435
memcpy (reg , & val , size );
373
- return true ;
436
+ return insn . length ;
374
437
}
375
438
376
439
static bool handle_in (struct pt_regs * regs , int size , int port )
@@ -421,23 +484,28 @@ static bool handle_out(struct pt_regs *regs, int size, int port)
421
484
*
422
485
* Return True on success or False on failure.
423
486
*/
424
- static bool handle_io (struct pt_regs * regs , u32 exit_qual )
487
+ static int handle_io (struct pt_regs * regs , struct ve_info * ve )
425
488
{
489
+ u32 exit_qual = ve -> exit_qual ;
426
490
int size , port ;
427
- bool in ;
491
+ bool in , ret ;
428
492
429
493
if (VE_IS_IO_STRING (exit_qual ))
430
- return false ;
494
+ return - EIO ;
431
495
432
496
in = VE_IS_IO_IN (exit_qual );
433
497
size = VE_GET_IO_SIZE (exit_qual );
434
498
port = VE_GET_PORT_NUM (exit_qual );
435
499
436
500
437
501
if (in )
438
- return handle_in (regs , size , port );
502
+ ret = handle_in (regs , size , port );
439
503
else
440
- return handle_out (regs , size , port );
504
+ ret = handle_out (regs , size , port );
505
+ if (!ret )
506
+ return - EIO ;
507
+
508
+ return ve_instr_len (ve );
441
509
}
442
510
443
511
/*
@@ -447,13 +515,19 @@ static bool handle_io(struct pt_regs *regs, u32 exit_qual)
447
515
__init bool tdx_early_handle_ve (struct pt_regs * regs )
448
516
{
449
517
struct ve_info ve ;
518
+ int insn_len ;
450
519
451
520
tdx_get_ve_info (& ve );
452
521
453
522
if (ve .exit_reason != EXIT_REASON_IO_INSTRUCTION )
454
523
return false;
455
524
456
- return handle_io (regs , ve .exit_qual );
525
+ insn_len = handle_io (regs , & ve );
526
+ if (insn_len < 0 )
527
+ return false;
528
+
529
+ regs -> ip += insn_len ;
530
+ return true;
457
531
}
458
532
459
533
void tdx_get_ve_info (struct ve_info * ve )
@@ -486,54 +560,65 @@ void tdx_get_ve_info(struct ve_info *ve)
486
560
ve -> instr_info = upper_32_bits (out .r10 );
487
561
}
488
562
489
- /* Handle the user initiated #VE */
490
- static bool virt_exception_user (struct pt_regs * regs , struct ve_info * ve )
563
+ /*
564
+ * Handle the user initiated #VE.
565
+ *
566
+ * On success, returns the number of bytes RIP should be incremented (>=0)
567
+ * or -errno on error.
568
+ */
569
+ static int virt_exception_user (struct pt_regs * regs , struct ve_info * ve )
491
570
{
492
571
switch (ve -> exit_reason ) {
493
572
case EXIT_REASON_CPUID :
494
- return handle_cpuid (regs );
573
+ return handle_cpuid (regs , ve );
495
574
default :
496
575
pr_warn ("Unexpected #VE: %lld\n" , ve -> exit_reason );
497
- return false ;
576
+ return - EIO ;
498
577
}
499
578
}
500
579
501
- /* Handle the kernel #VE */
502
- static bool virt_exception_kernel (struct pt_regs * regs , struct ve_info * ve )
580
+ /*
581
+ * Handle the kernel #VE.
582
+ *
583
+ * On success, returns the number of bytes RIP should be incremented (>=0)
584
+ * or -errno on error.
585
+ */
586
+ static int virt_exception_kernel (struct pt_regs * regs , struct ve_info * ve )
503
587
{
504
588
switch (ve -> exit_reason ) {
505
589
case EXIT_REASON_HLT :
506
- return handle_halt ();
590
+ return handle_halt (ve );
507
591
case EXIT_REASON_MSR_READ :
508
- return read_msr (regs );
592
+ return read_msr (regs , ve );
509
593
case EXIT_REASON_MSR_WRITE :
510
- return write_msr (regs );
594
+ return write_msr (regs , ve );
511
595
case EXIT_REASON_CPUID :
512
- return handle_cpuid (regs );
596
+ return handle_cpuid (regs , ve );
513
597
case EXIT_REASON_EPT_VIOLATION :
514
598
return handle_mmio (regs , ve );
515
599
case EXIT_REASON_IO_INSTRUCTION :
516
- return handle_io (regs , ve -> exit_qual );
600
+ return handle_io (regs , ve );
517
601
default :
518
602
pr_warn ("Unexpected #VE: %lld\n" , ve -> exit_reason );
519
- return false ;
603
+ return - EIO ;
520
604
}
521
605
}
522
606
523
607
bool tdx_handle_virt_exception (struct pt_regs * regs , struct ve_info * ve )
524
608
{
525
- bool ret ;
609
+ int insn_len ;
526
610
527
611
if (user_mode (regs ))
528
- ret = virt_exception_user (regs , ve );
612
+ insn_len = virt_exception_user (regs , ve );
529
613
else
530
- ret = virt_exception_kernel (regs , ve );
614
+ insn_len = virt_exception_kernel (regs , ve );
615
+ if (insn_len < 0 )
616
+ return false;
531
617
532
618
/* After successful #VE handling, move the IP */
533
- if (ret )
534
- regs -> ip += ve -> instr_len ;
619
+ regs -> ip += insn_len ;
535
620
536
- return ret ;
621
+ return true ;
537
622
}
538
623
539
624
static bool tdx_tlb_flush_required (bool private )
0 commit comments