Skip to content

Commit c7aae8f

Browse files
authored
Change how the key ring cache is updated (#54675)
* Change how the key ring cache is updated I wanted to address two concerns: 1. If there is a cached key ring, but it is expired, the first thread to discover this will _synchronously_ refresh the cache - other threads will continue to use the old value. 2. If a key ring refresh is forced, it will always hit the backing repository, even if several threads want a refresh at the same time. With this change, _all_ key ring updates are computed on a thread-pool thread and callers block exactly when there is no cached key ring for them to fall back on (first run, for example) or if they are forcing a refresh (an in-flight refresh is considered satisfactory). Moving this work to a background thread will give us more room to be generous with retries when (e.g.) Azure KeyVault is unreachable or a file is locked. The old behavior can be restored using the appcontext switch `Microsoft.AspNetCore.DataProtection.KeyManagement.DisableAsyncKeyRingUpdate`. This is a safety valve and not part of configuration - we'll remove the switch and the old code path in the next release if nothing blows up. Micro-benchmarking (on 9.0 on x64) shows that the new version has comparable performance in the common case of finding an unexpired key ring in the cache.
1 parent 8321d83 commit c7aae8f

File tree

4 files changed

+310
-44
lines changed

4 files changed

+310
-44
lines changed

src/DataProtection/DataProtection/src/Error.cs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,4 +97,9 @@ public static InvalidOperationException KeyRingProvider_DefaultKeyRevoked(Guid i
9797
var message = string.Format(CultureInfo.CurrentCulture, Resources.KeyRingProvider_DefaultKeyRevoked, id);
9898
return new InvalidOperationException(message);
9999
}
100+
101+
public static InvalidOperationException KeyRingProvider_RefreshFailedOnOtherThread()
102+
{
103+
return new InvalidOperationException(Resources.KeyRingProvider_RefreshFailedOnOtherThread);
104+
}
100105
}

src/DataProtection/DataProtection/src/KeyManagement/KeyRingProvider.cs

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
using System.Diagnostics;
77
using System.Diagnostics.CodeAnalysis;
88
using System.Threading;
9+
using System.Threading.Tasks;
910
using Microsoft.AspNetCore.Cryptography;
1011
using Microsoft.AspNetCore.DataProtection.KeyManagement.Internal;
1112
using Microsoft.Extensions.Logging;
@@ -16,13 +17,17 @@ namespace Microsoft.AspNetCore.DataProtection.KeyManagement;
1617

1718
internal sealed class KeyRingProvider : ICacheableKeyRingProvider, IKeyRingProvider
1819
{
20+
private const string DisableAsyncKeyRingUpdateSwitchKey = "Microsoft.AspNetCore.DataProtection.KeyManagement.DisableAsyncKeyRingUpdate";
21+
1922
private CacheableKeyRing? _cacheableKeyRing;
2023
private readonly object _cacheableKeyRingLockObj = new object();
24+
private Task<CacheableKeyRing>? _cacheableKeyRingTask; // Also covered by _cacheableKeyRingLockObj
2125
private readonly IDefaultKeyResolver _defaultKeyResolver;
2226
private readonly bool _autoGenerateKeys;
2327
private readonly TimeSpan _newKeyLifetime;
2428
private readonly IKeyManager _keyManager;
2529
private readonly ILogger _logger;
30+
private readonly bool _disableAsyncKeyRingUpdate;
2631

2732
public KeyRingProvider(
2833
IKeyManager keyManager,
@@ -52,6 +57,8 @@ public KeyRingProvider(
5257

5358
// We will automatically refresh any unknown keys for 2 minutes see https://github.com/dotnet/aspnetcore/issues/3975
5459
AutoRefreshWindowEnd = DateTime.UtcNow.AddMinutes(2);
60+
61+
AppContext.TryGetSwitch(DisableAsyncKeyRingUpdateSwitchKey, out _disableAsyncKeyRingUpdate);
5562
}
5663

5764
// for testing
@@ -195,6 +202,19 @@ internal IKeyRing RefreshCurrentKeyRing()
195202

196203
internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = false)
197204
{
205+
// We're making a big, scary change to the way this cache is updated: now threads
206+
// only block during computation of the new value if no old value is available
207+
// (or if they force it). We'll leave the old code in place, behind an appcontext
208+
// switch in case it turns out to have unwelcome emergent behavior.
209+
// TODO: Delete one of these codepaths in 10.0.
210+
return _disableAsyncKeyRingUpdate
211+
? GetCurrentKeyRingCoreOld(utcNow, forceRefresh)
212+
: GetCurrentKeyRingCoreNew(utcNow, forceRefresh);
213+
}
214+
215+
private IKeyRing GetCurrentKeyRingCoreOld(DateTime utcNow, bool forceRefresh)
216+
{
217+
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
198218
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);
199219

200220
// Can we return the cached keyring to the caller?
@@ -292,6 +312,149 @@ internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = fal
292312
}
293313
}
294314

315+
private IKeyRing GetCurrentKeyRingCoreNew(DateTime utcNow, bool forceRefresh)
316+
{
317+
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
318+
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);
319+
320+
// The 99% and perf-critical case is that there is no task in-flight and the cached
321+
// key ring is valid. We do what we can to avoid unnecessary overhead (locking,
322+
// context switching, etc) on this path.
323+
324+
CacheableKeyRing? existingCacheableKeyRing = null;
325+
326+
// Can we return the cached keyring to the caller?
327+
if (!forceRefresh)
328+
{
329+
existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
330+
if (CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
331+
{
332+
return existingCacheableKeyRing.KeyRing;
333+
}
334+
}
335+
336+
// If work kicked off by a previous caller has completed, we should use those results
337+
// We check this outside the lock to reduce contention in the common case (no task).
338+
// Logically, it would probably make more sense to check this before checking whether
339+
// the cache is valid - there could be a newer value available - but keeping that path
340+
// fast is more important. The next forced refresh or cache expiration will cause the
341+
// new value to be picked up.
342+
var existingTask = Volatile.Read(ref _cacheableKeyRingTask);
343+
if (existingTask is not null && existingTask.IsCompleted)
344+
{
345+
var taskKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed
346+
if (taskKeyRing is not null)
347+
{
348+
return taskKeyRing;
349+
}
350+
}
351+
352+
// The cached keyring hasn't been created or must be refreshed. We'll allow one thread to
353+
// create a task to update the keyring, and all threads will continue to use the existing cached
354+
// keyring while the first thread performs the update. There is an exception: if there
355+
// is no usable existing cached keyring, all callers must block until the keyring exists.
356+
lock (_cacheableKeyRingLockObj)
357+
{
358+
// Update existingTask, in case we're not the first to acquire the lock
359+
existingTask = Volatile.Read(ref _cacheableKeyRingTask);
360+
if (existingTask is null)
361+
{
362+
// If there's no existing task, make one now
363+
// PERF: Closing over utcNow substantially slows down the fast case (valid cache) in micro-benchmarks
364+
// (closing over `this` for CacheableKeyRingProvider doesn't seem impactful)
365+
existingTask = Task.Factory.StartNew(
366+
utcNow => CacheableKeyRingProvider.GetCacheableKeyRing((DateTime)utcNow!),
367+
utcNow,
368+
CancellationToken.None, // GetKeyRingFromCompletedTask will need to react if this becomes cancellable
369+
TaskCreationOptions.DenyChildAttach,
370+
TaskScheduler.Default);
371+
Volatile.Write(ref _cacheableKeyRingTask, existingTask);
372+
}
373+
}
374+
375+
if (existingCacheableKeyRing is not null)
376+
{
377+
Debug.Assert(!forceRefresh, "Read cached key ring even though forceRefresh is true");
378+
Debug.Assert(!CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow), "Should already have returned a valid cached key ring");
379+
return existingCacheableKeyRing.KeyRing;
380+
}
381+
382+
// Since there's no cached key ring we can return, we have to wait. It's not ideal to wait for a task we
383+
// just scheduled, but it makes the code a lot simpler (compared to having a separate, synchronous code path).
384+
// Cleverness: swallow any exceptions - they'll be surfaced by GetKeyRingFromCompletedTask, if appropriate.
385+
existingTask
386+
.ContinueWith(static _ => { }, TaskScheduler.Default)
387+
.Wait();
388+
389+
var newKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed (winning thread only)
390+
if (newKeyRing is null)
391+
{
392+
// Another thread won - check whether it cached a new key ring
393+
var newCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
394+
if (newCacheableKeyRing is null)
395+
{
396+
// There will have been a better exception from the winning thread
397+
throw Error.KeyRingProvider_RefreshFailedOnOtherThread();
398+
}
399+
400+
newKeyRing = newCacheableKeyRing.KeyRing;
401+
}
402+
403+
return newKeyRing;
404+
}
405+
406+
/// <summary>
407+
/// Returns null if another thread already processed the completed task.
408+
/// Otherwise, if the given completed task completed successfully, clears the task
409+
/// and either caches and returns the resulting key ring or throws, according to the
410+
/// successfulness of the task.
411+
/// </summary>
412+
private IKeyRing? GetKeyRingFromCompletedTask(Task<CacheableKeyRing> task, DateTime utcNow)
413+
{
414+
Debug.Assert(task.IsCompleted);
415+
416+
lock (_cacheableKeyRingLockObj)
417+
{
418+
// If the parameter doesn't match the field, another thread has already consumed the task (and it's reflected in _cacheableKeyRing)
419+
if (!ReferenceEquals(task, Volatile.Read(ref _cacheableKeyRingTask)))
420+
{
421+
return null;
422+
}
423+
424+
Volatile.Write(ref _cacheableKeyRingTask, null);
425+
426+
if (task.Status == TaskStatus.RanToCompletion)
427+
{
428+
var newCacheableKeyRing = task.Result;
429+
Volatile.Write(ref _cacheableKeyRing, newCacheableKeyRing);
430+
431+
// An unconsumed task result is considered to satisfy forceRefresh. One could quibble that this isn't really
432+
// a forced refresh, but we'll still return a key ring newer than the one the caller was dissatisfied with.
433+
return newCacheableKeyRing.KeyRing;
434+
}
435+
436+
Debug.Assert(!task.IsCanceled, "How did a task with no cancellation token get canceled?");
437+
Debug.Assert(task.Exception is not null, "Task should have either completed successfully or with an exception");
438+
var exception = task.Exception;
439+
440+
var existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
441+
if (existingCacheableKeyRing is not null && !CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
442+
{
443+
// If reading failed, we probably don't want to try again for a little bit, so slightly extend the
444+
// lifetime of the current cache entry
445+
Volatile.Write(ref _cacheableKeyRing, existingCacheableKeyRing.WithTemporaryExtendedLifetime(utcNow));
446+
447+
_logger.ErrorOccurredWhileRefreshingKeyRing(exception); // This one mentions the no-retry window
448+
}
449+
else
450+
{
451+
_logger.ErrorOccurredWhileReadingKeyRing(exception);
452+
}
453+
454+
throw exception.InnerExceptions.Count == 1 ? exception.InnerExceptions[0] : exception;
455+
}
456+
}
457+
295458
private static TimeSpan GetRefreshPeriodWithJitter(TimeSpan refreshPeriod)
296459
{
297460
// We'll fudge the refresh period up to -20% so that multiple applications don't try to

src/DataProtection/DataProtection/src/Resources.resx

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@
190190
<data name="KeyRingProvider_DefaultKeyRevoked" xml:space="preserve">
191191
<value>The key ring's default data protection key {0:B} has been revoked.</value>
192192
</data>
193+
<data name="KeyRingProvider_RefreshFailedOnOtherThread" xml:space="preserve">
194+
<value>Another thread failed to refresh the key ring.</value>
195+
</data>
193196
<data name="LifetimeMustNotBeNegative" xml:space="preserve">
194197
<value>{0} must not be negative.</value>
195198
</data>

0 commit comments

Comments
 (0)