Skip to content

Change how the key ring cache is updated #54675

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions src/DataProtection/DataProtection/src/Error.cs
Original file line number Diff line number Diff line change
Expand Up @@ -97,4 +97,9 @@ public static InvalidOperationException KeyRingProvider_DefaultKeyRevoked(Guid i
var message = string.Format(CultureInfo.CurrentCulture, Resources.KeyRingProvider_DefaultKeyRevoked, id);
return new InvalidOperationException(message);
}

public static InvalidOperationException KeyRingProvider_RefreshFailedOnOtherThread()
{
return new InvalidOperationException(Resources.KeyRingProvider_RefreshFailedOnOtherThread);
}
}
163 changes: 163 additions & 0 deletions src/DataProtection/DataProtection/src/KeyManagement/KeyRingProvider.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.AspNetCore.Cryptography;
using Microsoft.AspNetCore.DataProtection.KeyManagement.Internal;
using Microsoft.Extensions.Logging;
Expand All @@ -16,13 +17,17 @@ namespace Microsoft.AspNetCore.DataProtection.KeyManagement;

internal sealed class KeyRingProvider : ICacheableKeyRingProvider, IKeyRingProvider
{
private const string DisableAsyncKeyRingUpdateSwitchKey = "Microsoft.AspNetCore.DataProtection.KeyManagement.DisableAsyncKeyRingUpdate";

private CacheableKeyRing? _cacheableKeyRing;
private readonly object _cacheableKeyRingLockObj = new object();
private Task<CacheableKeyRing>? _cacheableKeyRingTask; // Also covered by _cacheableKeyRingLockObj
private readonly IDefaultKeyResolver _defaultKeyResolver;
private readonly bool _autoGenerateKeys;
private readonly TimeSpan _newKeyLifetime;
private readonly IKeyManager _keyManager;
private readonly ILogger _logger;
private readonly bool _disableAsyncKeyRingUpdate;

public KeyRingProvider(
IKeyManager keyManager,
Expand Down Expand Up @@ -52,6 +57,8 @@ public KeyRingProvider(

// We will automatically refresh any unknown keys for 2 minutes see https://github.com/dotnet/aspnetcore/issues/3975
AutoRefreshWindowEnd = DateTime.UtcNow.AddMinutes(2);

AppContext.TryGetSwitch(DisableAsyncKeyRingUpdateSwitchKey, out _disableAsyncKeyRingUpdate);
}

// for testing
Expand Down Expand Up @@ -195,6 +202,19 @@ internal IKeyRing RefreshCurrentKeyRing()

internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = false)
{
// We're making a big, scary change to the way this cache is updated: now threads
// only block during computation of the new value if no old value is available
// (or if they force it). We'll leave the old code in place, behind an appcontext
// switch in case it turns out to have unwelcome emergent behavior.
// TODO: Delete one of these codepaths in 10.0.
return _disableAsyncKeyRingUpdate
? GetCurrentKeyRingCoreOld(utcNow, forceRefresh)
: GetCurrentKeyRingCoreNew(utcNow, forceRefresh);
}

private IKeyRing GetCurrentKeyRingCoreOld(DateTime utcNow, bool forceRefresh)
{
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);

// Can we return the cached keyring to the caller?
Expand Down Expand Up @@ -292,6 +312,149 @@ internal IKeyRing GetCurrentKeyRingCore(DateTime utcNow, bool forceRefresh = fal
}
}

private IKeyRing GetCurrentKeyRingCoreNew(DateTime utcNow, bool forceRefresh)
{
// DateTimes are only meaningfully comparable if they share the same Kind - require Utc for consistency
Debug.Assert(utcNow.Kind == DateTimeKind.Utc);

// The 99% and perf-critical case is that there is no task in-flight and the cached
// key ring is valid. We do what we can to avoid unnecessary overhead (locking,
// context switching, etc) on this path.

CacheableKeyRing? existingCacheableKeyRing = null;

// Can we return the cached keyring to the caller?
if (!forceRefresh)
{
existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
{
return existingCacheableKeyRing.KeyRing;
}
}

// If work kicked off by a previous caller has completed, we should use those results
// We check this outside the lock to reduce contention in the common case (no task).
// Logically, it would probably make more sense to check this before checking whether
// the cache is valid - there could be a newer value available - but keeping that path
// fast is more important. The next forced refresh or cache expiration will cause the
// new value to be picked up.
var existingTask = Volatile.Read(ref _cacheableKeyRingTask);
if (existingTask is not null && existingTask.IsCompleted)
{
var taskKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed
if (taskKeyRing is not null)
{
return taskKeyRing;
}
}

// The cached keyring hasn't been created or must be refreshed. We'll allow one thread to
// create a task to update the keyring, and all threads will continue to use the existing cached
// keyring while the first thread performs the update. There is an exception: if there
// is no usable existing cached keyring, all callers must block until the keyring exists.
lock (_cacheableKeyRingLockObj)
{
// Update existingTask, in case we're not the first to acquire the lock
existingTask = Volatile.Read(ref _cacheableKeyRingTask);
if (existingTask is null)
{
// If there's no existing task, make one now
// PERF: Closing over utcNow substantially slows down the fast case (valid cache) in micro-benchmarks
// (closing over `this` for CacheableKeyRingProvider doesn't seem impactful)
existingTask = Task.Factory.StartNew(
utcNow => CacheableKeyRingProvider.GetCacheableKeyRing((DateTime)utcNow!),
utcNow,
CancellationToken.None, // GetKeyRingFromCompletedTask will need to react if this becomes cancellable
TaskCreationOptions.DenyChildAttach,
TaskScheduler.Default);
Volatile.Write(ref _cacheableKeyRingTask, existingTask);
}
}

if (existingCacheableKeyRing is not null)
{
Debug.Assert(!forceRefresh, "Read cached key ring even though forceRefresh is true");
Debug.Assert(!CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow), "Should already have returned a valid cached key ring");
return existingCacheableKeyRing.KeyRing;
}

// Since there's no cached key ring we can return, we have to wait. It's not ideal to wait for a task we
// just scheduled, but it makes the code a lot simpler (compared to having a separate, synchronous code path).
// Cleverness: swallow any exceptions - they'll be surfaced by GetKeyRingFromCompletedTask, if appropriate.
existingTask
.ContinueWith(static _ => { }, TaskScheduler.Default)
.Wait();

var newKeyRing = GetKeyRingFromCompletedTask(existingTask, utcNow); // Throws if the task failed (winning thread only)
if (newKeyRing is null)
{
// Another thread won - check whether it cached a new key ring
var newCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (newCacheableKeyRing is null)
{
// There will have been a better exception from the winning thread
throw Error.KeyRingProvider_RefreshFailedOnOtherThread();
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The exception must have already happened at this point. Why can't we capture it and throw it from any thread that fails to retrieve the KeyRing due to the exception?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I didn't think it would be helpful to print the underlying exception once per affected task, even if we were to add a note that it was being re-reported from elsewhere.

Copy link
Member

@halter73 halter73 Apr 22, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I still think it's more helpful than throwing a generic exception. We don't have to log it each time.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As a compromise, I've attached existingTask.Exception as the inner exception.

}

newKeyRing = newCacheableKeyRing.KeyRing;
}

return newKeyRing;
}

/// <summary>
/// Returns null if another thread already processed the completed task.
/// Otherwise, if the given completed task completed successfully, clears the task
/// and either caches and returns the resulting key ring or throws, according to the
/// successfulness of the task.
/// </summary>
private IKeyRing? GetKeyRingFromCompletedTask(Task<CacheableKeyRing> task, DateTime utcNow)
{
Debug.Assert(task.IsCompleted);

lock (_cacheableKeyRingLockObj)
{
// If the parameter doesn't match the field, another thread has already consumed the task (and it's reflected in _cacheableKeyRing)
if (!ReferenceEquals(task, Volatile.Read(ref _cacheableKeyRingTask)))
{
return null;
}

Volatile.Write(ref _cacheableKeyRingTask, null);

if (task.Status == TaskStatus.RanToCompletion)
{
var newCacheableKeyRing = task.Result;
Volatile.Write(ref _cacheableKeyRing, newCacheableKeyRing);

// An unconsumed task result is considered to satisfy forceRefresh. One could quibble that this isn't really
// a forced refresh, but we'll still return a key ring newer than the one the caller was dissatisfied with.
return newCacheableKeyRing.KeyRing;
}

Debug.Assert(!task.IsCanceled, "How did a task with no cancellation token get canceled?");
Debug.Assert(task.Exception is not null, "Task should have either completed successfully or with an exception");
var exception = task.Exception;

var existingCacheableKeyRing = Volatile.Read(ref _cacheableKeyRing);
if (existingCacheableKeyRing is not null && !CacheableKeyRing.IsValid(existingCacheableKeyRing, utcNow))
{
// If reading failed, we probably don't want to try again for a little bit, so slightly extend the
// lifetime of the current cache entry
Volatile.Write(ref _cacheableKeyRing, existingCacheableKeyRing.WithTemporaryExtendedLifetime(utcNow));

_logger.ErrorOccurredWhileRefreshingKeyRing(exception); // This one mentions the no-retry window
}
else
{
_logger.ErrorOccurredWhileReadingKeyRing(exception);
}

throw exception.InnerExceptions.Count == 1 ? exception.InnerExceptions[0] : exception;
}
}

private static TimeSpan GetRefreshPeriodWithJitter(TimeSpan refreshPeriod)
{
// We'll fudge the refresh period up to -20% so that multiple applications don't try to
Expand Down
3 changes: 3 additions & 0 deletions src/DataProtection/DataProtection/src/Resources.resx
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,9 @@
<data name="KeyRingProvider_DefaultKeyRevoked" xml:space="preserve">
<value>The key ring's default data protection key {0:B} has been revoked.</value>
</data>
<data name="KeyRingProvider_RefreshFailedOnOtherThread" xml:space="preserve">
<value>Another thread failed to refresh the key ring.</value>
</data>
<data name="LifetimeMustNotBeNegative" xml:space="preserve">
<value>{0} must not be negative.</value>
</data>
Expand Down
Loading
Loading