Skip to content

Distrust received pack indexes (behind config flag, with perf fixes) #1846

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions GVFS/GVFS.Common/GVFSConstants.cs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ public static class GitConfig
public const string GVFSTelemetryPipe = GitConfig.GVFSPrefix + "telemetry-pipe";
public const string IKey = GitConfig.GVFSPrefix + "ikey";
public const string HooksExtension = ".hooks";

/* Intended to be a temporary config to allow testing of distrusting pack indexes from cache server
* before it is enabled by default. */
public const string TrustPackIndexes = GVFSPrefix + "trust-pack-indexes";
}

public static class LocalGVFSConfig
Expand Down
170 changes: 116 additions & 54 deletions GVFS/GVFS.Common/Git/GitObjects.cs
Original file line number Diff line number Diff line change
Expand Up @@ -131,11 +131,23 @@ public virtual bool TryDownloadPrefetchPacks(GitProcess gitProcess, long latestT
{
long bytesDownloaded = 0;

/* Distrusting the indexes from the server is a security feature to prevent a compromised server from sending a
* pack file and an index file that do not match.
* Eventually we will make this the default, but it has a high performance cost for the first prefetch after
* cloning a large repository, so it must be explicitly enabled for now. */
bool trustPackIndexes = true;
if (gitProcess.TryGetFromConfig(GVFSConstants.GitConfig.TrustPackIndexes, forceOutsideEnlistment: false, out var valueString)
&& bool.TryParse(valueString, out var trustPackIndexesConfig))
{
trustPackIndexes = trustPackIndexesConfig;
}
metadata.Add("trustPackIndexes", trustPackIndexes);

long requestId = HttpRequestor.GetNewRequestId();
List<string> innerPackIndexes = null;
RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.InvocationResult result = this.GitObjectRequestor.TrySendProtocolRequest(
requestId: requestId,
onSuccess: (tryCount, response) => this.DeserializePrefetchPacks(response, ref latestTimestamp, ref bytesDownloaded, ref innerPackIndexes, gitProcess),
onSuccess: (tryCount, response) => this.DeserializePrefetchPacks(response, ref latestTimestamp, ref bytesDownloaded, ref innerPackIndexes, gitProcess, trustPackIndexes),
onFailure: RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.StandardErrorHandler(activity, requestId, "TryDownloadPrefetchPacks"),
method: HttpMethod.Get,
endPointGenerator: () => new Uri(
Expand Down Expand Up @@ -662,7 +674,8 @@ private LooseObjectToWrite GetLooseObjectDestination(string sha)
ref long latestTimestamp,
ref long bytesDownloaded,
ref List<string> packIndexes,
GitProcess gitProcess)
GitProcess gitProcess,
bool trustPackIndexes)
{
if (packIndexes == null)
{
Expand All @@ -676,11 +689,16 @@ private LooseObjectToWrite GetLooseObjectDestination(string sha)
string tempPackFolderPath = Path.Combine(this.Enlistment.GitPackRoot, TempPackFolder);
this.fileSystem.CreateDirectory(tempPackFolderPath);

List<TempPrefetchPackAndIdx> tempPacks = new List<TempPrefetchPackAndIdx>();
foreach (PrefetchPacksDeserializer.PackAndIndex pack in deserializer.EnumeratePacks())
var tempPacksTasks = new List<Task<TempPrefetchPackAndIdx>>();
// Future: We could manage cancellation of index building tasks if one fails (to stop indexing of later
// files if an early one fails), but in practice the first pack file takes the majority of the time and
// all the others will finish long before it so there would be no benefit to doing so.
bool allSucceeded = true;

// Read each pack from the stream to a temp file, and start a task to index it.
foreach (PrefetchPacksDeserializer.PackAndIndex packHandle in deserializer.EnumeratePacks())
{
// The advertised size may not match the actual, on-disk size.
long indexLength = 0;
var pack = packHandle; // Capture packHandle in a new variable to avoid closure issues with async index task
long packLength;

// Write the temp and index to a temp folder to avoid putting corrupt files in the pack folder
Expand All @@ -701,87 +719,131 @@ private LooseObjectToWrite GetLooseObjectDestination(string sha)
if (!this.TryWriteTempFile(activity, pack.PackStream, packTempPath, out packLength, out packFlushTask))
{
bytesDownloaded += packLength;
return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(null, true);
allSucceeded = false;
break;
}

bytesDownloaded += packLength;

// We will try to build an index if the server does not send one
if (pack.IndexStream == null)
if (trustPackIndexes
&& pack.IndexStream != null)
{
GitProcess.Result result;
if (!this.TryBuildIndex(activity, packTempPath, out result, gitProcess))
// The server provided an index stream, we can trust it, and were able to read it successfully, so we just need to wait for the index to flush.
if (this.TryWriteTempFile(activity, pack.IndexStream, idxTempPath, out var indexLength, out var indexFlushTask))
{
if (packFlushTask != null)
{
packFlushTask.Wait();
}

// Move whatever has been successfully downloaded so far
Exception moveException;
this.TryFlushAndMoveTempPacks(tempPacks, ref latestTimestamp, out moveException);

return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(null, true);
bytesDownloaded += indexLength;
tempPacksTasks.Add(Task.FromResult(
new TempPrefetchPackAndIdx(pack.Timestamp, packName, packTempPath, packFlushTask, idxName, idxTempPath, indexFlushTask)));
}
else
{
bytesDownloaded += indexLength;
// we can try to build the index ourself, and if it's successful then on the retry we can pick up from that point.
var indexTask = StartPackIndexAsync(activity, pack, packName, packTempPath, idxName, idxTempPath, packFlushTask);
tempPacksTasks.Add(indexTask);
// but we need to stop trying to read from the download stream as that has failed.
allSucceeded = false;
break;
}

tempPacks.Add(new TempPrefetchPackAndIdx(pack.Timestamp, packName, packTempPath, packFlushTask, idxName, idxTempPath, idxFlushTask: null));
}
else
{
Task indexFlushTask;
if (this.TryWriteTempFile(activity, pack.IndexStream, idxTempPath, out indexLength, out indexFlushTask))
{
tempPacks.Add(new TempPrefetchPackAndIdx(pack.Timestamp, packName, packTempPath, packFlushTask, idxName, idxTempPath, indexFlushTask));
}
else
// Either we can't trust the index file from the server, or the server didn't provide one, so we will build our own.
// For performance, we run the index build in the background while we continue downloading the next pack.
var indexTask = StartPackIndexAsync(activity, pack, packName, packTempPath, idxName, idxTempPath, packFlushTask);
tempPacksTasks.Add(indexTask);

// If the server provided an index stream, we still need to consume and handle any exceptions it even
// though we are otherwise ignoring it.
if (pack.IndexStream != null)
{
bytesDownloaded += indexLength;

// Try to build the index manually, then retry the prefetch
GitProcess.Result result;
if (this.TryBuildIndex(activity, packTempPath, out result, gitProcess))
{
// If we were able to recreate the failed index
// we can start the prefetch at the next timestamp
tempPacks.Add(new TempPrefetchPackAndIdx(pack.Timestamp, packName, packTempPath, packFlushTask, idxName, idxTempPath, idxFlushTask: null));
}
else
try
{
if (packFlushTask != null)
bytesDownloaded += pack.IndexStream.Length;
if (pack.IndexStream.CanSeek)
{
packFlushTask.Wait();
pack.IndexStream.Seek(0, SeekOrigin.End);
}
else
{
pack.IndexStream.CopyTo(Stream.Null);
}
}

// Move whatever has been successfully downloaded so far
Exception moveException;
this.TryFlushAndMoveTempPacks(tempPacks, ref latestTimestamp, out moveException);

// The download stream will not be in a good state if the index download fails.
// So we have to restart the prefetch
return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(null, true);
catch (Exception e)
{
EventMetadata metadata = CreateEventMetadata(e);
activity.RelatedWarning(metadata, "Failed to read to end of index stream");
allSucceeded = false;
break;
}
}
}
}

bytesDownloaded += indexLength;
// Wait for the index tasks to complete. If any fail, we still copy the prior successful ones
// to the pack folder so that the retry will be incremental from where the failure occurred.
var tempPacks = new List<TempPrefetchPackAndIdx>();
bool indexTasksSucceededSoFar = true;
foreach (var task in tempPacksTasks)
{
TempPrefetchPackAndIdx tempPack = task.Result;
if (tempPack != null && indexTasksSucceededSoFar)
{
tempPacks.Add(tempPack);
}
else
{
indexTasksSucceededSoFar = false;
tempPack?.PackFlushTask.Wait();
break;
}
}
allSucceeded = allSucceeded && indexTasksSucceededSoFar;

Exception exception = null;
if (!this.TryFlushAndMoveTempPacks(tempPacks, ref latestTimestamp, out exception))
{
return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(exception, true);
allSucceeded = false;
}

foreach (TempPrefetchPackAndIdx tempPack in tempPacks)
{
packIndexes.Add(tempPack.IdxName);
}

return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(
new GitObjectsHttpRequestor.GitObjectTaskResult(success: true));
if (allSucceeded)
{
return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(
new GitObjectsHttpRequestor.GitObjectTaskResult(success: true));
}
else
{
return new RetryWrapper<GitObjectsHttpRequestor.GitObjectTaskResult>.CallbackResult(exception, shouldRetry: true);
}
}
}

private Task<TempPrefetchPackAndIdx> StartPackIndexAsync(ITracer activity, PrefetchPacksDeserializer.PackAndIndex pack, string packName, string packTempPath, string idxName, string idxTempPath, Task packFlushTask)
{
var indexTask = Task.Run(async () =>
{
// GitProcess only permits one process per instance at a time, so we need to duplicate it to run the index build in parallel.
// This is safe because each process is only accessing the pack file we direct it to which is not yet part
// of the enlistment.
GitProcess gitProcessForIndex = new GitProcess(this.Enlistment);
if (this.TryBuildIndex(activity, packTempPath, out var _, gitProcessForIndex))
{
return new TempPrefetchPackAndIdx(pack.Timestamp, packName, packTempPath, packFlushTask, idxName, idxTempPath, idxFlushTask: null);
}
else
{
await packFlushTask;
return null;
}
});
return indexTask;
}

private bool TryFlushAndMoveTempPacks(List<TempPrefetchPackAndIdx> tempPacks, ref long latestTimestamp, out Exception exception)
{
exception = null;
Expand Down
12 changes: 11 additions & 1 deletion GVFS/GVFS.Common/Git/GitProcess.cs
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,17 @@ public Result VerifyCommitGraph(string objectDir)

public Result IndexPack(string packfilePath, string idxOutputPath)
{
return this.InvokeGitAgainstDotGitFolder($"index-pack -o \"{idxOutputPath}\" \"{packfilePath}\"");
/* Git's default thread count is Environment.ProcessorCount / 2, with a maximum of 20.
* Testing shows that we can get a 5% decrease in gvfs clone time for large repositories by using more threads, but
* we won't go over ProcessorCount or 20. */
var threadCount = Math.Min(Environment.ProcessorCount, 20);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any negative impact on clone times for machines that have many fewer available cores compared to threads?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you mean, if there are cores with more hyperthreading available than the typical 2 threads per core?

In that case it's possible that there is even more room to raise the thread count, but in my experience Environment.ProcessorCount usually is the number of hardware threads rather than hardware cores (eg it will report 16 on an 8-core, 16-thread processor) so this will attempt to use all the hardware threads. The comments around git code for the default number of threads suggest that number of cores is more important than number of threads though, which is why they default to half (ie they assume 2 threads per core) and is probably why I only observed 5% improvement when doubling that. (Most of the overall improvement comes from running the index tasks for smaller pack files in parallel while the big one is still in its single-threaded phase, not from using more threads in the multi-threaded phase)

string command = $"index-pack --threads={threadCount} -o \"{idxOutputPath}\" \"{packfilePath}\"";

// If index-pack is invoked within an enlistment, then it reads all the other objects and pack indexes available
// in the enlistment in order to verify references from within this pack file, even if --verify or similar
// options are not passed.
// Since we aren't verifying, we invoke index-pack outside the enlistment for performance.
return this.InvokeGitOutsideEnlistment(command);
}

/// <summary>
Expand Down
Loading