Skip to content

Commit a0eeed8

Browse files
authored
Implement unordered directory hash (#6197) [ci fast]
Signed-off-by: Paolo Di Tommaso <paolo.ditommaso@gmail.com>
1 parent 1c8e4d3 commit a0eeed8

File tree

2 files changed

+74
-22
lines changed

2 files changed

+74
-22
lines changed

modules/nf-commons/src/main/nextflow/util/HashBuilder.java

Lines changed: 65 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,7 @@ static private Hasher hashFile( Hasher hasher, Path path, HashMode mode, Path ba
267267

268268
if( (mode==HashMode.STANDARD || mode==HashMode.LENIENT) && isAssetFile(path) ) {
269269
if( attrs==null ) {
270-
// when file attributes are not avail or it's a directory
270+
// when file attributes are not avail, or it's a directory
271271
// hash the file using the file name path and the repository
272272
log.warn("Unable to fetch attribute for file: {} - Hash is inferred from Git repository commit Id", FilesEx.toUriString(path));
273273
return hashFileAsset(hasher, path);
@@ -322,18 +322,36 @@ static protected Hasher hashFileSha256( Hasher hasher, Path path, Path base ) {
322322
return hasher;
323323
}
324324

325+
/**
326+
* Compute an, order independent, hash of a directory path traversing recursively the directory content.
327+
*
328+
* @param hasher
329+
* The {@link Hasher} object to which the resulting directory hash will be added.
330+
* @param dir
331+
* The target directory path to be hashed.
332+
* @param base
333+
* The "base" directory path against which resolve relative paths.
334+
* @return
335+
* The resulting {@link Hasher} object updated with the directory path.
336+
*/
325337
static protected Hasher hashDirSha256( Hasher hasher, Path dir, Path base ) {
338+
if( base==null )
339+
throw new IllegalArgumentException("Argument 'base' cannot be null");
340+
// the byte array used as "accumulator" for
341+
final byte[] resultBytes = new byte[HASH_BYTES];
326342
try {
327343
Files.walkFileTree(dir, new SimpleFileVisitor<Path>() {
328344
public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IOException {
329345
log.trace("Hash sha-256 dir content [FILE] path={} - base={}", path, base);
330346
try {
331347
// the file relative base
332-
if( base!=null )
333-
hasher.putUnencodedChars(base.relativize(path).toString());
348+
final String relPath = base.relativize(path).toString();
349+
// compute the file path hash and sum to the result hash
350+
// since the sum is commutative, the traverse order does not matter
351+
sumBytes(resultBytes, hashBytes(relPath, HashMode.STANDARD));
334352
// the file content sha-256 checksum
335-
String sha256 = sha256Cache.get(path);
336-
hasher.putUnencodedChars(sha256);
353+
final String sha256 = sha256Cache.get(path);
354+
sumBytes(resultBytes, hashBytes(sha256, HashMode.STANDARD));
337355
return FileVisitResult.CONTINUE;
338356
}
339357
catch (ExecutionException t) {
@@ -344,12 +362,15 @@ public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IO
344362
public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes attrs) {
345363
log.trace("Hash sha-256 dir content [DIR] path={} - base={}", path, base);
346364
// the file relative base
347-
if( base!=null )
348-
hasher.putUnencodedChars(base.relativize(path).toString());
349-
hasher.putUnencodedChars(base.relativize(path).toString());
365+
final String relPath = base.relativize(path).toString();
366+
// compute the file path hash and sum to the result hash
367+
// since the sum is commutative, the traverse order does not matter
368+
sumBytes(resultBytes, hashBytes(relPath, HashMode.STANDARD));
350369
return FileVisitResult.CONTINUE;
351370
}
352371
});
372+
// finally put the result bytes in the hashing
373+
hasher.putBytes(resultBytes);
353374
}
354375
catch (IOException t) {
355376
Throwable err = t.getCause()!=null ? t.getCause() : t;
@@ -441,21 +462,48 @@ static HashCode hashContent( Path file, HashFunction function ) {
441462
}
442463

443464
static private Hasher hashUnorderedCollection(Hasher hasher, Collection collection, HashMode mode) {
444-
445465
byte[] resultBytes = new byte[HASH_BYTES];
446466
for (Object item : collection) {
447-
byte[] nextBytes = HashBuilder.hasher(defaultHasher(), item, mode).hash().asBytes();
448-
if( nextBytes.length != resultBytes.length )
449-
throw new IllegalStateException("All hash codes must have the same bit length");
450-
451-
for (int i = 0; i < nextBytes.length; i++) {
452-
resultBytes[i] += nextBytes[i];
453-
}
467+
// hash ghe collection item
468+
byte[] nextBytes = hashBytes(item, mode);
469+
// sum the hash bytes to the "resultBytes" accumulator
470+
// since the sum is a commutative operation the order does not matter
471+
sumBytes(resultBytes, nextBytes);
454472
}
455-
473+
// add the result bytes and return the resulting object
456474
return hasher.putBytes(resultBytes);
457475
}
458476

477+
static private byte[] hashBytes(Object item, HashMode mode) {
478+
return hasher(defaultHasher(), item, mode).hash().asBytes();
479+
}
480+
481+
/**
482+
* Sum two arras of bytes having the same length, required to compute hash of unordered collections.
483+
*
484+
* - For each byte position, add the corresponding byte from nextBytes into resultBytes
485+
* - Order doesn't matter: addition is commutative (a + b = b + a), so the final result is
486+
* the same no matter the order of items.
487+
* - This is what makes it suitable for unordered collections
488+
*
489+
* @param resultBytes
490+
* The first argument to be summed. This array is used as the accumulator array (i.e. the result)
491+
* @param nextBytes
492+
* The second argument to be summed.
493+
* @return
494+
* The array resulting adding the bytes in the second array to the first one. Note,
495+
* the result array instance is the same object passed as first argument.
496+
*
497+
*/
498+
static private byte[] sumBytes(byte[] resultBytes, byte[] nextBytes) {
499+
if( nextBytes.length != resultBytes.length )
500+
throw new IllegalStateException("All hash codes must have the same bit length");
501+
for (int i = 0; i < nextBytes.length; i++) {
502+
resultBytes[i] += nextBytes[i];
503+
}
504+
return resultBytes;
505+
}
506+
459507
/**
460508
* Check if the argument is an asset file i.e. a file that makes part of the
461509
* pipeline Git repository

modules/nf-commons/src/test/nextflow/util/HashBuilderTest.groovy

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,13 +110,18 @@ class HashBuilderTest extends Specification {
110110
folder.resolve('dir1/bar').text = "I'm bar"
111111
folder.resolve('dir1/xxx/yyy').mkdirs()
112112
folder.resolve('dir1/xxx/foo1').text = "I'm foo within xxx"
113-
folder.resolve('dir1/xxx/yyy/bar1').text = "I'm bar within yyy"
113+
folder.resolve('dir1/xxx/yyy/bar1').text = "I'm bar 1 within yyy"
114+
folder.resolve('dir1/xxx/yyy/bar2').text = "I'm bar 2 within yyy"
114115
and:
115-
folder.resolve('dir2/foo').text = "I'm foo"
116+
// create the same directory structure using a different
117+
// creation order, the resulting hash should be the same
116118
folder.resolve('dir2/bar').text = "I'm bar"
117-
folder.resolve('dir2/xxx/yyy').mkdirs()
119+
folder.resolve('dir2/foo').text = "I'm foo"
120+
folder.resolve('dir2/xxx').mkdirs()
118121
folder.resolve('dir2/xxx/foo1').text = "I'm foo within xxx"
119-
folder.resolve('dir2/xxx/yyy/bar1').text = "I'm bar within yyy"
122+
folder.resolve('dir2/xxx/yyy').mkdirs()
123+
folder.resolve('dir2/xxx/yyy/bar2').text = "I'm bar 2 within yyy"
124+
folder.resolve('dir2/xxx/yyy/bar1').text = "I'm bar 1 within yyy"
120125

121126
when:
122127
def hash1 = HashBuilder.hashDirSha256(HashBuilder.defaultHasher(), folder.resolve('dir1'), folder.resolve('dir1'))
@@ -125,6 +130,5 @@ class HashBuilderTest extends Specification {
125130

126131
then:
127132
hash1.hash() == hash2.hash()
128-
129133
}
130134
}

0 commit comments

Comments
 (0)