Sergio0694
diff --git a/‎NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs renamed to ‎NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
Lines changed: 13 additions & 0 deletions b/‎NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs renamed to ‎NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs
Lines changed: 13 additions & 0 deletions
diff --git a/‎NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
Lines changed: 1 addition & 0 deletions b/‎NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs
Lines changed: 1 addition & 0 deletions
diff --git a/‎NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
Lines changed: 64 additions & 0 deletions b/‎NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs
Lines changed: 64 additions & 0 deletions
diff --git a/‎NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
Lines changed: 2 additions & 4 deletions b/‎NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs
Lines changed: 2 additions & 4 deletions
diff --git a/‎NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
Lines changed: 1 addition & 1 deletion b/‎NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs
Lines changed: 1 addition & 1 deletion
@@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
         [PublicAPI]
         [Pure, NotNull]
         public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);
+
+        /// <summary>
+        /// Creates a new inception layer with the given input and features
+        /// </summary>
+        /// <param name="input">The input volume to process</param>
+        /// <param name="info">The info on the operations to execute inside the layer</param>
+        /// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
+        [PublicAPI]
+        [Pure, NotNull]
+        public static INetworkLayer Inception(
+            in TensorInfo input, in InceptionInfo info,
+            BiasInitializationMode biasMode = BiasInitializationMode.Zero)
+            => new CuDnnInceptionLayer(input, info, biasMode);
     }
 }
@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
                 case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
                 case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
                 case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
+                case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
                 default: return null;
             }
         } 
 
@@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
                 : throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
+        /// </summary>
+        /// <param name="gpu">The <see cref="Gpu"/> device to use</param>
+        /// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
+        /// <param name="offset">The column offset for the data to read from each row</param>
+        /// <param name="length"></param>
+        [MustUseReturnValue, NotNull]
+        public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
+        {
+            // Checks
+            if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                srcHost = source.Ptr + sizeof(float) * offset,
+                srcPitch = new IntPtr(sizeof(float) * source.Length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                dstDevice = result_gpu.Handle,
+                dstPitch = new IntPtr(sizeof(float) * length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(source.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
+                ? result_gpu
+                : throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
         /// </summary>
@@ -40,6 +73,37 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
                 throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
         }
 
+        /// <summary>
+        /// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
+        /// </summary>
+        /// <param name="source">The source memory area with the concatenated data for each entry</param>
+        /// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
+        /// <param name="offset">The column offset for the data for each entry</param>
+        /// <param name="length">The number of values to copy for each entry</param>
+        public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
+        {
+            // Checks
+            if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
+            if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
+
+            // Memory copy
+            CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
+            ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
+            {
+                srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
+                srcDevice = source.Handle,
+                srcPitch = new IntPtr(sizeof(float) * length),
+                dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
+                dstHost = destination.Ptr + sizeof(float) * offset,
+                dstPitch = new IntPtr(sizeof(float) * destination.Length),
+                WidthInBytes = new IntPtr(sizeof(float) * length),
+                Height = new IntPtr(destination.Entities)
+            };
+            CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
+            if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
+                throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
+        }
+
         /// <summary>
         /// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
         /// </summary>
 
@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
         [NotNull]
         private readonly Dnn DnnInstance = DnnService.Instance;
 
-        /// <summary>
-        /// Sets the cuDNN fields that will be used during future forward/backwards operations
-        /// </summary>
+        // cuDNN fields setup
         private void SetupCuDnnInfo()
         {
             ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
@@ -151,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                     {
                         DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
                     }
-                    w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
+                    w_gpu.CopyToHost(1, Weights.Length, out dJdw);
                 }
 
                 // Bias
 
@@ -67,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
                 w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
             {
                 DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
-                w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
+                w_gpu.CopyToHost(1, Weights.Length, out dJdw);
             }
             delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
         }
Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type`
`31`	`31`	`case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);`
`32`	`32`	`case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);`
`33`	`33`	`case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);`
	`34`	`+ case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);`
`34`	`35`	`default: return null;`
`35`	`36`	`}`
`36`	`37`	`}`
Original file line number	Diff line number	Diff line change
`@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer`
`47`	`47`	`[NotNull]`
`48`	`48`	`private readonly Dnn DnnInstance = DnnService.Instance;`
`49`	`49`
`50`		`- /// <summary>`
`51`		`- /// Sets the cuDNN fields that will be used during future forward/backwards operations`
`52`		`- /// </summary>`
	`50`	`+ // cuDNN fields setup`
`53`	`51`	`private void SetupCuDnnInfo()`
`54`	`52`	`{`
`55`	`53`	`ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);`
`@@ -151,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ`
`151`	`149`	`{`
`152`	`150`	`DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);`
`153`	`151`	`}`
`154`		`- w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);`
	`152`	`+ w_gpu.CopyToHost(1, Weights.Length, out dJdw);`
`155`	`153`	`}`
`156`	`154`
`157`	`155`	`// Bias`
Original file line number	Diff line number	Diff line change
`@@ -67,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ`
`67`	`67`	`w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))`
`68`	`68`	`{`
`69`	`69`	`DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);`
`70`		`- w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);`
	`70`	`+ w_gpu.CopyToHost(1, Weights.Length, out dJdw);`
`71`	`71`	`}`
`72`	`72`	`delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels`
`73`	`73`	`}`