Skip to content

Commit 2d56d11

Browse files
authored
Merge pull request #48 from Sergio0694/feature_inception-layer
Feature inception layer
2 parents 1711109 + b84d991 commit 2d56d11

25 files changed

+1710
-202
lines changed

NeuralNetwork.NET.Cuda/APIS/CuDnnNetworkLayers.cs renamed to NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayers.cs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,5 +67,18 @@ public static INetworkLayer Convolutional(
6767
[PublicAPI]
6868
[Pure, NotNull]
6969
public static INetworkLayer Pooling(in TensorInfo input, in PoolingInfo info, ActivationFunctionType activation) => new CuDnnPoolingLayer(input, info, activation);
70+
71+
/// <summary>
72+
/// Creates a new inception layer with the given input and features
73+
/// </summary>
74+
/// <param name="input">The input volume to process</param>
75+
/// <param name="info">The info on the operations to execute inside the layer</param>
76+
/// <param name="biasMode">Indicates the desired initialization mode to use for the layer bias values</param>
77+
[PublicAPI]
78+
[Pure, NotNull]
79+
public static INetworkLayer Inception(
80+
in TensorInfo input, in InceptionInfo info,
81+
BiasInitializationMode biasMode = BiasInitializationMode.Zero)
82+
=> new CuDnnInceptionLayer(input, info, biasMode);
7083
}
7184
}

NeuralNetwork.NET.Cuda/APIs/CuDnnNetworkLayersDeserializer.cs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ private static INetworkLayer Deserialize([NotNull] Stream stream, LayerType type
3131
case LayerType.Convolutional: return CuDnnConvolutionalLayer.Deserialize(stream);
3232
case LayerType.Pooling: return CuDnnPoolingLayer.Deserialize(stream);
3333
case LayerType.Softmax: return CuDnnSoftmaxLayer.Deserialize(stream);
34+
case LayerType.Inception: return CuDnnInceptionLayer.Deserialize(stream);
3435
default: return null;
3536
}
3637
}

NeuralNetwork.NET.Cuda/Extensions/GpuExtensions.cs

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,39 @@ public static DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tens
2727
: throw new InvalidOperationException($"Failed to copy the source data on the target GPU device, [CUDA ERROR] {result}");
2828
}
2929

30+
/// <summary>
31+
/// Allocates a memory area on device memory, reading the target values at a given offset from the input <see cref="Tensor"/>
32+
/// </summary>
33+
/// <param name="gpu">The <see cref="Gpu"/> device to use</param>
34+
/// <param name="source">The source <see cref="Tensor"/> with the data to copy</param>
35+
/// <param name="offset">The column offset for the data to read from each row</param>
36+
/// <param name="length"></param>
37+
[MustUseReturnValue, NotNull]
38+
public static unsafe DeviceMemory<float> AllocateDevice([NotNull] this Gpu gpu, in Tensor source, int offset, int length)
39+
{
40+
// Checks
41+
if (source.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
42+
43+
// Memory copy
44+
DeviceMemory<float> result_gpu = gpu.AllocateDevice<float>(source.Entities * length);
45+
CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
46+
ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
47+
{
48+
srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
49+
srcHost = source.Ptr + sizeof(float) * offset,
50+
srcPitch = new IntPtr(sizeof(float) * source.Length),
51+
dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
52+
dstDevice = result_gpu.Handle,
53+
dstPitch = new IntPtr(sizeof(float) * length),
54+
WidthInBytes = new IntPtr(sizeof(float) * length),
55+
Height = new IntPtr(source.Entities)
56+
};
57+
CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
58+
return result == CUDAInterop.cudaError_enum.CUDA_SUCCESS
59+
? result_gpu
60+
: throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
61+
}
62+
3063
/// <summary>
3164
/// Copies the contents of the input <see cref="DeviceMemory{T}"/> instance to the target host memory area
3265
/// </summary>
@@ -40,6 +73,37 @@ public static void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor d
4073
throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
4174
}
4275

76+
/// <summary>
77+
/// Copies the source data into the target <see cref="Tensor"/>, splitting each individual entry into its own row
78+
/// </summary>
79+
/// <param name="source">The source memory area with the concatenated data for each entry</param>
80+
/// <param name="destination">The destination <see cref="Tensor"/> that will store the data</param>
81+
/// <param name="offset">The column offset for the data for each entry</param>
82+
/// <param name="length">The number of values to copy for each entry</param>
83+
public static unsafe void CopyTo([NotNull] this DeviceMemory<float> source, in Tensor destination, int offset, int length)
84+
{
85+
// Checks
86+
if (source.Length / length != destination.Entities) throw new ArgumentOutOfRangeException(nameof(length), "The input length doesn't match the given arguments");
87+
if (destination.Length - offset < length) throw new ArgumentOutOfRangeException(nameof(offset), "The input offset isn't valid");
88+
89+
// Memory copy
90+
CUDAInterop.CUDA_MEMCPY2D_st* ptSt = stackalloc CUDAInterop.CUDA_MEMCPY2D_st[1];
91+
ptSt[0] = new CUDAInterop.CUDA_MEMCPY2D_st
92+
{
93+
srcMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_DEVICE,
94+
srcDevice = source.Handle,
95+
srcPitch = new IntPtr(sizeof(float) * length),
96+
dstMemoryType = CUDAInterop.CUmemorytype_enum.CU_MEMORYTYPE_HOST,
97+
dstHost = destination.Ptr + sizeof(float) * offset,
98+
dstPitch = new IntPtr(sizeof(float) * destination.Length),
99+
WidthInBytes = new IntPtr(sizeof(float) * length),
100+
Height = new IntPtr(destination.Entities)
101+
};
102+
CUDAInterop.cudaError_enum result = CUDAInterop.cuMemcpy2D(ptSt);
103+
if (result != CUDAInterop.cudaError_enum.CUDA_SUCCESS)
104+
throw new InvalidOperationException($"Failed to copy the source data on the given destination, [CUDA ERROR] {result}");
105+
}
106+
43107
/// <summary>
44108
/// Copies the contents of the input <see cref="DeviceMemory{T}"/> to a new memory area on the unmanaged heap
45109
/// </summary>

NeuralNetwork.NET.Cuda/Layers/CuDnnConvolutionalLayer.cs

Lines changed: 43 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,7 @@ internal sealed class CuDnnConvolutionalLayer : ConvolutionalLayer
4747
[NotNull]
4848
private readonly Dnn DnnInstance = DnnService.Instance;
4949

50-
/// <summary>
51-
/// Sets the cuDNN fields that will be used during future forward/backwards operations
52-
/// </summary>
50+
// cuDNN fields setup
5351
private void SetupCuDnnInfo()
5452
{
5553
ConvolutionDescription.Set2D(OperationInfo.VerticalPadding, OperationInfo.HorizontalPadding, OperationInfo.VerticalStride, OperationInfo.HorizontalStride, 1, 1, (Alea.cuDNN.ConvolutionMode)OperationInfo.Mode);
@@ -74,71 +72,63 @@ public CuDnnConvolutionalLayer(
7472
#region Implementation
7573

7674
/// <inheritdoc/>
77-
public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
75+
public override void Forward(in Tensor x, out Tensor z, out Tensor a)
7876
{
79-
fixed (float* pw = Weights)
77+
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
8078
{
81-
Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
82-
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size))
79+
// Tensors info setup
80+
InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
81+
OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
82+
83+
// Forward convolution
84+
DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
85+
DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
86+
using (DeviceMemory<float>
87+
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
88+
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
89+
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
8390
{
84-
// Tensors info setup
85-
InputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, InputInfo.Channels, InputInfo.Height, InputInfo.Width);
86-
OutputDescription.Set4D(DataType.FLOAT, TensorFormat.CUDNN_TENSOR_NCHW, x.Entities, OutputInfo.Channels, OutputInfo.Height, OutputInfo.Width);
87-
88-
// Forward convolution
89-
DnnInstance.GetConvolutionForwardAlgorithm(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, ConvolutionFwdPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionFwdAlgo algorithm);
90-
DnnInstance.GetConvolutionForwardWorkspaceSize(InputDescription, FilterDescription, ConvolutionDescription, OutputDescription, algorithm, out IntPtr size);
91-
using (DeviceMemory<float>
92-
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
93-
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
94-
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
95-
{
96-
DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
97-
}
91+
DnnInstance.ConvolutionForward(1, InputDescription, x_gpu.Ptr, FilterDescription, w_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, OutputDescription, z_gpu.Ptr);
92+
}
9893

99-
// Biases
100-
using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
101-
{
102-
DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
103-
}
104-
z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
94+
// Biases
95+
using (DeviceMemory<float> b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
96+
{
97+
DnnInstance.AddTensor(1, BiasDescription, b_gpu.Ptr, 1, OutputDescription, z_gpu.Ptr);
98+
}
99+
z_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
105100

106-
// Activation
107-
if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
108-
else
109-
{
110-
DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
111-
z_gpu.CopyToHost(z.Entities, z.Length, out a);
112-
}
101+
// Activation
102+
if (ActivationFunctionType == ActivationFunctionType.Identity) z.Duplicate(out a);
103+
else
104+
{
105+
DnnInstance.ActivationForward(z.Entities, z.Length, z_gpu.Ptr, z_gpu.Ptr, ActivationFunctions.Activation);
106+
z_gpu.CopyToHost(z.Entities, z.Length, out a);
113107
}
114108
}
115109
}
116110

117111
/// <inheritdoc/>
118-
public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
112+
public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
119113
{
120-
fixed (float* pw = Weights)
114+
using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
121115
{
122-
Tensor.Reshape(pw, OutputInfo.Channels, KernelInfo.Size, out Tensor wTensor);
116+
// Convolution
123117
DnnInstance.GetConvolutionBackwardDataAlgorithm(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, ConvolutionBwdDataPreference.PREFER_FASTEST, IntPtr.Zero, out ConvolutionBwdDataAlgo algorithm);
124118
DnnInstance.GetConvolutionBackwardDataWorkspaceSize(FilterDescription, OutputDescription, ConvolutionDescription, InputDescription, algorithm, out IntPtr size);
125-
using (DeviceMemory<float> delta_gpu = DnnInstance.Gpu.AllocateDevice<float>(z.Size))
119+
using (DeviceMemory<float>
120+
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
121+
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights))
122+
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
126123
{
127-
// Backwards convolution
128-
using (DeviceMemory<float>
129-
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
130-
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor))
131-
using (DeviceMemory<byte> workspace_gpu = DnnInstance.Gpu.AllocateDevice<byte>(size))
132-
{
133-
DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
134-
}
124+
DnnInstance.ConvolutionBackwardData(1, FilterDescription, w_gpu.Ptr, OutputDescription, delta_1_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, InputDescription, delta_gpu.Ptr);
125+
}
135126

136-
// Activation
137-
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
138-
{
139-
DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
140-
z_gpu.CopyTo(z);
141-
}
127+
// Activation
128+
using (DeviceMemory<float> z_gpu = DnnInstance.Gpu.AllocateDevice(z))
129+
{
130+
DnnInstance.ActivationBackward(z.Entities, z.Length, z_gpu.Ptr, delta_gpu.Ptr, activationPrime);
131+
z_gpu.CopyTo(z);
142132
}
143133
}
144134
}
@@ -159,7 +149,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
159149
{
160150
DnnInstance.ConvolutionBackwardFilter(1, InputDescription, a_gpu.Ptr, OutputDescription, delta_gpu.Ptr, ConvolutionDescription, algorithm, workspace_gpu.Ptr, size, 0, FilterDescription, w_gpu.Ptr);
161151
}
162-
w_gpu.CopyToHost(Kernels, KernelInfo.Size, out dJdw);
152+
w_gpu.CopyToHost(1, Weights.Length, out dJdw);
163153
}
164154

165155
// Bias

NeuralNetwork.NET.Cuda/Layers/CuDnnFullyConnectedLayer.cs

Lines changed: 18 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -30,39 +30,31 @@ public CuDnnFullyConnectedLayer(in TensorInfo input, int neurons, [NotNull] floa
3030
#region Implementation
3131

3232
/// <inheritdoc/>
33-
public override unsafe void Forward(in Tensor x, out Tensor z, out Tensor a)
33+
public override void Forward(in Tensor x, out Tensor z, out Tensor a)
3434
{
35-
fixed (float* pw = Weights)
35+
using (DeviceMemory<float>
36+
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
37+
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
38+
y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
39+
b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
3640
{
37-
Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
38-
using (DeviceMemory<float>
39-
x_gpu = DnnInstance.Gpu.AllocateDevice(x),
40-
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
41-
y_gpu = DnnInstance.Gpu.AllocateDevice<float>(x.Entities * OutputInfo.Size),
42-
b_gpu = DnnInstance.Gpu.AllocateDevice(Biases))
43-
{
44-
DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
45-
y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
46-
DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
47-
y_gpu.CopyToHost(z.Entities, z.Length, out a);
48-
}
41+
DnnInstance.FullyConnectedForward(x.Entities, x.Length, OutputInfo.Size, x_gpu.Ptr, w_gpu.Ptr, b_gpu.Ptr, y_gpu.Ptr);
42+
y_gpu.CopyToHost(x.Entities, OutputInfo.Size, out z);
43+
DnnInstance.ActivationForward(z.Entities, z.Length, y_gpu.Ptr, y_gpu.Ptr, ActivationFunctions.Activation);
44+
y_gpu.CopyToHost(z.Entities, z.Length, out a);
4945
}
5046
}
5147

5248
/// <inheritdoc/>
53-
public override unsafe void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
49+
public override void Backpropagate(in Tensor delta_1, in Tensor z, ActivationFunction activationPrime)
5450
{
55-
fixed (float* pw = Weights)
51+
using (DeviceMemory<float>
52+
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
53+
w_gpu = DnnInstance.Gpu.AllocateDevice(Weights),
54+
z_gpu = DnnInstance.Gpu.AllocateDevice(z))
5655
{
57-
Tensor.Reshape(pw, InputInfo.Size, OutputInfo.Size, out Tensor wTensor);
58-
using (DeviceMemory<float>
59-
delta_1_gpu = DnnInstance.Gpu.AllocateDevice(delta_1),
60-
w_gpu = DnnInstance.Gpu.AllocateDevice(wTensor),
61-
z_gpu = DnnInstance.Gpu.AllocateDevice(z))
62-
{
63-
DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
64-
z_gpu.CopyTo(z);
65-
}
56+
DnnInstance.FullyConnectedBackwardData(z.Entities, InputInfo.Size, OutputInfo.Size, z_gpu.Ptr, delta_1_gpu.Ptr, w_gpu.Ptr, activationPrime);
57+
z_gpu.CopyTo(z);
6658
}
6759
}
6860

@@ -75,7 +67,7 @@ public override void ComputeGradient(in Tensor a, in Tensor delta, out Tensor dJ
7567
w_gpu = DnnInstance.Gpu.AllocateDevice<float>(a.Length * delta.Length))
7668
{
7769
DnnInstance.FullyConnectedBackwardFilter(a.Entities, a.Length, delta.Length, a_gpu.Ptr, delta_gpu.Ptr, w_gpu.Ptr);
78-
w_gpu.CopyToHost(a.Length, delta.Length, out dJdw);
70+
w_gpu.CopyToHost(1, Weights.Length, out dJdw);
7971
}
8072
delta.CompressVertically(out dJdb); // Doing this on CPU is generally faster than launching the kernels
8173
}

0 commit comments

Comments
 (0)