@@ -95,9 +95,51 @@ static __global__ void unary_op_kernel(const T * x, T * dst, const int k) {
95
95
}
96
96
97
97
template <float (*op)(float ), typename T>
98
- static void unary_cuda (const T * x, T * dst, const int k, cudaStream_t stream) {
98
+ static __global__ void unary_op_kernel_noncont (
99
+ const void * x, void * dst,
100
+ const int64_t ne0, const int64_t ne1, const int64_t ne2, const int64_t ne3,
101
+ const int64_t nb0_x, const int64_t nb1_x, const int64_t nb2_x, const int64_t nb3_x,
102
+ const int64_t nb0_d, const int64_t nb1_d, const int64_t nb2_d, const int64_t nb3_d,
103
+ const int64_t k) {
104
+
105
+ const int64_t i = blockDim .x *blockIdx .x + threadIdx .x ;
106
+
107
+ if (i >= k) {
108
+ return ;
109
+ }
110
+
111
+ const int64_t i3 = i / (ne2 * ne1 * ne0);
112
+ const int64_t i2 = (i / (ne1 * ne0)) % ne2;
113
+ const int64_t i1 = (i / ne0) % ne1;
114
+ const int64_t i0 = i % ne0;
115
+
116
+ const int64_t offset_x = i0*nb0_x + i1*nb1_x + i2*nb2_x + i3*nb3_x;
117
+ const int64_t offset_d = i0*nb0_d + i1*nb1_d + i2*nb2_d + i3*nb3_d;
118
+
119
+ const T * px = (const T *)((const char *)x + offset_x);
120
+ T * pd = (T *)((char *)dst + offset_d);
121
+
122
+ *pd = (T)op ((float )*px);
123
+ }
124
+
125
+ template <float (*op)(float ), typename T>
126
+ static void unary_cuda (const T * x, T * dst, const int k,
127
+ const ggml_tensor * src, const ggml_tensor * dst_tensor,
128
+ cudaStream_t stream) {
99
129
const int num_blocks = (k + CUDA_NEG_BLOCK_SIZE - 1 ) / CUDA_NEG_BLOCK_SIZE;
100
- unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
130
+
131
+ if (ggml_is_contiguous (src) && ggml_is_contiguous (dst_tensor)) {
132
+ unary_op_kernel<op><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (x, dst, k);
133
+ } else {
134
+ unary_op_kernel_noncont<op, T><<<num_blocks, CUDA_NEG_BLOCK_SIZE, 0 , stream>>> (
135
+ (const void *)x, (void *)dst,
136
+ src->ne [0 ], src->ne [1 ], src->ne [2 ], src->ne [3 ],
137
+ src->nb [0 ], src->nb [1 ], src->nb [2 ], src->nb [3 ],
138
+ dst_tensor->nb [0 ], dst_tensor->nb [1 ],
139
+ dst_tensor->nb [2 ], dst_tensor->nb [3 ],
140
+ k
141
+ );
142
+ }
101
143
}
102
144
103
145
template <float (*op)(float )>
@@ -107,16 +149,16 @@ void ggml_cuda_op_unary(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
107
149
void * dst_d = dst->data ;
108
150
cudaStream_t stream = ctx.stream ();
109
151
110
- GGML_ASSERT (ggml_is_contiguous (src0));
111
-
112
152
GGML_ASSERT (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16);
113
153
GGML_ASSERT ( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
114
154
GGML_ASSERT (src0->type == dst->type );
115
155
116
156
if (src0->type == GGML_TYPE_F16) {
117
- unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements (src0), stream);
157
+ unary_cuda<op>((const half *)src0_d, (half *)dst_d, ggml_nelements (src0),
158
+ src0, dst, stream);
118
159
} else {
119
- unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements (src0), stream);
160
+ unary_cuda<op>((const float *)src0_d, (float *)dst_d, ggml_nelements (src0),
161
+ src0, dst, stream);
120
162
}
121
163
}
122
164
0 commit comments