25
25
#include < cstdio>
26
26
#include < memory>
27
27
#include < string>
28
+ #include < thread>
29
+ #include < unordered_map>
28
30
29
31
#ifdef _WIN32
30
32
#include < codecvt>
34
36
#include < VapourSynth.h>
35
37
#include < VSHelper.h>
36
38
39
+ #define BOOST_COMPUTE_HAVE_THREAD_LOCAL
40
+ #define BOOST_COMPUTE_THREAD_SAFE
37
41
#include < boost/compute/core.hpp>
38
42
#include < boost/compute/utility/dim.hpp>
39
43
#include < boost/compute/utility/source.hpp>
@@ -305,19 +309,23 @@ struct NNEDI3CLData {
305
309
VSVideoInfo vi;
306
310
int field;
307
311
bool dh, dw, process[3 ];
308
- compute::command_queue queue ;
309
- compute::kernel kernel ;
310
- compute::image2d src, dst, tmp ;
312
+ compute::device gpu ;
313
+ compute::context ctx ;
314
+ compute::program program ;
311
315
compute::buffer weights0, weights1Buffer;
312
316
cl_mem weights1;
317
+ cl_image_format clImageFormat;
318
+ std::unordered_map<std::thread::id, compute::command_queue> queue;
319
+ std::unordered_map<std::thread::id, compute::kernel> kernel;
320
+ std::unordered_map<std::thread::id, compute::image2d> src, dst, tmp;
313
321
};
314
322
315
323
static inline int roundds (const double f) {
316
324
return (f - std::floor (f) >= 0.5 ) ? std::min (static_cast <int >(std::ceil (f)), 32767 ) : std::max (static_cast <int >(std::floor (f)), -32768 );
317
325
}
318
326
319
327
template <typename T>
320
- static void process (const VSFrameRef * src, VSFrameRef * dst, const int field_n, NNEDI3CLData * d, const VSAPI * vsapi) {
328
+ static void process (const VSFrameRef * src, VSFrameRef * dst, const int field_n, const NNEDI3CLData * d, const VSAPI * vsapi) {
321
329
for (int plane = 0 ; plane < d->vi .format ->numPlanes ; plane++) {
322
330
if (d->process [plane]) {
323
331
const int srcWidth = vsapi->getFrameWidth (src, plane);
@@ -327,30 +335,37 @@ static void process(const VSFrameRef * src, VSFrameRef * dst, const int field_n,
327
335
const T * srcp = reinterpret_cast <const T *>(vsapi->getReadPtr (src, plane));
328
336
T * VS_RESTRICT dstp = reinterpret_cast <T *>(vsapi->getWritePtr (dst, plane));
329
337
338
+ const auto threadId = std::this_thread::get_id ();
339
+ auto queue = d->queue .at (threadId);
340
+ auto kernel = d->kernel .at (threadId);
341
+ auto srcImage = d->src .at (threadId);
342
+ auto dstImage = d->dst .at (threadId);
343
+ auto tmpImage = d->tmp .at (threadId);
344
+
330
345
constexpr size_t localWorkSize[] = { 8 , 8 };
331
346
332
- d-> queue .enqueue_write_image (d-> src , compute::dim (0 , 0 ), compute::dim (srcWidth, srcHeight), srcp, vsapi->getStride (src, plane));
347
+ queue.enqueue_write_image (srcImage , compute::dim (0 , 0 ), compute::dim (srcWidth, srcHeight), srcp, vsapi->getStride (src, plane));
333
348
334
349
if (d->dh && d->dw ) {
335
350
size_t globalWorkSize[] = { static_cast <size_t >(((srcHeight + 7 ) / 8 + 7 ) & -8 ), static_cast <size_t >((dstWidth / 2 + 7 ) & -8 ) };
336
- d-> kernel .set_args (d-> src , d-> tmp , d->weights0 , d->weights1 , srcHeight, srcWidth, srcHeight, dstWidth, field_n, 1 - field_n, -1 );
337
- d-> queue .enqueue_nd_range_kernel (d-> kernel , 2 , nullptr , globalWorkSize, localWorkSize);
351
+ kernel.set_args (srcImage, tmpImage , d->weights0 , d->weights1 , srcHeight, srcWidth, srcHeight, dstWidth, field_n, 1 - field_n, -1 );
352
+ queue.enqueue_nd_range_kernel (kernel, 2 , nullptr , globalWorkSize, localWorkSize);
338
353
339
354
globalWorkSize[0 ] = static_cast <size_t >(((dstWidth + 7 ) / 8 + 7 ) & -8 );
340
355
globalWorkSize[1 ] = static_cast <size_t >((dstHeight / 2 + 7 ) & -8 );
341
- d-> kernel .set_args (d-> tmp , d-> dst , d->weights0 , d->weights1 , dstWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0 );
342
- d-> queue .enqueue_nd_range_kernel (d-> kernel , 2 , nullptr , globalWorkSize, localWorkSize);
356
+ kernel.set_args (tmpImage, dstImage , d->weights0 , d->weights1 , dstWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0 );
357
+ queue.enqueue_nd_range_kernel (kernel, 2 , nullptr , globalWorkSize, localWorkSize);
343
358
} else if (d->dw ) {
344
359
const size_t globalWorkSize[] = { static_cast <size_t >(((dstHeight + 7 ) / 8 + 7 ) & -8 ), static_cast <size_t >((dstWidth / 2 + 7 ) & -8 ) };
345
- d-> kernel .set_args (d-> src , d-> dst , d->weights0 , d->weights1 , srcHeight, srcWidth, dstHeight, dstWidth, field_n, 1 - field_n, -1 );
346
- d-> queue .enqueue_nd_range_kernel (d-> kernel , 2 , nullptr , globalWorkSize, localWorkSize);
360
+ kernel.set_args (srcImage, dstImage , d->weights0 , d->weights1 , srcHeight, srcWidth, dstHeight, dstWidth, field_n, 1 - field_n, -1 );
361
+ queue.enqueue_nd_range_kernel (kernel, 2 , nullptr , globalWorkSize, localWorkSize);
347
362
} else {
348
363
const size_t globalWorkSize[] = { static_cast <size_t >(((dstWidth + 7 ) / 8 + 7 ) & -8 ), static_cast <size_t >((dstHeight / 2 + 7 ) & -8 ) };
349
- d-> kernel .set_args (d-> src , d-> dst , d->weights0 , d->weights1 , srcWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0 );
350
- d-> queue .enqueue_nd_range_kernel (d-> kernel , 2 , nullptr , globalWorkSize, localWorkSize);
364
+ kernel.set_args (srcImage, dstImage , d->weights0 , d->weights1 , srcWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0 );
365
+ queue.enqueue_nd_range_kernel (kernel, 2 , nullptr , globalWorkSize, localWorkSize);
351
366
}
352
367
353
- d-> queue .enqueue_read_image (d-> dst , compute::dim (0 , 0 ), compute::dim (dstWidth, dstHeight), dstp, vsapi->getStride (dst, plane));
368
+ queue.enqueue_read_image (dstImage , compute::dim (0 , 0 ), compute::dim (dstWidth, dstHeight), dstp, vsapi->getStride (dst, plane));
354
369
}
355
370
}
356
371
}
@@ -366,6 +381,35 @@ static const VSFrameRef *VS_CC nnedi3clGetFrame(int n, int activationReason, voi
366
381
if (activationReason == arInitial) {
367
382
vsapi->requestFrameFilter (d->field > 1 ? n / 2 : n, d->node , frameCtx);
368
383
} else if (activationReason == arAllFramesReady) {
384
+ try {
385
+ auto threadId = std::this_thread::get_id ();
386
+
387
+ if (!d->queue .count (threadId))
388
+ d->queue .emplace (threadId, compute::command_queue{ d->ctx , d->gpu });
389
+
390
+ if (!d->kernel .count (threadId)) {
391
+ if (d->vi .format ->sampleType == stInteger)
392
+ d->kernel .emplace (threadId, d->program .create_kernel (" process_uint" ));
393
+ else
394
+ d->kernel .emplace (threadId, d->program .create_kernel (" process_float" ));
395
+ }
396
+
397
+ if (!d->src .count (threadId))
398
+ d->src .emplace (threadId, compute::image2d{ d->ctx , static_cast <size_t >(vsapi->getVideoInfo (d->node )->width ), static_cast <size_t >(vsapi->getVideoInfo (d->node )->height ), compute::image_format{ d->clImageFormat }, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY });
399
+
400
+ if (!d->dst .count (threadId))
401
+ d->dst .emplace (threadId, compute::image2d{ d->ctx , static_cast <size_t >(std::max (d->vi .width , d->vi .height )), static_cast <size_t >(std::max (d->vi .width , d->vi .height )), compute::image_format{ d->clImageFormat }, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY });
402
+
403
+ if (!d->tmp .count (threadId))
404
+ d->tmp .emplace (threadId, compute::image2d{ d->ctx , static_cast <size_t >(std::max (d->vi .width , d->vi .height )), static_cast <size_t >(std::max (d->vi .width , d->vi .height )), compute::image_format{ d->clImageFormat }, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
405
+ } catch (const std::string & error) {
406
+ vsapi->setFilterError ((" NNEDI3CL: " + error).c_str (), frameCtx);
407
+ return nullptr ;
408
+ } catch (const compute::opencl_error & error) {
409
+ vsapi->setFilterError ((" NNEDI3CL: " + error.error_string ()).c_str (), frameCtx);
410
+ return nullptr ;
411
+ }
412
+
369
413
const VSFrameRef * src = vsapi->getFrameFilter (d->field > 1 ? n / 2 : n, d->node , frameCtx);
370
414
const VSFrameRef * fr[] = { d->process [0 ] ? nullptr : src, d->process [1 ] ? nullptr : src, d->process [2 ] ? nullptr : src };
371
415
const int pl[] = { 0 , 1 , 2 };
@@ -752,34 +796,33 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
752
796
const float scaleAsize = 1 .f / asize;
753
797
const float scaleQual = 1 .f / qual;
754
798
755
- compute::device gpu = compute::system::default_device ();
799
+ d-> gpu = compute::system::default_device ();
756
800
if (device > -1 )
757
- gpu = compute::system::devices ().at (device);
758
- const compute::context ctx{ gpu };
759
- d->queue = compute::command_queue{ ctx, gpu };
801
+ d->gpu = compute::system::devices ().at (device);
802
+ d->ctx = compute::context{ d->gpu };
760
803
761
- d->weights0 = compute::buffer{ ctx, std::max (dims0, dims0new) * sizeof (cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights0 };
762
- d->weights1Buffer = compute::buffer{ ctx, dims1 * 2 * sizeof (cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights1 };
804
+ d->weights0 = compute::buffer{ d-> ctx , std::max (dims0, dims0new) * sizeof (cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights0 };
805
+ d->weights1Buffer = compute::buffer{ d-> ctx , dims1 * 2 * sizeof (cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights1 };
763
806
delete[] weights0;
764
807
delete[] weights1;
765
808
766
809
if (!!vsapi->propGetInt (in, " info" , 0 , &err)) {
767
810
std::string text{ " === Device Info ===\n " };
768
- text += " Name: " + gpu.get_info <CL_DEVICE_NAME>() + " \n " ;
769
- text += " Vendor: " + gpu.get_info <CL_DEVICE_VENDOR>() + " \n " ;
770
- text += " Profile: " + gpu.get_info <CL_DEVICE_PROFILE>() + " \n " ;
771
- text += " Version: " + gpu.get_info <CL_DEVICE_VERSION>() + " \n " ;
772
- text += " Global Memory Size: " + std::to_string (gpu.get_info <CL_DEVICE_GLOBAL_MEM_SIZE>() / 1024 / 1024 ) + " MB\n " ;
773
- text += " Local Memory Size: " + std::to_string (gpu.get_info <CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 ) + " KB\n " ;
774
- text += " Local Memory Type: " + std::string{ gpu.get_info <CL_DEVICE_LOCAL_MEM_TYPE>() == CL_LOCAL ? " CL_LOCAL" : " CL_GLOBAL" } +" \n " ;
775
- text += " Image Support: " + std::string{ gpu.get_info <CL_DEVICE_IMAGE_SUPPORT>() ? " CL_TRUE" : " CL_FALSE" } +" \n " ;
776
- text += " 1D Image Max Buffer Size: " + std::to_string (gpu.get_info <size_t >(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + " \n " ;
777
- text += " 2D Image Max Width: " + std::to_string (gpu.get_info <CL_DEVICE_IMAGE2D_MAX_WIDTH>()) + " \n " ;
778
- text += " 2D Image Max Height: " + std::to_string (gpu.get_info <CL_DEVICE_IMAGE2D_MAX_HEIGHT>()) + " \n " ;
779
- text += " Max Constant Arguments: " + std::to_string (gpu.get_info <CL_DEVICE_MAX_CONSTANT_ARGS>()) + " \n " ;
780
- text += " Max Constant Buffer Size: " + std::to_string (gpu.get_info <CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>() / 1024 ) + " KB\n " ;
781
- text += " Max Work-group Size: " + std::to_string (gpu.get_info <CL_DEVICE_MAX_WORK_GROUP_SIZE>()) + " \n " ;
782
- const auto MAX_WORK_ITEM_SIZES = gpu.get_info <CL_DEVICE_MAX_WORK_ITEM_SIZES>();
811
+ text += " Name: " + d-> gpu .get_info <CL_DEVICE_NAME>() + " \n " ;
812
+ text += " Vendor: " + d-> gpu .get_info <CL_DEVICE_VENDOR>() + " \n " ;
813
+ text += " Profile: " + d-> gpu .get_info <CL_DEVICE_PROFILE>() + " \n " ;
814
+ text += " Version: " + d-> gpu .get_info <CL_DEVICE_VERSION>() + " \n " ;
815
+ text += " Global Memory Size: " + std::to_string (d-> gpu .get_info <CL_DEVICE_GLOBAL_MEM_SIZE>() / 1024 / 1024 ) + " MB\n " ;
816
+ text += " Local Memory Size: " + std::to_string (d-> gpu .get_info <CL_DEVICE_LOCAL_MEM_SIZE>() / 1024 ) + " KB\n " ;
817
+ text += " Local Memory Type: " + std::string{ d-> gpu .get_info <CL_DEVICE_LOCAL_MEM_TYPE>() == CL_LOCAL ? " CL_LOCAL" : " CL_GLOBAL" } +" \n " ;
818
+ text += " Image Support: " + std::string{ d-> gpu .get_info <CL_DEVICE_IMAGE_SUPPORT>() ? " CL_TRUE" : " CL_FALSE" } +" \n " ;
819
+ text += " 1D Image Max Buffer Size: " + std::to_string (d-> gpu .get_info <size_t >(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + " \n " ;
820
+ text += " 2D Image Max Width: " + std::to_string (d-> gpu .get_info <CL_DEVICE_IMAGE2D_MAX_WIDTH>()) + " \n " ;
821
+ text += " 2D Image Max Height: " + std::to_string (d-> gpu .get_info <CL_DEVICE_IMAGE2D_MAX_HEIGHT>()) + " \n " ;
822
+ text += " Max Constant Arguments: " + std::to_string (d-> gpu .get_info <CL_DEVICE_MAX_CONSTANT_ARGS>()) + " \n " ;
823
+ text += " Max Constant Buffer Size: " + std::to_string (d-> gpu .get_info <CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>() / 1024 ) + " KB\n " ;
824
+ text += " Max Work-group Size: " + std::to_string (d-> gpu .get_info <CL_DEVICE_MAX_WORK_GROUP_SIZE>()) + " \n " ;
825
+ const auto MAX_WORK_ITEM_SIZES = d-> gpu .get_info <CL_DEVICE_MAX_WORK_ITEM_SIZES>();
783
826
text += " Max Work-item Sizes: (" + std::to_string (MAX_WORK_ITEM_SIZES[0 ]) + " , " + std::to_string (MAX_WORK_ITEM_SIZES[1 ]) + " , " + std::to_string (MAX_WORK_ITEM_SIZES[2 ]) + " )" ;
784
827
785
828
VSMap * args = vsapi->createMap ();
@@ -803,7 +846,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
803
846
return ;
804
847
}
805
848
806
- compute:: program program = compute::program::create_with_source (source, ctx);
849
+ d-> program = compute::program::create_with_source (source, d-> ctx );
807
850
try {
808
851
std::setlocale (LC_ALL, " C" );
809
852
char buf[100 ];
@@ -837,28 +880,17 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
837
880
options += " -D Y_STRIDE=" + std::to_string (8 );
838
881
}
839
882
std::setlocale (LC_ALL, " " );
840
- program.build (options);
883
+ d-> program .build (options);
841
884
} catch (const compute::opencl_error & error) {
842
- throw error.error_string () + " \n " + program.build_log ();
885
+ throw error.error_string () + " \n " + d-> program .build_log ();
843
886
}
844
887
845
- if (d->vi .format ->sampleType == stInteger)
846
- d->kernel = program.create_kernel (" process_uint" );
847
- else
848
- d->kernel = program.create_kernel (" process_float" );
849
-
850
- cl_image_format clImageFormat;
851
888
if (d->vi .format ->bytesPerSample == 1 )
852
- clImageFormat = { CL_R, CL_UNSIGNED_INT8 };
889
+ d-> clImageFormat = { CL_R, CL_UNSIGNED_INT8 };
853
890
else if (d->vi .format ->bytesPerSample == 2 )
854
- clImageFormat = { CL_R, CL_UNSIGNED_INT16 };
891
+ d-> clImageFormat = { CL_R, CL_UNSIGNED_INT16 };
855
892
else
856
- clImageFormat = { CL_R, CL_FLOAT };
857
- const compute::image_format imageFormat{ clImageFormat };
858
-
859
- d->src = compute::image2d{ ctx, static_cast <size_t >(vsapi->getVideoInfo (d->node )->width ), static_cast <size_t >(vsapi->getVideoInfo (d->node )->height ), imageFormat, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY };
860
- d->dst = compute::image2d{ ctx, static_cast <size_t >(std::max (d->vi .width , d->vi .height )), static_cast <size_t >(std::max (d->vi .width , d->vi .height )), imageFormat, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY };
861
- d->tmp = compute::image2d{ ctx, static_cast <size_t >(std::max (d->vi .width , d->vi .height )), static_cast <size_t >(std::max (d->vi .width , d->vi .height )), imageFormat, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS };
893
+ d->clImageFormat = { CL_R, CL_FLOAT };
862
894
863
895
{
864
896
constexpr cl_image_format format = { CL_R, CL_FLOAT };
@@ -881,7 +913,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
881
913
882
914
cl_int error = 0 ;
883
915
884
- cl_mem mem = clCreateImage (ctx, 0 , &format, &desc, nullptr , &error);
916
+ cl_mem mem = clCreateImage (d-> ctx , 0 , &format, &desc, nullptr , &error);
885
917
if (!mem)
886
918
BOOST_THROW_EXCEPTION (compute::opencl_error (error));
887
919
@@ -901,7 +933,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
901
933
return ;
902
934
}
903
935
904
- vsapi->createFilter (in, out, " NNEDI3CL" , nnedi3clInit, nnedi3clGetFrame, nnedi3clFree, fmParallelRequests , 0 , d.release (), core);
936
+ vsapi->createFilter (in, out, " NNEDI3CL" , nnedi3clInit, nnedi3clGetFrame, nnedi3clFree, fmParallel , 0 , d.release (), core);
905
937
}
906
938
907
939
// ////////////////////////////////////////
0 commit comments