Skip to content

Commit 7823887

Browse files
committed
Change filter mode to completely parallel execution
1 parent d390065 commit 7823887

File tree

2 files changed

+87
-55
lines changed

2 files changed

+87
-55
lines changed

NNEDI3CL/NNEDI3CL.cpp

Lines changed: 86 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,8 @@
2525
#include <cstdio>
2626
#include <memory>
2727
#include <string>
28+
#include <thread>
29+
#include <unordered_map>
2830

2931
#ifdef _WIN32
3032
#include <codecvt>
@@ -34,6 +36,8 @@
3436
#include <VapourSynth.h>
3537
#include <VSHelper.h>
3638

39+
#define BOOST_COMPUTE_HAVE_THREAD_LOCAL
40+
#define BOOST_COMPUTE_THREAD_SAFE
3741
#include <boost/compute/core.hpp>
3842
#include <boost/compute/utility/dim.hpp>
3943
#include <boost/compute/utility/source.hpp>
@@ -305,19 +309,23 @@ struct NNEDI3CLData {
305309
VSVideoInfo vi;
306310
int field;
307311
bool dh, dw, process[3];
308-
compute::command_queue queue;
309-
compute::kernel kernel;
310-
compute::image2d src, dst, tmp;
312+
compute::device gpu;
313+
compute::context ctx;
314+
compute::program program;
311315
compute::buffer weights0, weights1Buffer;
312316
cl_mem weights1;
317+
cl_image_format clImageFormat;
318+
std::unordered_map<std::thread::id, compute::command_queue> queue;
319+
std::unordered_map<std::thread::id, compute::kernel> kernel;
320+
std::unordered_map<std::thread::id, compute::image2d> src, dst, tmp;
313321
};
314322

315323
static inline int roundds(const double f) {
316324
return (f - std::floor(f) >= 0.5) ? std::min(static_cast<int>(std::ceil(f)), 32767) : std::max(static_cast<int>(std::floor(f)), -32768);
317325
}
318326

319327
template<typename T>
320-
static void process(const VSFrameRef * src, VSFrameRef * dst, const int field_n, NNEDI3CLData * d, const VSAPI * vsapi) {
328+
static void process(const VSFrameRef * src, VSFrameRef * dst, const int field_n, const NNEDI3CLData * d, const VSAPI * vsapi) {
321329
for (int plane = 0; plane < d->vi.format->numPlanes; plane++) {
322330
if (d->process[plane]) {
323331
const int srcWidth = vsapi->getFrameWidth(src, plane);
@@ -327,30 +335,37 @@ static void process(const VSFrameRef * src, VSFrameRef * dst, const int field_n,
327335
const T * srcp = reinterpret_cast<const T *>(vsapi->getReadPtr(src, plane));
328336
T * VS_RESTRICT dstp = reinterpret_cast<T *>(vsapi->getWritePtr(dst, plane));
329337

338+
const auto threadId = std::this_thread::get_id();
339+
auto queue = d->queue.at(threadId);
340+
auto kernel = d->kernel.at(threadId);
341+
auto srcImage = d->src.at(threadId);
342+
auto dstImage = d->dst.at(threadId);
343+
auto tmpImage = d->tmp.at(threadId);
344+
330345
constexpr size_t localWorkSize[] = { 8, 8 };
331346

332-
d->queue.enqueue_write_image(d->src, compute::dim(0, 0), compute::dim(srcWidth, srcHeight), srcp, vsapi->getStride(src, plane));
347+
queue.enqueue_write_image(srcImage, compute::dim(0, 0), compute::dim(srcWidth, srcHeight), srcp, vsapi->getStride(src, plane));
333348

334349
if (d->dh && d->dw) {
335350
size_t globalWorkSize[] = { static_cast<size_t>(((srcHeight + 7) / 8 + 7) & -8), static_cast<size_t>((dstWidth / 2 + 7) & -8) };
336-
d->kernel.set_args(d->src, d->tmp, d->weights0, d->weights1, srcHeight, srcWidth, srcHeight, dstWidth, field_n, 1 - field_n, -1);
337-
d->queue.enqueue_nd_range_kernel(d->kernel, 2, nullptr, globalWorkSize, localWorkSize);
351+
kernel.set_args(srcImage, tmpImage, d->weights0, d->weights1, srcHeight, srcWidth, srcHeight, dstWidth, field_n, 1 - field_n, -1);
352+
queue.enqueue_nd_range_kernel(kernel, 2, nullptr, globalWorkSize, localWorkSize);
338353

339354
globalWorkSize[0] = static_cast<size_t>(((dstWidth + 7) / 8 + 7) & -8);
340355
globalWorkSize[1] = static_cast<size_t>((dstHeight / 2 + 7) & -8);
341-
d->kernel.set_args(d->tmp, d->dst, d->weights0, d->weights1, dstWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0);
342-
d->queue.enqueue_nd_range_kernel(d->kernel, 2, nullptr, globalWorkSize, localWorkSize);
356+
kernel.set_args(tmpImage, dstImage, d->weights0, d->weights1, dstWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0);
357+
queue.enqueue_nd_range_kernel(kernel, 2, nullptr, globalWorkSize, localWorkSize);
343358
} else if (d->dw) {
344359
const size_t globalWorkSize[] = { static_cast<size_t>(((dstHeight + 7) / 8 + 7) & -8), static_cast<size_t>((dstWidth / 2 + 7) & -8) };
345-
d->kernel.set_args(d->src, d->dst, d->weights0, d->weights1, srcHeight, srcWidth, dstHeight, dstWidth, field_n, 1 - field_n, -1);
346-
d->queue.enqueue_nd_range_kernel(d->kernel, 2, nullptr, globalWorkSize, localWorkSize);
360+
kernel.set_args(srcImage, dstImage, d->weights0, d->weights1, srcHeight, srcWidth, dstHeight, dstWidth, field_n, 1 - field_n, -1);
361+
queue.enqueue_nd_range_kernel(kernel, 2, nullptr, globalWorkSize, localWorkSize);
347362
} else {
348363
const size_t globalWorkSize[] = { static_cast<size_t>(((dstWidth + 7) / 8 + 7) & -8), static_cast<size_t>((dstHeight / 2 + 7) & -8) };
349-
d->kernel.set_args(d->src, d->dst, d->weights0, d->weights1, srcWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0);
350-
d->queue.enqueue_nd_range_kernel(d->kernel, 2, nullptr, globalWorkSize, localWorkSize);
364+
kernel.set_args(srcImage, dstImage, d->weights0, d->weights1, srcWidth, srcHeight, dstWidth, dstHeight, field_n, 1 - field_n, 0);
365+
queue.enqueue_nd_range_kernel(kernel, 2, nullptr, globalWorkSize, localWorkSize);
351366
}
352367

353-
d->queue.enqueue_read_image(d->dst, compute::dim(0, 0), compute::dim(dstWidth, dstHeight), dstp, vsapi->getStride(dst, plane));
368+
queue.enqueue_read_image(dstImage, compute::dim(0, 0), compute::dim(dstWidth, dstHeight), dstp, vsapi->getStride(dst, plane));
354369
}
355370
}
356371
}
@@ -366,6 +381,35 @@ static const VSFrameRef *VS_CC nnedi3clGetFrame(int n, int activationReason, voi
366381
if (activationReason == arInitial) {
367382
vsapi->requestFrameFilter(d->field > 1 ? n / 2 : n, d->node, frameCtx);
368383
} else if (activationReason == arAllFramesReady) {
384+
try {
385+
auto threadId = std::this_thread::get_id();
386+
387+
if (!d->queue.count(threadId))
388+
d->queue.emplace(threadId, compute::command_queue{ d->ctx, d->gpu });
389+
390+
if (!d->kernel.count(threadId)) {
391+
if (d->vi.format->sampleType == stInteger)
392+
d->kernel.emplace(threadId, d->program.create_kernel("process_uint"));
393+
else
394+
d->kernel.emplace(threadId, d->program.create_kernel("process_float"));
395+
}
396+
397+
if (!d->src.count(threadId))
398+
d->src.emplace(threadId, compute::image2d{ d->ctx, static_cast<size_t>(vsapi->getVideoInfo(d->node)->width), static_cast<size_t>(vsapi->getVideoInfo(d->node)->height), compute::image_format{ d->clImageFormat }, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY });
399+
400+
if (!d->dst.count(threadId))
401+
d->dst.emplace(threadId, compute::image2d{ d->ctx, static_cast<size_t>(std::max(d->vi.width, d->vi.height)), static_cast<size_t>(std::max(d->vi.width, d->vi.height)), compute::image_format{ d->clImageFormat }, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY });
402+
403+
if (!d->tmp.count(threadId))
404+
d->tmp.emplace(threadId, compute::image2d{ d->ctx, static_cast<size_t>(std::max(d->vi.width, d->vi.height)), static_cast<size_t>(std::max(d->vi.width, d->vi.height)), compute::image_format{ d->clImageFormat }, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS });
405+
} catch (const std::string & error) {
406+
vsapi->setFilterError(("NNEDI3CL: " + error).c_str(), frameCtx);
407+
return nullptr;
408+
} catch (const compute::opencl_error & error) {
409+
vsapi->setFilterError(("NNEDI3CL: " + error.error_string()).c_str(), frameCtx);
410+
return nullptr;
411+
}
412+
369413
const VSFrameRef * src = vsapi->getFrameFilter(d->field > 1 ? n / 2 : n, d->node, frameCtx);
370414
const VSFrameRef * fr[] = { d->process[0] ? nullptr : src, d->process[1] ? nullptr : src, d->process[2] ? nullptr : src };
371415
const int pl[] = { 0, 1, 2 };
@@ -752,34 +796,33 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
752796
const float scaleAsize = 1.f / asize;
753797
const float scaleQual = 1.f / qual;
754798

755-
compute::device gpu = compute::system::default_device();
799+
d->gpu = compute::system::default_device();
756800
if (device > -1)
757-
gpu = compute::system::devices().at(device);
758-
const compute::context ctx{ gpu };
759-
d->queue = compute::command_queue{ ctx, gpu };
801+
d->gpu = compute::system::devices().at(device);
802+
d->ctx = compute::context{ d->gpu };
760803

761-
d->weights0 = compute::buffer{ ctx, std::max(dims0, dims0new) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights0 };
762-
d->weights1Buffer = compute::buffer{ ctx, dims1 * 2 * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights1 };
804+
d->weights0 = compute::buffer{ d->ctx, std::max(dims0, dims0new) * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights0 };
805+
d->weights1Buffer = compute::buffer{ d->ctx, dims1 * 2 * sizeof(cl_float), CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR | CL_MEM_HOST_NO_ACCESS, weights1 };
763806
delete[] weights0;
764807
delete[] weights1;
765808

766809
if (!!vsapi->propGetInt(in, "info", 0, &err)) {
767810
std::string text{ "=== Device Info ===\n" };
768-
text += "Name: " + gpu.get_info<CL_DEVICE_NAME>() + "\n";
769-
text += "Vendor: " + gpu.get_info<CL_DEVICE_VENDOR>() + "\n";
770-
text += "Profile: " + gpu.get_info<CL_DEVICE_PROFILE>() + "\n";
771-
text += "Version: " + gpu.get_info<CL_DEVICE_VERSION>() + "\n";
772-
text += "Global Memory Size: " + std::to_string(gpu.get_info<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1024 / 1024) + " MB\n";
773-
text += "Local Memory Size: " + std::to_string(gpu.get_info<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024) + " KB\n";
774-
text += "Local Memory Type: " + std::string{ gpu.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() == CL_LOCAL ? "CL_LOCAL" : "CL_GLOBAL" } +"\n";
775-
text += "Image Support: " + std::string{ gpu.get_info<CL_DEVICE_IMAGE_SUPPORT>() ? "CL_TRUE" : "CL_FALSE" } +"\n";
776-
text += "1D Image Max Buffer Size: " + std::to_string(gpu.get_info<size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + "\n";
777-
text += "2D Image Max Width: " + std::to_string(gpu.get_info<CL_DEVICE_IMAGE2D_MAX_WIDTH>()) + "\n";
778-
text += "2D Image Max Height: " + std::to_string(gpu.get_info<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()) + "\n";
779-
text += "Max Constant Arguments: " + std::to_string(gpu.get_info<CL_DEVICE_MAX_CONSTANT_ARGS>()) + "\n";
780-
text += "Max Constant Buffer Size: " + std::to_string(gpu.get_info<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>() / 1024) + " KB\n";
781-
text += "Max Work-group Size: " + std::to_string(gpu.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>()) + "\n";
782-
const auto MAX_WORK_ITEM_SIZES = gpu.get_info<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
811+
text += "Name: " + d->gpu.get_info<CL_DEVICE_NAME>() + "\n";
812+
text += "Vendor: " + d->gpu.get_info<CL_DEVICE_VENDOR>() + "\n";
813+
text += "Profile: " + d->gpu.get_info<CL_DEVICE_PROFILE>() + "\n";
814+
text += "Version: " + d->gpu.get_info<CL_DEVICE_VERSION>() + "\n";
815+
text += "Global Memory Size: " + std::to_string(d->gpu.get_info<CL_DEVICE_GLOBAL_MEM_SIZE>() / 1024 / 1024) + " MB\n";
816+
text += "Local Memory Size: " + std::to_string(d->gpu.get_info<CL_DEVICE_LOCAL_MEM_SIZE>() / 1024) + " KB\n";
817+
text += "Local Memory Type: " + std::string{ d->gpu.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() == CL_LOCAL ? "CL_LOCAL" : "CL_GLOBAL" } +"\n";
818+
text += "Image Support: " + std::string{ d->gpu.get_info<CL_DEVICE_IMAGE_SUPPORT>() ? "CL_TRUE" : "CL_FALSE" } +"\n";
819+
text += "1D Image Max Buffer Size: " + std::to_string(d->gpu.get_info<size_t>(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE)) + "\n";
820+
text += "2D Image Max Width: " + std::to_string(d->gpu.get_info<CL_DEVICE_IMAGE2D_MAX_WIDTH>()) + "\n";
821+
text += "2D Image Max Height: " + std::to_string(d->gpu.get_info<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()) + "\n";
822+
text += "Max Constant Arguments: " + std::to_string(d->gpu.get_info<CL_DEVICE_MAX_CONSTANT_ARGS>()) + "\n";
823+
text += "Max Constant Buffer Size: " + std::to_string(d->gpu.get_info<CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE>() / 1024) + " KB\n";
824+
text += "Max Work-group Size: " + std::to_string(d->gpu.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>()) + "\n";
825+
const auto MAX_WORK_ITEM_SIZES = d->gpu.get_info<CL_DEVICE_MAX_WORK_ITEM_SIZES>();
783826
text += "Max Work-item Sizes: (" + std::to_string(MAX_WORK_ITEM_SIZES[0]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[1]) + ", " + std::to_string(MAX_WORK_ITEM_SIZES[2]) + ")";
784827

785828
VSMap * args = vsapi->createMap();
@@ -803,7 +846,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
803846
return;
804847
}
805848

806-
compute::program program = compute::program::create_with_source(source, ctx);
849+
d->program = compute::program::create_with_source(source, d->ctx);
807850
try {
808851
std::setlocale(LC_ALL, "C");
809852
char buf[100];
@@ -837,28 +880,17 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
837880
options += " -D Y_STRIDE=" + std::to_string(8);
838881
}
839882
std::setlocale(LC_ALL, "");
840-
program.build(options);
883+
d->program.build(options);
841884
} catch (const compute::opencl_error & error) {
842-
throw error.error_string() + "\n" + program.build_log();
885+
throw error.error_string() + "\n" + d->program.build_log();
843886
}
844887

845-
if (d->vi.format->sampleType == stInteger)
846-
d->kernel = program.create_kernel("process_uint");
847-
else
848-
d->kernel = program.create_kernel("process_float");
849-
850-
cl_image_format clImageFormat;
851888
if (d->vi.format->bytesPerSample == 1)
852-
clImageFormat = { CL_R, CL_UNSIGNED_INT8 };
889+
d->clImageFormat = { CL_R, CL_UNSIGNED_INT8 };
853890
else if (d->vi.format->bytesPerSample == 2)
854-
clImageFormat = { CL_R, CL_UNSIGNED_INT16 };
891+
d->clImageFormat = { CL_R, CL_UNSIGNED_INT16 };
855892
else
856-
clImageFormat = { CL_R, CL_FLOAT };
857-
const compute::image_format imageFormat{ clImageFormat };
858-
859-
d->src = compute::image2d{ ctx, static_cast<size_t>(vsapi->getVideoInfo(d->node)->width), static_cast<size_t>(vsapi->getVideoInfo(d->node)->height), imageFormat, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY };
860-
d->dst = compute::image2d{ ctx, static_cast<size_t>(std::max(d->vi.width, d->vi.height)), static_cast<size_t>(std::max(d->vi.width, d->vi.height)), imageFormat, CL_MEM_READ_WRITE | CL_MEM_HOST_READ_ONLY };
861-
d->tmp = compute::image2d{ ctx, static_cast<size_t>(std::max(d->vi.width, d->vi.height)), static_cast<size_t>(std::max(d->vi.width, d->vi.height)), imageFormat, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS };
893+
d->clImageFormat = { CL_R, CL_FLOAT };
862894

863895
{
864896
constexpr cl_image_format format = { CL_R, CL_FLOAT };
@@ -881,7 +913,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
881913

882914
cl_int error = 0;
883915

884-
cl_mem mem = clCreateImage(ctx, 0, &format, &desc, nullptr, &error);
916+
cl_mem mem = clCreateImage(d->ctx, 0, &format, &desc, nullptr, &error);
885917
if (!mem)
886918
BOOST_THROW_EXCEPTION(compute::opencl_error(error));
887919

@@ -901,7 +933,7 @@ void VS_CC nnedi3clCreate(const VSMap *in, VSMap *out, void *userData, VSCore *c
901933
return;
902934
}
903935

904-
vsapi->createFilter(in, out, "NNEDI3CL", nnedi3clInit, nnedi3clGetFrame, nnedi3clFree, fmParallelRequests, 0, d.release(), core);
936+
vsapi->createFilter(in, out, "NNEDI3CL", nnedi3clInit, nnedi3clGetFrame, nnedi3clFree, fmParallel, 0, d.release(), core);
905937
}
906938

907939
//////////////////////////////////////////

configure.ac

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
AC_INIT([NNEDI3CL], [4], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-NNEDI3CL/issues], [NNEDI3CL], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-NNEDI3CL/])
1+
AC_INIT([NNEDI3CL], [5], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-NNEDI3CL/issues], [NNEDI3CL], [https://github.com/HomeOfVapourSynthEvolution/VapourSynth-NNEDI3CL/])
22

33
: ${CXXFLAGS=""}
44

0 commit comments

Comments
 (0)