diff --git a/src/include/OSL/rs_free_function.h b/src/include/OSL/rs_free_function.h index de9b3316f..0abf3fcf8 100644 --- a/src/include/OSL/rs_free_function.h +++ b/src/include/OSL/rs_free_function.h @@ -316,6 +316,13 @@ OSL_RSOP OSL_HOSTDEVICE bool rs_trace_get(OSL::OpaqueExecContextPtr oec, OSL::ustringhash name, OSL::TypeDesc type, void* val, bool derivatives); +/// Allocates memory for a closure color. May return null if no memory could +/// be allocated. It is the renderers responsibility to clean up these +/// allocations after a shader is run and the closures have been processed. +OSL_RSOP OSL_HOSTDEVICE void* +rs_allocate_closure(OSL::OpaqueExecContextPtr oec, size_t size, + size_t alignment); + /// Report errors, warnings, printf, and fprintf. /// Fmtlib style format specifier is used (vs. printf style) /// Arguments are represented as EncodedTypes (encodedtypes.h) and diff --git a/src/liboslexec/CMakeLists.txt b/src/liboslexec/CMakeLists.txt index eb7c6cb00..ba20dd67e 100644 --- a/src/liboslexec/CMakeLists.txt +++ b/src/liboslexec/CMakeLists.txt @@ -194,7 +194,7 @@ if (USE_LLVM_BITCODE) EMBED_LLVM_BITCODE_IN_CPP ( "${llvm_ops_srcs}" "_host" "osl_llvm_compiled_ops" lib_src "" "${include_dirs}") set (rs_dependent_ops_srcs - opmatrix.cpp opfmt.cpp optexture.cpp pointcloud.cpp + opmatrix.cpp opfmt.cpp optexture.cpp pointcloud.cpp opclosure.cpp ) # Achieve the effect of absorbing osl_llvm_compiled_ops by adding its # sources to rs_dependent_ops_srcs which avoids having to do it at runtime. @@ -217,6 +217,7 @@ if (USE_LLVM_BITCODE) ${CMAKE_SOURCE_DIR}/src/liboslexec/opmatrix.cpp ${CMAKE_SOURCE_DIR}/src/liboslexec/optexture.cpp ${CMAKE_SOURCE_DIR}/src/liboslexec/pointcloud.cpp + ${CMAKE_SOURCE_DIR}/src/liboslexec/opclosure.cpp ${CMAKE_SOURCE_DIR}/src/liboslnoise/gabornoise.cpp ${CMAKE_SOURCE_DIR}/src/liboslnoise/simplexnoise.cpp ) diff --git a/src/liboslexec/builtindecl.h b/src/liboslexec/builtindecl.h index c08b44db7..8795845a4 100644 --- a/src/liboslexec/builtindecl.h +++ b/src/liboslexec/builtindecl.h @@ -109,29 +109,14 @@ DECL(osl_##name##_dvdvv, "xXXX") - -#ifndef __CUDA_ARCH__ -DECL(osl_add_closure_closure, "CXCC") -DECL(osl_mul_closure_float, "CXCf") -DECL(osl_mul_closure_color, "CXCc") -DECL(osl_allocate_closure_component, "CXii") -DECL(osl_allocate_weighted_closure_component, "CXiiX") -DECL(osl_closure_to_string, "sXC") -DECL(osl_closure_to_ustringhash, "hXC") -#else -// TODO: Figure out why trying to match the signatures between host and device -// definitions fails with 'LLVM had to make a cast' assertion failure. -// -// In the meantime, use a signature that matches the definitions in rend_lib.cu, -// where void* is used instead of ClosureColor* and ShaderGlobals*. DECL(osl_add_closure_closure, "XXXX") DECL(osl_mul_closure_float, "XXXf") -DECL(osl_mul_closure_color, "XXXc") +DECL(osl_mul_closure_color, "XXXX") DECL(osl_allocate_closure_component, "XXii") DECL(osl_allocate_weighted_closure_component, "XXiiX") DECL(osl_closure_to_string, "sXX") DECL(osl_closure_to_ustringhash, "hXX") -#endif + DECL(osl_format, "hh*") DECL(osl_gen_ustringhash_pod, "hs") DECL(osl_gen_ustring, "sh") diff --git a/src/liboslexec/llvm_gen.cpp b/src/liboslexec/llvm_gen.cpp index 4e45e1868..3b9e788e6 100644 --- a/src/liboslexec/llvm_gen.cpp +++ b/src/liboslexec/llvm_gen.cpp @@ -3863,19 +3863,15 @@ LLVMGEN(llvm_gen_closure) id_int, size_int); llvm::Value* comp_void_ptr = return_ptr; - // For the weighted closures, we need a surrounding "if" so that it's safe - // for osl_allocate_weighted_closure_component to return NULL (unless we - // know for sure that it's constant weighted and that the weight is - // not zero). - llvm::BasicBlock* next_block = NULL; - if (weighted && !(weight->is_constant() && !rop.is_zero(*weight))) { - llvm::BasicBlock* notnull_block = rop.ll.new_basic_block( - "non_null_closure"); - next_block = rop.ll.new_basic_block(""); - llvm::Value* cond = rop.ll.op_ne(return_ptr, rop.ll.void_ptr_null()); - rop.ll.op_branch(cond, notnull_block, next_block); - // new insert point is nonnull_block - } + // We need a surrounding "if" so that it's safe for closure allocation to + // return NULL, either because it has zero weight, or renderer services ran + // out of memory in the closure pool. + llvm::BasicBlock* notnull_block = rop.ll.new_basic_block( + "non_null_closure"); + llvm::BasicBlock* next_block = rop.ll.new_basic_block(""); + llvm::Value* cond = rop.ll.op_ne(return_ptr, rop.ll.void_ptr_null()); + rop.ll.op_branch(cond, notnull_block, next_block); + // new insert point is nonnull_block llvm::Value* comp_ptr = rop.ll.ptr_cast(comp_void_ptr, rop.llvm_type_closure_component_ptr()); diff --git a/src/liboslexec/opclosure.cpp b/src/liboslexec/opclosure.cpp index 6aa32a072..e90e2f1ea 100644 --- a/src/liboslexec/opclosure.cpp +++ b/src/liboslexec/opclosure.cpp @@ -7,6 +7,7 @@ #include "oslexec_pvt.h" #include +#include OSL_NAMESPACE_BEGIN @@ -14,65 +15,112 @@ namespace pvt { -OSL_SHADEOP const ClosureColor* -osl_add_closure_closure(ShaderGlobals* sg, const ClosureColor* a, - const ClosureColor* b) +OSL_SHADEOP OSL_HOSTDEVICE const void* +osl_add_closure_closure(OpaqueExecContextPtr oec, const void* a_, + const void* b_) { + const ClosureColor* a = (const ClosureColor*)a_; + const ClosureColor* b = (const ClosureColor*)b_; if (a == NULL) return b; if (b == NULL) return a; - return sg->context->closure_add_allot(a, b); + ClosureAdd* add = (ClosureAdd*)rs_allocate_closure(oec, sizeof(ClosureAdd), + alignof(ClosureAdd)); + if (add) { + add->id = ClosureColor::ADD; + add->closureA = a; + add->closureB = b; + } + return add; } -OSL_SHADEOP const ClosureColor* -osl_mul_closure_color(ShaderGlobals* sg, ClosureColor* a, const Color3* w) +OSL_SHADEOP OSL_HOSTDEVICE const void* +osl_mul_closure_color(OpaqueExecContextPtr oec, const void* a_, const void* w_) { + const ClosureColor* a = (const ClosureColor*)a_; + const Color3* w = (const Color3*)w_; if (a == NULL) return NULL; if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) return NULL; if (w->x == 1.0f && w->y == 1.0f && w->z == 1.0f) return a; - return sg->context->closure_mul_allot(*w, a); + ClosureMul* mul = (ClosureMul*)rs_allocate_closure(oec, sizeof(ClosureMul), + alignof(ClosureMul)); + if (mul) { + mul->id = ClosureColor::MUL; + mul->weight = *w; + mul->closure = a; + } + return mul; } -OSL_SHADEOP const ClosureColor* -osl_mul_closure_float(ShaderGlobals* sg, ClosureColor* a, float w) +OSL_SHADEOP OSL_HOSTDEVICE const void* +osl_mul_closure_float(OpaqueExecContextPtr oec, const void* a_, float w) { + const ClosureColor* a = (const ClosureColor*)a_; if (a == NULL) return NULL; if (w == 0.0f) return NULL; if (w == 1.0f) return a; - return sg->context->closure_mul_allot(w, a); + ClosureMul* mul = (ClosureMul*)rs_allocate_closure(oec, sizeof(ClosureMul), + alignof(ClosureMul)); + if (mul) { + mul->id = ClosureColor::MUL; + mul->weight = Color3(w); + mul->closure = a; + } + return mul; } -OSL_SHADEOP ClosureComponent* -osl_allocate_closure_component(ShaderGlobals* sg, int id, int size) +OSL_SHADEOP OSL_HOSTDEVICE void* +osl_allocate_closure_component(OpaqueExecContextPtr oec, int id, int size) { - return sg->context->closure_component_allot(id, size, Color3(1.0f)); + // Allocate the component and the mul back to back + const size_t needed = sizeof(ClosureComponent) + size; + ClosureComponent* comp + = (ClosureComponent*)rs_allocate_closure(oec, needed, + alignof(ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = Color3(1.0f); + } + return comp; } -OSL_SHADEOP ClosureColor* -osl_allocate_weighted_closure_component(ShaderGlobals* sg, int id, int size, - const Color3* w) +OSL_SHADEOP OSL_HOSTDEVICE void* +osl_allocate_weighted_closure_component(OpaqueExecContextPtr oec, int id, + int size, const void* w_) { + const Color3* w = (const Color3*)w_; if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) return NULL; - return sg->context->closure_component_allot(id, size, *w); + // Allocate the component and the mul back to back + const size_t needed = sizeof(ClosureComponent) + size; + ClosureComponent* comp + = (ClosureComponent*)rs_allocate_closure(oec, needed, + alignof(ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = *w; + } + return comp; } // Deprecated, remove when conversion from ustring to ustringhash is finished OSL_SHADEOP const char* -osl_closure_to_string(ShaderGlobals* sg, ClosureColor* c) +osl_closure_to_string(OpaqueExecContextPtr oec, const void* c_) { + ShaderGlobals* sg = (ShaderGlobals*)oec; + const ClosureColor* c = (const ClosureColor*)c_; // Special case for printing closures std::ostringstream stream; stream.imbue(std::locale::classic()); // force C locale @@ -82,8 +130,10 @@ osl_closure_to_string(ShaderGlobals* sg, ClosureColor* c) } OSL_SHADEOP ustringhash_pod -osl_closure_to_ustringhash(ShaderGlobals* sg, ClosureColor* c) +osl_closure_to_ustringhash(OpaqueExecContextPtr oec, const void* c_) { + ShaderGlobals* sg = (ShaderGlobals*)oec; + const ClosureColor* c = (const ClosureColor*)c_; // Special case for printing closures std::ostringstream stream; stream.imbue(std::locale::classic()); // force C locale diff --git a/src/liboslexec/oslexec_pvt.h b/src/liboslexec/oslexec_pvt.h index fa2a95628..cc6c1c47a 100644 --- a/src/liboslexec/oslexec_pvt.h +++ b/src/liboslexec/oslexec_pvt.h @@ -2242,49 +2242,11 @@ class OSLEXECPUBLIC ShadingContext { } #endif - ClosureComponent* closure_component_allot(int id, size_t prim_size, - const Color3& w) + void* allocate_closure(size_t size, size_t alignment) { - // Allocate the component and the mul back to back - size_t needed = sizeof(ClosureComponent) + prim_size; - ClosureComponent* comp = (ClosureComponent*)m_closure_pool.alloc( - needed, alignof(ClosureComponent)); - comp->id = id; - comp->w = w; - return comp; + return m_closure_pool.alloc(size, alignment); } - ClosureMul* closure_mul_allot(const Color3& w, const ClosureColor* c) - { - ClosureMul* mul = (ClosureMul*)m_closure_pool.alloc(sizeof(ClosureMul), - alignof(ClosureMul)); - mul->id = ClosureColor::MUL; - mul->weight = w; - mul->closure = c; - return mul; - } - - ClosureMul* closure_mul_allot(float w, const ClosureColor* c) - { - ClosureMul* mul = (ClosureMul*)m_closure_pool.alloc(sizeof(ClosureMul), - alignof(ClosureMul)); - mul->id = ClosureColor::MUL; - mul->weight.setValue(w, w, w); - mul->closure = c; - return mul; - } - - ClosureAdd* closure_add_allot(const ClosureColor* a, const ClosureColor* b) - { - ClosureAdd* add = (ClosureAdd*)m_closure_pool.alloc(sizeof(ClosureAdd), - alignof(ClosureAdd)); - add->id = ClosureColor::ADD; - add->closureA = a; - add->closureB = b; - return add; - } - - /// Find the named symbol in the (already-executed!) stack of shaders of /// the given use. If a layer is given, search just that layer. If no /// layer is specified, priority is given to later laters over earlier diff --git a/src/liboslexec/rs_fallback.cpp b/src/liboslexec/rs_fallback.cpp index 374a8c365..2526a81d0 100644 --- a/src/liboslexec/rs_fallback.cpp +++ b/src/liboslexec/rs_fallback.cpp @@ -8,6 +8,7 @@ #include +#include "oslexec_pvt.h" // Fallback is to reroute calls back through the virtual function // based RendererServices from ShaderGlobals. @@ -316,6 +317,18 @@ rs_trace_get(OSL::OpaqueExecContextPtr exec_ctx, OSL::ustringhash name, #endif } +OSL_RSOP OSL_HOSTDEVICE void* +rs_allocate_closure(OSL::OpaqueExecContextPtr exec_ctx, size_t size, + size_t alignment) +{ +#ifndef __CUDA_ARCH__ + auto sg = get_sg(exec_ctx); + return sg->context->allocate_closure(size, alignment); +#else + return nullptr; +#endif +} + OSL_RSOP OSL_HOSTDEVICE void rs_errorfmt(OSL::OpaqueExecContextPtr exec_ctx, OSL::ustringhash fmt_specification, int32_t count, diff --git a/src/liboslexec/shadeimage.cpp b/src/liboslexec/shadeimage.cpp index da982670e..5a3db57ea 100644 --- a/src/liboslexec/shadeimage.cpp +++ b/src/liboslexec/shadeimage.cpp @@ -116,9 +116,6 @@ shade_image(ShadingSystem& shadingsys, ShaderGroup& group, // That also implies that our normal points to (0,0,1) sg.N = Vec3(0, 0, 1); sg.Ng = Vec3(0, 0, 1); - // In our SimpleRenderer, the "renderstate" itself just a pointer to - // the ShaderGlobals. - // sg.renderstate = &sg; } // Loop over all pixels in the image (in x and y)... diff --git a/src/osltoy/osltoyrenderer.cpp b/src/osltoy/osltoyrenderer.cpp index 48d55855c..932cee26b 100644 --- a/src/osltoy/osltoyrenderer.cpp +++ b/src/osltoy/osltoyrenderer.cpp @@ -115,9 +115,6 @@ OSLToyRenderer::OSLToyRenderer() // That also implies that our normal points to (0,0,1) sg.N = Vec3(0, 0, 1); sg.Ng = Vec3(0, 0, 1); - // In our SimpleRenderer, the "renderstate" itself just a pointer to - // the ShaderGlobals. - // sg.renderstate = &sg; } diff --git a/src/testrender/cuda/optix_raytracer.cu b/src/testrender/cuda/optix_raytracer.cu index a2880c730..c102d25dc 100644 --- a/src/testrender/cuda/optix_raytracer.cu +++ b/src/testrender/cuda/optix_raytracer.cu @@ -48,16 +48,19 @@ __device__ __constant__ RenderParams render_params; static inline __device__ void -execute_shader(ShaderGlobalsType& sg, const int shader_id, char* closure_pool) +execute_shader(ShaderGlobalsType& sg, const int shader_id, + StackClosurePool& closure_pool) { if (shader_id < 0) { // TODO: should probably never get here ... return; } - // Pack the "closure pool" into one of the ShaderGlobals pointers - *(int*)&closure_pool[0] = 0; - sg.renderstate = &closure_pool[0]; + closure_pool.reset(); + RenderState renderState; + // TODO: renderState.context = ... + renderState.closure_pool = &closure_pool; + sg.renderstate = &renderState; // Pack the pointers to the options structs in a faux "context", // which is a rough stand-in for the host ShadingContext. diff --git a/src/testrender/cuda/rend_lib.cu b/src/testrender/cuda/rend_lib.cu index eb7cce56e..75de931cc 100644 --- a/src/testrender/cuda/rend_lib.cu +++ b/src/testrender/cuda/rend_lib.cu @@ -30,17 +30,6 @@ extern __device__ CUdeviceptr xform_buffer; } // namespace pvt OSL_NAMESPACE_END - -// Taken from the SimplePool class -__device__ static inline size_t -alignment_offset_calc(void* ptr, size_t alignment) -{ - uintptr_t ptrbits = reinterpret_cast(ptr); - uintptr_t offset = ((ptrbits + alignment - 1) & -alignment) - ptrbits; - return offset; -} - - // These functions are declared extern to prevent name mangling. extern "C" { @@ -51,199 +40,120 @@ __direct_callable__dummy_rend_lib() } -__device__ void* -closure_component_allot(void* pool, int id, size_t prim_size, - const OSL::Color3& w) -{ - ((OSL::ClosureComponent*)pool)->id = id; - ((OSL::ClosureComponent*)pool)->w = w; - - size_t needed = (sizeof(OSL::ClosureComponent) + prim_size - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return (void*)&char_ptr[needed]; -} - - -__device__ void* -closure_mul_allot(void* pool, const OSL::Color3& w, OSL::ClosureColor* c) -{ - ((OSL::ClosureMul*)pool)->id = OSL::ClosureColor::MUL; - ((OSL::ClosureMul*)pool)->weight = w; - ((OSL::ClosureMul*)pool)->closure = c; - - size_t needed = (sizeof(OSL::ClosureMul) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - - -__device__ void* -closure_mul_float_allot(void* pool, const float& w, OSL::ClosureColor* c) -{ - ((OSL::ClosureMul*)pool)->id = OSL::ClosureColor::MUL; - ((OSL::ClosureMul*)pool)->weight.x = w; - ((OSL::ClosureMul*)pool)->weight.y = w; - ((OSL::ClosureMul*)pool)->weight.z = w; - ((OSL::ClosureMul*)pool)->closure = c; - - size_t needed = (sizeof(OSL::ClosureMul) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - - -__device__ void* -closure_add_allot(void* pool, OSL::ClosureColor* a, OSL::ClosureColor* b) -{ - ((OSL::ClosureAdd*)pool)->id = OSL::ClosureColor::ADD; - ((OSL::ClosureAdd*)pool)->closureA = a; - ((OSL::ClosureAdd*)pool)->closureB = b; - - size_t needed = (sizeof(OSL::ClosureAdd) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - - -__device__ void* -osl_allocate_closure_component(void* sg_, int id, int size) +__device__ const void* +osl_add_closure_closure(void* sg_, const void* a_, const void* b_) { - OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; - - OSL::Color3 w = OSL::Color3(1, 1, 1); - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - - size = max(4, size); - - sg_ptr->renderstate = closure_component_allot(ret, id, size, w); - - return ret; + a_ = __builtin_assume_aligned(a_, alignof(float)); + b_ = __builtin_assume_aligned(b_, alignof(float)); + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + const OSL::ClosureColor* b = (const OSL::ClosureColor*)b_; + if (a == NULL) + return b; + if (b == NULL) + return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureAdd* add + = (OSL::ClosureAdd*)closure_pool->allocate(sizeof(OSL::ClosureAdd), + alignof(OSL::ClosureAdd)); + if (add) { + add->id = OSL::ClosureColor::ADD; + add->closureA = a; + add->closureB = b; + } + return add; } - - -__device__ void* -osl_allocate_weighted_closure_component(void* sg_, int id, int size, - const void* w) +__device__ const void* +osl_mul_closure_color(void* sg_, const void* a_, const void* w_) { - OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; + a_ = __builtin_assume_aligned(a_, alignof(float)); + w_ = __builtin_assume_aligned(w_, alignof(float)); - const OSL::Color3* wc - = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float)); - - if (wc->x == 0.0f && wc->y == 0.0f && wc->z == 0.0f) { + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + const OSL::Color3* w = (const OSL::Color3*)w_; + if (a == NULL) + return NULL; + if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) return NULL; + if (w->x == 1.0f && w->y == 1.0f && w->z == 1.0f) + return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureMul* mul + = (OSL::ClosureMul*)closure_pool->allocate(sizeof(OSL::ClosureMul), + alignof(OSL::ClosureMul)); + if (mul) { + mul->id = OSL::ClosureColor::MUL; + mul->weight = *w; + mul->closure = a; } - - size = max(4, size); - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_component_allot(ret, id, size, *wc); - - return ret; + return mul; } - - -__device__ void* -osl_mul_closure_color(void* sg_, void* a, const void* w) +__device__ const void* +osl_mul_closure_float(void* sg_, const void* a_, float w) { - OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; - const OSL::Color3* wc - = (const OSL::Color3*)__builtin_assume_aligned(w, alignof(float)); + a_ = __builtin_assume_aligned(a_, alignof(float)); - if (a == NULL) { + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + if (a == NULL) return NULL; - } - - if (wc->x == 0.0f && wc->y == 0.0f && wc->z == 0.0f) { + if (w == 0.0f) return NULL; - } - - if (wc->x == 1.0f && wc->y == 1.0f && wc->z == 1.0f) { + if (w == 1.0f) return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureMul* mul + = (OSL::ClosureMul*)closure_pool->allocate(sizeof(OSL::ClosureMul), + alignof(OSL::ClosureMul)); + if (mul) { + mul->id = OSL::ClosureColor::MUL; + mul->weight = OSL::Color3(w); + mul->closure = a; } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_mul_allot(ret, *wc, (OSL::ClosureColor*)a); - - return ret; + return mul; } - - __device__ void* -osl_mul_closure_float(void* sg_, void* a, float w) +osl_allocate_closure_component(void* sg_, int id, int size) { - a = __builtin_assume_aligned(a, alignof(float)); - - OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; - - if (a == NULL || w == 0.0f) { - return NULL; - } - - if (w == 1.0f) { - return a; + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + // Allocate the component and the mul back to back + const size_t needed = sizeof(OSL::ClosureComponent) + size; + OSL::ClosureComponent* comp + = (OSL::ClosureComponent*) + closure_pool->allocate(needed, alignof(OSL::ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = OSL::Color3(1.0f); } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_mul_float_allot(ret, w, - (OSL::ClosureColor*)a); - - return ret; + return comp; } - - __device__ void* -osl_add_closure_closure(void* sg_, void* a, void* b) +osl_allocate_weighted_closure_component(void* sg_, int id, int size, + const void* w_) { - a = __builtin_assume_aligned(a, alignof(float)); - b = __builtin_assume_aligned(b, alignof(float)); - - OSL_CUDA::ShaderGlobals* sg_ptr = (OSL_CUDA::ShaderGlobals*)sg_; - - if (a == NULL) { - return b; - } + w_ = __builtin_assume_aligned(w_, alignof(float)); - if (b == NULL) { - return a; + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::Color3* w = (const OSL::Color3*)w_; + if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) + return NULL; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + // Allocate the component and the mul back to back + const size_t needed = sizeof(OSL::ClosureComponent) + size; + OSL::ClosureComponent* comp + = (OSL::ClosureComponent*) + closure_pool->allocate(needed, alignof(OSL::ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = *w; } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_add_allot(ret, (OSL::ClosureColor*)a, - (OSL::ClosureColor*)b); - - return ret; + return comp; } diff --git a/src/testrender/raytracer.h b/src/testrender/raytracer.h index 4be865678..73dcad562 100644 --- a/src/testrender/raytracer.h +++ b/src/testrender/raytracer.h @@ -9,13 +9,13 @@ #include +#include "../testshade/render_state.h" #include "optix_compat.h" #include "render_params.h" #include #include #include "bvh.h" - #if OSL_USE_OPTIX # include # include // from CUDA diff --git a/src/testrender/simpleraytracer.cpp b/src/testrender/simpleraytracer.cpp index e86d968be..7508410a8 100644 --- a/src/testrender/simpleraytracer.cpp +++ b/src/testrender/simpleraytracer.cpp @@ -921,12 +921,6 @@ SimpleRaytracer::globals_from_hit(ShaderGlobalsType& sg, const Ray& r, } sg.raytype = r.raytype; sg.flipHandedness = sg.dPdx.cross(sg.dPdy).dot(sg.N) < 0; - -#ifndef __CUDACC__ - // In our SimpleRaytracer, the "renderstate" itself just a pointer to - // the ShaderGlobals. - sg.renderstate = &sg; -#endif } @@ -945,7 +939,7 @@ SimpleRaytracer::eval_background(const Dual2& dir, ShadingContext* ctx, #ifndef __CUDACC__ shadingsys->execute(*ctx, *m_shaders[backgroundShaderID].surf, sg); #else - alignas(8) char closure_pool[256]; + StackClosurePool closure_pool; execute_shader(sg, render_params.bg_id, closure_pool); #endif return process_background_closure((const ClosureColor*)sg.Ci); @@ -957,8 +951,8 @@ SimpleRaytracer::subpixel_radiance(float x, float y, Sampler& sampler, { #ifdef __CUDACC__ // Scratch space for the output closures - alignas(8) char closure_pool[256]; - alignas(8) char light_closure_pool[256]; + StackClosurePool closure_pool; + StackClosurePool light_closure_pool; #endif constexpr float inf = std::numeric_limits::infinity(); @@ -1349,7 +1343,6 @@ SimpleRaytracer::prepare_geometry() sg.v = uv[i].y; sg.I = (p[i] - camera.eye).normalize(); sg.surfacearea = area; - sg.renderstate = &sg; shadingsys->execute(*ctx, *m_shaders[shaderID].disp, sg); diff --git a/src/testshade/cuda/optix_grid_renderer.cu b/src/testshade/cuda/optix_grid_renderer.cu index f3657efcd..fe5d45a5e 100644 --- a/src/testshade/cuda/optix_grid_renderer.cu +++ b/src/testshade/cuda/optix_grid_renderer.cu @@ -98,7 +98,7 @@ __raygen__() // networks, so there should be (at least) some mechanism to issue a // warning or error if the closure or param storage can possibly be // exceeded. - alignas(8) char closure_pool[256]; + StackClosurePool closure_pool; alignas(8) char params[256]; OSL_CUDA::ShaderGlobals sg; @@ -137,8 +137,11 @@ __raygen__() sg.object2common = reinterpret_cast(render_params.object2common); // Pack the "closure pool" into one of the ShaderGlobals pointers - *(int*)&closure_pool[0] = 0; - sg.renderstate = &closure_pool[0]; + closure_pool.reset(); + RenderState renderState; + // TODO: renderState.context = ... + renderState.closure_pool = &closure_pool; + sg.renderstate = &renderState; // Run the OSL group and init functions if (render_params.fused_callable) diff --git a/src/testshade/render_state.h b/src/testshade/render_state.h index 8aec4954f..29079f324 100644 --- a/src/testshade/render_state.h +++ b/src/testshade/render_state.h @@ -10,7 +10,7 @@ // All the the state free functions in rs_simplerend.cpp will need to do their job // NOTE: Additional data is here that will be used by rs_simplerend.cpp in future PR's // procedurally generating ShaderGlobals. -struct RenderState { +struct RenderContext { int xres; int yres; OSL::Matrix44 world_to_camera; @@ -24,17 +24,36 @@ struct RenderState { void* journal_buffer; }; +class StackClosurePool { + static constexpr size_t capacity = 1024; + alignas(8) char buffer[capacity]; + void* ptr; + +public: + OSL_HOSTDEVICE + StackClosurePool() { reset(); } -// Create constexpr hashes for all strings used by the free function renderer services. -// NOTE: Actually ustring's should also be instantiated in host code someplace as well -// to allow the reverse mapping of hash->string to work when processing messages -namespace RS { -namespace { -namespace Hashes { -#define RS_STRDECL(str, var_name) \ - constexpr OSL::ustringhash var_name(OSL::strhash(str)); -#include "rs_strdecls.h" -#undef RS_STRDECL -}; //namespace Hashes -} // unnamed namespace -}; //namespace RS + OSL_HOSTDEVICE + void reset() + { + ptr = &buffer[0]; + *(int*)ptr = 0; + } + + OSL_HOSTDEVICE + void* allocate(size_t size, size_t alignment) + { + uintptr_t p = OIIO::round_to_multiple_of_pow2((uintptr_t)ptr, + alignment); + ptr = (void*)(p + size); + if (ptr <= &buffer[capacity]) + return (void*)p; + assert(false); + return nullptr; + } +}; + +struct RenderState { + RenderContext* context; + StackClosurePool* closure_pool; +}; diff --git a/src/testshade/rs_simplerend.cpp b/src/testshade/rs_simplerend.cpp index 93c9c960d..b8c69597e 100644 --- a/src/testshade/rs_simplerend.cpp +++ b/src/testshade/rs_simplerend.cpp @@ -17,6 +17,22 @@ #include "render_state.h" + +// Create constexpr hashes for all strings used by the free function renderer services. +// NOTE: Actually ustring's should also be instantiated in host code someplace as well +// to allow the reverse mapping of hash->string to work when processing messages +namespace RS { +namespace { +namespace Hashes { +#define RS_STRDECL(str, var_name) \ + constexpr OSL::ustringhash var_name(OSL::strhash(str)); +#include "rs_strdecls.h" +#undef RS_STRDECL +}; //namespace Hashes +} // unnamed namespace +}; //namespace RS + + // Keep free functions in sync with virtual function based SimpleRenderer. OSL_RSOP OSL_HOSTDEVICE bool @@ -66,24 +82,24 @@ rs_get_inverse_matrix_space_time(OSL::OpaqueExecContextPtr ec, using OSL::Matrix44; - auto rs = OSL::get_rs(ec); + auto rc = OSL::get_rs(ec)->context; if (to == OSL::Hashes::camera || to == OSL::Hashes::screen || to == OSL::Hashes::NDC || to == RS::Hashes::raster) { - Matrix44 M { rs->world_to_camera }; + Matrix44 M { rc->world_to_camera }; if (to == OSL::Hashes::screen || to == OSL::Hashes::NDC || to == RS::Hashes::raster) { - float depthrange = (double)rs->yon - (double)rs->hither; - OSL::ustringhash proj = rs->projection; + float depthrange = (double)rc->yon - (double)rc->hither; + OSL::ustringhash proj = rc->projection; if (proj == RS::Hashes::perspective) { - float tanhalffov = OIIO::fast_tan(0.5f * rs->fov * M_PI + float tanhalffov = OIIO::fast_tan(0.5f * rc->fov * M_PI / 180.0); // clang-format off Matrix44 camera_to_screen (1/tanhalffov, 0, 0, 0, 0, 1/tanhalffov, 0, 0, - 0, 0, rs->yon/depthrange, 1, - 0, 0, -(rs->yon*rs->hither)/depthrange, 0); + 0, 0, rc->yon/depthrange, 1, + 0, 0, -(rc->yon*rc->hither)/depthrange, 0); // clang-format on M = M * camera_to_screen; } else { @@ -91,7 +107,7 @@ rs_get_inverse_matrix_space_time(OSL::OpaqueExecContextPtr ec, Matrix44 camera_to_screen (1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1/depthrange, 0, - 0, 0, -(rs->hither)/depthrange, 1); + 0, 0, -(rc->hither)/depthrange, 1); // clang-format on M = M * camera_to_screen; } @@ -107,8 +123,8 @@ rs_get_inverse_matrix_space_time(OSL::OpaqueExecContextPtr ec, M = M * screen_to_ndc; if (to == RS::Hashes::raster) { // clang-format off - Matrix44 ndc_to_raster (rs->xres, 0, 0, 0, - 0, rs->yres, 0, 0, + Matrix44 ndc_to_raster (rc->xres, 0, 0, 0, + 0, rc->yres, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1); M = M * ndc_to_raster; @@ -368,6 +384,13 @@ rs_trace_get(OSL::OpaqueExecContextPtr ec, OSL::ustringhash name, #endif } +OSL_RSOP OSL_HOSTDEVICE void* +rs_allocate_closure(OSL::OpaqueExecContextPtr ec, size_t size, size_t alignment) +{ + auto rs = OSL::get_rs(ec); + return rs->closure_pool->allocate(size, alignment); +} + OSL_RSOP OSL_HOSTDEVICE bool rs_get_attribute_constant_string(OSL::ustringhash value, void* result) { @@ -491,7 +514,7 @@ rs_get_attribute(OSL::OpaqueExecContextPtr oec, OSL::ustringhash_pod object_, auto object = OSL::ustringhash_from(object_); auto name = OSL::ustringhash_from(name_); const OSL::TypeDesc type = OSL::TypeDesc_from(_type); - auto rs = OSL::get_rs(oec); + auto rc = OSL::get_rs(oec)->context; // The many branches in the code below handle the case where we don't know // the attribute name at compile time. In the case it is known, dead-code @@ -500,38 +523,38 @@ rs_get_attribute(OSL::OpaqueExecContextPtr oec, OSL::ustringhash_pod object_, return rs_get_attribute_constant_int(OSL_VERSION, result); if (name == RS::Hashes::camera_resolution && type == OSL::TypeDesc(OSL::TypeDesc::INT, 2)) - return rs_get_attribute_constant_int2(rs->xres, rs->yres, result); + return rs_get_attribute_constant_int2(rc->xres, rc->yres, result); if (name == RS::Hashes::camera_projection && type == OSL::TypeString) - return rs_get_attribute_constant_string(rs->projection, result); + return rs_get_attribute_constant_string(rc->projection, result); if (name == RS::Hashes::camera_pixelaspect && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->pixelaspect, derivatives, + return rs_get_attribute_constant_float(rc->pixelaspect, derivatives, result); if (name == RS::Hashes::camera_screen_window && type == OSL::TypeDesc(OSL::TypeDesc::FLOAT, 4)) - return rs_get_attribute_constant_float4(rs->screen_window[0], - rs->screen_window[1], - rs->screen_window[2], - rs->screen_window[3], + return rs_get_attribute_constant_float4(rc->screen_window[0], + rc->screen_window[1], + rc->screen_window[2], + rc->screen_window[3], derivatives, result); if (name == RS::Hashes::camera_fov && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->fov, derivatives, result); + return rs_get_attribute_constant_float(rc->fov, derivatives, result); if (name == RS::Hashes::camera_clip && type == OSL::TypeDesc(OSL::TypeDesc::FLOAT, 2)) - return rs_get_attribute_constant_float2(rs->hither, rs->yon, + return rs_get_attribute_constant_float2(rc->hither, rc->yon, derivatives, result); if (name == RS::Hashes::camera_clip_near && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->hither, derivatives, result); + return rs_get_attribute_constant_float(rc->hither, derivatives, result); if (name == RS::Hashes::camera_clip_far && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->yon, derivatives, result); + return rs_get_attribute_constant_float(rc->yon, derivatives, result); if (name == RS::Hashes::camera_shutter && type == OSL::TypeDesc(OSL::TypeDesc::FLOAT, 2)) - return rs_get_attribute_constant_float2(rs->shutter[0], rs->shutter[1], + return rs_get_attribute_constant_float2(rc->shutter[0], rc->shutter[1], derivatives, result); if (name == RS::Hashes::camera_shutter_open && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->shutter[0], derivatives, + return rs_get_attribute_constant_float(rc->shutter[0], derivatives, result); if (name == RS::Hashes::camera_shutter_close && type == OSL::TypeFloat) - return rs_get_attribute_constant_float(rs->shutter[1], derivatives, + return rs_get_attribute_constant_float(rc->shutter[1], derivatives, result); if (name == RS::Hashes::shading_index && type == OSL::TypeInt) @@ -643,9 +666,9 @@ rs_errorfmt(OSL::OpaqueExecContextPtr ec, OSL::ustringhash fmt_specification, int32_t arg_count, const OSL::EncodedType* argTypes, uint32_t argValuesSize, uint8_t* argValues) { - auto rs = OSL::get_rs(ec); + auto rc = OSL::get_rs(ec)->context; - OSL::journal::Writer jw { rs->journal_buffer }; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_errorfmt(OSL::get_thread_index(ec), OSL::get_shade_index(ec), fmt_specification, arg_count, argTypes, argValuesSize, argValues); @@ -656,9 +679,9 @@ rs_warningfmt(OSL::OpaqueExecContextPtr ec, OSL::ustringhash fmt_specification, int32_t arg_count, const OSL::EncodedType* argTypes, uint32_t argValuesSize, uint8_t* argValues) { - auto rs = OSL::get_rs(ec); + auto rc = OSL::get_rs(ec)->context; - OSL::journal::Writer jw { rs->journal_buffer }; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_warningfmt(OSL::get_max_warnings_per_thread(ec), OSL::get_thread_index(ec), OSL::get_shade_index(ec), fmt_specification, arg_count, argTypes, argValuesSize, @@ -671,9 +694,9 @@ rs_printfmt(OSL::OpaqueExecContextPtr ec, OSL::ustringhash fmt_specification, int32_t arg_count, const OSL::EncodedType* argTypes, uint32_t argValuesSize, uint8_t* argValues) { - auto rs = OSL::get_rs(ec); + auto rc = OSL::get_rs(ec)->context; - OSL::journal::Writer jw { rs->journal_buffer }; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_printfmt(OSL::get_thread_index(ec), OSL::get_shade_index(ec), fmt_specification, arg_count, argTypes, argValuesSize, argValues); @@ -686,9 +709,9 @@ rs_filefmt(OSL::OpaqueExecContextPtr ec, OSL::ustringhash filename_hash, const OSL::EncodedType* argTypes, uint32_t argValuesSize, uint8_t* argValues) { - auto rs = OSL::get_rs(ec); + auto rc = OSL::get_rs(ec)->context; - OSL::journal::Writer jw { rs->journal_buffer }; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_filefmt(OSL::get_thread_index(ec), OSL::get_shade_index(ec), filename_hash, fmt_specification, arg_count, argTypes, argValuesSize, argValues); diff --git a/src/testshade/simplerend.cpp b/src/testshade/simplerend.cpp index 191678a6d..1563caa6d 100644 --- a/src/testshade/simplerend.cpp +++ b/src/testshade/simplerend.cpp @@ -25,6 +25,21 @@ namespace Strings { } // namespace RS +// Create constexpr hashes for all strings used by the free function renderer services. +// NOTE: Actually ustring's should also be instantiated in host code someplace as well +// to allow the reverse mapping of hash->string to work when processing messages +namespace RS { +namespace { +namespace Hashes { +#define RS_STRDECL(str, var_name) \ + constexpr OSL::ustringhash var_name(OSL::strhash(str)); +#include "rs_strdecls.h" +#undef RS_STRDECL +}; //namespace Hashes +} // unnamed namespace +}; //namespace RS + + using namespace OSL; @@ -1058,22 +1073,22 @@ SimpleRenderer::add_output(string_view varname_, string_view filename, void -SimpleRenderer::export_state(RenderState& state) const +SimpleRenderer::export_context(RenderContext& context) const { - state.xres = m_xres; - state.yres = m_yres; - state.fov = m_fov; - state.hither = m_hither; - state.yon = m_yon; - - state.world_to_camera = OSL::Matrix44(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, - 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, - 0.0, 1.0); + context.xres = m_xres; + context.yres = m_yres; + context.fov = m_fov; + context.hither = m_hither; + context.yon = m_yon; + + context.world_to_camera = OSL::Matrix44(1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, + 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, + 0.0, 1.0); //perspective is not a member of StringParams (i.e not in strdecls.h) - state.projection = RS::Hashes::perspective; - state.pixelaspect = m_pixelaspect; - std::copy_n(m_screen_window, 4, state.screen_window); - std::copy_n(m_shutter, 2, state.shutter); + context.projection = RS::Hashes::perspective; + context.pixelaspect = m_pixelaspect; + std::copy_n(m_screen_window, 4, context.screen_window); + std::copy_n(m_shutter, 2, context.shutter); } void @@ -1082,8 +1097,8 @@ SimpleRenderer::errorfmt(OSL::ShaderGlobals* sg, const EncodedType* arg_types, uint32_t arg_values_size, uint8_t* argValues) { - RenderState* rs = reinterpret_cast(sg->renderstate); - OSL::journal::Writer jw { rs->journal_buffer }; + RenderContext* rc = reinterpret_cast(sg->renderstate)->context; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_errorfmt(OSL::get_thread_index(sg), OSL::get_shade_index(sg), fmt_specification, arg_count, arg_types, arg_values_size, argValues); @@ -1095,8 +1110,8 @@ SimpleRenderer::warningfmt(OSL::ShaderGlobals* sg, int32_t arg_count, const EncodedType* arg_types, uint32_t arg_values_size, uint8_t* argValues) { - RenderState* rs = reinterpret_cast(sg->renderstate); - OSL::journal::Writer jw { rs->journal_buffer }; + RenderContext* rc = reinterpret_cast(sg->renderstate)->context; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_warningfmt(OSL::get_max_warnings_per_thread(sg), OSL::get_thread_index(sg), OSL::get_shade_index(sg), fmt_specification, arg_count, arg_types, @@ -1111,8 +1126,8 @@ SimpleRenderer::printfmt(OSL::ShaderGlobals* sg, const EncodedType* arg_types, uint32_t arg_values_size, uint8_t* argValues) { - RenderState* rs = reinterpret_cast(sg->renderstate); - OSL::journal::Writer jw { rs->journal_buffer }; + RenderContext* rc = reinterpret_cast(sg->renderstate)->context; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_printfmt(OSL::get_thread_index(sg), OSL::get_shade_index(sg), fmt_specification, arg_count, arg_types, arg_values_size, argValues); @@ -1124,8 +1139,8 @@ SimpleRenderer::filefmt(OSL::ShaderGlobals* sg, OSL::ustringhash filename_hash, const EncodedType* arg_types, uint32_t arg_values_size, uint8_t* argValues) { - RenderState* rs = reinterpret_cast(sg->renderstate); - OSL::journal::Writer jw { rs->journal_buffer }; + RenderContext* rc = reinterpret_cast(sg->renderstate)->context; + OSL::journal::Writer jw { rc->journal_buffer }; jw.record_filefmt(OSL::get_thread_index(sg), OSL::get_shade_index(sg), filename_hash, fmt_specification, arg_count, arg_types, arg_values_size, argValues); diff --git a/src/testshade/simplerend.h b/src/testshade/simplerend.h index bb6a0e426..39ff3f5a9 100644 --- a/src/testshade/simplerend.h +++ b/src/testshade/simplerend.h @@ -145,7 +145,7 @@ class SimpleRenderer : public RendererServices { size_t noutputs() const { return m_outputbufs.size(); } virtual void init_shadingsys(ShadingSystem* ss) { shadingsys = ss; } - virtual void export_state(RenderState&) const; + virtual void export_context(RenderContext&) const; virtual void prepare_render() {} virtual void warmup() {} virtual void render(int /*xres*/, int /*yres*/) {} diff --git a/src/testshade/testshade.cpp b/src/testshade/testshade.cpp index 51f3e001b..5309d892d 100644 --- a/src/testshade/testshade.cpp +++ b/src/testshade/testshade.cpp @@ -945,21 +945,26 @@ setup_transformations(SimpleRenderer& rend, OSL::Matrix44& Mshad, rend.name_transform("myspace", Mmyspace); } -// NOTE: each host thread could end up with its own RenderState. -// Starting simple with a single instance for now -static RenderState theRenderState; +// A single render context shared by all render threads. +static RenderContext theRenderContext; // Set up the ShaderGlobals fields for pixel (x,y). static void -setup_shaderglobals(ShaderGlobals& sg, ShadingSystem* shadingsys, int x, int y) +setup_shaderglobals(ShaderGlobals& sg, ShadingSystem* shadingsys, + RenderState& renderState, StackClosurePool* closure_pool, + int x, int y) { // Just zero the whole thing out to start memset((char*)&sg, 0, sizeof(ShaderGlobals)); // Any state data needed by SimpleRenderer or its free function equivalent // will need to be passed here the ShaderGlobals. - sg.renderstate = &theRenderState; + renderState.context = &theRenderContext; + renderState.closure_pool = closure_pool; + sg.renderstate = &renderState; + if (closure_pool) + closure_pool->reset(); // Set "shader" space to be Mshad. In a real renderer, this may be // different for each shader group. @@ -1182,7 +1187,9 @@ setup_output_images(SimpleRenderer* rend, ShadingSystem* shadingsys, ShadingContext* ctx = shadingsys->get_context(thread_info); raytype_bit = shadingsys->raytype_bit(ustring(raytype_name)); ShaderGlobals sg; - setup_shaderglobals(sg, shadingsys, 0, 0); + RenderState renderState; + StackClosurePool closure_pool; + setup_shaderglobals(sg, shadingsys, renderState, &closure_pool, 0, 0); #if OSL_USE_BATCHED if (batched) { @@ -1586,6 +1593,8 @@ shade_region(SimpleRenderer* rend, ShaderGroup* shadergroup, OIIO::ROI roi, // Set up shader globals and a little test grid of points to shade. ShaderGlobals shaderglobals; + RenderState renderState; + StackClosurePool closure_pool; raytype_bit = shadingsys->raytype_bit(ustring(raytype_name)); @@ -1606,7 +1615,8 @@ shade_region(SimpleRenderer* rend, ShaderGroup* shadergroup, OIIO::ROI roi, // set it up rigged to look like we're rendering a single // quadrilateral that exactly fills the viewport, and that // setup is done in the following function call: - setup_shaderglobals(shaderglobals, shadingsys, x, y); + setup_shaderglobals(shaderglobals, shadingsys, renderState, + &closure_pool, x, y); if (this_threads_index == uninitialized_thread_index) { this_threads_index = next_thread_index.fetch_add(1u); @@ -2139,7 +2149,7 @@ test_shade(int argc, const char* argv[]) rend->prepare_render(); if (use_rs_bitcode) { // SimpleRend to supply the required state for render service free functions - rend->export_state(theRenderState); + rend->export_context(theRenderContext); } double setuptime = timer.lap(); @@ -2167,7 +2177,7 @@ test_shade(int argc, const char* argv[]) //Send the populated Journal Buffer to the renderer - theRenderState.journal_buffer = jbuffer.get(); + theRenderContext.journal_buffer = jbuffer.get(); // Allow a settable number of iterations to "render" the whole image, diff --git a/testsuite/example-cuda/cuda_grid_renderer.cu b/testsuite/example-cuda/cuda_grid_renderer.cu index cb53427f8..d2d4426b3 100644 --- a/testsuite/example-cuda/cuda_grid_renderer.cu +++ b/testsuite/example-cuda/cuda_grid_renderer.cu @@ -74,7 +74,7 @@ shade(float3* Cout, int w, int h) // networks, so there should be (at least) some mechanism to issue a // warning or error if the closure or param storage can possibly be // exceeded. - alignas(8) char closure_pool[256]; + StackClosurePool closure_pool; alignas(8) char params[256]; const float invw = 1.0 / w; @@ -115,8 +115,11 @@ shade(float3* Cout, int w, int h) sg.flipHandedness = 0; // Pack the "closure pool" into one of the ShaderGlobals pointers - *(int*)&closure_pool[0] = 0; - sg.renderstate = &closure_pool[0]; + closure_pool.reset(); + RenderState renderState; + // TODO: renderState.context = ... + renderState.closure_pool = &closure_pool; + sg.renderstate = &renderState; // Run the shader __osl__init(&sg, params); diff --git a/testsuite/example-cuda/rend_lib.cu b/testsuite/example-cuda/rend_lib.cu index 2c78e6d86..2d235dc19 100644 --- a/testsuite/example-cuda/rend_lib.cu +++ b/testsuite/example-cuda/rend_lib.cu @@ -14,197 +14,127 @@ extern __device__ char* s_color_system; } OSL_NAMESPACE_END -// Taken from the SimplePool class -__device__ static inline size_t -alignment_offset_calc(void* ptr, size_t alignment) -{ - uintptr_t ptrbits = reinterpret_cast(ptr); - uintptr_t offset = ((ptrbits + alignment - 1) & -alignment) - ptrbits; - return offset; -} - // These functions are declared extern to prevent name mangling. extern "C" { -__device__ void* -closure_component_allot(void* pool, int id, size_t prim_size, - const OSL::Color3& w) -{ - ((OSL::ClosureComponent*)pool)->id = id; - ((OSL::ClosureComponent*)pool)->w = w; - - size_t needed = (sizeof(OSL::ClosureComponent) - sizeof(void*) + prim_size - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return (void*)&char_ptr[needed]; -} - - -__device__ void* -closure_mul_allot(void* pool, const OSL::Color3& w, OSL::ClosureColor* c) -{ - ((OSL::ClosureMul*)pool)->id = OSL::ClosureColor::MUL; - ((OSL::ClosureMul*)pool)->weight = w; - ((OSL::ClosureMul*)pool)->closure = c; - - size_t needed = (sizeof(OSL::ClosureMul) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - - -__device__ void* -closure_mul_float_allot(void* pool, const float& w, OSL::ClosureColor* c) -{ - ((OSL::ClosureMul*)pool)->id = OSL::ClosureColor::MUL; - ((OSL::ClosureMul*)pool)->weight.x = w; - ((OSL::ClosureMul*)pool)->weight.y = w; - ((OSL::ClosureMul*)pool)->weight.z = w; - ((OSL::ClosureMul*)pool)->closure = c; - - size_t needed = (sizeof(OSL::ClosureMul) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - - -__device__ void* -closure_add_allot(void* pool, OSL::ClosureColor* a, OSL::ClosureColor* b) -{ - ((OSL::ClosureAdd*)pool)->id = OSL::ClosureColor::ADD; - ((OSL::ClosureAdd*)pool)->closureA = a; - ((OSL::ClosureAdd*)pool)->closureB = b; - - size_t needed = (sizeof(OSL::ClosureAdd) - + (alignof(OSL::ClosureComponent) - 1)) - & ~(alignof(OSL::ClosureComponent) - 1); - char* char_ptr = (char*)pool; - - return &char_ptr[needed]; -} - -__device__ void* -osl_allocate_closure_component(void* sg_, int id, int size) +__device__ const void* +osl_add_closure_closure(void* sg_, const void* a_, const void* b_) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; - - OSL::Color3 w = OSL::Color3(1, 1, 1); - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - - size = max(4, size); - - sg_ptr->renderstate = closure_component_allot(ret, id, size, w); - - return ret; + a_ = __builtin_assume_aligned(a_, alignof(float)); + b_ = __builtin_assume_aligned(b_, alignof(float)); + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + const OSL::ClosureColor* b = (const OSL::ClosureColor*)b_; + if (a == NULL) + return b; + if (b == NULL) + return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureAdd* add + = (OSL::ClosureAdd*)closure_pool->allocate(sizeof(OSL::ClosureAdd), + alignof(OSL::ClosureAdd)); + if (add) { + add->id = OSL::ClosureColor::ADD; + add->closureA = a; + add->closureB = b; + } + return add; } - -__device__ void* -osl_allocate_weighted_closure_component(void* sg_, int id, int size, - const OSL::Color3* w) +__device__ const void* +osl_mul_closure_color(void* sg_, const void* a_, const void* w_) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + a_ = __builtin_assume_aligned(a_, alignof(float)); + w_ = __builtin_assume_aligned(w_, alignof(float)); - if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) { + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + const OSL::Color3* w = (const OSL::Color3*)w_; + if (a == NULL) + return NULL; + if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) return NULL; + if (w->x == 1.0f && w->y == 1.0f && w->z == 1.0f) + return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureMul* mul + = (OSL::ClosureMul*)closure_pool->allocate(sizeof(OSL::ClosureMul), + alignof(OSL::ClosureMul)); + if (mul) { + mul->id = OSL::ClosureColor::MUL; + mul->weight = *w; + mul->closure = a; } - - size = max(4, size); - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_component_allot(ret, id, size, *w); - - return ret; + return mul; } - -__device__ void* -osl_mul_closure_color(void* sg_, OSL::ClosureColor* a, const OSL::Color3* w) +__device__ const void* +osl_mul_closure_float(void* sg_, const void* a_, float w) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + a_ = __builtin_assume_aligned(a_, alignof(float)); - if (a == NULL) { + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::ClosureColor* a = (const OSL::ClosureColor*)a_; + if (a == NULL) return NULL; - } - - if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) { + if (w == 0.0f) return NULL; - } - - if (w->x == 1.0f && w->y == 1.0f && w->z == 1.0f) { + if (w == 1.0f) return a; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + OSL::ClosureMul* mul + = (OSL::ClosureMul*)closure_pool->allocate(sizeof(OSL::ClosureMul), + alignof(OSL::ClosureMul)); + if (mul) { + mul->id = OSL::ClosureColor::MUL; + mul->weight = OSL::Color3(w); + mul->closure = a; } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_mul_allot(ret, *w, a); - - return ret; + return mul; } - __device__ void* -osl_mul_closure_float(void* sg_, OSL::ClosureColor* a, float w) +osl_allocate_closure_component(void* sg_, int id, int size) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; - - if (a == NULL || w == 0.0f) { - return NULL; + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + // Allocate the component and the mul back to back + const size_t needed = sizeof(OSL::ClosureComponent) + size; + OSL::ClosureComponent* comp + = (OSL::ClosureComponent*) + closure_pool->allocate(needed, alignof(OSL::ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = OSL::Color3(1.0f); } - - if (w == 1.0f) { - return a; - } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_mul_float_allot(ret, w, a); - - return ret; + return comp; } - __device__ void* -osl_add_closure_closure(void* sg_, OSL::ClosureColor* a, OSL::ClosureColor* b) +osl_allocate_weighted_closure_component(void* sg_, int id, int size, + const void* w_) { - ShaderGlobals* sg_ptr = (ShaderGlobals*)sg_; + w_ = __builtin_assume_aligned(w_, alignof(float)); - if (a == NULL) { - return b; - } - - if (b == NULL) { - return a; + OSL_CUDA::ShaderGlobals* sg = (OSL_CUDA::ShaderGlobals*)sg_; + const OSL::Color3* w = (const OSL::Color3*)w_; + if (w->x == 0.0f && w->y == 0.0f && w->z == 0.0f) + return NULL; + auto* closure_pool = ((RenderState*)sg->renderstate)->closure_pool; + // Allocate the component and the mul back to back + const size_t needed = sizeof(OSL::ClosureComponent) + size; + OSL::ClosureComponent* comp + = (OSL::ClosureComponent*) + closure_pool->allocate(needed, alignof(OSL::ClosureComponent)); + if (comp) { + comp->id = id; + comp->w = *w; } - - // Fix up the alignment - void* ret = ((char*)sg_ptr->renderstate) - + alignment_offset_calc(sg_ptr->renderstate, - alignof(OSL::ClosureComponent)); - sg_ptr->renderstate = closure_add_allot(ret, a, b); - - return ret; + return comp; } + #define IS_STRING(type) (type.basetype == OSL::TypeDesc::STRING) #define IS_PTR(type) (type.basetype == OSL::TypeDesc::PTR) #define IS_COLOR(type) (type.vecsemantics == OSL::TypeDesc::COLOR)