From 0e380024cbbbb16c8e1719a09bca36afaf91d0b1 Mon Sep 17 00:00:00 2001
From: Mihai Bujanca <mihaibujanca@yahoo.com>
Date: Tue, 29 Aug 2017 04:28:50 +0100
Subject: [PATCH 1/4] Updated modules README. Added code for dynamicfusion

---
 modules/README.md                             |    2 +
 modules/dynamicfusion/CMakeLists.txt          |   82 +
 modules/dynamicfusion/LICENSE.md              |   27 +
 modules/dynamicfusion/README.md               |  134 ++
 modules/dynamicfusion/apps/CMakeLists.txt     |   25 +
 modules/dynamicfusion/apps/demo.cpp           |  168 ++
 .../apps/dynamicfusion_kinect.cpp             |  158 ++
 .../cmake/Modules/FindOpenNI.cmake            |   78 +
 modules/dynamicfusion/cmake/Targets.cmake     |  140 ++
 modules/dynamicfusion/cmake/Utils.cmake       |  102 ++
 modules/dynamicfusion/download_data.sh        |   14 +
 modules/dynamicfusion/kfusion/CMakeLists.txt  |    6 +
 .../kfusion/include/io/capture.hpp            |   42 +
 .../include/kfusion/cuda/device_array.hpp     |  303 ++++
 .../include/kfusion/cuda/device_memory.hpp    |  258 +++
 .../kfusion/include/kfusion/cuda/imgproc.hpp  |   35 +
 .../kfusion/cuda/kernel_containers.hpp        |   75 +
 .../include/kfusion/cuda/projective_icp.hpp   |   48 +
 .../include/kfusion/cuda/tsdf_volume.hpp      |  102 ++
 .../kfusion/include/kfusion/exports.hpp       |    7 +
 .../kfusion/include/kfusion/kinfu.hpp         |  107 ++
 .../kfusion/include/kfusion/optimisation.hpp  |  168 ++
 .../kfusion/include/kfusion/types.hpp         |  100 ++
 .../kfusion/include/kfusion/warp_field.hpp    |  102 ++
 .../kfusion/include/nanoflann/nanoflann.hpp   | 1398 +++++++++++++++++
 modules/dynamicfusion/kfusion/src/capture.cpp |  339 ++++
 modules/dynamicfusion/kfusion/src/core.cpp    |  231 +++
 .../dynamicfusion/kfusion/src/cuda/device.hpp |  127 ++
 .../dynamicfusion/kfusion/src/cuda/imgproc.cu |  622 ++++++++
 .../kfusion/src/cuda/proj_icp.cu              |  476 ++++++
 .../kfusion/src/cuda/temp_utils.hpp           |  606 +++++++
 .../kfusion/src/cuda/texture_binder.hpp       |   60 +
 .../kfusion/src/cuda/tsdf_volume.cu           |  831 ++++++++++
 .../kfusion/src/device_memory.cpp             |  252 +++
 modules/dynamicfusion/kfusion/src/imgproc.cpp |  202 +++
 .../dynamicfusion/kfusion/src/internal.hpp    |  149 ++
 modules/dynamicfusion/kfusion/src/kinfu.cpp   |  445 ++++++
 .../kfusion/src/optimisation.cpp              |    0
 modules/dynamicfusion/kfusion/src/precomp.cpp |   70 +
 modules/dynamicfusion/kfusion/src/precomp.hpp |   31 +
 .../kfusion/src/projective_icp.cpp            |  214 +++
 .../dynamicfusion/kfusion/src/safe_call.hpp   |   36 +
 .../dynamicfusion/kfusion/src/tsdf_volume.cpp |  334 ++++
 .../kfusion/src/utils/dual_quaternion.hpp     |  289 ++++
 .../kfusion/src/utils/knn_point_cloud.hpp     |   56 +
 .../kfusion/src/utils/quaternion.hpp          |  246 +++
 .../dynamicfusion/kfusion/src/warp_field.cpp  |  383 +++++
 modules/dynamicfusion/tests/CMakeLists.txt    |   15 +
 modules/dynamicfusion/tests/ceres_test.cpp    |  167 ++
 .../dynamicfusion/tests/ceres_test_warp.cpp   |   49 +
 .../dynamicfusion/tests/test_warp_field.cpp   |    0
 .../dynamicfusion/tests/utils/CMakeLists.txt  |   17 +
 .../tests/utils/test_dual_quaternion.cc       |   79 +
 .../tests/utils/test_quaternion.cc            |   88 ++
 54 files changed, 10095 insertions(+)
 create mode 100644 modules/dynamicfusion/CMakeLists.txt
 create mode 100644 modules/dynamicfusion/LICENSE.md
 create mode 100644 modules/dynamicfusion/README.md
 create mode 100644 modules/dynamicfusion/apps/CMakeLists.txt
 create mode 100644 modules/dynamicfusion/apps/demo.cpp
 create mode 100644 modules/dynamicfusion/apps/dynamicfusion_kinect.cpp
 create mode 100644 modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
 create mode 100644 modules/dynamicfusion/cmake/Targets.cmake
 create mode 100644 modules/dynamicfusion/cmake/Utils.cmake
 create mode 100755 modules/dynamicfusion/download_data.sh
 create mode 100644 modules/dynamicfusion/kfusion/CMakeLists.txt
 create mode 100644 modules/dynamicfusion/kfusion/include/io/capture.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/exports.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/types.hpp
 create mode 100644 modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp
 create mode 100755 modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/capture.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/core.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/device.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/imgproc.cu
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu
 create mode 100644 modules/dynamicfusion/kfusion/src/device_memory.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/imgproc.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/internal.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/kinfu.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/optimisation.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/precomp.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/precomp.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/projective_icp.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/safe_call.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/tsdf_volume.cpp
 create mode 100644 modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/utils/quaternion.hpp
 create mode 100644 modules/dynamicfusion/kfusion/src/warp_field.cpp
 create mode 100644 modules/dynamicfusion/tests/CMakeLists.txt
 create mode 100644 modules/dynamicfusion/tests/ceres_test.cpp
 create mode 100644 modules/dynamicfusion/tests/ceres_test_warp.cpp
 create mode 100644 modules/dynamicfusion/tests/test_warp_field.cpp
 create mode 100644 modules/dynamicfusion/tests/utils/CMakeLists.txt
 create mode 100644 modules/dynamicfusion/tests/utils/test_dual_quaternion.cc
 create mode 100644 modules/dynamicfusion/tests/utils/test_quaternion.cc

diff --git a/modules/README.md b/modules/README.md
index d0cde4b8315..870809074eb 100644
--- a/modules/README.md
+++ b/modules/README.md
@@ -28,6 +28,8 @@ $ cmake -D OPENCV_EXTRA_MODULES_PATH=<opencv_contrib>/modules -D BUILD_opencv_<r
 
 - **dpm**: Deformable Part Model -- Felzenszwalb's Cascade with deformable parts object recognition code.
 
+- **dynamicfusion** 3D reconstruction with dynamic scenes -- RGB-D SLAM system that can reconstruct 3D models of scenes with non-rigit deformations.
+
 - **face**: Face Recognition -- Face recognition techniques: Eigen, Fisher and Local Binary Pattern Histograms LBPH methods.
 
 - **fuzzy**: Fuzzy Logic in Vision -- Fuzzy logic image transform and inverse; Fuzzy image processing.
diff --git a/modules/dynamicfusion/CMakeLists.txt b/modules/dynamicfusion/CMakeLists.txt
new file mode 100644
index 00000000000..d14b636d3fa
--- /dev/null
+++ b/modules/dynamicfusion/CMakeLists.txt
@@ -0,0 +1,82 @@
+cmake_minimum_required(VERSION 3.5.0)
+
+# ---[ Configurations types
+set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Possible configurations" FORCE)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DFORCE_INLINERS -D_MWAITXINTRIN_H_INCLUDED")
+if (DEFINED CMAKE_BUILD_TYPE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
+endif()
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile)
+  find_package(Doxygen)
+  if(DOXYGEN_FOUND)
+
+    set(DOXYGEN_INPUT ${SOURCE_FILES})
+    set(DOXYGEN_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
+
+    add_custom_command(
+            OUTPUT ${DOXYGEN_OUTPUT}
+            COMMAND ${CMAKE_COMMAND} -E echo_append "Building API Documentation..."
+            COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_INPUT}
+            COMMAND ${CMAKE_COMMAND} -E echo "Done."
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            DEPENDS ${DOXYGEN_INPUT}
+    )
+
+    add_custom_target(apidoc ALL DEPENDS ${DOXYGEN_OUTPUT})
+
+    add_custom_target(apidoc_forced
+            COMMAND ${CMAKE_COMMAND} -E echo_append "Building API Documentation..."
+            COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_INPUT}
+            COMMAND ${CMAKE_COMMAND} -E echo "Done."
+            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
+            )
+
+  endif(DOXYGEN_FOUND)
+endif()
+
+# ---[ Solution name
+project(kfusion C CXX)
+
+# ---[ utility
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/")
+include(cmake/Utils.cmake)
+include(cmake/Targets.cmake)
+
+# ---[ find dependencies
+find_package(OpenCV REQUIRED COMPONENTS core viz highgui calib3d)
+find_package(CUDA REQUIRED)
+find_package(OpenNI)
+find_package(Ceres REQUIRED)
+include_directories(${OpenCV_INCLUDE_DIRS} ${CERES_INCLUDE_DIRS} ${Boost_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${OPENNI_INCLUDE_DIR} "kfusion/src/utils" "kfusion/include/nanoflann")
+if(OPENNI_FOUND)
+    message("FOUND OPENNI AT: ${OPENNI_INCLUDE_DIR}")
+endif()
+# ---[ misc settings
+if(USE_PROJECT_FOLDERS)
+  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+  set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
+endif()
+
+# ---[ cuda settings
+set(HAVE_CUDA 1)
+list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_61,code=sm_61")
+
+if(UNIX OR APPLE)
+  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC;")
+endif()
+
+warnings_disable(CMAKE_CXX_FLAGS /wd4985)
+
+add_subdirectory(kfusion)
+add_subdirectory(apps)
+
+if(BUILD_TESTS)
+  find_package(GTest REQUIRED)
+  if( GTEST_FOUND )
+    message( "Found Gtest at ${GTEST_ROOT}")
+    message( "GTest Libs: ${GTEST_BOTH_LIBRARIES}")
+    message( "GTest Include: ${GTEST_INCLUDE_DIRS}")
+    include_directories(${GTEST_INCLUDE_DIRS})
+    add_subdirectory(tests)
+  endif()
+endif()
\ No newline at end of file
diff --git a/modules/dynamicfusion/LICENSE.md b/modules/dynamicfusion/LICENSE.md
new file mode 100644
index 00000000000..0b27f29ca25
--- /dev/null
+++ b/modules/dynamicfusion/LICENSE.md
@@ -0,0 +1,27 @@
+Copyright (c) 2012, Anatoly Baksheev
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the {organization} nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/modules/dynamicfusion/README.md b/modules/dynamicfusion/README.md
new file mode 100644
index 00000000000..8fd3dee8c65
--- /dev/null
+++ b/modules/dynamicfusion/README.md
@@ -0,0 +1,134 @@
+DynamicFusion
+============
+Implementation of [Newcombe et al. 2015 DynamicFusion paper](http://grail.cs.washington.edu/projects/dynamicfusion/papers/DynamicFusion.pdf).
+
+The code is based on this [KinectFusion implemenation](https://github.com/Nerei/kinfu_remake)
+
+Dependencies:
+* CUDA 5.0 or higher
+* OpenCV 2.4.8 or higher (modules opencv_core, opencv_highgui, opencv_calib3d, opencv_imgproc, opencv_viz). Make sure that WITH_VTK flag is enabled in CMake during OpenCV configuration.
+* Nanoflann (included in the repository)
+* Boost (libraries system, filesystem and program options. Only used in the demo. Tested with [1.64.0](http://www.boost.org/users/history/version_1_64_0.html))
+* Ceres solver (Tested with version [1.13.0](http://ceres-solver.org/ceres-solver-1.13.0.tar.gz))
+
+Implicit dependency (needed by opencv_viz):
+* VTK 5.8.0 or higher
+* SuiteSparse, BLAS and LAPACK for ceres
+Optional dependencies:
+* GTest for testing
+* Doxygen for documentation
+* OpenNI v1.5.4 for getting input straight from a kinect device.
+
+## Building instructions:
+
+### Linux
+Install NVIDIA drivers.A good tutorial with some common issues covered can be found [here](
+https://askubuntu.com/a/61433/167689).
+
+Download and install CUDA from the [NVIDIA website](https://developer.nvidia.com/cuda-downloads).
+
+Install VTK, SuiteSparse, BLAS and LAPACK
+```
+sudo apt-get install libvtk5-dev libsuitesparse-dev liblapack-dev libblas-dev
+```
+
+Clone Ceres Solver `git clone https://ceres-solver.googlesource.com/ceres-solver`, then build and install using `cmake` and `make`.
+
+Clone and install opencv. The project should work with any version above 2.4.8
+```
+git clone https://github.com/opencv/opencv
+cd opencv
+git checkout 3.2.0
+mkdir build
+cd build
+cmake .. -DWITH_VTK=ON -DBUILD_opencv_calib3d=ON BUILD_opencv_imgproc=ON
+make -j4
+sudo make install
+```
+Get [Boost 1.64.0](http://www.boost.org/users/download/) or above. Don't install it through apt-get as it sometimes results in linking errors on Ubuntu 16.04.
+
+Optionals:
+Doxygen can be installed through apt-get: `sudo apt-get install doxygen`.
+Clone and install [GTest](https://github.com/google/googletest)
+Clone and install [OpenNI](https://github.com/OpenNI/OpenNI)
+
+Now clone the repository and build:
+```
+git clone https://github.com/mihaibujanca/dynamicfusion
+mkdir build
+cd build
+cmake ..
+make -j4
+```
+
+If you want to build the tests as well, make sure you pass `-DBUILD_TESTS=ON` to the cmake command.
+
+To build documentation, go to the project root directory and execute
+```
+doxygen -g
+doxygen Doxyfile
+```
+### Windows
+* [Install NVIDIA drivers](https://www.geforce.com/drivers) and [CUDA](https://developer.nvidia.com/cuda-downloads)
+* [Install LAPACK](http://icl.cs.utk.edu/lapack-for-windows/lapack/).
+* [Install VTK](http://www.vtk.org/download/) (download and build from source)
+* [Install OpenCV](http://docs.opencv.org/3.2.0/d3/d52/tutorial_windows_install.html).  
+* [Install Boost](http://www.boost.org/users/download/)
+
+ 
+Optionals:
+* [Doxygen](http://www.stack.nl/~dimitri/doxygen/download.html)
+* [GTest](https://github.com/google/googletest) 
+* [OpenNI]( http://pointclouds.org/downloads/windows.html)
+
+
+
+## Run instructions
+### Unix
+Download an example dataset using `./download_data`. 
+To run the project, use `./build/bin/dynamicfusion <project_root>/data/umbrella`
+
+### Windows
+[Download the dataset](http://lgdv.cs.fau.de/uploads/publications/data/innmann2016deform/umbrella_data.zip).\
+Create a `data` folder inside the project root directory. \
+Unzip the archive into `data` and remove any files that are not .png. \
+Inside `data`, create directories `color` and `depth`, and move color and depth frames to their corresponding folders.
+
+
+
+To use with .oni captures or straight from a kinect device, use `./build/bin/dynamicfusion_kinect <path-to-oni>` or `./build/bin/dynamicfusion_kinect <device_id>` 
+
+---
+Note: currently, the frame rate is too low (5-6fps) to be able to cope with live inputs, so it is advisable that you capture your input first.
+
+## References
+[DynamicFusion project page](http://grail.cs.washington.edu/projects/dynamicfusion/)
+
+```
+@InProceedings{Newcombe_2015_CVPR,
+author = {Newcombe, Richard A. and Fox, Dieter and Seitz, Steven M.},
+title = {DynamicFusion: Reconstruction and Tracking of Non-Rigid Scenes in Real-Time},
+booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2015}
+}
+```
+
+The example dataset is taken from the [VolumeDeform project](http://lgdv.cs.fau.de/publications/publication/Pub.2016.tech.IMMD.IMMD9.volume_6/).
+```
+@inbook{innmann2016volume,
+author = "Innmann, Matthias and Zollh{\"o}fer, Michael and Nie{\ss}ner, Matthias and Theobalt, Christian 
+         and Stamminger, Marc",
+editor = "Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max",
+title = "VolumeDeform: Real-Time Volumetric Non-rigid Reconstruction",
+bookTitle = "Computer Vision -- ECCV 2016: 14th European Conference, Amsterdam, The Netherlands,
+            October 11-14, 2016, Proceedings, Part VIII",
+year = "2016",
+publisher = "Springer International Publishing",
+address = "Cham",
+pages = "362--379",
+isbn = "978-3-319-46484-8",
+doi = "10.1007/978-3-319-46484-8_22",
+url = "http://dx.doi.org/10.1007/978-3-319-46484-8_22"
+}
+```
\ No newline at end of file
diff --git a/modules/dynamicfusion/apps/CMakeLists.txt b/modules/dynamicfusion/apps/CMakeLists.txt
new file mode 100644
index 00000000000..559eacd728b
--- /dev/null
+++ b/modules/dynamicfusion/apps/CMakeLists.txt
@@ -0,0 +1,25 @@
+include_directories(${CMAKE_SOURCE_DIR}/kfusion/include)
+find_package(Boost REQUIRED COMPONENTS system filesystem program_options)
+
+add_executable(dynamicfusion demo.cpp)
+target_link_libraries(dynamicfusion ${OpenCV_LIBS} ${Boost_LIBRARIES} kfusion)
+
+set_target_properties(dynamicfusion PROPERTIES
+    DEBUG_POSTFIX "d"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
+
+if(OPENNI_FOUND)
+    add_executable(dynamicfusion_kinect dynamicfusion_kinect.cpp)
+    target_link_libraries(dynamicfusion_kinect ${OPENNI_LIBRARY} ${OpenCV_LIBS} ${Boost_LIBRARIES} kfusion)
+
+    set_target_properties(dynamicfusion_kinect PROPERTIES
+        DEBUG_POSTFIX "d"
+        ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+    install(TARGETS dynamicfusion_kinect RUNTIME DESTINATION bin COMPONENT main)
+endif()
+
+install(TARGETS dynamicfusion RUNTIME DESTINATION bin COMPONENT main)
+#install(FILES ${srcs} DESTINATION app COMPONENT main)
diff --git a/modules/dynamicfusion/apps/demo.cpp b/modules/dynamicfusion/apps/demo.cpp
new file mode 100644
index 00000000000..3429fe1533a
--- /dev/null
+++ b/modules/dynamicfusion/apps/demo.cpp
@@ -0,0 +1,168 @@
+#include <iostream>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/viz/vizcore.hpp>
+#include <kfusion/kinfu.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/range/iterator_range.hpp>
+
+using namespace kfusion;
+
+struct DynamicFusionApp
+{
+    static void KeyboardCallback(const cv::viz::KeyboardEvent& event, void* pthis)
+    {
+        DynamicFusionApp& kinfu = *static_cast<DynamicFusionApp*>(pthis);
+
+        if(event.action != cv::viz::KeyboardEvent::KEY_DOWN)
+            return;
+
+        if(event.code == 't' || event.code == 'T')
+            kinfu.show_warp(*kinfu.kinfu_);
+
+        if(event.code == 'i' || event.code == 'I')
+            kinfu.interactive_mode_ = !kinfu.interactive_mode_;
+    }
+
+    DynamicFusionApp(std::string dir) : exit_ (false), interactive_mode_(false), pause_(false), directory(true), dir_name(dir)
+    {
+        KinFuParams params = KinFuParams::default_params_dynamicfusion();
+        kinfu_ = KinFu::Ptr( new KinFu(params) );
+
+
+        cv::viz::WCube cube(cv::Vec3d::all(0), cv::Vec3d(params.volume_size), true, cv::viz::Color::apricot());
+        viz.showWidget("cube", cube, params.volume_pose);
+        viz.showWidget("coor", cv::viz::WCoordinateSystem(0.1));
+        viz.registerKeyboardCallback(KeyboardCallback, this);
+
+    }
+    static void show_depth(const cv::Mat& depth)
+    {
+        cv::Mat display;
+        depth.convertTo(display, CV_8U, 255.0/4000);
+        cv::imshow("Depth", display);
+    }
+
+    void show_raycasted(KinFu& kinfu)
+    {
+        const int mode = 3;
+        if (interactive_mode_)
+            kinfu.renderImage(view_device_, viz.getViewerPose(), mode);
+        else
+            kinfu.renderImage(view_device_, mode);
+
+        view_host_.create(view_device_.rows(), view_device_.cols(), CV_8UC4);
+        view_device_.download(view_host_.ptr<void>(), view_host_.step);
+        cv::imshow("Scene", view_host_);
+    }
+
+    void show_warp(KinFu &kinfu)
+    {
+        cv::Mat warp_host =  kinfu.getWarp().getNodesAsMat();
+        viz1.showWidget("warp_field", cv::viz::WCloud(warp_host));
+    }
+
+    bool execute()
+    {
+        KinFu& dfusion = *kinfu_;
+        cv::Mat depth, image;
+        double time_ms = 0;
+        bool has_image = false;
+        std::vector<boost::filesystem::path> depths;             // store paths,
+        std::vector<boost::filesystem::path> images;             // store paths,
+
+        copy(boost::filesystem::directory_iterator(dir_name + "/depth"), boost::filesystem::directory_iterator(),
+             back_inserter(depths));
+        copy(boost::filesystem::directory_iterator(dir_name + "/color"), boost::filesystem::directory_iterator(),
+             back_inserter(images));
+
+        std::sort(depths.begin(), depths.end());
+        std::sort(images.begin(), images.end());
+
+        for (int i = 0; i < depths.size() && !exit_ && !viz.wasStopped(); i++) {
+            image = cv::imread(images[i].string(), CV_LOAD_IMAGE_COLOR);
+            depth = cv::imread(depths[i].string(), CV_LOAD_IMAGE_ANYDEPTH);
+            depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);
+
+            {
+                SampledScopeTime fps(time_ms);
+                (void) fps;
+                has_image = dfusion(depth_device_);
+            }
+
+            if (has_image)
+                show_raycasted(dfusion);
+
+            show_depth(depth);
+            cv::imshow("Image", image);
+
+            if (!interactive_mode_) {
+                viz.setViewerPose(dfusion.getCameraPose());
+                viz1.setViewerPose(dfusion.getCameraPose());
+            }
+
+            int key = cv::waitKey(pause_ ? 0 : 3);
+            show_warp(dfusion);
+            switch (key) {
+                case 't':
+                case 'T' :
+                    show_warp(dfusion);
+                    break;
+                case 'i':
+                case 'I' :
+                    interactive_mode_ = !interactive_mode_;
+                    break;
+                case 27:
+                    exit_ = true;
+                    break;
+                case 32:
+                    pause_ = !pause_;
+                    break;
+            }
+
+            //exit_ = exit_ || i > 100;
+            viz.spinOnce(3, true);
+            viz1.spinOnce(3, true);
+        }
+        return true;
+    }
+
+    bool pause_ /*= false*/;
+    bool exit_, interactive_mode_, directory;
+    std::string dir_name;
+    KinFu::Ptr kinfu_;
+    cv::viz::Viz3d viz;
+    cv::viz::Viz3d viz1;
+
+    cv::Mat view_host_;
+    cuda::Image view_device_;
+    cuda::Depth depth_device_;
+
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main (int argc, char* argv[])
+{
+    int device = 0;
+    cuda::setDevice (device);
+    cuda::printShortCudaDeviceInfo (device);
+
+    if(cuda::checkIfPreFermiGPU(device))
+        return std::cout << std::endl << "Kinfu is not supported for pre-Fermi GPU architectures, and not built for them by default. Exiting..." << std::endl, -1;
+
+    DynamicFusionApp *app;
+    if(boost::filesystem::is_directory(argv[1]))
+        app = new DynamicFusionApp(argv[1]);
+
+    // executing
+    try { app->execute (); }
+    catch (const std::bad_alloc& /*e*/) { std::cout << "Bad alloc" << std::endl; }
+    catch (const std::exception& /*e*/) { std::cout << "Exception" << std::endl; }
+
+    delete app;
+    return 0;
+}
diff --git a/modules/dynamicfusion/apps/dynamicfusion_kinect.cpp b/modules/dynamicfusion/apps/dynamicfusion_kinect.cpp
new file mode 100644
index 00000000000..62dcacf2c8d
--- /dev/null
+++ b/modules/dynamicfusion/apps/dynamicfusion_kinect.cpp
@@ -0,0 +1,158 @@
+#include <iostream>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <opencv2/viz/vizcore.hpp>
+#include <kfusion/kinfu.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#include <boost/filesystem.hpp>
+#include <boost/range/iterator_range.hpp>
+#include <io/capture.hpp>
+using namespace kfusion;
+
+struct KinFuApp
+{
+    static void KeyboardCallback(const cv::viz::KeyboardEvent& event, void* pthis)
+    {
+        KinFuApp& kinfu = *static_cast<KinFuApp*>(pthis);
+
+        if(event.action != cv::viz::KeyboardEvent::KEY_DOWN)
+            return;
+
+        if(event.code == 't' || event.code == 'T')
+            kinfu.take_cloud(*kinfu.kinfu_);
+
+        if(event.code == 'i' || event.code == 'I')
+            kinfu.interactive_mode_ = !kinfu.interactive_mode_;
+    }
+
+    KinFuApp(OpenNISource& source) : exit_ (false), interactive_mode_(false), capture_ (source), pause_(false)
+    {
+        KinFuParams params = KinFuParams::default_params();
+        kinfu_ = KinFu::Ptr( new KinFu(params) );
+
+        capture_.setRegistration(true);
+
+        cv::viz::WCube cube(cv::Vec3d::all(0), cv::Vec3d(params.volume_size), true, cv::viz::Color::apricot());
+        viz.showWidget("cube", cube, params.volume_pose);
+        viz.showWidget("coor", cv::viz::WCoordinateSystem(0.1));
+        viz.registerKeyboardCallback(KeyboardCallback, this);
+    }
+
+    static void show_depth(const cv::Mat& depth)
+    {
+        cv::Mat display;
+        //cv::normalize(depth, display, 0, 255, cv::NORM_MINMAX, CV_8U);
+        depth.convertTo(display, CV_8U, 255.0/4000);
+        cv::imshow("Depth", display);
+    }
+
+    void show_raycasted(KinFu& kinfu)
+    {
+        const int mode = 3;
+        if (interactive_mode_)
+            kinfu.renderImage(view_device_, viz.getViewerPose(), mode);
+        else
+            kinfu.renderImage(view_device_, mode);
+
+        view_host_.create(view_device_.rows(), view_device_.cols(), CV_8UC4);
+        view_device_.download(view_host_.ptr<void>(), view_host_.step);
+        cv::imshow("Scene", view_host_);
+    }
+
+    void take_cloud(KinFu& kinfu)
+    {
+//        cv::Mat cloud_host = kinfu.tsdf().get_cloud_host();
+        cv::Mat normal_host =  kinfu.tsdf().get_normal_host();
+        cv::Mat warp_host =  kinfu.getWarp().getNodesAsMat();
+
+//        viz.showWidget("cloud", cv::viz::WCloud(cloud_host));
+//        viz.showWidget("cloud_normals", cv::viz::WCloudNormals(cloud_host, normal_host, 64, 0.05, cv::viz::Color::blue()));
+        viz1.showWidget("warp_field", cv::viz::WCloud(warp_host));
+    }
+
+    bool execute()
+    {
+        KinFu& kinfu = *kinfu_;
+        cv::Mat depth, image;
+        double time_ms = 0;
+        bool has_image = false;
+            for (int i = 0; !exit_ && !viz.wasStopped(); ++i)
+            {
+                bool has_frame = capture_.grab(depth, image);
+                if (!has_frame)
+                    return std::cout << "Can't grab" << std::endl, false;
+                depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);
+
+                {
+                    SampledScopeTime fps(time_ms); (void)fps;
+                    has_image = kinfu(depth_device_);
+                }
+
+                if (has_image)
+                    show_raycasted(kinfu);
+
+                show_depth(depth);
+                cv::imshow("Image", image);
+
+                if (!interactive_mode_)
+                {
+                    viz.setViewerPose(kinfu.getCameraPose());
+                    viz1.setViewerPose(kinfu.getCameraPose());
+                }
+
+                int key = cv::waitKey(pause_ ? 0 : 3);
+                take_cloud(kinfu);
+                switch(key)
+                {
+                    case 't': case 'T' : take_cloud(kinfu); break;
+                    case 'i': case 'I' : interactive_mode_ = !interactive_mode_; break;
+                    case 27: exit_ = true; break;
+                    case 32: pause_ = !pause_; break;
+                }
+
+                //exit_ = exit_ || i > 100;
+                viz.spinOnce(3, true);
+                viz1.spinOnce(3, true);
+            }
+        return true;
+    }
+
+    bool pause_ /*= false*/;
+    bool exit_, interactive_mode_;
+    KinFu::Ptr kinfu_;
+    cv::viz::Viz3d viz;
+    cv::viz::Viz3d viz1;
+
+    cv::Mat view_host_;
+    cuda::Image view_device_;
+    cuda::Depth depth_device_;
+    OpenNISource& capture_;
+
+};
+
+
+///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+
+int main (int argc, char* argv[])
+{
+    int device = 0;
+    cuda::setDevice (device);
+    cuda::printShortCudaDeviceInfo (device);
+
+    if(cuda::checkIfPreFermiGPU(device))
+        return std::cout << std::endl << "Kinfu is not supported for pre-Fermi GPU architectures, and not built for them by default. Exiting..." << std::endl, -1;
+
+    KinFuApp *app;
+
+    OpenNISource capture;
+    capture.open(argv[1]);
+    app = new KinFuApp(capture);
+
+    // executing
+    try { app->execute (); }
+    catch (const std::bad_alloc& /*e*/) { std::cout << "Bad alloc" << std::endl; }
+    catch (const std::exception& /*e*/) { std::cout << "Exception" << std::endl; }
+
+    delete app;
+    return 0;
+}
diff --git a/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake b/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
new file mode 100644
index 00000000000..a8e6f51f915
--- /dev/null
+++ b/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
@@ -0,0 +1,78 @@
+###############################################################################
+# Find OpenNI
+#
+# This sets the following variables:
+# OPENNI_FOUND - True if OPENNI was found.
+# OPENNI_INCLUDE_DIRS - Directories containing the OPENNI include files.
+# OPENNI_LIBRARIES - Libraries needed to use OPENNI.
+# OPENNI_DEFINITIONS - Compiler flags for OPENNI.
+# 
+# For libusb-1.0, add USB_10_ROOT if not found
+
+find_package(PkgConfig QUIET)
+
+# Find LibUSB
+if(NOT WIN32)
+  pkg_check_modules(PC_USB_10 libusb-1.0)
+  find_path(USB_10_INCLUDE_DIR libusb-1.0/libusb.h
+            HINTS ${PC_USB_10_INCLUDEDIR} ${PC_USB_10_INCLUDE_DIRS} "${USB_10_ROOT}" "$ENV{USB_10_ROOT}"
+            PATH_SUFFIXES libusb-1.0)
+
+  find_library(USB_10_LIBRARY
+               NAMES usb-1.0 
+               HINTS ${PC_USB_10_LIBDIR} ${PC_USB_10_LIBRARY_DIRS} "${USB_10_ROOT}" "$ENV{USB_10_ROOT}"
+               PATH_SUFFIXES lib)
+               
+  include(FindPackageHandleStandardArgs)
+  find_package_handle_standard_args(USB_10 DEFAULT_MSG USB_10_LIBRARY USB_10_INCLUDE_DIR)
+   
+  if(NOT USB_10_FOUND)
+    message(STATUS "OpenNI disabled because libusb-1.0 not found.")     
+    return()
+  else()
+    include_directories(SYSTEM ${USB_10_INCLUDE_DIR})
+  endif()
+endif(NOT WIN32)
+ 
+if(${CMAKE_VERSION} VERSION_LESS 2.8.2)
+  pkg_check_modules(PC_OPENNI openni-dev)
+else()
+  pkg_check_modules(PC_OPENNI QUIET openni-dev)
+endif()
+
+set(OPENNI_DEFINITIONS ${PC_OPENNI_CFLAGS_OTHER})
+
+set(OPENNI_SUFFIX)
+if(WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+  set(OPENNI_SUFFIX 64)
+endif(WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+
+#add a hint so that it can find it without the pkg-config
+find_path(OPENNI_INCLUDE_DIR XnStatus.h
+          HINTS ${PC_OPENNI_INCLUDEDIR} ${PC_OPENNI_INCLUDE_DIRS} /usr/include/openni /usr/include/ni "${OPENNI_ROOT}" "$ENV{OPENNI_ROOT}"
+          PATHS "$ENV{OPEN_NI_INSTALL_PATH${OPENNI_SUFFIX}}/Include"
+          PATH_SUFFIXES openni include Include)
+#add a hint so that it can find it without the pkg-config
+find_library(OPENNI_LIBRARY 
+             NAMES OpenNI${OPENNI_SUFFIX}
+             HINTS ${PC_OPENNI_LIBDIR} ${PC_OPENNI_LIBRARY_DIRS} /usr/lib "${OPENNI_ROOT}" "$ENV{OPENNI_ROOT}"
+             PATHS "$ENV{OPEN_NI_LIB${OPENNI_SUFFIX}}"
+             PATH_SUFFIXES lib Lib Lib64)
+
+if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+  set(OPENNI_LIBRARIES ${OPENNI_LIBRARY} ${LIBUSB_1_LIBRARIES})
+else()
+  set(OPENNI_LIBRARIES ${OPENNI_LIBRARY})
+endif()
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(OpenNI DEFAULT_MSG OPENNI_LIBRARY OPENNI_INCLUDE_DIR)
+    
+mark_as_advanced(OPENNI_LIBRARY OPENNI_INCLUDE_DIR)
+
+if(OPENNI_FOUND)  
+  # Add the include directories
+  set(OPENNI_INCLUDE_DIRS ${OPENNI_INCLUDE_DIR})  
+  message(STATUS "OpenNI found (include: ${OPENNI_INCLUDE_DIRS}, lib: ${OPENNI_LIBRARY})")
+endif(OPENNI_FOUND)
+
diff --git a/modules/dynamicfusion/cmake/Targets.cmake b/modules/dynamicfusion/cmake/Targets.cmake
new file mode 100644
index 00000000000..2c66ece1969
--- /dev/null
+++ b/modules/dynamicfusion/cmake/Targets.cmake
@@ -0,0 +1,140 @@
+################################################################################################
+# short command to setup source group
+function(kf_source_group group)
+  cmake_parse_arguments(VW_SOURCE_GROUP "" "" "GLOB" ${ARGN})
+  file(GLOB srcs ${VW_SOURCE_GROUP_GLOB})    
+  #list(LENGTH ${srcs} ___size)  
+  #if (___size GREATER 0)
+    source_group(${group} FILES ${srcs})
+  #endif()
+endfunction()
+
+
+################################################################################################
+# short command getting sources from standard directores
+macro(pickup_std_sources)  
+  kf_source_group("Include" GLOB "include/${module_name}/*.h*")
+  kf_source_group("Include\\cuda" GLOB "include/${module_name}/cuda/*.h*")
+  kf_source_group("Source" GLOB "src/*.cpp" "src/*.h*")
+  kf_source_group("Source\\utils" GLOB "src/utils/*.cpp" "src/utils/*.h*")
+  kf_source_group("Source\\cuda" GLOB "src/cuda/*.c*" "src/cuda/*.h*")
+  FILE(GLOB_RECURSE sources include/${module_name}/*.h* src/*.cpp src/*.h* src/cuda/*.h* src/cuda/*.c*)
+endmacro()
+
+
+################################################################################################
+# short command for declaring includes from other modules
+macro(declare_deps_includes)
+  foreach(__arg ${ARGN})    
+    get_filename_component(___path "${CMAKE_SOURCE_DIR}/modules/${__arg}/include" ABSOLUTE)
+    if (EXISTS ${___path})    
+      include_directories(${___path})
+    endif()
+  endforeach()
+  
+  unset(___path)
+  unset(__arg)
+endmacro()
+
+
+################################################################################################
+# short command for setting defeault target properties
+function(default_properties target)
+  set_target_properties(${target} PROPERTIES    
+    DEBUG_POSTFIX "d"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")  
+    
+    if (NOT ${target} MATCHES "^test_")
+      install(TARGETS ${the_target} RUNTIME DESTINATION ".")
+    endif()
+endfunction()
+
+function(test_props target)
+  #os_project_label(${target} "[test]") 
+  if(USE_PROJECT_FOLDERS)
+    set_target_properties(${target} PROPERTIES FOLDER "Tests")
+  endif()  
+endfunction()
+
+function(app_props target)
+  #os_project_label(${target} "[app]")
+  if(USE_PROJECT_FOLDERS)
+    set_target_properties(${target} PROPERTIES FOLDER "Apps")
+  endif()
+endfunction()
+
+
+################################################################################################
+# short command for setting defeault target properties
+function(default_properties target)
+  set_target_properties(${target} PROPERTIES    
+    DEBUG_POSTFIX "d"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")  
+    
+    if (NOT ${target} MATCHES "^test_")
+      install(TARGETS ${the_target} RUNTIME DESTINATION ".")
+    endif()
+endfunction()
+
+
+################################################################################################
+# short command for adding library module
+macro(add_module_library name)
+  set(module_name ${name})  
+  pickup_std_sources()
+  include_directories(include src src/cuda src/utils)
+  
+  set(__has_cuda OFF)
+  check_cuda(__has_cuda)  
+  
+  set(__lib_type STATIC)
+  if (${ARGV1} MATCHES "SHARED|STATIC")
+    set(__lib_type ${ARGV1})
+  endif()
+
+  if (__has_cuda)  
+    cuda_add_library(${module_name} ${__lib_type} ${sources})
+  else()
+    add_library(${module_name} ${__lib_type} ${sources})
+  endif()
+   
+  if(MSVC)
+    set_target_properties(${module_name} PROPERTIES DEFINE_SYMBOL KFUSION_API_EXPORTS)
+  else()
+    add_definitions(-DKFUSION_API_EXPORTS)
+  endif()  
+  
+  default_properties(${module_name})
+  
+  if(USE_PROJECT_FOLDERS)
+    set_target_properties(${module_name} PROPERTIES FOLDER "Libraries")
+  endif()
+
+  set_target_properties(${module_name} PROPERTIES INSTALL_NAME_DIR lib)
+
+  install(TARGETS ${module_name}
+    RUNTIME DESTINATION bin COMPONENT main
+    LIBRARY DESTINATION lib COMPONENT main
+    ARCHIVE DESTINATION lib COMPONENT main)
+
+  install(DIRECTORY include/ DESTINATION include/ FILES_MATCHING PATTERN "*.h*")
+endmacro()
+
+################################################################################################
+# short command for adding application module
+macro(add_application target sources)
+  add_executable(${target} ${sources})   
+  default_properties(${target})
+  app_props(${target})
+endmacro()
+
+
+################################################################################################
+# short command for adding test target
+macro(add_test the_target)
+  add_executable(${the_target} ${ARGN})
+  default_properties(${the_target})
+  test_props(${the_target})
+endmacro()
diff --git a/modules/dynamicfusion/cmake/Utils.cmake b/modules/dynamicfusion/cmake/Utils.cmake
new file mode 100644
index 00000000000..c0e4bd02774
--- /dev/null
+++ b/modules/dynamicfusion/cmake/Utils.cmake
@@ -0,0 +1,102 @@
+################################################################################################
+# short alias for CMkae loggign
+function(dmsg)
+  message(STATUS "<<${ARGN}")
+endfunction()
+
+################################################################################################
+# Command checking if current module has cuda souces to compile
+macro(check_cuda var)
+  file(GLOB cuda src/cuda/*.cu)
+  list(LENGTH cuda ___size)  
+  if (HAVE_CUDA AND ___size GREATER 0)  
+    set(${var} ON)
+  else()  
+    set(${var} OFF) 
+  endif()
+  unset(___size)
+  unset(cuda)
+endmacro()
+
+################################################################################################
+# short command for adding library module
+macro(warnings_disable)
+  if(NOT ENABLE_NOISY_WARNINGS)
+    set(_flag_vars "")
+    set(_msvc_warnings "")
+    set(_gxx_warnings "")
+    foreach(arg ${ARGN})
+      if(arg MATCHES "^CMAKE_")
+        list(APPEND _flag_vars ${arg})
+      elseif(arg MATCHES "^/wd")
+        list(APPEND _msvc_warnings ${arg})
+      elseif(arg MATCHES "^-W")
+        list(APPEND _gxx_warnings ${arg})
+      endif()
+    endforeach()
+    if(MSVC AND _msvc_warnings AND _flag_vars)
+      foreach(var ${_flag_vars})
+        foreach(warning ${_msvc_warnings})
+          set(${var} "${${var}} ${warning}")
+        endforeach()
+      endforeach()
+    elseif(CV_COMPILER_IS_GNU AND _gxx_warnings AND _flag_vars)
+      foreach(var ${_flag_vars})
+        foreach(warning ${_gxx_warnings})
+          if(NOT warning MATCHES "^-Wno-")
+            string(REPLACE "${warning}" "" ${var} "${${var}}")
+            string(REPLACE "-W" "-Wno-" warning "${warning}")
+          endif()
+          ocv_check_flag_support(${var} "${warning}" _varname)
+          if(${_varname})
+            set(${var} "${${var}} ${warning}")
+          endif()
+        endforeach()
+      endforeach()
+    endif()
+    unset(_flag_vars)
+    unset(_msvc_warnings)
+    unset(_gxx_warnings)
+  endif(NOT ENABLE_NOISY_WARNINGS)
+endmacro()
+
+################################################################################################
+# Command for asstion options with some preconditions
+macro(kf_option variable description value)
+  set(__value ${value})
+  set(__condition "")
+  set(__varname "__value")
+  foreach(arg ${ARGN})
+    if(arg STREQUAL "IF" OR arg STREQUAL "if")
+      set(__varname "__condition")
+    else()
+      list(APPEND ${__varname} ${arg})
+    endif()
+  endforeach()
+  unset(__varname)
+  if("${__condition}" STREQUAL "")
+    set(__condition 2 GREATER 1)
+  endif()
+
+  if(${__condition})
+    if("${__value}" MATCHES ";")
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    elseif(DEFINED ${__value})
+      if(${__value})
+        option(${variable} "${description}" ON)
+      else()
+        option(${variable} "${description}" OFF)
+      endif()
+    else()
+      option(${variable} "${description}" ${__value})
+    endif()
+  else()
+    unset(${variable} CACHE)
+  endif()
+  unset(__condition)
+  unset(__value)
+endmacro()
diff --git a/modules/dynamicfusion/download_data.sh b/modules/dynamicfusion/download_data.sh
new file mode 100755
index 00000000000..87e57d2018a
--- /dev/null
+++ b/modules/dynamicfusion/download_data.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+wget http://lgdv.cs.fau.de/uploads/publications/data/innmann2016deform/umbrella_data.zip
+mkdir -p data/umbrella/depth
+mkdir -p data/umbrella/color
+
+mv umbrella_data.zip data/umbrella
+cd data/umbrella
+unzip umbrella_data.zip
+rm *.txt
+mv *color*.png color/
+mv *depth*.png depth/
+rm umbrella_data.zip
+
diff --git a/modules/dynamicfusion/kfusion/CMakeLists.txt b/modules/dynamicfusion/kfusion/CMakeLists.txt
new file mode 100644
index 00000000000..c74af700487
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/CMakeLists.txt
@@ -0,0 +1,6 @@
+set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false")
+add_module_library(kfusion)
+if(OPENNI_FOUND)
+    target_compile_definitions(kfusion PRIVATE OPENNI_FOUND=1)
+endif()
+target_link_libraries(kfusion ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${OpenCV_LIBS} ${OPENNI_LIBRARY} ${CERES_LIBRARIES})
diff --git a/modules/dynamicfusion/kfusion/include/io/capture.hpp b/modules/dynamicfusion/kfusion/include/io/capture.hpp
new file mode 100644
index 00000000000..dc8f786273d
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/io/capture.hpp
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <kfusion/kinfu.hpp>
+#include <opencv2/core/core.hpp>
+#include <string>
+
+namespace kfusion
+{
+    class KF_EXPORTS OpenNISource
+    {
+    public:
+        typedef kfusion::PixelRGB RGB24;
+
+        enum { PROP_OPENNI_REGISTRATION_ON  = 104 };
+
+        OpenNISource();
+        OpenNISource(int device);
+        OpenNISource(const std::string& oni_filename, bool repeat = false);
+
+        void open(int device);
+        void open(const std::string& oni_filename, bool repeat = false);
+        void release();
+
+        ~OpenNISource();
+
+        bool grab(cv::Mat &depth, cv::Mat &image);
+
+        //parameters taken from camera/oni
+        int shadow_value, no_sample_value;
+        float depth_focal_length_VGA;
+        float baseline;               // mm
+        double pixelSize;             // mm
+        unsigned short max_depth;     // mm
+
+        bool setRegistration (bool value = false);
+    private:
+        struct Impl;
+        cv::Ptr<Impl> impl_;
+        void getParams ();
+
+    };
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp
new file mode 100644
index 00000000000..2186f51e724
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp
@@ -0,0 +1,303 @@
+#pragma once
+
+#include <kfusion/exports.hpp>
+#include <kfusion/cuda/device_memory.hpp>
+
+#include <vector>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        /** \brief @b DeviceArray class
+          *
+          * \note Typed container for GPU memory with reference counting.
+          *
+          * \author Anatoly Baksheev
+          */
+        template<class T>
+        class KF_EXPORTS DeviceArray : public DeviceMemory
+        {
+        public:
+            /** \brief Element type. */
+            typedef T type;
+
+            /** \brief Element size. */
+            enum { elem_size = sizeof(T) };
+
+            /** \brief Empty constructor. */
+            DeviceArray();
+
+            /** \brief Allocates internal buffer in GPU memory
+              * \param size_t: number of elements to allocate
+              * */
+            DeviceArray(size_t size);
+
+            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+              * \param ptr: pointer to buffer
+              * \param size: element number
+              * */
+            DeviceArray(T *ptr, size_t size);
+
+            /** \brief Copy constructor. Just increments reference counter. */
+            DeviceArray(const DeviceArray& other);
+
+            /** \brief Assigment operator. Just increments reference counter. */
+            DeviceArray& operator = (const DeviceArray& other);
+
+            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+              * \param size: elemens number
+              * */
+            void create(size_t size);
+
+            /** \brief Decrements reference counter and releases internal buffer if needed. */
+            void release();
+
+            /** \brief Performs data copying. If destination size differs it will be reallocated.
+              * \param other_arg: destination container
+              * */
+            void copyTo(DeviceArray& other) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param host_ptr_arg: pointer to buffer to upload
+              * \param size: elemens number
+              * */
+            void upload(const T *host_ptr, size_t size);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+              * \param host_ptr_arg: pointer to buffer to download
+              * */
+            void download(T *host_ptr) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param data: host vector to upload from
+              * */
+            template<class A>
+            void upload(const std::vector<T, A>& data);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+              * \param data:  host vector to download to
+              * */
+            template<typename A>
+            void download(std::vector<T, A>& data) const;
+
+            /** \brief Performs swap of data pointed with another device array.
+              * \param other: device array to swap with
+              * */
+            void swap(DeviceArray& other_arg);
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            T* ptr();
+
+            /** \brief Returns const pointer for internal buffer in GPU memory. */
+            const T* ptr() const;
+
+            //using DeviceMemory::ptr;
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            operator T*();
+
+            /** \brief Returns const pointer for internal buffer in GPU memory. */
+            operator const T*() const;
+
+            /** \brief Returns size in elements. */
+            size_t size() const;
+        };
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        /** \brief @b DeviceArray2D class
+          *
+          * \note Typed container for pitched GPU memory with reference counting.
+          *
+          * \author Anatoly Baksheev
+          */
+        template<class T>
+        class KF_EXPORTS DeviceArray2D : public DeviceMemory2D
+        {
+        public:
+            /** \brief Element type. */
+            typedef T type;
+
+            /** \brief Element size. */
+            enum { elem_size = sizeof(T) };
+
+            /** \brief Empty constructor. */
+            DeviceArray2D();
+
+            /** \brief Allocates internal buffer in GPU memory
+              * \param rows: number of rows to allocate
+              * \param cols: number of elements in each row
+              * */
+            DeviceArray2D(int rows, int cols);
+
+            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+             * \param rows: number of rows
+             * \param cols: number of elements in each row
+             * \param data: pointer to buffer
+             * \param stepBytes: stride between two consecutive rows in bytes
+             * */
+            DeviceArray2D(int rows, int cols, void *data, size_t stepBytes);
+
+            /** \brief Copy constructor. Just increments reference counter. */
+            DeviceArray2D(const DeviceArray2D& other);
+
+            /** \brief Assigment operator. Just increments reference counter. */
+            DeviceArray2D& operator = (const DeviceArray2D& other);
+
+            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+               * \param rows: number of rows to allocate
+               * \param cols: number of elements in each row
+               * */
+            void create(int rows, int cols);
+
+            /** \brief Decrements reference counter and releases internal buffer if needed. */
+            void release();
+
+            /** \brief Performs data copying. If destination size differs it will be reallocated.
+              * \param other: destination container
+              * */
+            void copyTo(DeviceArray2D& other) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param host_ptr: pointer to host buffer to upload
+              * \param host_step: stride between two consecutive rows in bytes for host buffer
+              * \param rows: number of rows to upload
+              * \param cols: number of elements in each row
+              * */
+            void upload(const void *host_ptr, size_t host_step, int rows, int cols);
+
+            /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
+              * \param host_ptr: pointer to host buffer to download
+              * \param host_step: stride between two consecutive rows in bytes for host buffer
+              * */
+            void download(void *host_ptr, size_t host_step) const;
+
+            /** \brief Performs swap of data pointed with another device array.
+              * \param other: device array to swap with
+              * */
+            void swap(DeviceArray2D& other_arg);
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param data: host vector to upload from
+              * \param cols: stride in elements between two consecutive rows for host buffer
+              * */
+            template<class A>
+            void upload(const std::vector<T, A>& data, int cols);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+               * \param data: host vector to download to
+               * \param cols: Output stride in elements between two consecutive rows for host vector.
+               * */
+            template<class A>
+            void download(std::vector<T, A>& data, int& cols) const;
+
+            /** \brief Returns pointer to given row in internal buffer.
+              * \param y_arg: row index
+              * */
+            T* ptr(int y = 0);
+
+            /** \brief Returns const pointer to given row in internal buffer.
+              * \param y_arg: row index
+              * */
+            const T* ptr(int y = 0) const;
+
+            //using DeviceMemory2D::ptr;
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            operator T*();
+
+            /** \brief Returns const pointer for internal buffer in GPU memory. */
+            operator const T*() const;
+
+            /** \brief Returns number of elements in each row. */
+            int cols() const;
+
+            /** \brief Returns number of rows. */
+            int rows() const;
+
+            /** \brief Returns step in elements. */
+            size_t elem_step() const;
+        };
+    }
+
+    namespace device
+    {
+        using kfusion::cuda::DeviceArray;
+        using kfusion::cuda::DeviceArray2D;
+    }
+}
+
+/////////////////////  Inline implementations of DeviceArray ////////////////////////////////////////////
+
+template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray() {}
+template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
+template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
+template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray& other) : DeviceMemory(other) {}
+template<class T> inline kfusion::cuda::DeviceArray<T>& kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray& other)
+{ DeviceMemory::operator=(other); return *this; }
+
+template<class T> inline void kfusion::cuda::DeviceArray<T>::create(size_t size)
+{ DeviceMemory::create(size * elem_size); }
+template<class T> inline void kfusion::cuda::DeviceArray<T>::release()
+{ DeviceMemory::release(); }
+
+template<class T> inline void kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray& other) const
+{ DeviceMemory::copyTo(other); }
+template<class T> inline void kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size)
+{ DeviceMemory::upload(host_ptr, size * elem_size); }
+template<class T> inline void kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const
+{ DeviceMemory::download( host_ptr ); }
+
+template<class T> void kfusion::cuda::DeviceArray<T>::swap(DeviceArray& other_arg) { DeviceMemory::swap(other_arg); }
+
+template<class T> inline kfusion::cuda::DeviceArray<T>::operator T*() { return ptr(); }
+template<class T> inline kfusion::cuda::DeviceArray<T>::operator const T*() const { return ptr(); }
+template<class T> inline size_t kfusion::cuda::DeviceArray<T>::size() const { return sizeBytes() / elem_size; }
+
+template<class T> inline       T* kfusion::cuda::DeviceArray<T>::ptr()       { return DeviceMemory::ptr<T>(); }
+template<class T> inline const T* kfusion::cuda::DeviceArray<T>::ptr() const { return DeviceMemory::ptr<T>(); }
+
+template<class T> template<class A> inline void kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A>& data) { upload(&data[0], data.size()); }
+template<class T> template<class A> inline void kfusion::cuda::DeviceArray<T>::download(std::vector<T, A>& data) const { data.resize(size()); if (!data.empty()) download(&data[0]); }
+
+/////////////////////  Inline implementations of DeviceArray2D ////////////////////////////////////////////
+
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D& other) : DeviceMemory2D(other) {}
+template<class T> inline kfusion::cuda::DeviceArray2D<T>& kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D& other)
+{ DeviceMemory2D::operator=(other); return *this; }
+
+template<class T> inline void kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols)
+{ DeviceMemory2D::create(rows, cols * elem_size); }
+template<class T> inline void kfusion::cuda::DeviceArray2D<T>::release()
+{ DeviceMemory2D::release(); }
+
+template<class T> inline void kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D& other) const
+{ DeviceMemory2D::copyTo(other); }
+template<class T> inline void kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols)
+{ DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size); }
+template<class T> inline void kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const
+{ DeviceMemory2D::download( host_ptr, host_step ); }
+
+template<class T> template<class A> inline void kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A>& data, int cols)
+{ upload(&data[0], cols * elem_size, data.size()/cols, cols); }
+
+template<class T> template<class A> inline void kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A>& data, int& elem_step) const
+{ elem_step = cols(); data.resize(cols() * rows()); if (!data.empty()) download(&data[0], colsBytes());  }
+
+template<class T> void  kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D& other_arg) { DeviceMemory2D::swap(other_arg); }
+
+template<class T> inline       T* kfusion::cuda::DeviceArray2D<T>::ptr(int y)       { return DeviceMemory2D::ptr<T>(y); }
+template<class T> inline const T* kfusion::cuda::DeviceArray2D<T>::ptr(int y) const { return DeviceMemory2D::ptr<T>(y); }
+
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::operator T*() { return ptr(); }
+template<class T> inline kfusion::cuda::DeviceArray2D<T>::operator const T*() const { return ptr(); }
+
+template<class T> inline int kfusion::cuda::DeviceArray2D<T>::cols() const { return DeviceMemory2D::colsBytes()/elem_size; }
+template<class T> inline int kfusion::cuda::DeviceArray2D<T>::rows() const { return DeviceMemory2D::rows(); }
+
+template<class T> inline size_t kfusion::cuda::DeviceArray2D<T>::elem_step() const { return DeviceMemory2D::step()/elem_size; }
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp
new file mode 100644
index 00000000000..c444baf304c
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp
@@ -0,0 +1,258 @@
+#pragma once
+
+#include <kfusion/exports.hpp>
+#include <kfusion/cuda/kernel_containers.hpp>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        /** \brief Error handler. All GPU functions from this subsystem call the function to report an error. For internal use only */
+        KF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = "");
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        /** \brief @b DeviceMemory class
+          *
+          * \note This is a BLOB container class with reference counting for GPU memory.
+          *
+          * \author Anatoly Baksheev
+          */
+
+        class KF_EXPORTS DeviceMemory
+        {
+        public:
+            /** \brief Empty constructor. */
+            DeviceMemory();
+
+            /** \brief Destructor. */
+            ~DeviceMemory();
+
+            /** \brief Allocates internal buffer in GPU memory
+              * \param sizeBytes_arg: amount of memory to allocate
+              * */
+            DeviceMemory(size_t sizeBytes_arg);
+
+            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+              * \param ptr_arg: pointer to buffer
+              * \param sizeBytes_arg: buffer size
+              * */
+            DeviceMemory(void *ptr_arg, size_t sizeBytes_arg);
+
+            /** \brief Copy constructor. Just increments reference counter. */
+            DeviceMemory(const DeviceMemory& other_arg);
+
+            /** \brief Assigment operator. Just increments reference counter. */
+            DeviceMemory& operator=(const DeviceMemory& other_arg);
+
+             /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+               * \param sizeBytes_arg: buffer size
+               * */
+            void create(size_t sizeBytes_arg);
+
+            /** \brief Decrements reference counter and releases internal buffer if needed. */
+            void release();
+
+            /** \brief Performs data copying. If destination size differs it will be reallocated.
+              * \param other_arg: destination container
+              * */
+            void copyTo(DeviceMemory& other) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param host_ptr_arg: pointer to buffer to upload
+              * \param sizeBytes_arg: buffer size
+              * */
+            void upload(const void *host_ptr_arg, size_t sizeBytes_arg);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+              * \param host_ptr_arg: pointer to buffer to download
+              * */
+            void download(void *host_ptr_arg) const;
+
+            /** \brief Performs swap of data pointed with another device memory.
+              * \param other: device memory to swap with
+              * */
+            void swap(DeviceMemory& other_arg);
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            template<class T> T* ptr();
+
+            /** \brief Returns constant pointer for internal buffer in GPU memory. */
+            template<class T> const T* ptr() const;
+
+            /** \brief Conversion to PtrSz for passing to kernel functions. */
+            template <class U> operator PtrSz<U>() const;
+
+            /** \brief Returns true if unallocated otherwise false. */
+            bool empty() const;
+
+            size_t sizeBytes() const;
+
+        private:
+            /** \brief Device pointer. */
+            void *data_;
+
+            /** \brief Allocated size in bytes. */
+            size_t sizeBytes_;
+
+            /** \brief Pointer to reference counter in CPU memory. */
+            int* refcount_;
+        };
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        /** \brief @b DeviceMemory2D class
+          *
+          * \note This is a BLOB container class with reference counting for pitched GPU memory.
+          *
+          * \author Anatoly Baksheev
+          */
+
+        class KF_EXPORTS DeviceMemory2D
+        {
+        public:
+            /** \brief Empty constructor. */
+            DeviceMemory2D();
+
+            /** \brief Destructor. */
+            ~DeviceMemory2D();
+
+            /** \brief Allocates internal buffer in GPU memory
+              * \param rows_arg: number of rows to allocate
+              * \param colsBytes_arg: width of the buffer in bytes
+              * */
+            DeviceMemory2D(int rows_arg, int colsBytes_arg);
+
+
+            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+              * \param rows_arg: number of rows
+              * \param colsBytes_arg: width of the buffer in bytes
+              * \param data_arg: pointer to buffer
+              * \param stepBytes_arg: stride between two consecutive rows in bytes
+              * */
+            DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg);
+
+            /** \brief Copy constructor. Just increments reference counter. */
+            DeviceMemory2D(const DeviceMemory2D& other_arg);
+
+            /** \brief Assigment operator. Just increments reference counter. */
+            DeviceMemory2D& operator=(const DeviceMemory2D& other_arg);
+
+            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+               * \param ptr_arg: number of rows to allocate
+               * \param sizeBytes_arg: width of the buffer in bytes
+               * */
+            void create(int rows_arg, int colsBytes_arg);
+
+            /** \brief Decrements reference counter and releases internal buffer if needed. */
+            void release();
+
+            /** \brief Performs data copying. If destination size differs it will be reallocated.
+              * \param other_arg: destination container
+              * */
+            void copyTo(DeviceMemory2D& other) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param host_ptr_arg: pointer to host buffer to upload
+              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
+              * \param rows_arg: number of rows to upload
+              * \param sizeBytes_arg: width of host buffer in bytes
+              * */
+            void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg);
+
+            /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
+              * \param host_ptr_arg: pointer to host buffer to download
+              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
+              * */
+            void download(void *host_ptr_arg, size_t host_step_arg) const;
+
+            /** \brief Performs swap of data pointed with another device memory.
+              * \param other: device memory to swap with
+              * */
+            void swap(DeviceMemory2D& other_arg);
+
+            /** \brief Returns pointer to given row in internal buffer.
+              * \param y_arg: row index
+              * */
+            template<class T> T* ptr(int y_arg = 0);
+
+            /** \brief Returns constant pointer to given row in internal buffer.
+              * \param y_arg: row index
+              * */
+            template<class T> const T* ptr(int y_arg = 0) const;
+
+             /** \brief Conversion to PtrStep for passing to kernel functions. */
+            template <class U> operator PtrStep<U>() const;
+
+            /** \brief Conversion to PtrStepSz for passing to kernel functions. */
+            template <class U> operator PtrStepSz<U>() const;
+
+            /** \brief Returns true if unallocated otherwise false. */
+            bool empty() const;
+
+            /** \brief Returns number of bytes in each row. */
+            int colsBytes() const;
+
+            /** \brief Returns number of rows. */
+            int rows() const;
+
+            /** \brief Returns stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
+            size_t step() const;
+        private:
+            /** \brief Device pointer. */
+            void *data_;
+
+            /** \brief Stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
+            size_t step_;
+
+            /** \brief Width of the buffer in bytes. */
+            int colsBytes_;
+
+            /** \brief Number of rows. */
+            int rows_;
+
+            /** \brief Pointer to reference counter in CPU memory. */
+            int* refcount_;
+        };
+    }
+
+    namespace device
+    {
+        using kfusion::cuda::DeviceMemory;
+        using kfusion::cuda::DeviceMemory2D;
+    }
+}
+
+/////////////////////  Inline implementations of DeviceMemory ////////////////////////////////////////////
+
+template<class T> inline       T* kfusion::cuda::DeviceMemory::ptr()       { return (      T*)data_; }
+template<class T> inline const T* kfusion::cuda::DeviceMemory::ptr() const { return (const T*)data_; }
+
+template <class U> inline kfusion::cuda::DeviceMemory::operator kfusion::cuda::PtrSz<U>() const
+{
+    PtrSz<U> result;
+    result.data = (U*)ptr<U>();
+    result.size = sizeBytes_/sizeof(U);
+    return result;
+}
+
+/////////////////////  Inline implementations of DeviceMemory2D ////////////////////////////////////////////
+
+template<class T>        T* kfusion::cuda::DeviceMemory2D::ptr(int y_arg)       { return (      T*)((      char*)data_ + y_arg * step_); }
+template<class T>  const T* kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const { return (const T*)((const char*)data_ + y_arg * step_); }
+
+template <class U> kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStep<U>() const
+{
+    PtrStep<U> result;
+    result.data = (U*)ptr<U>();
+    result.step = step_;
+    return result;
+}
+
+template <class U> kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStepSz<U>() const
+{
+    PtrStepSz<U> result;
+    result.data = (U*)ptr<U>();
+    result.step = step_;
+    result.cols = colsBytes_/sizeof(U);
+    result.rows = rows_;
+    return result;
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp
new file mode 100644
index 00000000000..69f21f2d7d3
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <kfusion/types.hpp>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        KF_EXPORTS void depthBilateralFilter(const Depth& in, Depth& out, int ksz, float sigma_spatial, float sigma_depth);
+
+        KF_EXPORTS void depthTruncation(Depth& depth, float threshold);
+
+        KF_EXPORTS void depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth);
+
+        KF_EXPORTS void computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals);
+
+        KF_EXPORTS void computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals);
+
+        KF_EXPORTS void computeDists(const Depth& depth, Dists& dists, const Intr& intr);
+
+        KF_EXPORTS void cloudToDepth(const Cloud& cloud, Depth& depth);
+
+        KF_EXPORTS void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
+
+        KF_EXPORTS void resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out);
+
+        KF_EXPORTS void waitAllDefaultStream();
+
+        KF_EXPORTS void renderTangentColors(const Normals& normals, Image& image);
+
+        KF_EXPORTS void renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+
+        KF_EXPORTS void renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp
new file mode 100644
index 00000000000..485f93bc0e8
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp
@@ -0,0 +1,75 @@
+#pragma once
+
+#if defined(__CUDACC__)
+    #define __kf_hdevice__ __host__ __device__ __forceinline__
+    #define __kf_device__ __device__ __forceinline__
+#else
+    #define __kf_hdevice__
+    #define __kf_device__
+#endif  
+
+#include <cstddef>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        template<typename T> struct DevPtr
+        {
+            typedef T elem_type;
+            const static size_t elem_size = sizeof(elem_type);
+
+            T* data;
+
+            __kf_hdevice__ DevPtr() : data(0) {}
+            __kf_hdevice__ DevPtr(T* data_arg) : data(data_arg) {}
+
+            __kf_hdevice__ size_t elemSize() const { return elem_size; }
+            __kf_hdevice__ operator       T*()       { return data; }
+            __kf_hdevice__ operator const T*() const { return data; }
+        };
+
+        template<typename T> struct PtrSz : public DevPtr<T>
+        {                     
+            __kf_hdevice__ PtrSz() : size(0) {}
+            __kf_hdevice__ PtrSz(T* data_arg, size_t size_arg) : DevPtr<T>(data_arg), size(size_arg) {}
+
+            size_t size;
+        };
+
+        template<typename T>  struct PtrStep : public DevPtr<T>
+        {   
+            __kf_hdevice__ PtrStep() : step(0) {}
+            __kf_hdevice__ PtrStep(T* data_arg, size_t step_arg) : DevPtr<T>(data_arg), step(step_arg) {}
+
+            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+            size_t step;            
+
+            __kf_hdevice__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+            __kf_hdevice__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
+
+            __kf_hdevice__       T& operator()(int y, int x)       { return ptr(y)[x]; }
+            __kf_hdevice__ const T& operator()(int y, int x) const { return ptr(y)[x]; }
+        };
+
+        template <typename T> struct PtrStepSz : public PtrStep<T>
+        {   
+            __kf_hdevice__ PtrStepSz() : cols(0), rows(0) {}
+            __kf_hdevice__ PtrStepSz(int rows_arg, int cols_arg, T* data_arg, size_t step_arg)
+                : PtrStep<T>(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {}
+
+            int cols;
+            int rows;                                                                              
+        };
+    }
+
+    namespace device
+    {
+        using kfusion::cuda::PtrSz;
+        using kfusion::cuda::PtrStep;
+        using kfusion::cuda::PtrStepSz;
+    }
+}
+
+namespace kf = kfusion;
+
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp
new file mode 100644
index 00000000000..1b71d8c436b
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp
@@ -0,0 +1,48 @@
+#pragma once
+
+#include <kfusion/types.hpp>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        class ProjectiveICP
+        {
+        public:
+            enum { MAX_PYRAMID_LEVELS = 4 };
+
+            typedef std::vector<Depth> DepthPyr;
+            typedef std::vector<Cloud> PointsPyr;
+            typedef std::vector<Normals> NormalsPyr;
+
+            ProjectiveICP();
+            virtual ~ProjectiveICP();
+
+            float getDistThreshold() const;
+            void setDistThreshold(float distance);
+
+            float getAngleThreshold() const;
+            void setAngleThreshold(float angle);
+
+            void setIterationsNum(const std::vector<int>& iters);
+            int getUsedLevelsNum() const;
+
+            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const Frame& curr, const Frame& prev);
+
+            /** The function takes masked depth, i.e. it assumes for performance reasons that
+              * "if depth(y,x) is not zero, then normals(y,x) surely is not qnan" */
+            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const DepthPyr& dcurr, const NormalsPyr ncurr, const DepthPyr dprev, const NormalsPyr nprev);
+            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const PointsPyr& vcurr, const NormalsPyr ncurr, const PointsPyr vprev, const NormalsPyr nprev);
+
+            //static Vec3f rodrigues2(const Mat3f& matrix);
+        private:
+            std::vector<int> iters_;
+            float angle_thres_;
+            float dist_thres_;
+            DeviceArray2D<float> buffer_;
+
+            struct StreamHelper;
+            cv::Ptr<StreamHelper> shelp_;
+        };
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp b/modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp
new file mode 100644
index 00000000000..fc74ee86681
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp
@@ -0,0 +1,102 @@
+#pragma once
+
+#include <kfusion/types.hpp>
+#include <dual_quaternion.hpp>
+
+namespace kfusion
+{
+    class WarpField;
+    namespace cuda
+    {
+        class KF_EXPORTS TsdfVolume
+        {
+        public:
+            TsdfVolume(const cv::Vec3i& dims);
+            virtual ~TsdfVolume();
+
+            void create(const Vec3i& dims);
+
+            Vec3i getDims() const;
+            Vec3f getVoxelSize() const;
+
+            const CudaData data() const;
+            CudaData data();
+
+            cv::Mat get_cloud_host() const;
+            cv::Mat get_normal_host() const;
+
+            cv::Mat* get_cloud_host_ptr() const;
+            cv::Mat* get_normal_host_ptr() const;
+
+            Vec3f getSize() const;
+            void setSize(const Vec3f& size);
+
+            float getTruncDist() const;
+            void setTruncDist(float distance);
+
+            int getMaxWeight() const;
+            void setMaxWeight(int weight);
+
+            Affine3f getPose() const;
+            void setPose(const Affine3f& pose);
+
+            float getRaycastStepFactor() const;
+            void setRaycastStepFactor(float factor);
+
+            float getGradientDeltaFactor() const;
+            void setGradientDeltaFactor(float factor);
+
+            Vec3i getGridOrigin() const;
+            void setGridOrigin(const Vec3i& origin);
+
+            std::vector<float> psdf(const std::vector<Vec3f>& warped, Dists& depth_img, const Intr& intr);
+//            float psdf(const std::vector<Vec3f>& warped, Dists& dists, const Intr& intr);
+            float weighting(const std::vector<float>& dist_sqr, int k) const;
+            void surface_fusion(const WarpField& warp_field,
+                                std::vector<Vec3f> warped,
+                                std::vector<Vec3f> canonical,
+                                cuda::Depth &depth,
+                                const Affine3f& camera_pose,
+                                const Intr& intr);
+
+            virtual void clear();
+            virtual void applyAffine(const Affine3f& affine);
+            virtual void integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr);
+            virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals);
+            virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals);
+
+            void swap(CudaData& data);
+
+            DeviceArray<Point> fetchCloud(DeviceArray<Point>& cloud_buffer) const;
+            void fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const;
+            void compute_points();
+            void compute_normals();
+
+
+        private:
+            CudaData data_;
+//            need to make this smart pointers
+            cuda::DeviceArray<Point> *cloud_buffer;
+            cuda::DeviceArray<Point> *cloud;
+            cuda::DeviceArray<Normal> *normal_buffer;
+            cv::Mat *cloud_host;
+            cv::Mat *normal_host;
+
+            float trunc_dist_;
+            float max_weight_;
+            Vec3i dims_;
+            Vec3f size_;
+            Affine3f pose_;
+            float gradient_delta_factor_;
+            float raycast_step_factor_;
+            // TODO: remember to add entry when adding a new node
+            struct Entry
+            {
+                float tsdf_value;
+                float tsdf_weight;
+            };
+
+            std::vector<Entry> tsdf_entries;
+        };
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/exports.hpp b/modules/dynamicfusion/kfusion/include/kfusion/exports.hpp
new file mode 100644
index 00000000000..43fdf142d90
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/exports.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined KFUSION_API_EXPORTS
+  #define KF_EXPORTS __declspec(dllexport)
+#else
+  #define KF_EXPORTS
+#endif
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp b/modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp
new file mode 100644
index 00000000000..9c2abdc3c42
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <kfusion/types.hpp>
+#include <vector>
+#include <string>
+#include <dual_quaternion.hpp>
+#include <quaternion.hpp>
+#include <kfusion/cuda/projective_icp.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#include <kfusion/warp_field.hpp>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        KF_EXPORTS int getCudaEnabledDeviceCount();
+        KF_EXPORTS void setDevice(int device);
+        KF_EXPORTS std::string getDeviceName(int device);
+        KF_EXPORTS bool checkIfPreFermiGPU(int device);
+        KF_EXPORTS void printCudaDeviceInfo(int device);
+        KF_EXPORTS void printShortCudaDeviceInfo(int device);
+    }
+
+    struct KF_EXPORTS KinFuParams
+    {
+        static KinFuParams default_params();
+        static KinFuParams default_params_dynamicfusion();
+
+        int cols;  //pixels
+        int rows;  //pixels
+
+        Intr intr;  //Camera parameters
+
+        Vec3i volume_dims; //number of voxels
+        Vec3f volume_size; //meters
+        Affine3f volume_pose; //meters, inital pose
+
+        float bilateral_sigma_depth;   //meters
+        float bilateral_sigma_spatial;   //pixels
+        int   bilateral_kernel_size;   //pixels
+
+        float icp_truncate_depth_dist; //meters
+        float icp_dist_thres;          //meters
+        float icp_angle_thres;         //radians
+        std::vector<int> icp_iter_num; //iterations for level index 0,1,..,3
+
+        float tsdf_min_camera_movement; //meters, integrate only if exceedes
+        float tsdf_trunc_dist;             //meters;
+        int tsdf_max_weight;               //frames
+
+        float raycast_step_factor;   // in voxel sizes
+        float gradient_delta_factor; // in voxel sizes
+
+        Vec3f light_pose; //meters
+
+    };
+
+    class KF_EXPORTS KinFu
+    {
+    public:        
+        typedef cv::Ptr<KinFu> Ptr;
+
+        KinFu(const KinFuParams& params);
+
+        const KinFuParams& params() const;
+        KinFuParams& params();
+
+        const cuda::TsdfVolume& tsdf() const;
+        cuda::TsdfVolume& tsdf();
+
+        const cuda::ProjectiveICP& icp() const;
+        cuda::ProjectiveICP& icp();
+
+        const WarpField& getWarp() const;
+        WarpField& getWarp();
+
+        void reset();
+
+        bool operator()(const cuda::Depth& depth, const cuda::Image& image = cuda::Image());
+
+        void renderImage(cuda::Image& image, int flags = 0);
+        void dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals);
+        void renderImage(cuda::Image& image, const Affine3f& pose, int flags = 0);
+        void reprojectToDepth();
+
+        Affine3f getCameraPose (int time = -1) const;
+    private:
+        void allocate_buffers();
+
+        int frame_counter_;
+        KinFuParams params_;
+
+        std::vector<Affine3f> poses_;
+
+        cuda::Dists dists_;
+        cuda::Frame curr_, prev_, first_;
+
+        cuda::Cloud points_;
+        cuda::Normals normals_;
+        cuda::Depth depths_;
+
+        cv::Ptr<cuda::TsdfVolume> volume_;
+        cv::Ptr<cuda::ProjectiveICP> icp_;
+        cv::Ptr<WarpField> warp_;
+        std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> edges;
+    };
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp b/modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp
new file mode 100644
index 00000000000..7f088d8f7c1
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp
@@ -0,0 +1,168 @@
+#ifndef KFUSION_OPTIMISATION_H
+#define KFUSION_OPTIMISATION_H
+#include "ceres/ceres.h"
+#include "ceres/rotation.h"
+#include <kfusion/warp_field.hpp>
+
+typedef Eigen::Vector3d Vec3;
+struct DynamicFusionDataEnergy
+{
+    DynamicFusionDataEnergy(cv::Vec3d live_vertex,
+                            cv::Vec3f live_normal,
+                            cv::Vec3f canonical_vertex,
+                            cv::Vec3f canonical_normal,
+                            kfusion::WarpField* warpField)
+            : live_vertex_(live_vertex),
+              live_normal_(live_normal),
+              canonical_vertex_(canonical_vertex),
+              canonical_normal_(canonical_normal),
+              warpField_(warpField) {}
+    template <typename T>
+    bool operator()(T const * const * epsilon_, T* residuals) const
+    {
+        T const * epsilon = epsilon_[0];
+        float weights[KNN_NEIGHBOURS];
+        warpField_->getWeightsAndUpdateKNN(canonical_vertex_, weights);
+        auto nodes = warpField_->getNodes();
+        T total_quaternion[4] = {T(0), T(0), T(0), T(0)};
+        T total_translation[3] = {T(0), T(0), T(0)};
+        for(int i = 0; i < KNN_NEIGHBOURS; i++)
+        {
+            int ret_index_i = warpField_->ret_index[i];
+            auto quat = weights[i] * nodes->at(ret_index_i).transform;
+
+            T eps_r[3] = {epsilon[ret_index_i],epsilon[ret_index_i + 1],epsilon[ret_index_i + 2]};
+            T eps_t[3] = {epsilon[ret_index_i + 3],epsilon[ret_index_i + 4],epsilon[ret_index_i + 5]};
+            float temp[3];
+            auto r_quat = quat.getRotation();
+            T r[4] = { T(r_quat.w_), T(r_quat.x_), T(r_quat.y_), T(r_quat.z_)};
+            quat.getTranslation(temp[0], temp[1], temp[2]);
+
+
+            T eps_quaternion[4];
+            ceres::AngleAxisToQuaternion(eps_r, eps_quaternion);
+            T product[4];
+
+            ceres::QuaternionProduct(eps_quaternion, r, product);
+
+            total_quaternion[0] += product[0];
+            total_quaternion[1] += product[1];
+            total_quaternion[2] += product[2];
+            total_quaternion[3] += product[3];
+
+            //probably wrong, should do like in quaternion multiplication.
+            total_translation[0] += T(temp[0]) +  eps_t[0];
+            total_translation[1] += T(temp[1]) +  eps_t[1];
+            total_translation[2] += T(temp[2]) +  eps_t[2];
+
+        }
+
+
+        T predicted_x, predicted_y, predicted_z;
+        T point[3];
+        T predicted[3];
+        ceres::QuaternionRotatePoint(total_quaternion, point, predicted);
+        predicted_x = predicted[0] + total_translation[0];
+        predicted_y = predicted[1] + total_translation[1];
+        predicted_z = predicted[2] + total_translation[2];
+
+//        T normal[3] = {T(canonical_normal_[0]),T(canonical_normal_[1]),T(canonical_normal_[2])};
+//        T result[3];
+        // The error is the difference between the predicted and observed position.
+        residuals[0] = predicted_x - live_vertex_[0];
+        residuals[1] = predicted_y - live_vertex_[1];
+        residuals[2] = predicted_z - live_vertex_[2];
+//        T dotProd = ceres::DotProduct(residuals, normal);
+//
+//        residuals[0] = tukeyPenalty(dotProd);
+//        result[0] = predicted_x - live_vertex_[0];
+//        result[1] = predicted_y - live_vertex_[1];
+//        result[2] = predicted_z - live_vertex_[2];
+//        T dotProd = ceres::DotProduct(residuals, normal);
+//
+//        residuals[0] = tukeyPenalty(dotProd);
+        return true;
+    }
+
+/**
+ * Tukey loss function as described in http://web.as.uky.edu/statistics/users/pbreheny/764-F11/notes/12-1.pdf
+ * \param x
+ * \param c
+ * \return
+ *
+ * \note
+ * The value c = 4.685 is usually used for this loss function, and
+ * it provides an asymptotic efficiency 95% that of linear
+ * regression for the normal distribution
+ *
+ * In the paper, a value of 0.01 is suggested for c
+ */
+    template <typename T>
+    T tukeyPenalty(T x, T c = T(0.01)) const
+    {
+        return ceres::abs(x) <= c ? x * ceres::pow((T(1.0) - (x * x) / (c * c)), 2) : T(0.0);
+    }
+
+    // Factory to hide the construction of the CostFunction object from
+    // the client code.
+//      TODO: this will only have one residual at the end, remember to change
+//      TODO: find out how to sum residuals
+    static ceres::CostFunction* Create(const cv::Vec3d live_vertex,
+                                       const cv::Vec3d live_normal,
+                                       const cv::Vec3f canonical_vertex,
+                                       const cv::Vec3f canonical_normal,
+                                       kfusion::WarpField* warpField) {
+        {
+            auto cost_function = new ceres::DynamicAutoDiffCostFunction<DynamicFusionDataEnergy, 4>(
+                    new DynamicFusionDataEnergy(live_vertex, live_normal, canonical_vertex, canonical_normal, warpField));
+            cost_function->AddParameterBlock(warpField->getNodes()->size() * 6);
+            cost_function->SetNumResiduals(3);
+            return cost_function;
+        }
+    }
+    const cv::Vec3d live_vertex_;
+    const cv::Vec3d live_normal_;
+    const cv::Vec3f canonical_vertex_;
+    const cv::Vec3f canonical_normal_;
+    kfusion::WarpField* warpField_;
+};
+
+class WarpProblem {
+public:
+    WarpProblem(kfusion::WarpField* warp) : warpField_(warp)
+    {
+        parameters_ = new double[warpField_->getNodes()->size() * 6];
+        mutable_epsilon_ = new double*[KNN_NEIGHBOURS * 6];
+    };
+    ~WarpProblem() {
+        delete[] parameters_;
+        for(int i = 0; i < KNN_NEIGHBOURS * 6; i++)
+            delete[] mutable_epsilon_[i];
+        delete[] mutable_epsilon_;
+    }
+    double **mutable_epsilon(int *index_list)
+    {
+        for(int i = 0; i < KNN_NEIGHBOURS; i++)
+            for(int j = 0; j < 6; j++)
+                mutable_epsilon_[i * 6 + j] = &(parameters_[index_list[i] + j]);
+        return mutable_epsilon_;
+    }
+    double *mutable_params()
+    {
+        return parameters_;
+    }
+
+
+private:
+    double **mutable_epsilon_;
+    double *parameters_;
+
+    kfusion::WarpField* warpField_;
+};
+
+class Optimisation {
+
+};
+
+
+#endif //KFUSION_OPTIMISATION_H
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/types.hpp b/modules/dynamicfusion/kfusion/include/kfusion/types.hpp
new file mode 100644
index 00000000000..b99585dc105
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/types.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <kfusion/cuda/device_array.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/core/affine.hpp>
+#include <opencv2/viz/vizcore.hpp>
+#include <iosfwd>
+
+struct CUevent_st;
+
+namespace kfusion
+{
+    typedef cv::Matx33f Mat3f;
+    typedef cv::Matx44f Mat4f;
+    typedef cv::Vec3f Vec3f;
+    typedef cv::Vec4f Vec4f;
+    typedef cv::Vec3i Vec3i;
+    typedef cv::Affine3f Affine3f;
+
+    struct KF_EXPORTS Intr
+    {
+        float fx, fy, cx, cy;
+
+        Intr ();
+        Intr (float fx, float fy, float cx, float cy);
+        Intr operator()(int level_index) const;
+    };
+
+    KF_EXPORTS std::ostream& operator << (std::ostream& os, const Intr& intr);
+
+    struct Point
+    {
+        union
+        {
+            float data[4];
+            struct { float x, y, z; };
+        };
+    };
+
+    typedef Point Normal;
+
+    struct RGB
+    {
+        union
+        {
+            struct { unsigned char b, g, r; };
+            int bgra;
+        };
+    };
+
+    struct PixelRGB
+    {
+        unsigned char r, g, b;
+    };
+
+    namespace cuda
+    {
+        typedef cuda::DeviceMemory CudaData;
+        typedef cuda::DeviceArray2D<unsigned short> Depth;
+        typedef cuda::DeviceArray2D<unsigned short> Dists;
+        typedef cuda::DeviceArray2D<RGB> Image;
+        typedef cuda::DeviceArray2D<Normal> Normals;
+        typedef cuda::DeviceArray2D<Point> Cloud;
+
+        struct Frame
+        {
+            bool use_points;
+
+            std::vector<Depth> depth_pyr;
+            std::vector<Cloud> points_pyr;
+            std::vector<Normals> normals_pyr;
+        };
+    }
+
+    inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
+
+    struct KF_EXPORTS ScopeTime
+    {
+        const char* name;
+        double start;
+        ScopeTime(const char *name);
+        ~ScopeTime();
+    };
+
+    struct KF_EXPORTS SampledScopeTime
+    {
+    public:
+        enum { EACH = 33 };
+        SampledScopeTime(double& time_ms);
+        ~SampledScopeTime();
+    private:
+        double getTime();
+        SampledScopeTime(const SampledScopeTime&);
+        SampledScopeTime& operator=(const SampledScopeTime&);
+
+        double& time_ms_;
+        double start;
+    };
+
+}
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp b/modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp
new file mode 100644
index 00000000000..63faa1f23c8
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp
@@ -0,0 +1,102 @@
+#ifndef KFUSION_WARP_FIELD_HPP
+#define KFUSION_WARP_FIELD_HPP
+
+/**
+ * \brief
+ * \details
+ */
+#include <dual_quaternion.hpp>
+#include <kfusion/types.hpp>
+#include <nanoflann/nanoflann.hpp>
+#include <knn_point_cloud.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#define KNN_NEIGHBOURS 8
+
+namespace kfusion
+{
+    typedef nanoflann::KDTreeSingleIndexAdaptor<
+            nanoflann::L2_Simple_Adaptor<float, utils::PointCloud>,
+            utils::PointCloud,
+            3 /* dim */
+    > kd_tree_t;
+
+
+    //    TODO: remember to rewrite this with proper doxygen formatting (e.g <sub></sub> rather than _
+    /*!
+     * \struct node
+     * \brief A node of the warp field
+     * \details The state of the warp field Wt at time t is defined by the values of a set of n
+     * deformation nodes Nt_warp = {dg_v, dg_w, dg_se3}_t. Here, this is represented as follows
+     *
+     * \var node::index
+     * Index of the node in the canonical frame. Equivalent to dg_v
+     *
+     * \var node::transform
+     * Transform from canonical point to warped point, equivalent to dg_se in the paper.
+     *
+     * \var node::weight
+     * Equivalent to dg_w
+     */
+    struct deformation_node
+    {
+        Vec3f vertex;
+        kfusion::utils::DualQuaternion<float> transform;
+        float weight = 0;
+    };
+    class WarpField
+    {
+    public:
+        WarpField();
+        ~WarpField();
+
+        void init(const cv::Mat& first_frame, const cv::Mat& normals);
+        void init(const std::vector<Vec3f>& first_frame, const std::vector<Vec3f>& normals);
+        void energy(const cuda::Cloud &frame,
+                    const cuda::Normals &normals,
+                    const Affine3f &pose,
+                    const cuda::TsdfVolume &tsdfVolume,
+                    const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
+                            kfusion::utils::DualQuaternion<float>>> &edges
+        );
+
+        float energy_data(const std::vector<Vec3f> &canonical_vertices,
+                          const std::vector<Vec3f> &canonical_normals,
+                          const std::vector<Vec3f> &live_vertices,
+                          const std::vector<Vec3f> &live_normals);
+        void energy_reg(const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
+                kfusion::utils::DualQuaternion<float>>> &edges);
+
+        float tukeyPenalty(float x, float c = 0.01) const;
+
+        float huberPenalty(float a, float delta) const;
+
+        void warp(std::vector<Vec3f>& points, std::vector<Vec3f>& normals) const;
+        void warp(cuda::Cloud& points) const;
+
+        utils::DualQuaternion<float> DQB(const Vec3f& vertex) const;
+        utils::DualQuaternion<float> DQB(const Vec3f& vertex, double epsilon[KNN_NEIGHBOURS * 6]) const;
+
+        void getWeightsAndUpdateKNN(const Vec3f& vertex, float weights[KNN_NEIGHBOURS]);
+
+        float weighting(float squared_dist, float weight) const;
+        void KNN(Vec3f point) const;
+
+        void clear();
+
+        const std::vector<deformation_node>* getNodes() const;
+        const cv::Mat getNodesAsMat() const;
+        void setWarpToLive(const Affine3f &pose);
+
+
+        std::vector<float> out_dist_sqr;
+        std::vector<size_t> ret_index;
+
+    private:
+        std::vector<deformation_node>* nodes;
+        kd_tree_t* index;
+        nanoflann::KNNResultSet<float> *resultSet;
+        Affine3f warp_to_live;
+        void buildKDTree();
+    };
+}
+#endif //KFUSION_WARP_FIELD_HPP
diff --git a/modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp b/modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp
new file mode 100755
index 00000000000..445aa2998c3
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp
@@ -0,0 +1,1398 @@
+/***********************************************************************
+ * Software License Agreement (BSD License)
+ *
+ * Copyright 2008-2009  Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
+ * Copyright 2008-2009  David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
+ * Copyright 2011-2016  Jose Luis Blanco (joseluisblancoc@gmail.com).
+ *   All rights reserved.
+ *
+ * THE BSD LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *************************************************************************/
+
+/** \mainpage nanoflann C++ API documentation
+  *  nanoflann is a C++ header-only library for building KD-Trees, mostly
+  *  optimized for 2D or 3D point clouds.
+  *
+  *  nanoflann does not require compiling or installing, just an
+  *  #include <nanoflann.hpp> in your code.
+  *
+  *  See:
+  *   - <a href="modules.html" >C++ API organized by modules</a>
+  *   - <a href="https://github.com/jlblancoc/nanoflann" >Online README</a>
+  *   - <a href="http://jlblancoc.github.io/nanoflann/" >Doxygen documentation</a>
+  */
+
+#ifndef  NANOFLANN_HPP_
+#define  NANOFLANN_HPP_
+
+#include <vector>
+#include <cassert>
+#include <algorithm>
+#include <stdexcept>
+#include <cstdio>  // for fwrite()
+#include <cmath>   // for abs()
+#include <cstdlib> // for abs()
+#include <limits>
+
+// Avoid conflicting declaration of min/max macros in windows headers
+#if !defined(NOMINMAX) && (defined(_WIN32) || defined(_WIN32_)  || defined(WIN32) || defined(_WIN64))
+# define NOMINMAX
+# ifdef max
+#  undef   max
+#  undef   min
+# endif
+#endif
+
+namespace nanoflann
+{
+/** @addtogroup nanoflann_grp nanoflann C++ library for ANN
+  *  @{ */
+
+  	/** Library version: 0xMmP (M=Major,m=minor,P=patch) */
+	#define NANOFLANN_VERSION 0x123
+
+	/** @addtogroup result_sets_grp Result set classes
+	  *  @{ */
+	template <typename DistanceType, typename IndexType = size_t, typename CountType = size_t>
+	class KNNResultSet
+	{
+		IndexType * indices;
+		DistanceType* dists;
+		CountType capacity;
+		CountType count;
+
+	public:
+		inline KNNResultSet(CountType capacity_) : indices(0), dists(0), capacity(capacity_), count(0)
+		{
+		}
+
+		inline void init(IndexType* indices_, DistanceType* dists_)
+		{
+			indices = indices_;
+			dists = dists_;
+			count = 0;
+            if (capacity)
+                dists[capacity-1] = (std::numeric_limits<DistanceType>::max)();
+		}
+
+		inline CountType size() const
+		{
+			return count;
+		}
+
+		inline bool full() const
+		{
+			return count == capacity;
+		}
+
+
+		inline void addPoint(DistanceType dist, IndexType index)
+		{
+			CountType i;
+			for (i=count; i>0; --i) {
+#ifdef NANOFLANN_FIRST_MATCH   // If defined and two points have the same distance, the one with the lowest-index will be returned first.
+				if ( (dists[i-1]>dist) || ((dist==dists[i-1])&&(indices[i-1]>index)) ) {
+#else
+				if (dists[i-1]>dist) {
+#endif
+					if (i<capacity) {
+						dists[i] = dists[i-1];
+						indices[i] = indices[i-1];
+					}
+				}
+				else break;
+			}
+			if (i<capacity) {
+				dists[i] = dist;
+				indices[i] = index;
+			}
+			if (count<capacity) count++;
+		}
+
+		inline DistanceType worstDist() const
+		{
+			return dists[capacity-1];
+		}
+	};
+
+
+	/**
+	 * A result-set class used when performing a radius based search.
+	 */
+	template <typename DistanceType, typename IndexType = size_t>
+	class RadiusResultSet
+	{
+	public:
+		const DistanceType radius;
+
+		std::vector<std::pair<IndexType,DistanceType> >& m_indices_dists;
+
+		inline RadiusResultSet(DistanceType radius_, std::vector<std::pair<IndexType,DistanceType> >& indices_dists) : radius(radius_), m_indices_dists(indices_dists)
+		{
+			init();
+		}
+
+		inline ~RadiusResultSet() { }
+
+		inline void init() { clear(); }
+		inline void clear() { m_indices_dists.clear(); }
+
+		inline size_t size() const { return m_indices_dists.size(); }
+
+		inline bool full() const { return true; }
+
+		inline void addPoint(DistanceType dist, IndexType index)
+		{
+			if (dist<radius)
+				m_indices_dists.push_back(std::make_pair(index,dist));
+		}
+
+		inline DistanceType worstDist() const { return radius; }
+
+		/** Clears the result set and adjusts the search radius. */
+		inline void set_radius_and_clear( const DistanceType r )
+		{
+			radius = r;
+			clear();
+		}
+
+		/**
+		 * Find the worst result (furtherest neighbor) without copying or sorting
+		 * Pre-conditions: size() > 0
+		 */
+		std::pair<IndexType,DistanceType> worst_item() const
+		{
+		   if (m_indices_dists.empty()) throw std::runtime_error("Cannot invoke RadiusResultSet::worst_item() on an empty list of results.");
+		   typedef typename std::vector<std::pair<IndexType,DistanceType> >::const_iterator DistIt;
+		   DistIt it = std::max_element(m_indices_dists.begin(), m_indices_dists.end());
+		   return *it;
+		}
+	};
+
+	/** operator "<" for std::sort() */
+	struct IndexDist_Sorter
+	{
+		/** PairType will be typically: std::pair<IndexType,DistanceType> */
+		template <typename PairType>
+		inline bool operator()(const PairType &p1, const PairType &p2) const {
+			return p1.second < p2.second;
+		}
+	};
+
+	/** @} */
+
+
+	/** @addtogroup loadsave_grp Load/save auxiliary functions
+	  * @{ */
+	template<typename T>
+	void save_value(FILE* stream, const T& value, size_t count = 1)
+	{
+		fwrite(&value, sizeof(value),count, stream);
+	}
+
+	template<typename T>
+	void save_value(FILE* stream, const std::vector<T>& value)
+	{
+		size_t size = value.size();
+		fwrite(&size, sizeof(size_t), 1, stream);
+		fwrite(&value[0], sizeof(T), size, stream);
+	}
+
+	template<typename T>
+	void load_value(FILE* stream, T& value, size_t count = 1)
+	{
+		size_t read_cnt = fread(&value, sizeof(value), count, stream);
+		if (read_cnt != count) {
+			throw std::runtime_error("Cannot read from file");
+		}
+	}
+
+
+	template<typename T>
+	void load_value(FILE* stream, std::vector<T>& value)
+	{
+		size_t size;
+		size_t read_cnt = fread(&size, sizeof(size_t), 1, stream);
+		if (read_cnt!=1) {
+			throw std::runtime_error("Cannot read from file");
+		}
+		value.resize(size);
+		read_cnt = fread(&value[0], sizeof(T), size, stream);
+		if (read_cnt!=size) {
+			throw std::runtime_error("Cannot read from file");
+		}
+	}
+	/** @} */
+
+
+	/** @addtogroup metric_grp Metric (distance) classes
+	  * @{ */
+
+	/** Manhattan distance functor (generic version, optimized for high-dimensionality data sets).
+	  *  Corresponding distance traits: nanoflann::metric_L1
+	  * \tparam T Type of the elements (e.g. double, float, uint8_t)
+	  * \tparam _DistanceType Type of distance variables (must be signed) (e.g. float, double, int64_t)
+	  */
+	template<class T, class DataSource, typename _DistanceType = T>
+	struct L1_Adaptor
+	{
+		typedef T ElementType;
+		typedef _DistanceType DistanceType;
+
+		const DataSource &data_source;
+
+		L1_Adaptor(const DataSource &_data_source) : data_source(_data_source) { }
+
+		inline DistanceType operator()(const T* a, const size_t b_idx, size_t size, DistanceType worst_dist = -1) const
+		{
+			DistanceType result = DistanceType();
+			const T* last = a + size;
+			const T* lastgroup = last - 3;
+			size_t d = 0;
+
+			/* Process 4 items with each loop for efficiency. */
+			while (a < lastgroup) {
+				const DistanceType diff0 = std::abs(a[0] - data_source.kdtree_get_pt(b_idx,d++));
+				const DistanceType diff1 = std::abs(a[1] - data_source.kdtree_get_pt(b_idx,d++));
+				const DistanceType diff2 = std::abs(a[2] - data_source.kdtree_get_pt(b_idx,d++));
+				const DistanceType diff3 = std::abs(a[3] - data_source.kdtree_get_pt(b_idx,d++));
+				result += diff0 + diff1 + diff2 + diff3;
+				a += 4;
+				if ((worst_dist>0)&&(result>worst_dist)) {
+					return result;
+				}
+			}
+			/* Process last 0-3 components.  Not needed for standard vector lengths. */
+			while (a < last) {
+				result += std::abs( *a++ - data_source.kdtree_get_pt(b_idx,d++) );
+			}
+			return result;
+		}
+
+		template <typename U, typename V>
+		inline DistanceType accum_dist(const U a, const V b, int ) const
+		{
+			return std::abs(a-b);
+		}
+	};
+
+	/** Squared Euclidean distance functor (generic version, optimized for high-dimensionality data sets).
+	  *  Corresponding distance traits: nanoflann::metric_L2
+	  * \tparam T Type of the elements (e.g. double, float, uint8_t)
+	  * \tparam _DistanceType Type of distance variables (must be signed) (e.g. float, double, int64_t)
+	  */
+	template<class T, class DataSource, typename _DistanceType = T>
+	struct L2_Adaptor
+	{
+		typedef T ElementType;
+		typedef _DistanceType DistanceType;
+
+		const DataSource &data_source;
+
+		L2_Adaptor(const DataSource &_data_source) : data_source(_data_source) { }
+
+		inline DistanceType operator()(const T* a, const size_t b_idx, size_t size, DistanceType worst_dist = -1) const
+		{
+			DistanceType result = DistanceType();
+			const T* last = a + size;
+			const T* lastgroup = last - 3;
+			size_t d = 0;
+
+			/* Process 4 items with each loop for efficiency. */
+			while (a < lastgroup) {
+				const DistanceType diff0 = a[0] - data_source.kdtree_get_pt(b_idx,d++);
+				const DistanceType diff1 = a[1] - data_source.kdtree_get_pt(b_idx,d++);
+				const DistanceType diff2 = a[2] - data_source.kdtree_get_pt(b_idx,d++);
+				const DistanceType diff3 = a[3] - data_source.kdtree_get_pt(b_idx,d++);
+				result += diff0 * diff0 + diff1 * diff1 + diff2 * diff2 + diff3 * diff3;
+				a += 4;
+				if ((worst_dist>0)&&(result>worst_dist)) {
+					return result;
+				}
+			}
+			/* Process last 0-3 components.  Not needed for standard vector lengths. */
+			while (a < last) {
+				const DistanceType diff0 = *a++ - data_source.kdtree_get_pt(b_idx,d++);
+				result += diff0 * diff0;
+			}
+			return result;
+		}
+
+		template <typename U, typename V>
+		inline DistanceType accum_dist(const U a, const V b, int ) const
+		{
+			return (a-b)*(a-b);
+		}
+	};
+
+	/** Squared Euclidean (L2) distance functor (suitable for low-dimensionality datasets, like 2D or 3D point clouds)
+	  *  Corresponding distance traits: nanoflann::metric_L2_Simple
+	  * \tparam T Type of the elements (e.g. double, float, uint8_t)
+	  * \tparam _DistanceType Type of distance variables (must be signed) (e.g. float, double, int64_t)
+	  */
+	template<class T, class DataSource, typename _DistanceType = T>
+	struct L2_Simple_Adaptor
+	{
+		typedef T ElementType;
+		typedef _DistanceType DistanceType;
+
+		const DataSource &data_source;
+
+		L2_Simple_Adaptor(const DataSource &_data_source) : data_source(_data_source) { }
+
+		inline DistanceType operator()(const T* a, const size_t b_idx, size_t size) const {
+			return data_source.kdtree_distance(a,b_idx,size);
+		}
+
+		template <typename U, typename V>
+		inline DistanceType accum_dist(const U a, const V b, int ) const
+		{
+			return (a-b)*(a-b);
+		}
+	};
+
+	/** Metaprogramming helper traits class for the L1 (Manhattan) metric */
+	struct metric_L1 {
+		template<class T, class DataSource>
+		struct traits {
+			typedef L1_Adaptor<T,DataSource> distance_t;
+		};
+	};
+	/** Metaprogramming helper traits class for the L2 (Euclidean) metric */
+	struct metric_L2 {
+		template<class T, class DataSource>
+		struct traits {
+			typedef L2_Adaptor<T,DataSource> distance_t;
+		};
+	};
+	/** Metaprogramming helper traits class for the L2_simple (Euclidean) metric */
+	struct metric_L2_Simple {
+		template<class T, class DataSource>
+		struct traits {
+			typedef L2_Simple_Adaptor<T,DataSource> distance_t;
+		};
+	};
+
+	/** @} */
+
+	/** @addtogroup param_grp Parameter structs
+	  * @{ */
+
+	/**  Parameters (see README.md) */
+	struct KDTreeSingleIndexAdaptorParams
+	{
+		KDTreeSingleIndexAdaptorParams(size_t _leaf_max_size = 10) :
+			leaf_max_size(_leaf_max_size)
+		{}
+
+		size_t leaf_max_size;
+	};
+
+	/** Search options for KDTreeSingleIndexAdaptor::findNeighbors() */
+	struct SearchParams
+	{
+		/** Note: The first argument (checks_IGNORED_) is ignored, but kept for compatibility with the FLANN interface */
+		SearchParams(int checks_IGNORED_ = 32, float eps_ = 0, bool sorted_ = true ) :
+			checks(checks_IGNORED_), eps(eps_), sorted(sorted_) {}
+
+		int   checks;  //!< Ignored parameter (Kept for compatibility with the FLANN interface).
+		float eps;  //!< search for eps-approximate neighbours (default: 0)
+		bool sorted; //!< only for radius search, require neighbours sorted by distance (default: true)
+	};
+	/** @} */
+
+
+	/** @addtogroup memalloc_grp Memory allocation
+	  * @{ */
+
+	/**
+	 * Allocates (using C's malloc) a generic type T.
+	 *
+	 * Params:
+	 *     count = number of instances to allocate.
+	 * Returns: pointer (of type T*) to memory buffer
+	 */
+	template <typename T>
+	inline T* allocate(size_t count = 1)
+	{
+		T* mem = static_cast<T*>( ::malloc(sizeof(T)*count));
+		return mem;
+	}
+
+
+	/**
+	 * Pooled storage allocator
+	 *
+	 * The following routines allow for the efficient allocation of storage in
+	 * small chunks from a specified pool.  Rather than allowing each structure
+	 * to be freed individually, an entire pool of storage is freed at once.
+	 * This method has two advantages over just using malloc() and free().  First,
+	 * it is far more efficient for allocating small objects, as there is
+	 * no overhead for remembering all the information needed to free each
+	 * object or consolidating fragmented memory.  Second, the decision about
+	 * how long to keep an object is made at the time of allocation, and there
+	 * is no need to track down all the objects to free them.
+	 *
+	 */
+
+	const size_t     WORDSIZE=16;
+	const size_t     BLOCKSIZE=8192;
+
+	class PooledAllocator
+	{
+		/* We maintain memory alignment to word boundaries by requiring that all
+		    allocations be in multiples of the machine wordsize.  */
+		/* Size of machine word in bytes.  Must be power of 2. */
+		/* Minimum number of bytes requested at a time from	the system.  Must be multiple of WORDSIZE. */
+
+
+		size_t  remaining;  /* Number of bytes left in current block of storage. */
+		void*   base;     /* Pointer to base of current block of storage. */
+		void*   loc;      /* Current location in block to next allocate memory. */
+
+		void internal_init()
+		{
+			remaining = 0;
+			base = NULL;
+			usedMemory = 0;
+			wastedMemory = 0;
+		}
+
+	public:
+		size_t  usedMemory;
+		size_t  wastedMemory;
+
+		/**
+		    Default constructor. Initializes a new pool.
+		 */
+		PooledAllocator() {
+			internal_init();
+		}
+
+		/**
+		 * Destructor. Frees all the memory allocated in this pool.
+		 */
+		~PooledAllocator() {
+			free_all();
+		}
+
+		/** Frees all allocated memory chunks */
+		void free_all()
+		{
+			while (base != NULL) {
+				void *prev = *(static_cast<void**>( base)); /* Get pointer to prev block. */
+				::free(base);
+				base = prev;
+			}
+			internal_init();
+		}
+
+		/**
+		 * Returns a pointer to a piece of new memory of the given size in bytes
+		 * allocated from the pool.
+		 */
+		void* malloc(const size_t req_size)
+		{
+			/* Round size up to a multiple of wordsize.  The following expression
+			    only works for WORDSIZE that is a power of 2, by masking last bits of
+			    incremented size to zero.
+			 */
+			const size_t size = (req_size + (WORDSIZE - 1)) & ~(WORDSIZE - 1);
+
+			/* Check whether a new block must be allocated.  Note that the first word
+			    of a block is reserved for a pointer to the previous block.
+			 */
+			if (size > remaining) {
+
+				wastedMemory += remaining;
+
+				/* Allocate new storage. */
+				const size_t blocksize = (size + sizeof(void*) + (WORDSIZE-1) > BLOCKSIZE) ?
+							size + sizeof(void*) + (WORDSIZE-1) : BLOCKSIZE;
+
+				// use the standard C malloc to allocate memory
+				void* m = ::malloc(blocksize);
+				if (!m) {
+					fprintf(stderr,"Failed to allocate memory.\n");
+					return NULL;
+				}
+
+				/* Fill first word of new block with pointer to previous block. */
+				static_cast<void**>(m)[0] = base;
+				base = m;
+
+				size_t shift = 0;
+				//int size_t = (WORDSIZE - ( (((size_t)m) + sizeof(void*)) & (WORDSIZE-1))) & (WORDSIZE-1);
+
+				remaining = blocksize - sizeof(void*) - shift;
+				loc = (static_cast<char*>(m) + sizeof(void*) + shift);
+			}
+			void* rloc = loc;
+			loc = static_cast<char*>(loc) + size;
+			remaining -= size;
+
+			usedMemory += size;
+
+			return rloc;
+		}
+
+		/**
+		 * Allocates (using this pool) a generic type T.
+		 *
+		 * Params:
+		 *     count = number of instances to allocate.
+		 * Returns: pointer (of type T*) to memory buffer
+		 */
+		template <typename T>
+		T* allocate(const size_t count = 1)
+		{
+			T* mem = static_cast<T*>(this->malloc(sizeof(T)*count));
+			return mem;
+		}
+
+	};
+	/** @} */
+
+	/** @addtogroup nanoflann_metaprog_grp Auxiliary metaprogramming stuff
+	  * @{ */
+
+	// ----------------  CArray -------------------------
+	/** A STL container (as wrapper) for arrays of constant size defined at compile time (class imported from the MRPT project)
+	 * This code is an adapted version from Boost, modifed for its integration
+	 *	within MRPT (JLBC, Dec/2009) (Renamed array -> CArray to avoid possible potential conflicts).
+	 * See
+	 *      http://www.josuttis.com/cppcode
+	 * for details and the latest version.
+	 * See
+	 *      http://www.boost.org/libs/array for Documentation.
+	 * for documentation.
+	 *
+	 * (C) Copyright Nicolai M. Josuttis 2001.
+	 * Permission to copy, use, modify, sell and distribute this software
+	 * is granted provided this copyright notice appears in all copies.
+	 * This software is provided "as is" without express or implied
+	 * warranty, and with no claim as to its suitability for any purpose.
+	 *
+	 * 29 Jan 2004 - minor fixes (Nico Josuttis)
+	 * 04 Dec 2003 - update to synch with library TR1 (Alisdair Meredith)
+	 * 23 Aug 2002 - fix for Non-MSVC compilers combined with MSVC libraries.
+	 * 05 Aug 2001 - minor update (Nico Josuttis)
+	 * 20 Jan 2001 - STLport fix (Beman Dawes)
+	 * 29 Sep 2000 - Initial Revision (Nico Josuttis)
+	 *
+	 * Jan 30, 2004
+	 */
+    template <typename T, std::size_t N>
+    class CArray {
+      public:
+        T elems[N];    // fixed-size array of elements of type T
+
+      public:
+        // type definitions
+        typedef T              value_type;
+        typedef T*             iterator;
+        typedef const T*       const_iterator;
+        typedef T&             reference;
+        typedef const T&       const_reference;
+        typedef std::size_t    size_type;
+        typedef std::ptrdiff_t difference_type;
+
+        // iterator support
+        inline iterator begin() { return elems; }
+        inline const_iterator begin() const { return elems; }
+        inline iterator end() { return elems+N; }
+        inline const_iterator end() const { return elems+N; }
+
+        // reverse iterator support
+#if !defined(BOOST_NO_TEMPLATE_PARTIAL_SPECIALIZATION) && !defined(BOOST_MSVC_STD_ITERATOR) && !defined(BOOST_NO_STD_ITERATOR_TRAITS)
+        typedef std::reverse_iterator<iterator> reverse_iterator;
+        typedef std::reverse_iterator<const_iterator> const_reverse_iterator;
+#elif defined(_MSC_VER) && (_MSC_VER == 1300) && defined(BOOST_DINKUMWARE_STDLIB) && (BOOST_DINKUMWARE_STDLIB == 310)
+        // workaround for broken reverse_iterator in VC7
+        typedef std::reverse_iterator<std::_Ptrit<value_type, difference_type, iterator,
+                                      reference, iterator, reference> > reverse_iterator;
+        typedef std::reverse_iterator<std::_Ptrit<value_type, difference_type, const_iterator,
+                                      const_reference, iterator, reference> > const_reverse_iterator;
+#else
+        // workaround for broken reverse_iterator implementations
+        typedef std::reverse_iterator<iterator,T> reverse_iterator;
+        typedef std::reverse_iterator<const_iterator,T> const_reverse_iterator;
+#endif
+
+        reverse_iterator rbegin() { return reverse_iterator(end()); }
+        const_reverse_iterator rbegin() const { return const_reverse_iterator(end()); }
+        reverse_iterator rend() { return reverse_iterator(begin()); }
+        const_reverse_iterator rend() const { return const_reverse_iterator(begin()); }
+        // operator[]
+        inline reference operator[](size_type i) { return elems[i]; }
+        inline const_reference operator[](size_type i) const { return elems[i]; }
+        // at() with range check
+        reference at(size_type i) { rangecheck(i); return elems[i]; }
+        const_reference at(size_type i) const { rangecheck(i); return elems[i]; }
+        // front() and back()
+        reference front() { return elems[0]; }
+        const_reference front() const { return elems[0]; }
+        reference back() { return elems[N-1]; }
+        const_reference back() const { return elems[N-1]; }
+        // size is constant
+        static inline size_type size() { return N; }
+        static bool empty() { return false; }
+        static size_type max_size() { return N; }
+        enum { static_size = N };
+		/** This method has no effects in this class, but raises an exception if the expected size does not match */
+		inline void resize(const size_t nElements) { if (nElements!=N) throw std::logic_error("Try to change the size of a CArray."); }
+        // swap (note: linear complexity in N, constant for given instantiation)
+        void swap (CArray<T,N>& y) { std::swap_ranges(begin(),end(),y.begin()); }
+        // direct access to data (read-only)
+        const T* data() const { return elems; }
+        // use array as C array (direct read/write access to data)
+        T* data() { return elems; }
+        // assignment with type conversion
+        template <typename T2> CArray<T,N>& operator= (const CArray<T2,N>& rhs) {
+            std::copy(rhs.begin(),rhs.end(), begin());
+            return *this;
+        }
+        // assign one value to all elements
+        inline void assign (const T& value) { for (size_t i=0;i<N;i++) elems[i]=value; }
+        // assign (compatible with std::vector's one) (by JLBC for MRPT)
+        void assign (const size_t n, const T& value) { assert(N==n); for (size_t i=0;i<N;i++) elems[i]=value; }
+      private:
+        // check range (may be private because it is static)
+        static void rangecheck (size_type i) { if (i >= size()) { throw std::out_of_range("CArray<>: index out of range"); } }
+    }; // end of CArray
+
+	/** Used to declare fixed-size arrays when DIM>0, dynamically-allocated vectors when DIM=-1.
+	  * Fixed size version for a generic DIM:
+	  */
+	template <int DIM, typename T>
+	struct array_or_vector_selector
+	{
+		typedef CArray<T,DIM> container_t;
+	};
+	/** Dynamic size version */
+	template <typename T>
+	struct array_or_vector_selector<-1,T> {
+		typedef std::vector<T> container_t;
+	};
+	/** @} */
+
+	/** @addtogroup kdtrees_grp KD-tree classes and adaptors
+	  * @{ */
+
+	/** kd-tree index
+	 *
+	 * Contains the k-d trees and other information for indexing a set of points
+	 * for nearest-neighbor matching.
+	 *
+	 *  The class "DatasetAdaptor" must provide the following interface (can be non-virtual, inlined methods):
+	 *
+	 *  \code
+	 *   // Must return the number of data poins
+	 *   inline size_t kdtree_get_point_count() const { ... }
+	 *
+	 *   // [Only if using the metric_L2_Simple type] Must return the Euclidean (L2) distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+	 *   inline DistanceType kdtree_distance(const T *p1, const size_t idx_p2,size_t size) const { ... }
+	 *
+	 *   // Must return the dim'th component of the idx'th point in the class:
+	 *   inline T kdtree_get_pt(const size_t idx, int dim) const { ... }
+	 *
+	 *   // Optional bounding-box computation: return false to default to a standard bbox computation loop.
+	 *   //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
+	 *   //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
+	 *   template <class BBOX>
+	 *   bool kdtree_get_bbox(BBOX &bb) const
+	 *   {
+	 *      bb[0].low = ...; bb[0].high = ...;  // 0th dimension limits
+	 *      bb[1].low = ...; bb[1].high = ...;  // 1st dimension limits
+	 *      ...
+	 *      return true;
+	 *   }
+	 *
+	 *  \endcode
+	 *
+	 * \tparam DatasetAdaptor The user-provided adaptor (see comments above).
+	 * \tparam Distance The distance metric to use: nanoflann::metric_L1, nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
+	 * \tparam DIM Dimensionality of data points (e.g. 3 for 3D points)
+	 * \tparam IndexType Will be typically size_t or int
+	 */
+	template <typename Distance, class DatasetAdaptor,int DIM = -1, typename IndexType = size_t>
+	class KDTreeSingleIndexAdaptor
+	{
+	private:
+		/** Hidden copy constructor, to disallow copying indices (Not implemented) */
+		KDTreeSingleIndexAdaptor(const KDTreeSingleIndexAdaptor<Distance,DatasetAdaptor,DIM,IndexType>&);
+	public:
+		typedef typename Distance::ElementType  ElementType;
+		typedef typename Distance::DistanceType DistanceType;
+	protected:
+
+		/**
+		 *  Array of indices to vectors in the dataset.
+		 */
+		std::vector<IndexType> vind;
+
+		size_t m_leaf_max_size;
+
+
+		/**
+		 * The dataset used by this index
+		 */
+		const DatasetAdaptor &dataset; //!< The source of our data
+
+		const KDTreeSingleIndexAdaptorParams index_params;
+
+		size_t m_size; //!< Number of current poins in the dataset
+		size_t m_size_at_index_build; //!< Number of points in the dataset when the index was built
+		int dim;  //!< Dimensionality of each data point
+
+
+		/*--------------------- Internal Data Structures --------------------------*/
+		struct Node
+		{
+			/** Union used because a node can be either a LEAF node or a non-leaf node, so both data fields are never used simultaneously */
+			union {
+				struct leaf
+                                {
+					IndexType    left, right;  //!< Indices of points in leaf node
+				} lr;
+				struct nonleaf
+                                {
+					int          divfeat; //!< Dimension used for subdivision.
+					DistanceType divlow, divhigh; //!< The values used for subdivision.
+				} sub;
+			} node_type;
+			Node* child1, * child2;  //!< Child nodes (both=NULL mean its a leaf node)
+		};
+		typedef Node* NodePtr;
+
+
+		struct Interval
+		{
+			ElementType low, high;
+		};
+
+		/** Define "BoundingBox" as a fixed-size or variable-size container depending on "DIM" */
+		typedef typename array_or_vector_selector<DIM,Interval>::container_t BoundingBox;
+
+		/** Define "distance_vector_t" as a fixed-size or variable-size container depending on "DIM" */
+		typedef typename array_or_vector_selector<DIM,DistanceType>::container_t distance_vector_t;
+
+		/** The KD-tree used to find neighbours */
+		NodePtr root_node;
+		BoundingBox root_bbox;
+
+		/**
+		 * Pooled memory allocator.
+		 *
+		 * Using a pooled memory allocator is more efficient
+		 * than allocating memory directly when there is a large
+		 * number small of memory allocations.
+		 */
+		PooledAllocator pool;
+
+	public:
+
+		Distance distance;
+
+		/**
+		 * KDTree constructor
+		 *
+		 * Refer to docs in README.md or online in https://github.com/jlblancoc/nanoflann
+		 *
+		 * The KD-Tree point dimension (the length of each point in the datase, e.g. 3 for 3D points)
+		 * is determined by means of:
+		 *  - The \a DIM template parameter if >0 (highest priority)
+		 *  - Otherwise, the \a dimensionality parameter of this constructor.
+		 *
+		 * @param inputData Dataset with the input features
+		 * @param params Basically, the maximum leaf node size
+		 */
+		KDTreeSingleIndexAdaptor(const int dimensionality, const DatasetAdaptor& inputData, const KDTreeSingleIndexAdaptorParams& params = KDTreeSingleIndexAdaptorParams() ) :
+			dataset(inputData), index_params(params), root_node(NULL), distance(inputData)
+		{
+			m_size = dataset.kdtree_get_point_count();
+			m_size_at_index_build = m_size;
+			dim = dimensionality;
+			if (DIM>0) dim=DIM;
+			m_leaf_max_size = params.leaf_max_size;
+
+			// Create a permutable array of indices to the input vectors.
+			init_vind();
+		}
+
+		/** Standard destructor */
+		~KDTreeSingleIndexAdaptor() { }
+
+		/** Frees the previously-built index. Automatically called within buildIndex(). */
+		void freeIndex()
+		{
+			pool.free_all();
+			root_node=NULL;
+			m_size_at_index_build = 0;
+		}
+
+		/**
+		 * Builds the index
+		 */
+		void buildIndex()
+		{
+			init_vind();
+			freeIndex();
+			m_size_at_index_build = m_size;
+			if(m_size == 0) return;
+			computeBoundingBox(root_bbox);
+			root_node = divideTree(0, m_size, root_bbox );   // construct the tree
+		}
+
+		/** Returns number of points in dataset  */
+		size_t size() const { return m_size; }
+
+		/** Returns the length of each point in the dataset */
+		size_t veclen() const {
+			return static_cast<size_t>(DIM>0 ? DIM : dim);
+		}
+
+		/**
+		 * Computes the inde memory usage
+		 * Returns: memory used by the index
+		 */
+		size_t usedMemory() const
+		{
+			return pool.usedMemory+pool.wastedMemory+dataset.kdtree_get_point_count()*sizeof(IndexType);  // pool memory and vind array memory
+		}
+
+		/** \name Query methods
+		  * @{ */
+
+		/**
+		 * Find set of nearest neighbors to vec[0:dim-1]. Their indices are stored inside
+		 * the result object.
+		 *
+		 * Params:
+		 *     result = the result object in which the indices of the nearest-neighbors are stored
+		 *     vec = the vector for which to search the nearest neighbors
+		 *
+		 * \tparam RESULTSET Should be any ResultSet<DistanceType>
+         * \return  True if the requested neighbors could be found.
+		 * \sa knnSearch, radiusSearch
+		 */
+		template <typename RESULTSET>
+		bool findNeighbors(RESULTSET& result, const ElementType* vec, const SearchParams& searchParams) const
+		{
+			assert(vec);
+            if (size() == 0)
+                return false;
+			if (!root_node)
+                throw std::runtime_error("[nanoflann] findNeighbors() called before building the index.");
+			float epsError = 1+searchParams.eps;
+
+			distance_vector_t dists; // fixed or variable-sized container (depending on DIM)
+			dists.assign((DIM>0 ? DIM : dim) ,0); // Fill it with zeros.
+			DistanceType distsq = computeInitialDistances(vec, dists);
+			searchLevel(result, vec, root_node, distsq, dists, epsError);  // "count_leaf" parameter removed since was neither used nor returned to the user.
+            return result.full();
+		}
+
+		/**
+		 * Find the "num_closest" nearest neighbors to the \a query_point[0:dim-1]. Their indices are stored inside
+		 * the result object.
+		 *  \sa radiusSearch, findNeighbors
+		 * \note nChecks_IGNORED is ignored but kept for compatibility with the original FLANN interface.
+		 * \return Number `N` of valid points in the result set. Only the first `N` entries in `out_indices` and `out_distances_sq` will be valid. 
+		 *         Return may be less than `num_closest` only if the number of elements in the tree is less than `num_closest`.
+		 */
+		size_t knnSearch(const ElementType *query_point, const size_t num_closest, IndexType *out_indices, DistanceType *out_distances_sq, const int /* nChecks_IGNORED */ = 10) const
+		{
+			nanoflann::KNNResultSet<DistanceType,IndexType> resultSet(num_closest);
+			resultSet.init(out_indices, out_distances_sq);
+			this->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
+			return resultSet.size();
+		}
+
+		/**
+		 * Find all the neighbors to \a query_point[0:dim-1] within a maximum radius.
+		 *  The output is given as a vector of pairs, of which the first element is a point index and the second the corresponding distance.
+		 *  Previous contents of \a IndicesDists are cleared.
+		 *
+		 *  If searchParams.sorted==true, the output list is sorted by ascending distances.
+		 *
+		 *  For a better performance, it is advisable to do a .reserve() on the vector if you have any wild guess about the number of expected matches.
+		 *
+		 *  \sa knnSearch, findNeighbors, radiusSearchCustomCallback
+		 * \return The number of points within the given radius (i.e. indices.size() or dists.size() )
+		 */
+		size_t radiusSearch(const ElementType *query_point,const DistanceType &radius, std::vector<std::pair<IndexType,DistanceType> >& IndicesDists, const SearchParams& searchParams) const
+		{
+			RadiusResultSet<DistanceType,IndexType> resultSet(radius,IndicesDists);
+			const size_t nFound = radiusSearchCustomCallback(query_point,resultSet,searchParams);
+			if (searchParams.sorted)
+				std::sort(IndicesDists.begin(),IndicesDists.end(), IndexDist_Sorter() );
+			return nFound;
+		}
+
+		/**
+		 * Just like radiusSearch() but with a custom callback class for each point found in the radius of the query.
+		 * See the source of RadiusResultSet<> as a start point for your own classes.
+		 * \sa radiusSearch
+		 */
+		template <class SEARCH_CALLBACK>
+		size_t radiusSearchCustomCallback(const ElementType *query_point,SEARCH_CALLBACK &resultSet, const SearchParams& searchParams = SearchParams() ) const
+		{
+			this->findNeighbors(resultSet, query_point, searchParams);
+			return resultSet.size();
+		}
+
+		/** @} */
+
+	private:
+		/** Make sure the auxiliary list \a vind has the same size than the current dataset, and re-generate if size has changed. */
+		void init_vind()
+		{
+			// Create a permutable array of indices to the input vectors.
+			m_size = dataset.kdtree_get_point_count();
+			if (vind.size()!=m_size) vind.resize(m_size);
+			for (size_t i = 0; i < m_size; i++) vind[i] = i;
+		}
+
+		/// Helper accessor to the dataset points:
+		inline ElementType dataset_get(size_t idx, int component) const {
+			return dataset.kdtree_get_pt(idx,component);
+		}
+
+
+		void save_tree(FILE* stream, NodePtr tree)
+		{
+			save_value(stream, *tree);
+			if (tree->child1!=NULL) {
+				save_tree(stream, tree->child1);
+			}
+			if (tree->child2!=NULL) {
+				save_tree(stream, tree->child2);
+			}
+		}
+
+
+		void load_tree(FILE* stream, NodePtr& tree)
+		{
+			tree = pool.allocate<Node>();
+			load_value(stream, *tree);
+			if (tree->child1!=NULL) {
+				load_tree(stream, tree->child1);
+			}
+			if (tree->child2!=NULL) {
+				load_tree(stream, tree->child2);
+			}
+		}
+
+
+		void computeBoundingBox(BoundingBox& bbox)
+		{
+			bbox.resize((DIM>0 ? DIM : dim));
+			if (dataset.kdtree_get_bbox(bbox))
+			{
+				// Done! It was implemented in derived class
+			}
+			else
+			{
+				const size_t N = dataset.kdtree_get_point_count();
+				if (!N) throw std::runtime_error("[nanoflann] computeBoundingBox() called but no data points found.");
+				for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+					bbox[i].low =
+					bbox[i].high = dataset_get(0,i);
+				}
+				for (size_t k=1; k<N; ++k) {
+					for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+						if (dataset_get(k,i)<bbox[i].low) bbox[i].low = dataset_get(k,i);
+						if (dataset_get(k,i)>bbox[i].high) bbox[i].high = dataset_get(k,i);
+					}
+				}
+			}
+		}
+
+
+		/**
+		 * Create a tree node that subdivides the list of vecs from vind[first]
+		 * to vind[last].  The routine is called recursively on each sublist.
+		 *
+		 * @param left index of the first vector
+		 * @param right index of the last vector
+		 */
+		NodePtr divideTree(const IndexType left, const IndexType right, BoundingBox& bbox)
+		{
+			NodePtr node = pool.allocate<Node>(); // allocate memory
+
+			/* If too few exemplars remain, then make this a leaf node. */
+			if ( (right-left) <= static_cast<IndexType>(m_leaf_max_size) ) {
+				node->child1 = node->child2 = NULL;    /* Mark as leaf node. */
+				node->node_type.lr.left = left;
+				node->node_type.lr.right = right;
+
+				// compute bounding-box of leaf points
+				for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+					bbox[i].low = dataset_get(vind[left],i);
+					bbox[i].high = dataset_get(vind[left],i);
+				}
+				for (IndexType k=left+1; k<right; ++k) {
+					for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+						if (bbox[i].low>dataset_get(vind[k],i)) bbox[i].low=dataset_get(vind[k],i);
+						if (bbox[i].high<dataset_get(vind[k],i)) bbox[i].high=dataset_get(vind[k],i);
+					}
+				}
+			}
+			else {
+				IndexType idx;
+				int cutfeat;
+				DistanceType cutval;
+				middleSplit_(&vind[0]+left, right-left, idx, cutfeat, cutval, bbox);
+
+				node->node_type.sub.divfeat = cutfeat;
+
+				BoundingBox left_bbox(bbox);
+				left_bbox[cutfeat].high = cutval;
+				node->child1 = divideTree(left, left+idx, left_bbox);
+
+				BoundingBox right_bbox(bbox);
+				right_bbox[cutfeat].low = cutval;
+				node->child2 = divideTree(left+idx, right, right_bbox);
+
+				node->node_type.sub.divlow = left_bbox[cutfeat].high;
+				node->node_type.sub.divhigh = right_bbox[cutfeat].low;
+
+				for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+					bbox[i].low = std::min(left_bbox[i].low, right_bbox[i].low);
+					bbox[i].high = std::max(left_bbox[i].high, right_bbox[i].high);
+				}
+			}
+
+			return node;
+		}
+
+
+		void computeMinMax(IndexType* ind, IndexType count, int element, ElementType& min_elem, ElementType& max_elem)
+		{
+			min_elem = dataset_get(ind[0],element);
+			max_elem = dataset_get(ind[0],element);
+			for (IndexType i=1; i<count; ++i) {
+				ElementType val = dataset_get(ind[i],element);
+				if (val<min_elem) min_elem = val;
+				if (val>max_elem) max_elem = val;
+			}
+		}
+
+		void middleSplit_(IndexType* ind, IndexType count, IndexType& index, int& cutfeat, DistanceType& cutval, const BoundingBox& bbox)
+		{
+			const DistanceType EPS=static_cast<DistanceType>(0.00001);
+			ElementType max_span = bbox[0].high-bbox[0].low;
+			for (int i=1; i<(DIM>0 ? DIM : dim); ++i) {
+				ElementType span = bbox[i].high-bbox[i].low;
+				if (span>max_span) {
+					max_span = span;
+				}
+			}
+			ElementType max_spread = -1;
+			cutfeat = 0;
+			for (int i=0; i<(DIM>0 ? DIM : dim); ++i) {
+				ElementType span = bbox[i].high-bbox[i].low;
+				if (span>(1-EPS)*max_span) {
+					ElementType min_elem, max_elem;
+					computeMinMax(ind, count, i, min_elem, max_elem);
+					ElementType spread = max_elem-min_elem;;
+					if (spread>max_spread) {
+						cutfeat = i;
+						max_spread = spread;
+					}
+				}
+			}
+			// split in the middle
+			DistanceType split_val = (bbox[cutfeat].low+bbox[cutfeat].high)/2;
+			ElementType min_elem, max_elem;
+			computeMinMax(ind, count, cutfeat, min_elem, max_elem);
+
+			if (split_val<min_elem) cutval = min_elem;
+			else if (split_val>max_elem) cutval = max_elem;
+			else cutval = split_val;
+
+			IndexType lim1, lim2;
+			planeSplit(ind, count, cutfeat, cutval, lim1, lim2);
+
+			if (lim1>count/2) index = lim1;
+			else if (lim2<count/2) index = lim2;
+			else index = count/2;
+		}
+
+
+		/**
+		 *  Subdivide the list of points by a plane perpendicular on axe corresponding
+		 *  to the 'cutfeat' dimension at 'cutval' position.
+		 *
+		 *  On return:
+		 *  dataset[ind[0..lim1-1]][cutfeat]<cutval
+		 *  dataset[ind[lim1..lim2-1]][cutfeat]==cutval
+		 *  dataset[ind[lim2..count]][cutfeat]>cutval
+		 */
+		void planeSplit(IndexType* ind, const IndexType count, int cutfeat, DistanceType &cutval, IndexType& lim1, IndexType& lim2)
+		{
+			/* Move vector indices for left subtree to front of list. */
+			IndexType left = 0;
+			IndexType right = count-1;
+			for (;; ) {
+				while (left<=right && dataset_get(ind[left],cutfeat)<cutval) ++left;
+				while (right && left<=right && dataset_get(ind[right],cutfeat)>=cutval) --right;
+				if (left>right || !right) break;  // "!right" was added to support unsigned Index types
+				std::swap(ind[left], ind[right]);
+				++left;
+				--right;
+			}
+			/* If either list is empty, it means that all remaining features
+			 * are identical. Split in the middle to maintain a balanced tree.
+			 */
+			lim1 = left;
+			right = count-1;
+			for (;; ) {
+				while (left<=right && dataset_get(ind[left],cutfeat)<=cutval) ++left;
+				while (right && left<=right && dataset_get(ind[right],cutfeat)>cutval) --right;
+				if (left>right || !right) break;  // "!right" was added to support unsigned Index types
+				std::swap(ind[left], ind[right]);
+				++left;
+				--right;
+			}
+			lim2 = left;
+		}
+
+		DistanceType computeInitialDistances(const ElementType* vec, distance_vector_t& dists) const
+		{
+			assert(vec);
+			DistanceType distsq = DistanceType();
+
+			for (int i = 0; i < (DIM>0 ? DIM : dim); ++i) {
+				if (vec[i] < root_bbox[i].low) {
+					dists[i] = distance.accum_dist(vec[i], root_bbox[i].low, i);
+					distsq += dists[i];
+				}
+				if (vec[i] > root_bbox[i].high) {
+					dists[i] = distance.accum_dist(vec[i], root_bbox[i].high, i);
+					distsq += dists[i];
+				}
+			}
+
+			return distsq;
+		}
+
+		/**
+		 * Performs an exact search in the tree starting from a node.
+		 * \tparam RESULTSET Should be any ResultSet<DistanceType>
+		 */
+		template <class RESULTSET>
+		void searchLevel(RESULTSET& result_set, const ElementType* vec, const NodePtr node, DistanceType mindistsq,
+						 distance_vector_t& dists, const float epsError) const
+		{
+			/* If this is a leaf node, then do check and return. */
+			if ((node->child1 == NULL)&&(node->child2 == NULL)) {
+				//count_leaf += (node->lr.right-node->lr.left);  // Removed since was neither used nor returned to the user.
+				DistanceType worst_dist = result_set.worstDist();
+				for (IndexType i=node->node_type.lr.left; i<node->node_type.lr.right; ++i) {
+					const IndexType index = vind[i];// reorder... : i;
+					DistanceType dist = distance(vec, index, (DIM>0 ? DIM : dim));
+					if (dist<worst_dist) {
+						result_set.addPoint(dist,vind[i]);
+					}
+				}
+				return;
+			}
+
+			/* Which child branch should be taken first? */
+			int idx = node->node_type.sub.divfeat;
+			ElementType val = vec[idx];
+			DistanceType diff1 = val - node->node_type.sub.divlow;
+			DistanceType diff2 = val - node->node_type.sub.divhigh;
+
+			NodePtr bestChild;
+			NodePtr otherChild;
+			DistanceType cut_dist;
+			if ((diff1+diff2)<0) {
+				bestChild = node->child1;
+				otherChild = node->child2;
+				cut_dist = distance.accum_dist(val, node->node_type.sub.divhigh, idx);
+			}
+			else {
+				bestChild = node->child2;
+				otherChild = node->child1;
+				cut_dist = distance.accum_dist( val, node->node_type.sub.divlow, idx);
+			}
+
+			/* Call recursively to search next level down. */
+			searchLevel(result_set, vec, bestChild, mindistsq, dists, epsError);
+
+			DistanceType dst = dists[idx];
+			mindistsq = mindistsq + cut_dist - dst;
+			dists[idx] = cut_dist;
+			if (mindistsq*epsError<=result_set.worstDist()) {
+				searchLevel(result_set, vec, otherChild, mindistsq, dists, epsError);
+			}
+			dists[idx] = dst;
+		}
+
+	public:
+		/**  Stores the index in a binary file.
+		  *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so when loading the index object it must be constructed associated to the same source of data points used while building it.
+		  * See the example: examples/saveload_example.cpp
+		  * \sa loadIndex  */
+		void saveIndex(FILE* stream)
+		{
+			save_value(stream, m_size);
+			save_value(stream, dim);
+			save_value(stream, root_bbox);
+			save_value(stream, m_leaf_max_size);
+			save_value(stream, vind);
+			save_tree(stream, root_node);
+		}
+
+		/**  Loads a previous index from a binary file.
+		  *   IMPORTANT NOTE: The set of data points is NOT stored in the file, so the index object must be constructed associated to the same source of data points used while building the index.
+		  * See the example: examples/saveload_example.cpp
+		  * \sa loadIndex  */
+		void loadIndex(FILE* stream)
+		{
+			load_value(stream, m_size);
+			load_value(stream, dim);
+			load_value(stream, root_bbox);
+			load_value(stream, m_leaf_max_size);
+			load_value(stream, vind);
+			load_tree(stream, root_node);
+		}
+
+	};   // class KDTree
+
+
+	/** An L2-metric KD-tree adaptor for working with data directly stored in an Eigen Matrix, without duplicating the data storage.
+	  *  Each row in the matrix represents a point in the state space.
+	  *
+	  *  Example of usage:
+	  * \code
+	  * 	Eigen::Matrix<num_t,Dynamic,Dynamic>  mat;
+	  * 	// Fill out "mat"...
+	  *
+	  * 	typedef KDTreeEigenMatrixAdaptor< Eigen::Matrix<num_t,Dynamic,Dynamic> >  my_kd_tree_t;
+	  * 	const int max_leaf = 10;
+	  * 	my_kd_tree_t   mat_index(dimdim, mat, max_leaf );
+	  * 	mat_index.index->buildIndex();
+	  * 	mat_index.index->...
+	  * \endcode
+	  *
+	  *  \tparam DIM If set to >0, it specifies a compile-time fixed dimensionality for the points in the data set, allowing more compiler optimizations.
+	  *  \tparam Distance The distance metric to use: nanoflann::metric_L1, nanoflann::metric_L2, nanoflann::metric_L2_Simple, etc.
+	  */
+	template <class MatrixType, int DIM = -1, class Distance = nanoflann::metric_L2>
+	struct KDTreeEigenMatrixAdaptor
+	{
+		typedef KDTreeEigenMatrixAdaptor<MatrixType,DIM,Distance> self_t;
+		typedef typename MatrixType::Scalar              num_t;
+		typedef typename MatrixType::Index IndexType;
+		typedef typename Distance::template traits<num_t,self_t>::distance_t metric_t;
+		typedef KDTreeSingleIndexAdaptor< metric_t,self_t,DIM,IndexType>  index_t;
+
+		index_t* index; //! The kd-tree index for the user to call its methods as usual with any other FLANN index.
+
+		/// Constructor: takes a const ref to the matrix object with the data points
+		KDTreeEigenMatrixAdaptor(const int dimensionality, const MatrixType &mat, const int leaf_max_size = 10) : m_data_matrix(mat)
+		{
+			const IndexType dims = mat.cols();
+			if (dims!=dimensionality) throw std::runtime_error("Error: 'dimensionality' must match column count in data matrix");
+			if (DIM>0 && static_cast<int>(dims)!=DIM)
+				throw std::runtime_error("Data set dimensionality does not match the 'DIM' template argument");
+			index = new index_t( dims, *this /* adaptor */, nanoflann::KDTreeSingleIndexAdaptorParams(leaf_max_size ) );
+			index->buildIndex();
+		}
+	private:
+		/** Hidden copy constructor, to disallow copying this class (Not implemented) */
+		KDTreeEigenMatrixAdaptor(const self_t&);
+	public:
+
+		~KDTreeEigenMatrixAdaptor() {
+			delete index;
+		}
+
+		const MatrixType &m_data_matrix;
+
+		/** Query for the \a num_closest closest points to a given point (entered as query_point[0:dim-1]).
+		  *  Note that this is a short-cut method for index->findNeighbors().
+		  *  The user can also call index->... methods as desired.
+		  * \note nChecks_IGNORED is ignored but kept for compatibility with the original FLANN interface.
+		  */
+		inline void query(const num_t *query_point, const size_t num_closest, IndexType *out_indices, num_t *out_distances_sq, const int /* nChecks_IGNORED */ = 10) const
+		{
+			nanoflann::KNNResultSet<num_t,IndexType> resultSet(num_closest);
+			resultSet.init(out_indices, out_distances_sq);
+			index->findNeighbors(resultSet, query_point, nanoflann::SearchParams());
+		}
+
+		/** @name Interface expected by KDTreeSingleIndexAdaptor
+		  * @{ */
+
+		const self_t & derived() const {
+			return *this;
+		}
+		self_t & derived()       {
+			return *this;
+		}
+
+		// Must return the number of data points
+		inline size_t kdtree_get_point_count() const {
+			return m_data_matrix.rows();
+		}
+
+		// Returns the L2 distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+		inline num_t kdtree_distance(const num_t *p1, const IndexType idx_p2,IndexType size) const
+		{
+			num_t s=0;
+			for (IndexType i=0; i<size; i++) {
+				const num_t d= p1[i]-m_data_matrix.coeff(idx_p2,i);
+				s+=d*d;
+			}
+			return s;
+		}
+
+		// Returns the dim'th component of the idx'th point in the class:
+		inline num_t kdtree_get_pt(const IndexType idx, int dim) const {
+			return m_data_matrix.coeff(idx,IndexType(dim));
+		}
+
+		// Optional bounding-box computation: return false to default to a standard bbox computation loop.
+		//   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
+		//   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
+		template <class BBOX>
+		bool kdtree_get_bbox(BBOX& /*bb*/) const {
+			return false;
+		}
+
+		/** @} */
+
+	}; // end of KDTreeEigenMatrixAdaptor
+	/** @} */
+
+/** @} */ // end of grouping
+} // end of NS
+
+
+#endif /* NANOFLANN_HPP_ */
diff --git a/modules/dynamicfusion/kfusion/src/capture.cpp b/modules/dynamicfusion/kfusion/src/capture.cpp
new file mode 100644
index 00000000000..070f3910095
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/capture.cpp
@@ -0,0 +1,339 @@
+#ifdef OPENNI_FOUND
+#pragma warning (disable :4996)
+#undef _CRT_SECURE_NO_DEPRECATE
+#include "XnCppWrapper.h"
+#include <io/capture.hpp>
+
+using namespace std;
+using namespace xn;
+
+//const std::string XMLConfig =
+//"<OpenNI>"
+//        "<Licenses>"
+//        "<License vendor=\"PrimeSense\" key=\"0KOIk2JeIBYClPWVnMoRKn5cdY4=\"/>"
+//        "</Licenses>"
+//        "<Log writeToConsole=\"false\" writeToFile=\"false\">"
+//                "<LogLevel value=\"3\"/>"
+//                "<Masks>"
+//                        "<Mask name=\"ALL\" on=\"true\"/>"
+//                "</Masks>"
+//                "<Dumps>"
+//                "</Dumps>"
+//        "</Log>"
+//        "<ProductionNodes>"
+//                "<Node type=\"Image\" name=\"Image1\">"
+//                        "<Configuration>"
+//                                "<MapOutputMode xRes=\"640\" yRes=\"480\" FPS=\"30\"/>"
+//                                "<Mirror on=\"false\"/>"
+//                        "</Configuration>"
+//                "</Node> "
+//                "<Node type=\"Depth\" name=\"Depth1\">"
+//                        "<Configuration>"
+//                                "<MapOutputMode xRes=\"640\" yRes=\"480\" FPS=\"30\"/>"
+//                                "<Mirror on=\"false\"/>"
+//                        "</Configuration>"
+//                "</Node>"
+//        "</ProductionNodes>"
+//"</OpenNI>";
+
+#define REPORT_ERROR(msg) kfusion::cuda::error ((msg), __FILE__, __LINE__)
+
+struct kfusion::OpenNISource::Impl
+{
+    Context context;
+    ScriptNode scriptNode;
+    DepthGenerator depth;
+    ImageGenerator image;
+    ProductionNode node;
+    DepthMetaData depthMD;
+    ImageMetaData imageMD;
+    XnChar strError[1024];
+    Player player_;
+
+    bool has_depth;
+    bool has_image;
+};
+
+kfusion::OpenNISource::OpenNISource() : depth_focal_length_VGA (0.f), baseline (0.f),
+    shadow_value (0), no_sample_value (0), pixelSize (0.0), max_depth (0) {}
+
+kfusion::OpenNISource::OpenNISource(int device) {open (device); }
+kfusion::OpenNISource::OpenNISource(const string& filename, bool repeat /*= false*/) {open (filename, repeat); }
+kfusion::OpenNISource::~OpenNISource() { release (); }
+
+void kfusion::OpenNISource::open (int device)
+{
+    impl_ = cv::Ptr<Impl>( new Impl () );
+
+    XnMapOutputMode mode;
+    mode.nXRes = XN_VGA_X_RES;
+    mode.nYRes = XN_VGA_Y_RES;
+    mode.nFPS = 30;
+
+    XnStatus rc;
+    rc = impl_->context.Init ();
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Init failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    xn::NodeInfoList devicesList;
+    rc = impl_->context.EnumerateProductionTrees ( XN_NODE_TYPE_DEVICE, NULL, devicesList, 0 );
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Init failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    xn::NodeInfoList::Iterator it = devicesList.Begin ();
+    for (int i = 0; i < device; ++i)
+        it++;
+
+    NodeInfo node = *it;
+    rc = impl_->context.CreateProductionTree ( node, impl_->node );
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Init failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    XnLicense license;
+    const char* vendor = "PrimeSense";
+    const char* key = "0KOIk2JeIBYClPWVnMoRKn5cdY4=";
+    sprintf ("%s, %s", license.strKey, key);
+    sprintf ("%s, %s", license.strVendor, vendor);
+
+    rc = impl_->context.AddLicense (license);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "licence failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    rc = impl_->depth.Create (impl_->context);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Depth generator  failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+    //rc = impl_->depth.SetIntProperty("HoleFilter", 1);
+    rc = impl_->depth.SetMapOutputMode (mode);
+    impl_->has_depth = true;
+
+    rc = impl_->image.Create (impl_->context);
+    if (rc != XN_STATUS_OK)
+    {
+        printf ("Image generator creation failed: %s\n", xnGetStatusString (rc));
+        impl_->has_image = false;
+    }
+    else
+    {
+        impl_->has_image = true;
+        rc = impl_->image.SetMapOutputMode (mode);
+    }
+
+    getParams ();
+
+    rc = impl_->context.StartGeneratingAll ();
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Start failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+}
+
+void kfusion::OpenNISource::open(const std::string& filename, bool repeat /*= false*/)
+{
+    impl_ = cv::Ptr<Impl> ( new Impl () );
+
+    XnStatus rc;
+
+    rc = impl_->context.Init ();
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Init failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    //rc = impl_->context.OpenFileRecording (filename.c_str (), impl_->node);
+    rc = impl_->context.OpenFileRecording (filename.c_str (), impl_->player_);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "Open failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    impl_->player_.SetRepeat(repeat);
+
+    rc = impl_->context.FindExistingNode (XN_NODE_TYPE_DEPTH, impl_->depth);
+    impl_->has_depth = (rc == XN_STATUS_OK);
+
+    rc = impl_->context.FindExistingNode (XN_NODE_TYPE_IMAGE, impl_->image);
+    impl_->has_image = (rc == XN_STATUS_OK);
+
+    if (!impl_->has_depth)
+        REPORT_ERROR ("No depth nodes. Check your configuration");
+
+    if (impl_->has_depth)
+        impl_->depth.GetMetaData (impl_->depthMD);
+
+    if (impl_->has_image)
+        impl_->image.GetMetaData (impl_->imageMD);
+
+    // RGB is the only image format supported.
+    if (impl_->imageMD.PixelFormat () != XN_PIXEL_FORMAT_RGB24)
+        REPORT_ERROR ("Image format must be RGB24\n");
+
+    getParams ();
+}
+
+void kfusion::OpenNISource::release ()
+{
+    if (impl_)
+    {
+        impl_->context.StopGeneratingAll ();
+        impl_->context.Release ();
+    }
+
+    impl_.release();;
+    depth_focal_length_VGA = 0;
+    baseline = 0.f;
+    shadow_value = 0;
+    no_sample_value = 0;
+    pixelSize = 0.0;
+}
+
+bool kfusion::OpenNISource::grab(cv::Mat& depth, cv::Mat& image)
+{
+    XnStatus rc = XN_STATUS_OK;
+
+    rc = impl_->context.WaitAndUpdateAll ();
+    if (rc != XN_STATUS_OK)
+        return printf ("Read failed: %s\n", xnGetStatusString (rc)), false;
+
+    if (impl_->has_depth)
+    {
+        impl_->depth.GetMetaData (impl_->depthMD);
+        const XnDepthPixel* pDepth = impl_->depthMD.Data ();
+        int x = impl_->depthMD.FullXRes ();
+        int y = impl_->depthMD.FullYRes ();
+        cv::Mat(y, x, CV_16U, (void*)pDepth).copyTo(depth);
+    }
+    else
+    {
+        depth.release();
+        printf ("no depth\n");
+    }
+
+    if (impl_->has_image)
+    {
+        impl_->image.GetMetaData (impl_->imageMD);
+        const XnRGB24Pixel* pImage = impl_->imageMD.RGB24Data ();
+        int x = impl_->imageMD.FullXRes ();
+        int y = impl_->imageMD.FullYRes ();
+        image.create(y, x, CV_8UC3);
+
+        cv::Vec3b *dptr = image.ptr<cv::Vec3b>();
+        for(size_t i = 0; i < image.total(); ++i)
+            dptr[i] = cv::Vec3b(pImage[i].nBlue, pImage[i].nGreen, pImage[i].nRed);
+    }
+    else
+    {
+        image.release();
+        printf ("no image\n");
+    }
+
+    return impl_->has_image || impl_->has_depth;
+}
+
+void kfusion::OpenNISource::getParams ()
+{
+    XnStatus rc = XN_STATUS_OK;
+
+    max_depth = impl_->depth.GetDeviceMaxDepth ();
+
+    rc = impl_->depth.GetRealProperty ( "ZPPS", pixelSize );  // in mm
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "ZPPS failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    XnUInt64 depth_focal_length_SXGA_mm;   //in mm
+    rc = impl_->depth.GetIntProperty ("ZPD", depth_focal_length_SXGA_mm);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "ZPD failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    XnDouble baseline_local;
+    rc = impl_->depth.GetRealProperty ("LDDIS", baseline_local);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "ZPD failed: %s\n", xnGetStatusString (rc));
+        REPORT_ERROR (impl_->strError);
+    }
+
+    XnUInt64 shadow_value_local;
+    rc = impl_->depth.GetIntProperty ("ShadowValue", shadow_value_local);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "ShadowValue failed: %s\n", xnGetStatusString (rc));
+//        REPORT_ERROR (impl_->strError);
+    }
+    shadow_value = (int)shadow_value_local;
+
+    XnUInt64 no_sample_value_local;
+    rc = impl_->depth.GetIntProperty ("NoSampleValue", no_sample_value_local);
+    if (rc != XN_STATUS_OK)
+    {
+        sprintf (impl_->strError, "NoSampleValue failed: %s\n", xnGetStatusString (rc));
+//        REPORT_ERROR (impl_->strError);
+    }
+    no_sample_value = (int)no_sample_value_local;
+
+
+    // baseline from cm -> mm
+    baseline = (float)(baseline_local * 10);
+
+    //focal length from mm -> pixels (valid for 1280x1024)
+    float depth_focal_length_SXGA = static_cast<float>(depth_focal_length_SXGA_mm / pixelSize);
+    depth_focal_length_VGA = depth_focal_length_SXGA / 2;
+}
+
+bool kfusion::OpenNISource::setRegistration (bool value)
+{
+    XnStatus rc = XN_STATUS_OK;
+
+    if (value)
+    {
+        if (!impl_->has_image)
+            return false;
+
+        if (impl_->depth.GetAlternativeViewPointCap ().IsViewPointAs (impl_->image) )
+            return true;
+
+        if (!impl_->depth.GetAlternativeViewPointCap ().IsViewPointSupported (impl_->image) )
+        {
+            printf ("SetRegistration failed: Unsupported viewpoint.\n");
+            return false;
+        }
+
+        rc = impl_->depth.GetAlternativeViewPointCap ().SetViewPoint (impl_->image);
+        if (rc != XN_STATUS_OK)
+            printf ("SetRegistration failed: %s\n", xnGetStatusString (rc));
+
+    }
+    else   // "off"
+    {
+        rc = impl_->depth.GetAlternativeViewPointCap ().ResetViewPoint ();
+        if (rc != XN_STATUS_OK)
+            printf ("SetRegistration failed: %s\n", xnGetStatusString (rc));
+    }
+
+    getParams ();
+    return rc == XN_STATUS_OK;
+}
+#endif
\ No newline at end of file
diff --git a/modules/dynamicfusion/kfusion/src/core.cpp b/modules/dynamicfusion/kfusion/src/core.cpp
new file mode 100644
index 00000000000..a9fe78521fe
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/core.cpp
@@ -0,0 +1,231 @@
+#include <kfusion/kinfu.hpp>
+#include "safe_call.hpp"
+
+#include <cuda.h>
+#include <cstdio>
+#include <iostream>
+
+int kf::cuda::getCudaEnabledDeviceCount()
+{
+    int count;
+    cudaError_t error = cudaGetDeviceCount( &count );
+
+    if (error == cudaErrorInsufficientDriver)
+        return -1;
+
+    if (error == cudaErrorNoDevice)
+        return 0;
+
+    cudaSafeCall(error);
+    return count;  
+}
+
+void kf::cuda::setDevice(int device)
+{
+    cudaSafeCall( cudaSetDevice( device ) );
+}
+
+std::string kf::cuda::getDeviceName(int device)
+{
+    cudaDeviceProp prop;
+    cudaSafeCall( cudaGetDeviceProperties(&prop, device) );
+
+    return prop.name;
+}
+
+bool kf::cuda::checkIfPreFermiGPU(int device)
+{
+  if (device < 0)
+    cudaSafeCall( cudaGetDevice(&device) );
+
+  cudaDeviceProp prop;
+  cudaSafeCall( cudaGetDeviceProperties(&prop, device) );
+  return prop.major < 2; // CC == 1.x
+}
+
+namespace 
+{
+    template <class T> inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute, int device)
+    {
+        *attribute = T();
+        CUresult error = cuDeviceGetAttribute( attribute, device_attribute, device );
+        if( CUDA_SUCCESS == error ) 
+            return;        
+
+        printf("Driver API error = %04d\n", error);
+        kfusion::cuda::error("driver API error", __FILE__, __LINE__);
+    }
+
+    inline int convertSMVer2Cores(int major, int minor)
+    {
+        // Defines for GPU Architecture types (using the SM version to determine the # of cores per SM
+        typedef struct {
+            int SM; // 0xMm (hexidecimal notation), M = SM Major version, and m = SM minor version
+            int Cores;
+        } SMtoCores;
+
+        SMtoCores gpuArchCoresPerSM[] =  { { 0x10,  8 }, { 0x11,  8 }, { 0x12,  8 }, { 0x13,  8 }, { 0x20, 32 }, { 0x21, 48 }, {0x30, 192}, {0x35, 192}, {0x50, 128}, {0x52, 128}, {0x61, 128}, { -1, -1 }  };
+
+        int index = 0;
+        while (gpuArchCoresPerSM[index].SM != -1) 
+        {
+            if (gpuArchCoresPerSM[index].SM == ((major << 4) + minor) ) 
+                return gpuArchCoresPerSM[index].Cores;
+            index++;
+        }
+        printf("\nCan't determine number of cores. Unknown SM version %d.%d!\n", major, minor);
+        return 0;
+    }
+}
+
+void kf::cuda::printCudaDeviceInfo(int device)
+{
+    int count = getCudaEnabledDeviceCount();
+    bool valid = (device >= 0) && (device < count);
+
+    int beg = valid ? device   : 0;
+    int end = valid ? device+1 : count;
+
+    printf("*** CUDA Device Query (Runtime API) version (CUDART static linking) *** \n\n");
+    printf("Device count: %d\n", count);
+
+    int driverVersion = 0, runtimeVersion = 0;
+    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+    const char *computeMode[] = {
+        "Default (multiple host threads can use ::cudaSetDevice() with device simultaneously)",
+        "Exclusive (only one host thread in one process is able to use ::cudaSetDevice() with this device)",
+        "Prohibited (no host thread can use ::cudaSetDevice() with this device)",
+        "Exclusive Process (many threads in one process is able to use ::cudaSetDevice() with this device)",
+        "Unknown",
+        NULL
+    };
+
+    for(int dev = beg; dev < end; ++dev)
+    {                
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+        int sm_cores = convertSMVer2Cores(prop.major, prop.minor);
+
+        printf("\nDevice %d: \"%s\"\n", dev, prop.name);        
+        printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+        printf("  CUDA Capability Major/Minor version number:    %d.%d\n", prop.major, prop.minor);        
+        printf("  Total amount of global memory:                 %.0f MBytes (%llu bytes)\n", (float)prop.totalGlobalMem/1048576.0f, (unsigned long long) prop.totalGlobalMem);            
+        printf("  (%2d) Multiprocessors x (%2d) CUDA Cores/MP:     %d CUDA Cores\n", prop.multiProcessorCount, sm_cores, sm_cores * prop.multiProcessorCount);
+        printf("  GPU Clock Speed:                               %.2f GHz\n", prop.clockRate * 1e-6f);
+
+#if (CUDART_VERSION >= 4000)
+        // This is not available in the CUDA Runtime API, so we make the necessary calls the driver API to support this for output
+        int memoryClock, memBusWidth, L2CacheSize;
+        getCudaAttribute<int>( &memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE, dev );        
+        getCudaAttribute<int>( &memBusWidth, CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev );                
+        getCudaAttribute<int>( &L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev );
+
+        printf("  Memory Clock rate:                             %.2f Mhz\n", memoryClock * 1e-3f);
+        printf("  Memory Bus Width:                              %d-bit\n", memBusWidth);
+        if (L2CacheSize)
+            printf("  L2 Cache Size:                                 %d bytes\n", L2CacheSize);
+        
+        printf("  Max Texture Dimension Size (x,y,z)             1D=(%d), 2D=(%d,%d), 3D=(%d,%d,%d)\n",
+            prop.maxTexture1D, prop.maxTexture2D[0], prop.maxTexture2D[1],
+            prop.maxTexture3D[0], prop.maxTexture3D[1], prop.maxTexture3D[2]);
+        printf("  Max Layered Texture Size (dim) x layers        1D=(%d) x %d, 2D=(%d,%d) x %d\n",
+            prop.maxTexture1DLayered[0], prop.maxTexture1DLayered[1],
+            prop.maxTexture2DLayered[0], prop.maxTexture2DLayered[1], prop.maxTexture2DLayered[2]);
+#endif
+        printf("  Total amount of constant memory:               %u bytes\n", (int)prop.totalConstMem);
+        printf("  Total amount of shared memory per block:       %u bytes\n", (int)prop.sharedMemPerBlock);
+        printf("  Total number of registers available per block: %d\n", prop.regsPerBlock);
+        printf("  Warp size:                                     %d\n", prop.warpSize);
+        printf("  Maximum number of threads per block:           %d\n", prop.maxThreadsPerBlock);
+        printf("  Maximum sizes of each dimension of a block:    %d x %d x %d\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
+        printf("  Maximum sizes of each dimension of a grid:     %d x %d x %d\n", prop.maxGridSize[0], prop.maxGridSize[1],  prop.maxGridSize[2]);
+        printf("  Maximum memory pitch:                          %u bytes\n", (int)prop.memPitch);
+        printf("  Texture alignment:                             %u bytes\n", (int)prop.textureAlignment);
+
+#if CUDART_VERSION >= 4000
+        printf("  Concurrent copy and execution:                 %s with %d copy engine(s)\n", (prop.deviceOverlap ? "Yes" : "No"), prop.asyncEngineCount);
+#else
+        printf("  Concurrent copy and execution:                 %s\n", prop.deviceOverlap ? "Yes" : "No");
+#endif
+        printf("  Run time limit on kernels:                     %s\n", prop.kernelExecTimeoutEnabled ? "Yes" : "No");
+        printf("  Integrated GPU sharing Host Memory:            %s\n", prop.integrated ? "Yes" : "No");
+        printf("  Support host page-locked memory mapping:       %s\n", prop.canMapHostMemory ? "Yes" : "No");
+
+        printf("  Concurrent kernel execution:                   %s\n", prop.concurrentKernels ? "Yes" : "No");
+        printf("  Alignment requirement for Surfaces:            %s\n", prop.surfaceAlignment ? "Yes" : "No");
+        printf("  Device has ECC support enabled:                %s\n", prop.ECCEnabled ? "Yes" : "No");
+        printf("  Device is using TCC driver mode:               %s\n", prop.tccDriver ? "Yes" : "No");
+#if CUDART_VERSION >= 4000
+        printf("  Device supports Unified Addressing (UVA):      %s\n", prop.unifiedAddressing ? "Yes" : "No");
+        printf("  Device PCI Bus ID / PCI location ID:           %d / %d\n", prop.pciBusID, prop.pciDeviceID );
+#endif
+        printf("  Compute Mode:\n");
+        printf("      %s \n", computeMode[prop.computeMode]);
+    }
+    
+    printf("\n");    
+    printf("deviceQuery, CUDA Driver = CUDART");
+    printf(", CUDA Driver Version  = %d.%d", driverVersion / 1000, driverVersion % 100);
+    printf(", CUDA Runtime Version = %d.%d", runtimeVersion/1000, runtimeVersion%100);
+    printf(", NumDevs = %d\n\n", count);                
+    fflush(stdout);
+}
+
+void kf::cuda::printShortCudaDeviceInfo(int device)
+{
+    int count = getCudaEnabledDeviceCount();
+    bool valid = (device >= 0) && (device < count);
+
+    int beg = valid ? device   : 0;
+    int end = valid ? device+1 : count;
+
+    int driverVersion = 0, runtimeVersion = 0;
+    cudaSafeCall( cudaDriverGetVersion(&driverVersion) );
+    cudaSafeCall( cudaRuntimeGetVersion(&runtimeVersion) );
+
+    for(int dev = beg; dev < end; ++dev)
+    {                
+        cudaDeviceProp prop;
+        cudaSafeCall( cudaGetDeviceProperties(&prop, dev) );
+
+        const char *arch_str = prop.major < 2 ? " (pre-Fermi)" : "";
+        printf("Device %d:  \"%s\"  %.0fMb", dev, prop.name, (float)prop.totalGlobalMem/1048576.0f);                
+        printf(", sm_%d%d%s, %d cores", prop.major, prop.minor, arch_str, convertSMVer2Cores(prop.major, prop.minor) * prop.multiProcessorCount);                
+        printf(", Driver/Runtime ver.%d.%d/%d.%d\n", driverVersion/1000, driverVersion%100, runtimeVersion/1000, runtimeVersion%100);
+    }
+    fflush(stdout);
+}
+
+kf::SampledScopeTime::SampledScopeTime(double& time_ms) : time_ms_(time_ms)
+{
+    start = (double)cv::getTickCount();
+}
+kf::SampledScopeTime::~SampledScopeTime()
+{
+    static int i_ = 0;
+    time_ms_ += getTime ();
+    if (i_ % EACH == 0 && i_)
+    {
+        std::cout << "Average frame time = " << time_ms_ / EACH << "ms ( " << 1000.f * EACH / time_ms_ << "fps )" << std::endl;
+        time_ms_ = 0.0;
+    }
+    ++i_;
+}
+
+double kf::SampledScopeTime::getTime()
+{
+    return ((double)cv::getTickCount() - start)*1000.0/cv::getTickFrequency();
+}
+
+kf::ScopeTime::ScopeTime(const char *name_) : name(name_)
+{
+    start = (double)cv::getTickCount();
+}
+kf::ScopeTime::~ScopeTime()
+{
+    double time_ms =  ((double)cv::getTickCount() - start)*1000.0/cv::getTickFrequency();
+    std::cout << "Time(" << name << ") = " << time_ms << "ms" << std::endl;
+}
diff --git a/modules/dynamicfusion/kfusion/src/cuda/device.hpp b/modules/dynamicfusion/kfusion/src/cuda/device.hpp
new file mode 100644
index 00000000000..497c8f84cf8
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/device.hpp
@@ -0,0 +1,127 @@
+#pragma once
+
+#include "internal.hpp"
+#include "temp_utils.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TsdfVolume
+
+//__kf_device__
+//kfusion::device::TsdfVolume::TsdfVolume(elem_type* _data, int3 _dims, float3 _voxel_size, float _trunc_dist, int _max_weight)
+//  : data(_data), dims(_dims), voxel_size(_voxel_size), trunc_dist(_trunc_dist), max_weight(_max_weight) {}
+
+//__kf_device__
+//kfusion::device::TsdfVolume::TsdfVolume(const TsdfVolume& other)
+//  : data(other.data), dims(other.dims), voxel_size(other.voxel_size), trunc_dist(other.trunc_dist), max_weight(other.max_weight) {}
+
+__kf_device__ kfusion::device::TsdfVolume::elem_type* kfusion::device::TsdfVolume::operator()(int x, int y, int z)
+{ return data + x + y*dims.x + z*dims.y*dims.x; }
+
+__kf_device__ const kfusion::device::TsdfVolume::elem_type* kfusion::device::TsdfVolume::operator() (int x, int y, int z) const
+{ return data + x + y*dims.x + z*dims.y*dims.x; }
+
+__kf_device__ kfusion::device::TsdfVolume::elem_type* kfusion::device::TsdfVolume::beg(int x, int y) const
+{ return data + x + dims.x * y; }
+
+__kf_device__ kfusion::device::TsdfVolume::elem_type* kfusion::device::TsdfVolume::zstep(elem_type *const ptr) const
+{ return ptr + dims.x * dims.y; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Projector
+
+__kf_device__ float2 kfusion::device::Projector::operator()(const float3& p) const
+{
+    float2 coo;
+    coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);
+    coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);
+    return coo;
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Reprojector
+
+__kf_device__ float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const
+{
+    float x = z * (u - c.x) * finv.x;
+    float y = z * (v - c.y) * finv.y;
+    return make_float3(x, y, z);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// packing/unpacking tsdf volume element
+
+__kf_device__ ushort2 kfusion::device::pack_tsdf (float tsdf, int weight)
+{ return make_ushort2 (__float2half_rn (tsdf), weight); }
+
+__kf_device__ float kfusion::device::unpack_tsdf(ushort2 value, int& weight)
+{
+    weight = value.y;
+    return __half2float (value.x);
+}
+__kf_device__ float kfusion::device::unpack_tsdf (ushort2 value) { return __half2float (value.x); }
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Utility
+
+namespace kfusion
+{
+    namespace device
+    {
+        __kf_device__ Vec3f operator*(const Mat3f& m, const Vec3f& v)
+        { return make_float3(dot(m.data[0], v), dot (m.data[1], v), dot (m.data[2], v)); }
+
+        __kf_device__ Vec3f operator*(const Aff3f& a, const Vec3f& v) { return a.R * v + a.t; }
+
+        __kf_device__ Vec3f tr(const float4& v) { return make_float3(v.x, v.y, v.z); }
+
+        struct plus
+        {
+            __kf_device__ float operator () (float l, float r) const  { return l + r; }
+            __kf_device__ double operator () (double l, double r) const  { return l + r; }
+        };
+
+        struct gmem
+        {
+            template<typename T> __kf_device__ static T LdCs(T *ptr);
+            template<typename T> __kf_device__ static void StCs(const T& val, T *ptr);
+        };
+
+        template<> __kf_device__ ushort2 gmem::LdCs(ushort2* ptr);
+        template<> __kf_device__ void gmem::StCs(const ushort2& val, ushort2* ptr);
+    }
+}
+
+
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 200
+
+    #if defined(_WIN64) || defined(__LP64__)
+        #define _ASM_PTR_ "l"
+    #else
+        #define _ASM_PTR_ "r"
+    #endif
+
+    template<> __kf_device__ ushort2 kfusion::device::gmem::LdCs(ushort2* ptr)
+    {
+        ushort2 val;
+        asm("ld.global.cs.v2.u16 {%0, %1}, [%2];" : "=h"(reinterpret_cast<ushort&>(val.x)), "=h"(reinterpret_cast<ushort&>(val.y)) : _ASM_PTR_(ptr));
+        return val;
+    }
+
+    template<> __kf_device__ void kfusion::device::gmem::StCs(const ushort2& val, ushort2* ptr)
+    {
+        short cx = val.x, cy = val.y;
+        asm("st.global.cs.v2.u16 [%0], {%1, %2};" : : _ASM_PTR_(ptr), "h"(reinterpret_cast<ushort&>(cx)), "h"(reinterpret_cast<ushort&>(cy)));
+    }
+    #undef _ASM_PTR_
+
+#else
+    template<> __kf_device__ ushort2 kfusion::device::gmem::LdCs(ushort2* ptr) { return *ptr; }
+    template<> __kf_device__ void kfusion::device::gmem::StCs(const ushort2& val, ushort2* ptr) { *ptr = val; }
+#endif
+
+
+
+
+
+
diff --git a/modules/dynamicfusion/kfusion/src/cuda/imgproc.cu b/modules/dynamicfusion/kfusion/src/cuda/imgproc.cu
new file mode 100644
index 00000000000..511b629f09b
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/imgproc.cu
@@ -0,0 +1,622 @@
+#include <host_defines.h>
+#include "device.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Depth bilateral filter
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void bilateral_kernel(const PtrStepSz<ushort> src, PtrStep<ushort> dst, const int ksz, const float sigma_spatial2_inv_half, const float sigma_depth2_inv_half)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= src.cols || y >= src.rows)
+                return;
+
+            int value = src(y, x);
+
+            int tx = min (x - ksz / 2 + ksz, src.cols - 1);
+            int ty = min (y - ksz / 2 + ksz, src.rows - 1);
+
+            float sum1 = 0;
+            float sum2 = 0;
+
+            for (int cy = max (y - ksz / 2, 0); cy < ty; ++cy)
+            {
+                for (int cx = max (x - ksz / 2, 0); cx < tx; ++cx)
+                {
+                    int depth = src(cy, cx);
+
+                    float space2 = (x - cx) * (x - cx) + (y - cy) * (y - cy);
+                    float color2 = (value - depth) * (value - depth);
+
+                    float weight = __expf (-(space2 * sigma_spatial2_inv_half + color2 * sigma_depth2_inv_half));
+
+                    sum1 += depth * weight;
+                    sum2 += weight;
+                }
+            }
+            dst(y, x) = __float2int_rn (sum1 / sum2);
+        }
+    }
+}
+
+void kfusion::device::bilateralFilter (const Depth& src, Depth& dst, int kernel_size, float sigma_spatial, float sigma_depth)
+{
+    sigma_depth *= 1000; // meters -> mm
+
+    dim3 block (32, 8);
+    dim3 grid (divUp (src.cols (), block.x), divUp (src.rows (), block.y));
+
+    cudaSafeCall( cudaFuncSetCacheConfig (bilateral_kernel, cudaFuncCachePreferL1) );
+    bilateral_kernel<<<grid, block>>>(src, dst, kernel_size, 0.5f / (sigma_spatial * sigma_spatial), 0.5f / (sigma_depth * sigma_depth));
+    cudaSafeCall ( cudaGetLastError () );
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Depth truncation
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void truncate_depth_kernel(PtrStepSz<ushort> depth, ushort max_dist /*mm*/)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x < depth.cols && y < depth.rows)
+                if(depth(y, x) > max_dist)
+                    depth(y, x) = 0;
+        }
+    }
+}
+
+void kfusion::device::truncateDepth(Depth& depth, float max_dist /*meters*/)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (depth.cols (), block.x), divUp (depth.rows (), block.y));
+
+    truncate_depth_kernel<<<grid, block>>>(depth, static_cast<ushort>(max_dist * 1000.f));
+    cudaSafeCall ( cudaGetLastError() );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Build depth pyramid
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void pyramid_kernel(const PtrStepSz<ushort> src, PtrStepSz<ushort> dst, float sigma_depth_mult3)
+        {
+            int x = blockIdx.x * blockDim.x + threadIdx.x;
+            int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+            if (x >= dst.cols || y >= dst.rows)
+                return;
+
+            const int D = 5;
+            int center = src(2 * y, 2 * x);
+
+            int tx = min (2 * x - D / 2 + D, src.cols - 1);
+            int ty = min (2 * y - D / 2 + D, src.rows - 1);
+            int cy = max (0, 2 * y - D / 2);
+
+            int sum = 0;
+            int count = 0;
+
+            for (; cy < ty; ++cy)
+                for (int cx = max (0, 2 * x - D / 2); cx < tx; ++cx)
+                {
+                    int val = src(cy, cx);
+                    if (abs (val - center) < sigma_depth_mult3)
+                    {
+                        sum += val;
+                        ++count;
+                    }
+                }
+            dst(y, x) = (count == 0) ? 0 : sum / count;
+        }
+    }
+}
+
+void kfusion::device::depthPyr(const Depth& source, Depth& pyramid, float sigma_depth)
+{
+    sigma_depth *= 1000; // meters -> mm
+
+    dim3 block (32, 8);
+    dim3 grid (divUp(pyramid.cols(), block.x), divUp(pyramid.rows(), block.y));
+
+    pyramid_kernel<<<grid, block>>>(source, pyramid, sigma_depth * 3);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Compute normals
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void compute_normals_kernel(const PtrStepSz<ushort> depth, const Reprojector reproj, PtrStep<Normal> normals)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= depth.cols || y >= depth.rows)
+                return;
+
+            const float qnan = numeric_limits<float>::quiet_NaN ();
+
+            Normal n_out = make_float4(qnan, qnan, qnan, 0.f);
+
+            if (x < depth.cols - 1 && y < depth.rows - 1)
+            {
+                //mm -> meters
+                float z00 = depth(y,   x) * 0.001f;
+                float z01 = depth(y, x+1) * 0.001f;
+                float z10 = depth(y+1, x) * 0.001f;
+
+                if (z00 * z01 * z10 != 0)
+                {
+                    float3 v00 = reproj(x,   y, z00);
+                    float3 v01 = reproj(x+1, y, z01);
+                    float3 v10 = reproj(x, y+1, z10);
+
+                    float3 n = normalized( cross (v01 - v00, v10 - v00) );
+                    n_out = make_float4(-n.x, -n.y, -n.z, 0.f);
+                }
+            }
+            normals(y, x) = n_out;
+        }
+
+        __global__ void mask_depth_kernel(const PtrStep<Normal> normals, PtrStepSz<ushort> depth)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < depth.cols || y < depth.rows)
+            {
+                float4 n = normals(y, x);
+                if (isnan(n.x))
+                    depth(y, x) = 0;
+            }
+        }
+    }
+}
+
+void kfusion::device::computeNormalsAndMaskDepth(const Reprojector& reproj, Depth& depth, Normals& normals)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (depth.cols (), block.x), divUp (depth.rows (), block.y));
+
+    compute_normals_kernel<<<grid, block>>>(depth, reproj, normals);
+    cudaSafeCall ( cudaGetLastError () );
+
+    mask_depth_kernel<<<grid, block>>>(normals, depth);
+    cudaSafeCall ( cudaGetLastError () );
+}
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Compute computePointNormals
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void points_normals_kernel(const Reprojector reproj, const PtrStepSz<ushort> depth, PtrStep<Point> points, PtrStep<Normal> normals)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= depth.cols || y >= depth.rows)
+                return;
+
+            const float qnan = numeric_limits<float>::quiet_NaN ();
+            points(y, x) = normals(y, x) = make_float4(qnan, qnan, qnan, qnan);
+
+            if (x >= depth.cols - 1 || y >= depth.rows - 1)
+                return;
+
+            //mm -> meters
+            float z00 = depth(y,   x) * 0.001f;
+            float z01 = depth(y, x+1) * 0.001f;
+            float z10 = depth(y+1, x) * 0.001f;
+
+            if (z00 * z01 * z10 != 0)
+            {
+                float3 v00 = reproj(x,   y, z00);
+                float3 v01 = reproj(x+1, y, z01);
+                float3 v10 = reproj(x, y+1, z10);
+
+                float3 n = normalized( cross (v01 - v00, v10 - v00) );
+                normals(y, x) = make_float4(-n.x, -n.y, -n.z, 0.f);
+                points(y, x) = make_float4(v00.x, v00.y, v00.z, 0.f);
+            }
+        }
+    }
+}
+
+void kfusion::device::computePointNormals(const Reprojector& reproj, const Depth& depth, Points& points, Normals& normals)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (depth.cols (), block.x), divUp (depth.rows (), block.y));
+
+    points_normals_kernel<<<grid, block>>>(reproj, depth, points, normals);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Compute dists
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void compute_dists_kernel(const PtrStepSz<ushort> depth, Dists dists, float2 finv, float2 c)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < depth.cols || y < depth.rows)
+            {
+                float xl = (x - c.x) * finv.x;
+                float yl = (y - c.y) * finv.y;
+                float lambda = sqrtf (xl * xl + yl * yl + 1);
+
+                dists(y, x) = __float2half_rn(depth(y, x) * lambda * 0.001f); //meters
+            }
+        }
+
+        __global__ void cloud_to_depth_kernel(const PtrStep<Point> cloud, PtrStepSz<ushort> depth)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < depth.cols || y < depth.rows)
+            {
+                depth(y, x) = cloud(y, x).z * 1000; //meters
+            }
+        }
+    }
+}
+
+void kfusion::device::compute_dists(const Depth& depth, Dists dists, float2 f, float2 c)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (depth.cols (), block.x), divUp (depth.rows (), block.y));
+
+    compute_dists_kernel<<<grid, block>>>(depth, dists, make_float2(1.f/f.x, 1.f/f.y), c);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+void kfusion::device::cloud_to_depth(const Points& cloud, Depth depth)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (cloud.cols (), block.x), divUp (cloud.rows (), block.y));
+
+    cloud_to_depth_kernel<<<grid, block>>>(cloud, depth);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void resize_depth_normals_kernel(const PtrStep<ushort> dsrc, const PtrStep<float4> nsrc, PtrStepSz<ushort> ddst, PtrStep<float4> ndst)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= ddst.cols || y >= ddst.rows)
+                return;
+
+            const float qnan = numeric_limits<float>::quiet_NaN ();
+
+            ushort d = 0;
+            float4 n = make_float4(qnan, qnan, qnan, qnan);
+
+            int xs = x * 2;
+            int ys = y * 2;
+
+            int d00 = dsrc(ys+0, xs+0);
+            int d01 = dsrc(ys+0, xs+1);
+            int d10 = dsrc(ys+1, xs+0);
+            int d11 = dsrc(ys+1, xs+1);
+
+            if (d00 * d01 != 0 && d10 * d11 != 0)
+            {
+                d = (d00 + d01 + d10 + d11)/4;
+
+                float4 n00 = nsrc(ys+0, xs+0);
+                float4 n01 = nsrc(ys+0, xs+1);
+                float4 n10 = nsrc(ys+1, xs+0);
+                float4 n11 = nsrc(ys+1, xs+1);
+
+                n.x = (n00.x + n01.x + n10.x + n11.x)*0.25;
+                n.y = (n00.y + n01.y + n10.y + n11.y)*0.25;
+                n.z = (n00.z + n01.z + n10.z + n11.z)*0.25;
+            }
+            ddst(y, x) = d;
+            ndst(y, x) = n;
+        }
+    }
+}
+
+void kfusion::device::resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out)
+{
+    int in_cols = depth.cols ();
+    int in_rows = depth.rows ();
+
+    int out_cols = in_cols / 2;
+    int out_rows = in_rows / 2;
+
+    dim3 block (32, 8);
+    dim3 grid (divUp (out_cols, block.x), divUp (out_rows, block.y));
+
+    resize_depth_normals_kernel<<<grid, block>>>(depth, normals, depth_out, normals_out);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void resize_points_normals_kernel(const PtrStep<Point> vsrc, const PtrStep<Normal> nsrc, PtrStepSz<Point> vdst, PtrStep<Normal> ndst)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= vdst.cols || y >= vdst.rows)
+                return;
+
+            const float qnan = numeric_limits<float>::quiet_NaN ();
+            vdst(y, x) = ndst(y, x) = make_float4(qnan, qnan, qnan, 0.f);
+
+            int xs = x * 2;
+            int ys = y * 2;
+
+            float3 d00 = tr(vsrc(ys+0, xs+0));
+            float3 d01 = tr(vsrc(ys+0, xs+1));
+            float3 d10 = tr(vsrc(ys+1, xs+0));
+            float3 d11 = tr(vsrc(ys+1, xs+1));
+
+            if (!isnan(d00.x * d01.x * d10.x * d11.x))
+            {
+                float3 d = (d00 + d01 + d10 + d11) * 0.25f;
+                vdst(y, x) = make_float4(d.x, d.y, d.z, 0.f);
+
+                float3 n00 = tr(nsrc(ys+0, xs+0));
+                float3 n01 = tr(nsrc(ys+0, xs+1));
+                float3 n10 = tr(nsrc(ys+1, xs+0));
+                float3 n11 = tr(nsrc(ys+1, xs+1));
+
+                float3 n = (n00 + n01 + n10 + n11)*0.25f;
+                ndst(y, x) = make_float4(n.x, n.y, n.z, 0.f);
+            }
+        }
+    }
+}
+
+void kfusion::device::resizePointsNormals(const Points& points, const Normals& normals, Points& points_out, Normals& normals_out)
+{
+    int out_cols = points.cols () / 2;
+    int out_rows = points.rows () / 2;
+
+    dim3 block (32, 8);
+    dim3 grid (divUp (out_cols, block.x), divUp (out_rows, block.y));
+
+    resize_points_normals_kernel<<<grid, block>>>(points, normals, points_out, normals_out);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void render_image_kernel(const PtrStep<ushort> depth, const PtrStep<Normal> normals,
+                                            const Reprojector reproj, const float3 light_pose, PtrStepSz<uchar4> dst)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= dst.cols || y >= dst.rows)
+                return;
+
+            float3 color;
+
+            int d = depth(y,x);
+
+            if (d == 0)
+            {
+                const float3 bgr1 = make_float3(4.f/255.f, 2.f/255.f, 2.f/255.f);
+                const float3 bgr2 = make_float3(236.f/255.f, 120.f/255.f, 120.f/255.f);
+
+                float w = static_cast<float>(y) / dst.rows;
+                color = bgr1 * (1 - w) + bgr2 * w;
+            }
+            else
+            {
+                float3 P = reproj(x, y, d * 0.001f);
+                float3 N = tr(normals(y,x));
+
+                const float Ka = 0.3f;  //ambient coeff
+                const float Kd = 0.5f;  //diffuse coeff
+                const float Ks = 0.2f;  //specular coeff
+                const float n = 20.f;  //specular power
+
+                const float Ax = 1.f;   //ambient color,  can be RGB
+                const float Dx = 1.f;   //diffuse color,  can be RGB
+                const float Sx = 1.f;   //specular color, can be RGB
+                const float Lx = 1.f;   //light color
+
+                //Ix = Ax*Ka*Dx + Att*Lx [Kd*Dx*(N dot L) + Ks*Sx*(R dot V)^n]
+
+                float3 L = normalized(light_pose - P);
+                float3 V = normalized(make_float3(0.f, 0.f, 0.f) - P);
+                float3 R = normalized(2 * N * dot(N, L) - L);
+
+                float Ix = Ax*Ka*Dx + Lx * Kd * Dx * fmax(0.f, dot(N, L)) + Lx * Ks * Sx * __powf(fmax(0.f, dot(R, V)), n);
+                color = make_float3(Ix, Ix, Ix);
+            }
+
+            uchar4 out;
+            out.x = static_cast<unsigned char>(__saturatef(color.x) * 255.f);
+            out.y = static_cast<unsigned char>(__saturatef(color.y) * 255.f);
+            out.z = static_cast<unsigned char>(__saturatef(color.z) * 255.f);
+            out.w = 0;
+            dst(y, x) = out;
+        }
+
+        __global__ void render_image_kernel(const PtrStep<Point> points, const PtrStep<Normal> normals,
+                                            const Reprojector reproj, const float3 light_pose, PtrStepSz<uchar4> dst)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= dst.cols || y >= dst.rows)
+                return;
+
+            float3 color;
+
+            float3 p = tr(points(y,x));
+
+            if (isnan(p.x))
+            {
+                const float3 bgr1 = make_float3(4.f/255.f, 2.f/255.f, 2.f/255.f);
+                const float3 bgr2 = make_float3(236.f/255.f, 120.f/255.f, 120.f/255.f);
+
+                float w = static_cast<float>(y) / dst.rows;
+                color = bgr1 * (1 - w) + bgr2 * w;
+            }
+            else
+            {
+                float3 P = p;
+                float3 N = tr(normals(y,x));
+
+                const float Ka = 0.3f;  //ambient coeff
+                const float Kd = 0.5f;  //diffuse coeff
+                const float Ks = 0.2f;  //specular coeff
+                const float n = 20.f;  //specular power
+
+                const float Ax = 1.f;   //ambient color,  can be RGB
+                const float Dx = 1.f;   //diffuse color,  can be RGB
+                const float Sx = 1.f;   //specular color, can be RGB
+                const float Lx = 1.f;   //light color
+
+                //Ix = Ax*Ka*Dx + Att*Lx [Kd*Dx*(N dot L) + Ks*Sx*(R dot V)^n]
+
+                float3 L = normalized(light_pose - P);
+                float3 V = normalized(make_float3(0.f, 0.f, 0.f) - P);
+                float3 R = normalized(2 * N * dot(N, L) - L);
+
+                float Ix = Ax*Ka*Dx + Lx * Kd * Dx * fmax(0.f, dot(N, L)) + Lx * Ks * Sx * __powf(fmax(0.f, dot(R, V)), n);
+                color = make_float3(Ix, Ix, Ix);
+            }
+
+            uchar4 out;
+            out.x = static_cast<unsigned char>(__saturatef(color.x) * 255.f);
+            out.y = static_cast<unsigned char>(__saturatef(color.y) * 255.f);
+            out.z = static_cast<unsigned char>(__saturatef(color.z) * 255.f);
+            out.w = 0;
+            dst(y, x) = out;
+        }
+    }
+}
+
+void kfusion::device::renderImage(const Depth& depth, const Normals& normals, const Reprojector& reproj, const float3& light_pose, Image& image)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (depth.cols(), block.x), divUp (depth.rows(), block.y));
+
+    render_image_kernel<<<grid, block>>>((PtrStep<ushort>)depth, normals, reproj, light_pose, image);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+void kfusion::device::renderImage(const Points& points, const Normals& normals, const Reprojector& reproj, const Vec3f& light_pose, Image& image)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (points.cols(), block.x), divUp (points.rows(), block.y));
+
+    render_image_kernel<<<grid, block>>>((PtrStep<Point>)points, normals, reproj, light_pose, image);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void tangent_colors_kernel(PtrStepSz<Normal> normals, PtrStep<uchar4> colors)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x >= normals.cols || y >= normals.rows)
+                return;
+
+            float4 n = normals(y, x);
+
+        #if 0
+            unsigned char r = static_cast<unsigned char>(__saturatef((-n.x + 1.f)/2.f) * 255.f);
+            unsigned char g = static_cast<unsigned char>(__saturatef((-n.y + 1.f)/2.f) * 255.f);
+            unsigned char b = static_cast<unsigned char>(__saturatef((-n.z + 1.f)/2.f) * 255.f);
+        #else
+            unsigned char r = static_cast<unsigned char>((5.f - n.x * 3.5f) * 25.5f);
+            unsigned char g = static_cast<unsigned char>((5.f - n.y * 2.5f) * 25.5f);
+            unsigned char b = static_cast<unsigned char>((5.f - n.z * 3.5f) * 25.5f);
+        #endif
+            colors(y, x) = make_uchar4(b, g, r, 0);
+        }
+    }
+}
+
+void kfusion::device::renderTangentColors(const Normals& normals, Image& image)
+{
+    dim3 block (32, 8);
+    dim3 grid (divUp (normals.cols(), block.x), divUp (normals.rows(), block.y));
+
+    tangent_colors_kernel<<<grid, block>>>(normals, image);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void mergePointNormalKernel (const Point* cloud, const float8* normals, PtrSz<float12> output)
+        {
+            int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+            if (idx < output.size)
+            {
+                float4 p = cloud[idx];
+                float8 n = normals[idx];
+
+                float12 o;
+                o.x = p.x;
+                o.y = p.y;
+                o.z = p.z;
+
+                o.normal_x = n.x;
+                o.normal_y = n.y;
+                o.normal_z = n.z;
+
+                output.data[idx] = o;
+            }
+        }
+    }
+}
+
+void kfusion::device::mergePointNormal (const DeviceArray<Point>& cloud, const DeviceArray<float8>& normals, const DeviceArray<float12>& output)
+{
+    const int block = 256;
+    int total = (int)output.size ();
+
+    mergePointNormalKernel<<<divUp (total, block), block>>>(cloud, normals, output);
+    cudaSafeCall ( cudaGetLastError () );
+    cudaSafeCall (cudaDeviceSynchronize ());
+}
diff --git a/modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu b/modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu
new file mode 100644
index 00000000000..159ab4e3b04
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu
@@ -0,0 +1,476 @@
+#include "device.hpp"
+#include "texture_binder.hpp"
+
+
+namespace kfusion
+{
+    namespace device
+    {
+        texture<ushort, 2> dprev_tex;
+        texture<Normal, 2> nprev_tex;
+        texture<Point,  2> vprev_tex;
+
+        struct ComputeIcpHelper::Policy
+        {
+            enum
+            {
+                CTA_SIZE_X = 32,
+                CTA_SIZE_Y = 8,
+                CTA_SIZE = CTA_SIZE_X * CTA_SIZE_Y,
+
+                B = 6, COLS = 6, ROWS = 6, DIAG = 6,
+                UPPER_DIAG_MAT = (COLS * ROWS - DIAG) / 2 + DIAG,
+                TOTAL = UPPER_DIAG_MAT + B,
+
+                FINAL_REDUCE_CTA_SIZE = 256,
+                FINAL_REDUCE_STRIDE = FINAL_REDUCE_CTA_SIZE
+            };
+        };
+
+        __kf_device__
+        float2 ComputeIcpHelper::proj(const float3& p) const
+        {
+            float2 coo;
+            coo.x = __fmaf_rn(f.x, __fdividef(p.x, p.z), c.x);
+            coo.y = __fmaf_rn(f.y, __fdividef(p.y, p.z), c.y);
+            return coo;
+        }
+
+        __kf_device__
+        float3 ComputeIcpHelper::reproj(float u, float v, float z)  const
+        {
+            float x = z * (u - c.x) * finv.x;
+            float y = z * (v - c.y) * finv.y;
+            return make_float3(x, y, z);
+        }
+
+#if defined USE_DEPTH
+        __kf_device__
+        int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const
+        {
+            int src_z = dcurr(y, x);
+            if (src_z == 0)
+                return 40;
+
+            s = aff * reproj(x, y, src_z * 0.001f);
+
+            float2 coo = proj(s);
+            if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)
+                return 80;
+
+            int dst_z = tex2D(dprev_tex, coo.x, coo.y);
+            if (dst_z == 0)
+                return 120;
+
+            d = reproj(coo.x, coo.y, dst_z * 0.001f);
+
+            float dist2 = norm_sqr(s - d);
+            if (dist2 > dist2_thres)
+                return 160;
+
+            float3 ns = aff.R * tr(ncurr(y, x));
+            nd = tr(tex2D(nprev_tex, coo.x, coo.y));
+
+            float cosine = fabs(dot(ns, nd));
+            if (cosine < min_cosine)
+                return 200;
+            return 0;
+        }
+#else
+        __kf_device__
+        int ComputeIcpHelper::find_coresp(int x, int y, float3& nd, float3& d, float3& s) const
+        {
+            s = tr(vcurr(y, x));
+            if (isnan(s.x))
+                return 40;
+
+            s = aff * s;
+
+            float2 coo = proj(s);
+            if (s.z <= 0 || coo.x < 0 || coo.y < 0 || coo.x >= cols || coo.y >= rows)
+                return 80;
+
+            d = tr(tex2D(vprev_tex, coo.x, coo.y));
+            if (isnan(d.x))
+                return 120;
+
+            float dist2 = norm_sqr(s - d);
+            if (dist2 > dist2_thres)
+                return 160;
+
+            float3 ns = aff.R * tr(ncurr(y, x));
+            nd = tr(tex2D(nprev_tex, coo.x, coo.y));
+
+            float cosine = fabs(dot(ns, nd));
+            if (cosine < min_cosine)
+                return 200;
+            return 0;
+        }
+#endif
+
+        __kf_device__
+        void ComputeIcpHelper::partial_reduce(const float row[7], PtrStep<float>& partial_buf) const
+        {
+            volatile __shared__ float smem[Policy::CTA_SIZE];
+            int tid = Block::flattenedThreadId ();
+
+            float  *pos = partial_buf.data + blockIdx.x + gridDim.x * blockIdx.y;
+            size_t step = partial_buf.step / sizeof(float);
+
+#define STOR \
+            if (tid == 0) \
+            { \
+                *pos = smem[0]; \
+                pos += step; \
+            }
+
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[0];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[1];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[2];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[3];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[4];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+            __syncthreads ();
+            smem[tid] = row[0] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[0] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+////////////////////////////////
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[1];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[2];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[3];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[4];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[1] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+////////////////////////////////
+
+            __syncthreads ();
+            smem[tid] = row[2] * row[2];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[2] * row[3];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[2] * row[4];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[2] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[2] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+////////////////////////////////
+
+            __syncthreads ();
+            smem[tid] = row[3] * row[3];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[3] * row[4];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[3] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[3] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+///////////////////////////////////////////////////
+
+            __syncthreads ();
+            smem[tid] = row[4] * row[4];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[4] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[4] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+        STOR
+
+///////////////////////////////////////////////////
+
+            __syncthreads ();
+            smem[tid] = row[5] * row[5];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+
+            __syncthreads ();
+            smem[tid] = row[5] * row[6];
+            __syncthreads ();
+
+            Block::reduce<Policy::CTA_SIZE>(smem, plus ());
+
+         STOR
+        }
+
+        __global__ void icp_helper_kernel(const ComputeIcpHelper helper, PtrStep<float> partial_buf)
+        {
+            int x = threadIdx.x + blockIdx.x * ComputeIcpHelper::Policy::CTA_SIZE_X;
+            int y = threadIdx.y + blockIdx.y * ComputeIcpHelper::Policy::CTA_SIZE_Y;
+
+            float3 n, d, s;
+            int filtered = (x < helper.cols && y < helper.rows) ? helper.find_coresp (x, y, n, d, s) : 1;
+            //if (x < helper.cols && y < helper.rows) mask(y, x) = filtered;
+
+            float row[7];
+
+            if (!filtered)
+            {
+                *(float3*)&row[0] = cross (s, n);
+                *(float3*)&row[3] = n;
+                row[6] = dot (n, d - s);
+            }
+            else
+                row[0] = row[1] = row[2] = row[3] = row[4] = row[5] = row[6] = 0.f;
+
+            helper.partial_reduce(row, partial_buf);
+        }
+
+        __global__ void icp_final_reduce_kernel(const PtrStep<float> partial_buf, const int length, float* final_buf)
+        {
+            const float *beg = partial_buf.ptr(blockIdx.x);
+            const float *end = beg + length;
+
+            int tid = threadIdx.x;
+
+            float sum = 0.f;
+            for (const float *t = beg + tid; t < end; t += ComputeIcpHelper::Policy::FINAL_REDUCE_STRIDE)
+                sum += *t;
+
+            __shared__ float smem[ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE];
+
+            smem[tid] = sum;
+
+            __syncthreads();
+
+            Block::reduce<ComputeIcpHelper::Policy::FINAL_REDUCE_CTA_SIZE>(smem, plus());
+
+            if (tid == 0)
+                final_buf[blockIdx.x] = smem[0];
+        }
+    }
+}
+
+void kfusion::device::ComputeIcpHelper::operator()(const Depth& dprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t s)
+{
+    dprev_tex.filterMode = cudaFilterModePoint;
+    nprev_tex.filterMode = cudaFilterModePoint;
+    TextureBinder dprev_binder(dprev, dprev_tex);
+    TextureBinder nprev_binder(nprev, nprev_tex);
+
+    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);
+    dim3 grid(divUp ((int)cols, block.x), divUp ((int)rows, block.y));
+
+    int partials_count = (int)(grid.x * grid.y);
+    allocate_buffer(buffer, partials_count);
+
+    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);
+    cudaSafeCall ( cudaGetLastError () );
+
+    int b = Policy::FINAL_REDUCE_CTA_SIZE;
+    int g = Policy::TOTAL;
+    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));
+    cudaSafeCall ( cudaGetLastError () );
+
+    cudaSafeCall ( cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s) );
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+void kfusion::device::ComputeIcpHelper::operator()(const Points& vprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t s)
+{
+    dprev_tex.filterMode = cudaFilterModePoint;
+    nprev_tex.filterMode = cudaFilterModePoint;
+    TextureBinder vprev_binder(vprev, vprev_tex);
+    TextureBinder nprev_binder(nprev, nprev_tex);
+
+    dim3 block(Policy::CTA_SIZE_X, Policy::CTA_SIZE_Y);
+    dim3 grid(divUp ((int)cols, block.x), divUp ((int)rows, block.y));
+
+    int partials_count = (int)(grid.x * grid.y);
+    allocate_buffer(buffer, partials_count);
+
+    icp_helper_kernel<<<grid, block, 0, s>>>(*this, buffer);
+    cudaSafeCall ( cudaGetLastError () );
+
+    int b = Policy::FINAL_REDUCE_CTA_SIZE;
+    int g = Policy::TOTAL;
+    icp_final_reduce_kernel<<<g, b, 0, s>>>(buffer, partials_count, buffer.ptr(Policy::TOTAL));
+    cudaSafeCall ( cudaGetLastError () );
+
+    cudaSafeCall ( cudaMemcpyAsync(data, buffer.ptr(Policy::TOTAL), Policy::TOTAL * sizeof(float), cudaMemcpyDeviceToHost, s) );
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+
+void kfusion::device::ComputeIcpHelper::allocate_buffer(DeviceArray2D<float>& buffer, int partials_count)
+{ 
+    if (partials_count < 0)
+    {
+        const int input_cols = 640;
+        const int input_rows = 480;
+
+        int gx = divUp (input_cols, Policy::CTA_SIZE_X);
+        int gy = divUp (input_rows, Policy::CTA_SIZE_Y);
+
+        partials_count = gx * gy;
+    }
+
+    int min_rows = Policy::TOTAL + 1;
+    int min_cols = max(partials_count, Policy::TOTAL);
+
+    if (buffer.rows() < min_rows || buffer.cols() < min_cols)
+        buffer.create (min_rows, min_cols);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// ComputeIcpHelper::PageLockHelper
+
+kfusion::device::ComputeIcpHelper::PageLockHelper::PageLockHelper() : data(0)
+{ cudaSafeCall( cudaMallocHost((void **)&data, Policy::TOTAL * sizeof(float)) );  }
+
+kfusion::device::ComputeIcpHelper::PageLockHelper::~PageLockHelper()
+{   cudaSafeCall( cudaFreeHost(data) ); data = 0; }
diff --git a/modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp b/modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp
new file mode 100644
index 00000000000..993c3a25524
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp
@@ -0,0 +1,606 @@
+#pragma once
+
+#include "cuda.h"
+#include <kfusion/cuda/kernel_containers.hpp>
+
+namespace kfusion
+{
+    namespace device
+    {
+        template <class T> __kf_hdevice__ void swap(T& a, T& b) { T c(a); a=b; b=c; }
+
+        template<typename T> struct numeric_limits;
+
+        template<> struct numeric_limits<float>
+        {
+            __kf_device__ static float quiet_NaN() { return __int_as_float(0x7fffffff); /*CUDART_NAN_F*/ };
+            __kf_device__ static float epsilon() { return 1.192092896e-07f/*FLT_EPSILON*/; };
+            __kf_device__ static float min() { return 1.175494351e-38f/*FLT_MIN*/; };
+            __kf_device__ static float max() { return 3.402823466e+38f/*FLT_MAX*/; };
+        };
+
+        template<> struct numeric_limits<unsigned short>
+        {
+            __kf_device__ static unsigned short max() { return USHRT_MAX; };
+        };
+
+        __kf_device__ float dot(const float3& v1, const float3& v2)
+        {
+            return __fmaf_rn(v1.x, v2.x, __fmaf_rn(v1.y, v2.y, v1.z*v2.z));
+        }
+
+        __kf_device__ float3& operator+=(float3& vec, const float& v)
+        {
+            vec.x += v;  vec.y += v;  vec.z += v; return vec;
+        }
+
+        __kf_device__ float3& operator+=(float3& v1, const float3& v2)
+        {
+            v1.x += v2.x;  v1.y += v2.y;  v1.z += v2.z; return v1;
+        }
+
+        __kf_device__ float3 operator+(const float3& v1, const float3& v2)
+        {
+            return make_float3(v1.x + v2.x, v1.y + v2.y, v1.z + v2.z);
+        }
+
+        __kf_device__ float3 operator*(const float3& v1, const float3& v2)
+        {
+            return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
+        }
+
+        __kf_hdevice__ float3 operator*(const float3& v1, const int3& v2)
+        {
+            return make_float3(v1.x * v2.x, v1.y * v2.y, v1.z * v2.z);
+        }
+
+        __kf_device__ float3 operator/(const float3& v1, const float3& v2)
+        {
+            return make_float3(v1.x / v2.x, v1.y / v2.y, v1.z / v2.z);
+        }
+
+        __kf_hdevice__ float3 operator/(const float& v, const float3& vec)
+        {
+            return make_float3(v / vec.x, v / vec.y, v / vec.z);
+        }
+
+        __kf_device__ float3& operator*=(float3& vec, const float& v)
+        {
+            vec.x *= v;  vec.y *= v;  vec.z *= v; return vec;
+        }
+
+        __kf_device__ float3 operator-(const float3& v1, const float3& v2)
+        {
+            return make_float3(v1.x - v2.x, v1.y - v2.y, v1.z - v2.z);
+        }
+
+        __kf_hdevice__ float3 operator*(const float3& v1, const float& v)
+        {
+            return make_float3(v1.x * v, v1.y * v, v1.z * v);
+        }
+
+        __kf_hdevice__ float3 operator*(const float& v, const float3& v1)
+        {
+            return make_float3(v1.x * v, v1.y * v, v1.z * v);
+        }
+
+        __kf_device__ float norm(const float3& v)
+        {
+            return sqrt(dot(v, v));
+        }
+
+        __kf_device__ float norm_sqr(const float3& v)
+        {
+            return dot(v, v);
+        }
+
+        __kf_device__ float3 normalized(const float3& v)
+        {
+            return v * rsqrt(dot(v, v));
+        }
+
+        __kf_hdevice__ float3 cross(const float3& v1, const float3& v2)
+        {
+            return make_float3(v1.y * v2.z - v1.z * v2.y, v1.z * v2.x - v1.x * v2.z, v1.x * v2.y - v1.y * v2.x);
+        }
+
+        __kf_device__ void computeRoots2(const float& b, const float& c, float3& roots)
+        {
+            roots.x = 0.f;
+            float d = b * b - 4.f * c;
+            if (d < 0.f) // no real roots!!!! THIS SHOULD NOT HAPPEN!
+                d = 0.f;
+
+            float sd = sqrtf(d);
+
+            roots.z = 0.5f * (b + sd);
+            roots.y = 0.5f * (b - sd);
+        }
+
+        __kf_device__ void computeRoots3(float c0, float c1, float c2, float3& roots)
+        {
+            if ( fabsf(c0) < numeric_limits<float>::epsilon())// one root is 0 -> quadratic equation
+            {
+                computeRoots2 (c2, c1, roots);
+            }
+            else
+            {
+                const float s_inv3 = 1.f/3.f;
+                const float s_sqrt3 = sqrtf(3.f);
+                // Construct the parameters used in classifying the roots of the equation
+                // and in solving the equation for the roots in closed form.
+                float c2_over_3 = c2 * s_inv3;
+                float a_over_3 = (c1 - c2*c2_over_3)*s_inv3;
+                if (a_over_3 > 0.f)
+                    a_over_3 = 0.f;
+
+                float half_b = 0.5f * (c0 + c2_over_3 * (2.f * c2_over_3 * c2_over_3 - c1));
+
+                float q = half_b * half_b + a_over_3 * a_over_3 * a_over_3;
+                if (q > 0.f)
+                    q = 0.f;
+
+                // Compute the eigenvalues by solving for the roots of the polynomial.
+                float rho = sqrtf(-a_over_3);
+                float theta = atan2f (sqrtf (-q), half_b)*s_inv3;
+                float cos_theta = __cosf (theta);
+                float sin_theta = __sinf (theta);
+                roots.x = c2_over_3 + 2.f * rho * cos_theta;
+                roots.y = c2_over_3 - rho * (cos_theta + s_sqrt3 * sin_theta);
+                roots.z = c2_over_3 - rho * (cos_theta - s_sqrt3 * sin_theta);
+
+                // Sort in increasing order.
+                if (roots.x >= roots.y)
+                    swap(roots.x, roots.y);
+
+                if (roots.y >= roots.z)
+                {
+                    swap(roots.y, roots.z);
+
+                    if (roots.x >= roots.y)
+                        swap (roots.x, roots.y);
+                }
+                if (roots.x <= 0) // eigenval for symetric positive semi-definite matrix can not be negative! Set it to 0
+                    computeRoots2 (c2, c1, roots);
+            }
+        }
+
+        struct Eigen33
+        {
+        public:
+            template<int Rows> struct MiniMat
+            {
+                float3 data[Rows];
+                __kf_hdevice__ float3& operator[](int i) { return data[i]; }
+                __kf_hdevice__ const float3& operator[](int i) const { return data[i]; }
+            };
+            typedef MiniMat<3> Mat33;
+            typedef MiniMat<4> Mat43;
+
+
+            static __kf_device__ float3 unitOrthogonal (const float3& src)
+            {
+                float3 perp;
+                /* Let us compute the crossed product of *this with a vector
+                 * that is not too close to being colinear to *this.
+                 */
+
+                /* unless the x and y coords are both close to zero, we can
+                 * simply take ( -y, x, 0 ) and normalize it.
+                 */
+                if(!isMuchSmallerThan(src.x, src.z) || !isMuchSmallerThan(src.y, src.z))
+                {
+                    float invnm = rsqrtf(src.x*src.x + src.y*src.y);
+                    perp.x = -src.y * invnm;
+                    perp.y =  src.x * invnm;
+                    perp.z = 0.0f;
+                }
+                /* if both x and y are close to zero, then the vector is close
+                 * to the z-axis, so it's far from colinear to the x-axis for instance.
+                 * So we take the crossed product with (1,0,0) and normalize it.
+                 */
+                else
+                {
+                    float invnm = rsqrtf(src.z * src.z + src.y * src.y);
+                    perp.x = 0.0f;
+                    perp.y = -src.z * invnm;
+                    perp.z =  src.y * invnm;
+                }
+
+                return perp;
+            }
+
+            __kf_device__ Eigen33(volatile float* mat_pkg_arg) : mat_pkg(mat_pkg_arg) {}
+            __kf_device__ void compute(Mat33& tmp, Mat33& vec_tmp, Mat33& evecs, float3& evals)
+            {
+                // Scale the matrix so its entries are in [-1,1].  The scaling is applied
+                // only when at least one matrix entry has magnitude larger than 1.
+
+                float max01 = fmaxf( fabsf(mat_pkg[0]), fabsf(mat_pkg[1]) );
+                float max23 = fmaxf( fabsf(mat_pkg[2]), fabsf(mat_pkg[3]) );
+                float max45 = fmaxf( fabsf(mat_pkg[4]), fabsf(mat_pkg[5]) );
+                float m0123 = fmaxf( max01, max23);
+                float scale = fmaxf( max45, m0123);
+
+                if (scale <= numeric_limits<float>::min())
+                    scale = 1.f;
+
+                mat_pkg[0] /= scale;
+                mat_pkg[1] /= scale;
+                mat_pkg[2] /= scale;
+                mat_pkg[3] /= scale;
+                mat_pkg[4] /= scale;
+                mat_pkg[5] /= scale;
+
+                // The characteristic equation is x^3 - c2*x^2 + c1*x - c0 = 0.  The
+                // eigenvalues are the roots to this equation, all guaranteed to be
+                // real-valued, because the matrix is symmetric.
+                float c0 = m00() * m11() * m22()
+                        + 2.f * m01() * m02() * m12()
+                        - m00() * m12() * m12()
+                        - m11() * m02() * m02()
+                        - m22() * m01() * m01();
+                float c1 = m00() * m11() -
+                        m01() * m01() +
+                        m00() * m22() -
+                        m02() * m02() +
+                        m11() * m22() -
+                        m12() * m12();
+                float c2 = m00() + m11() + m22();
+
+                computeRoots3(c0, c1, c2, evals);
+
+                if(evals.z - evals.x <= numeric_limits<float>::epsilon())
+                {
+                    evecs[0] = make_float3(1.f, 0.f, 0.f);
+                    evecs[1] = make_float3(0.f, 1.f, 0.f);
+                    evecs[2] = make_float3(0.f, 0.f, 1.f);
+                }
+                else if (evals.y - evals.x <= numeric_limits<float>::epsilon() )
+                {
+                    // first and second equal
+                    tmp[0] = row0();  tmp[1] = row1();  tmp[2] = row2();
+                    tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
+
+                    vec_tmp[0] = cross(tmp[0], tmp[1]);
+                    vec_tmp[1] = cross(tmp[0], tmp[2]);
+                    vec_tmp[2] = cross(tmp[1], tmp[2]);
+
+                    float len1 = dot (vec_tmp[0], vec_tmp[0]);
+                    float len2 = dot (vec_tmp[1], vec_tmp[1]);
+                    float len3 = dot (vec_tmp[2], vec_tmp[2]);
+
+                    if (len1 >= len2 && len1 >= len3)
+                    {
+                        evecs[2] = vec_tmp[0] * rsqrtf (len1);
+                    }
+                    else if (len2 >= len1 && len2 >= len3)
+                    {
+                        evecs[2] = vec_tmp[1] * rsqrtf (len2);
+                    }
+                    else
+                    {
+                        evecs[2] = vec_tmp[2] * rsqrtf (len3);
+                    }
+
+                    evecs[1] = unitOrthogonal(evecs[2]);
+                    evecs[0] = cross(evecs[1], evecs[2]);
+                }
+                else if (evals.z - evals.y <= numeric_limits<float>::epsilon() )
+                {
+                    // second and third equal
+                    tmp[0] = row0();  tmp[1] = row1();  tmp[2] = row2();
+                    tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
+
+                    vec_tmp[0] = cross(tmp[0], tmp[1]);
+                    vec_tmp[1] = cross(tmp[0], tmp[2]);
+                    vec_tmp[2] = cross(tmp[1], tmp[2]);
+
+                    float len1 = dot(vec_tmp[0], vec_tmp[0]);
+                    float len2 = dot(vec_tmp[1], vec_tmp[1]);
+                    float len3 = dot(vec_tmp[2], vec_tmp[2]);
+
+                    if (len1 >= len2 && len1 >= len3)
+                    {
+                        evecs[0] = vec_tmp[0] * rsqrtf(len1);
+                    }
+                    else if (len2 >= len1 && len2 >= len3)
+                    {
+                        evecs[0] = vec_tmp[1] * rsqrtf(len2);
+                    }
+                    else
+                    {
+                        evecs[0] = vec_tmp[2] * rsqrtf(len3);
+                    }
+
+                    evecs[1] = unitOrthogonal( evecs[0] );
+                    evecs[2] = cross(evecs[0], evecs[1]);
+                }
+                else
+                {
+
+                    tmp[0] = row0();  tmp[1] = row1();  tmp[2] = row2();
+                    tmp[0].x -= evals.z; tmp[1].y -= evals.z; tmp[2].z -= evals.z;
+
+                    vec_tmp[0] = cross(tmp[0], tmp[1]);
+                    vec_tmp[1] = cross(tmp[0], tmp[2]);
+                    vec_tmp[2] = cross(tmp[1], tmp[2]);
+
+                    float len1 = dot(vec_tmp[0], vec_tmp[0]);
+                    float len2 = dot(vec_tmp[1], vec_tmp[1]);
+                    float len3 = dot(vec_tmp[2], vec_tmp[2]);
+
+                    float mmax[3];
+
+                    unsigned int min_el = 2;
+                    unsigned int max_el = 2;
+                    if (len1 >= len2 && len1 >= len3)
+                    {
+                        mmax[2] = len1;
+                        evecs[2] = vec_tmp[0] * rsqrtf (len1);
+                    }
+                    else if (len2 >= len1 && len2 >= len3)
+                    {
+                        mmax[2] = len2;
+                        evecs[2] = vec_tmp[1] * rsqrtf (len2);
+                    }
+                    else
+                    {
+                        mmax[2] = len3;
+                        evecs[2] = vec_tmp[2] * rsqrtf (len3);
+                    }
+
+                    tmp[0] = row0();  tmp[1] = row1();  tmp[2] = row2();
+                    tmp[0].x -= evals.y; tmp[1].y -= evals.y; tmp[2].z -= evals.y;
+
+                    vec_tmp[0] = cross(tmp[0], tmp[1]);
+                    vec_tmp[1] = cross(tmp[0], tmp[2]);
+                    vec_tmp[2] = cross(tmp[1], tmp[2]);
+
+                    len1 = dot(vec_tmp[0], vec_tmp[0]);
+                    len2 = dot(vec_tmp[1], vec_tmp[1]);
+                    len3 = dot(vec_tmp[2], vec_tmp[2]);
+
+                    if (len1 >= len2 && len1 >= len3)
+                    {
+                        mmax[1] = len1;
+                        evecs[1] = vec_tmp[0] * rsqrtf (len1);
+                        min_el = len1 <= mmax[min_el] ? 1 : min_el;
+                        max_el = len1  > mmax[max_el] ? 1 : max_el;
+                    }
+                    else if (len2 >= len1 && len2 >= len3)
+                    {
+                        mmax[1] = len2;
+                        evecs[1] = vec_tmp[1] * rsqrtf (len2);
+                        min_el = len2 <= mmax[min_el] ? 1 : min_el;
+                        max_el = len2  > mmax[max_el] ? 1 : max_el;
+                    }
+                    else
+                    {
+                        mmax[1] = len3;
+                        evecs[1] = vec_tmp[2] * rsqrtf (len3);
+                        min_el = len3 <= mmax[min_el] ? 1 : min_el;
+                        max_el = len3 >  mmax[max_el] ? 1 : max_el;
+                    }
+
+                    tmp[0] = row0();  tmp[1] = row1();  tmp[2] = row2();
+                    tmp[0].x -= evals.x; tmp[1].y -= evals.x; tmp[2].z -= evals.x;
+
+                    vec_tmp[0] = cross(tmp[0], tmp[1]);
+                    vec_tmp[1] = cross(tmp[0], tmp[2]);
+                    vec_tmp[2] = cross(tmp[1], tmp[2]);
+
+                    len1 = dot (vec_tmp[0], vec_tmp[0]);
+                    len2 = dot (vec_tmp[1], vec_tmp[1]);
+                    len3 = dot (vec_tmp[2], vec_tmp[2]);
+
+
+                    if (len1 >= len2 && len1 >= len3)
+                    {
+                        mmax[0] = len1;
+                        evecs[0] = vec_tmp[0] * rsqrtf (len1);
+                        min_el = len3 <= mmax[min_el] ? 0 : min_el;
+                        max_el = len3  > mmax[max_el] ? 0 : max_el;
+                    }
+                    else if (len2 >= len1 && len2 >= len3)
+                    {
+                        mmax[0] = len2;
+                        evecs[0] = vec_tmp[1] * rsqrtf (len2);
+                        min_el = len3 <= mmax[min_el] ? 0 : min_el;
+                        max_el = len3  > mmax[max_el] ? 0 : max_el;
+                    }
+                    else
+                    {
+                        mmax[0] = len3;
+                        evecs[0] = vec_tmp[2] * rsqrtf (len3);
+                        min_el = len3 <= mmax[min_el] ? 0 : min_el;
+                        max_el = len3  > mmax[max_el] ? 0 : max_el;
+                    }
+
+                    unsigned mid_el = 3 - min_el - max_el;
+                    evecs[min_el] = normalized( cross( evecs[(min_el+1) % 3], evecs[(min_el+2) % 3] ) );
+                    evecs[mid_el] = normalized( cross( evecs[(mid_el+1) % 3], evecs[(mid_el+2) % 3] ) );
+                }
+                // Rescale back to the original size.
+                evals *= scale;
+            }
+        private:
+            volatile float* mat_pkg;
+
+            __kf_device__ float m00() const { return mat_pkg[0]; }
+            __kf_device__ float m01() const { return mat_pkg[1]; }
+            __kf_device__ float m02() const { return mat_pkg[2]; }
+            __kf_device__ float m10() const { return mat_pkg[1]; }
+            __kf_device__ float m11() const { return mat_pkg[3]; }
+            __kf_device__ float m12() const { return mat_pkg[4]; }
+            __kf_device__ float m20() const { return mat_pkg[2]; }
+            __kf_device__ float m21() const { return mat_pkg[4]; }
+            __kf_device__ float m22() const { return mat_pkg[5]; }
+
+            __kf_device__ float3 row0() const { return make_float3( m00(), m01(), m02() ); }
+            __kf_device__ float3 row1() const { return make_float3( m10(), m11(), m12() ); }
+            __kf_device__ float3 row2() const { return make_float3( m20(), m21(), m22() ); }
+
+            __kf_device__ static bool isMuchSmallerThan (float x, float y)
+            {
+                // copied from <eigen>/include/Eigen/src/Core/NumTraits.h
+                const float prec_sqr = numeric_limits<float>::epsilon() * numeric_limits<float>::epsilon();
+                return x * x <= prec_sqr * y * y;
+            }
+        };
+
+        struct Warp
+        {
+            enum
+            {
+                LOG_WARP_SIZE = 5,
+                WARP_SIZE     = 1 << LOG_WARP_SIZE,
+                STRIDE        = WARP_SIZE
+            };
+
+            /** \brief Returns the warp lane ID of the calling thread. */
+            static __kf_device__ unsigned int laneId()
+            {
+                unsigned int ret;
+                asm("mov.u32 %0, %laneid;" : "=r"(ret) );
+                return ret;
+            }
+
+            static __kf_device__ unsigned int id()
+            {
+                int tid = threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+                return tid >> LOG_WARP_SIZE;
+            }
+
+            static __kf_device__ int laneMaskLt()
+            {
+#if (__CUDA_ARCH__ >= 200)
+                unsigned int ret;
+                asm("mov.u32 %0, %lanemask_lt;" : "=r"(ret) );
+                return ret;
+#else
+                return 0xFFFFFFFF >> (32 - laneId());
+#endif
+            }
+            static __kf_device__ int binaryExclScan(int ballot_mask)
+            {
+                return __popc(Warp::laneMaskLt() & ballot_mask);
+            }
+        };
+
+        struct Block
+        {
+            static __kf_device__ unsigned int stride()
+            {
+                return blockDim.x * blockDim.y * blockDim.z;
+            }
+
+            static __kf_device__ int flattenedThreadId()
+            {
+                return threadIdx.z * blockDim.x * blockDim.y + threadIdx.y * blockDim.x + threadIdx.x;
+            }
+
+            template<int CTA_SIZE, typename T, class BinOp>
+            static __kf_device__ void reduce(volatile T* buffer, BinOp op)
+            {
+                int tid = flattenedThreadId();
+                T val =  buffer[tid];
+
+                if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+                if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+                if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+                if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+                if (tid < 32)
+                {
+                    if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                    if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                    if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                    if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                    if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                    if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+                }
+            }
+
+            template<int CTA_SIZE, typename T, class BinOp>
+            static __kf_device__ T reduce(volatile T* buffer, T init, BinOp op)
+            {
+                int tid = flattenedThreadId();
+                T val =  buffer[tid] = init;
+                __syncthreads();
+
+                if (CTA_SIZE >= 1024) { if (tid < 512) buffer[tid] = val = op(val, buffer[tid + 512]); __syncthreads(); }
+                if (CTA_SIZE >=  512) { if (tid < 256) buffer[tid] = val = op(val, buffer[tid + 256]); __syncthreads(); }
+                if (CTA_SIZE >=  256) { if (tid < 128) buffer[tid] = val = op(val, buffer[tid + 128]); __syncthreads(); }
+                if (CTA_SIZE >=  128) { if (tid <  64) buffer[tid] = val = op(val, buffer[tid +  64]); __syncthreads(); }
+
+                if (tid < 32)
+                {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300 && 0
+                    if (CTA_SIZE >=   64) val = op(val, buffer[tid +  32]);
+                    if (CTA_SIZE >=   32) val = op(val, __shfl_xor(val, 16));
+                    if (CTA_SIZE >=   16) val = op(val, __shfl_xor(val,  8));
+                    if (CTA_SIZE >=    8) val = op(val, __shfl_xor(val,  4));
+                    if (CTA_SIZE >=    4) val = op(val, __shfl_xor(val,  2));
+                    if (CTA_SIZE >=    2) buffer[tid] = op(val, __shfl_xor(val,  1));
+#else
+                    if (CTA_SIZE >=   64) { buffer[tid] = val = op(val, buffer[tid +  32]); }
+                    if (CTA_SIZE >=   32) { buffer[tid] = val = op(val, buffer[tid +  16]); }
+                    if (CTA_SIZE >=   16) { buffer[tid] = val = op(val, buffer[tid +   8]); }
+                    if (CTA_SIZE >=    8) { buffer[tid] = val = op(val, buffer[tid +   4]); }
+                    if (CTA_SIZE >=    4) { buffer[tid] = val = op(val, buffer[tid +   2]); }
+                    if (CTA_SIZE >=    2) { buffer[tid] = val = op(val, buffer[tid +   1]); }
+#endif
+                }
+                __syncthreads();
+                return buffer[0];
+            }
+        };
+
+
+
+        struct Emulation
+        {
+            static __kf_device__ int warp_reduce ( volatile int *ptr , const unsigned int tid)
+            {
+                const unsigned int lane = tid & 31; // index of thread in warp (0..31)
+
+                if (lane < 16)
+                {
+                    int partial = ptr[tid];
+
+                    ptr[tid] = partial = partial + ptr[tid + 16];
+                    ptr[tid] = partial = partial + ptr[tid + 8];
+                    ptr[tid] = partial = partial + ptr[tid + 4];
+                    ptr[tid] = partial = partial + ptr[tid + 2];
+                    ptr[tid] = partial = partial + ptr[tid + 1];
+                }
+                return ptr[tid - lane];
+            }
+
+            static __kf_device__ int Ballot(int predicate, volatile int* cta_buffer)
+            {
+#if __CUDA_ARCH__ >= 200
+                (void)cta_buffer;
+                return __ballot(predicate);
+#else
+                int tid = Block::flattenedThreadId();
+                cta_buffer[tid] = predicate ? (1 << (tid & 31)) : 0;
+                return warp_reduce(cta_buffer, tid);
+#endif
+            }
+
+            static __kf_device__ bool All(int predicate, volatile int* cta_buffer)
+            {
+#if __CUDA_ARCH__ >= 200
+                (void)cta_buffer;
+                return __all(predicate);
+#else
+                int tid = Block::flattenedThreadId();
+                cta_buffer[tid] = predicate ? 1 : 0;
+                return warp_reduce(cta_buffer, tid) == 32;
+#endif
+            }
+        };
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp b/modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp
new file mode 100644
index 00000000000..10258c12026
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <kfusion/cuda/device_array.hpp>
+#include <safe_call.hpp>
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        class TextureBinder
+        {
+        public:
+            template<class T, enum cudaTextureReadMode readMode>
+            TextureBinder(const DeviceArray2D<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
+            {
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture2D(0, tex, arr.ptr(), desc, arr.cols(), arr.rows(), arr.step()) );
+            }
+
+            template<class T, enum cudaTextureReadMode readMode>
+            TextureBinder(const DeviceArray<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
+            {
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture(0, tex, arr.ptr(), desc, arr.sizeBytes()) );
+            }
+
+            template<class T, enum cudaTextureReadMode readMode>
+            TextureBinder(const PtrStepSz<T>& arr, const struct texture<T, 2, readMode>& tex) : texref(&tex)
+            {
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+            }
+
+            template<class A, class T, enum cudaTextureReadMode readMode>
+            TextureBinder(const A& arr, const struct texture<T, 2, readMode>& tex, const cudaChannelFormatDesc& desc) : texref(&tex)
+            {
+                cudaSafeCall( cudaBindTexture2D(0, tex, arr.data, desc, arr.cols, arr.rows, arr.step) );
+            }
+
+            template<class T, enum cudaTextureReadMode readMode>
+            TextureBinder(const PtrSz<T>& arr, const struct texture<T, 1, readMode> &tex) : texref(&tex)
+            {
+                cudaChannelFormatDesc desc = cudaCreateChannelDesc<T>();
+                cudaSafeCall( cudaBindTexture(0, tex, arr.data, desc, arr.size * arr.elemSize()) );
+            }
+
+            ~TextureBinder()
+            {
+                cudaSafeCall( cudaUnbindTexture(texref) );
+            }
+        private:
+            const struct textureReference *texref;
+        };
+    }
+
+    namespace device
+    {
+        using kfusion::cuda::TextureBinder;
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu b/modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu
new file mode 100644
index 00000000000..bf41284a78d
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu
@@ -0,0 +1,831 @@
+#include <vector_functions.h>
+#include "device.hpp"
+#include "texture_binder.hpp"
+#include "../internal.hpp"
+#include "math_constants.h"
+using namespace kfusion::device;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Volume initialization
+
+namespace kfusion
+{
+    namespace device
+    {
+        __global__ void clear_volume_kernel(TsdfVolume tsdf)
+        {
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+
+            if (x < tsdf.dims.x && y < tsdf.dims.y)
+            {
+                ushort2 *beg = tsdf.beg(x, y);
+                ushort2 *end = beg + tsdf.dims.x * tsdf.dims.y * tsdf.dims.z;
+
+                for(ushort2* pos = beg; pos != end; pos = tsdf.zstep(pos))
+                    *pos = pack_tsdf (0.f, 0);
+            }
+        }
+    }
+}
+
+void kfusion::device::clear_volume(TsdfVolume volume)
+{
+    dim3 block (32, 8);
+    dim3 grid (1, 1, 1);
+    grid.x = divUp (volume.dims.x, block.x);
+    grid.y = divUp (volume.dims.y, block.y);
+
+    clear_volume_kernel<<<grid, block>>>(volume);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Volume integration
+float3 test;
+namespace kfusion
+{
+    namespace device
+    {
+        texture<float, 2> dists_tex(0, cudaFilterModePoint, cudaAddressModeBorder, cudaCreateChannelDescHalf());
+        struct TsdfIntegrator
+        {
+            Aff3f vol2cam;
+            Projector proj;
+            int2 dists_size;
+
+            float tranc_dist_inv;
+
+            __kf_device__
+            void operator()(TsdfVolume& volume) const
+            {
+                int x = blockIdx.x * blockDim.x + threadIdx.x;
+                int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                if (x >= volume.dims.x || y >= volume.dims.y)
+                    return;
+
+                //float3 zstep = vol2cam.R * make_float3(0.f, 0.f, volume.voxel_size.z);
+                float3 zstep = make_float3(vol2cam.R.data[0].z, vol2cam.R.data[1].z, vol2cam.R.data[2].z) * volume.voxel_size.z;
+
+                float3 vx = make_float3(x * volume.voxel_size.x, y * volume.voxel_size.y, 0);
+                float3 vc = vol2cam * vx; //tranform from volume coo frame to camera one
+
+                TsdfVolume::elem_type* vptr = volume.beg(x, y);
+                for(int i = 0; i < volume.dims.z; ++i, vc += zstep, vptr = volume.zstep(vptr))
+                {
+                    float2 coo = proj(vc);
+
+                    //#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 300
+                    // this is actually workaround for kepler. it doesn't return 0.f for texture
+                    // fetches for out-of-border coordinates even for cudaaddressmodeborder mode
+                    if (coo.x < 0 || coo.y < 0 || coo.x >= dists_size.x || coo.y >= dists_size.y)
+                        continue;
+                    //#endif
+                    float Dp = tex2D(dists_tex, coo.x, coo.y);
+                    if(Dp == 0 || vc.z <= 0)
+                        continue;
+
+                    float sdf = Dp - __fsqrt_rn(dot(vc, vc)); //Dp - norm(v)
+
+                    if (sdf >= -volume.trunc_dist)
+                    {
+                        float tsdf = fmin(1.f, sdf * tranc_dist_inv);
+
+                        //read and unpack
+                        int weight_prev;
+                        float tsdf_prev = unpack_tsdf (gmem::LdCs(vptr), weight_prev);
+
+                        float tsdf_new = __fdividef(__fmaf_rn(tsdf_prev, weight_prev, tsdf), weight_prev + 1);
+                        int weight_new = min (weight_prev + 1, volume.max_weight);
+
+                        //pack and write
+                        gmem::StCs(pack_tsdf (tsdf_new, weight_new), vptr);
+                    }
+                }  // for(;;)
+            }
+
+
+        };
+
+
+        __global__ void integrate_kernel( const TsdfIntegrator integrator, TsdfVolume volume) { integrator(volume); };
+
+        __global__
+        void project_kernel(const Projector proj, PtrStep<Point> points, PtrStepSz<ushort> depth, int rows, int cols)
+        {
+
+            int x = threadIdx.x + blockIdx.x * blockDim.x;
+            int y = threadIdx.y + blockIdx.y * blockDim.y;
+            float qnan = numeric_limits<float>::quiet_NaN ();
+            if (x < cols || y < rows) {
+                Point pt = points(y, x);
+                if(isnan(pt.x) || isnan(pt.y) || isnan(pt.z))
+                    return;
+                float3 point = make_float3(pt.x, pt.y, pt.z);
+                float2 coo = proj(point);
+                if (coo.x < 0 || coo.y < 0 || coo.y >= rows || coo.x >= cols)
+                {
+                    points(y, x) = make_float4(qnan, qnan, qnan, 0.f);
+                    return;
+                }
+
+                float Dp = tex2D(dists_tex, coo.x, coo.y);
+                depth(coo.y, coo.x) = 0;
+                points(y, x) = make_float4(coo.x * Dp, coo.y * Dp, Dp, 0.f);
+            }
+        }
+    }
+}
+
+void kfusion::device::integrate(const PtrStepSz<ushort>& dists, TsdfVolume& volume, const Aff3f& aff, const Projector& proj)
+{
+    TsdfIntegrator ti;
+    ti.dists_size = make_int2(dists.cols, dists.rows);
+    ti.vol2cam = aff;
+    ti.proj = proj;
+    ti.tranc_dist_inv = 1.f/volume.trunc_dist;
+
+    dists_tex.filterMode = cudaFilterModePoint;
+    dists_tex.addressMode[0] = cudaAddressModeBorder;
+    dists_tex.addressMode[1] = cudaAddressModeBorder;
+    dists_tex.addressMode[2] = cudaAddressModeBorder;
+    TextureBinder binder(dists, dists_tex, cudaCreateChannelDescHalf()); (void)binder;
+
+    dim3 block(32, 8);
+    dim3 grid(divUp(volume.dims.x, block.x), divUp(volume.dims.y, block.y));
+
+    integrate_kernel<<<grid, block>>>(ti, volume);
+    cudaSafeCall ( cudaGetLastError () );
+    cudaSafeCall ( cudaDeviceSynchronize() );
+}
+
+//TODO: rename as now projecting + removing from depth
+void kfusion::device::project_and_remove(const PtrStepSz<ushort>& dists, Points &vertices, const Projector &proj)
+{
+    dists_tex.filterMode = cudaFilterModePoint;
+    dists_tex.addressMode[0] = cudaAddressModeBorder;
+    dists_tex.addressMode[1] = cudaAddressModeBorder;
+    dists_tex.addressMode[2] = cudaAddressModeBorder;
+    TextureBinder binder(dists, dists_tex, cudaCreateChannelDescHalf()); (void)binder;
+
+    dim3 block(32, 8);
+    dim3 grid(divUp(vertices.cols(), block.x), divUp(vertices.rows(), block.y));
+
+    project_kernel <<<grid, block>>>(proj, vertices, dists, dists.rows, dists.cols);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+//TODO: rename as now projecting + removing from depth
+void kfusion::device::project(const PtrStepSz<ushort> &dists, Points &vertices, const Projector &proj)
+{
+    dists_tex.filterMode = cudaFilterModePoint;
+    dists_tex.addressMode[0] = cudaAddressModeBorder;
+    dists_tex.addressMode[1] = cudaAddressModeBorder;
+    dists_tex.addressMode[2] = cudaAddressModeBorder;
+    TextureBinder binder(dists, dists_tex, cudaCreateChannelDescHalf()); (void)binder;
+
+    dim3 block(32, 8);
+    dim3 grid(divUp(vertices.cols(), block.x), divUp(vertices.rows(), block.y));
+
+    project_kernel <<<grid, block>>>(proj, vertices, dists, dists.rows, dists.cols);
+    cudaSafeCall ( cudaGetLastError () );
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Volume ray casting
+
+namespace kfusion
+{
+    namespace device
+    {
+        __kf_device__ void intersect(float3 ray_org, float3 ray_dir, /*float3 box_min,*/ float3 box_max, float &tnear, float &tfar)
+        {
+            const float3 box_min = make_float3(0.f, 0.f, 0.f);
+
+            // compute intersection of ray with all six bbox planes
+            float3 invR = make_float3(1.f/ray_dir.x, 1.f/ray_dir.y, 1.f/ray_dir.z);
+            float3 tbot = invR * (box_min - ray_org);
+            float3 ttop = invR * (box_max - ray_org);
+
+            // re-order intersections to find smallest and largest on each axis
+            float3 tmin = make_float3(fminf(ttop.x, tbot.x), fminf(ttop.y, tbot.y), fminf(ttop.z, tbot.z));
+            float3 tmax = make_float3(fmaxf(ttop.x, tbot.x), fmaxf(ttop.y, tbot.y), fmaxf(ttop.z, tbot.z));
+
+            // find the largest tmin and the smallest tmax
+            tnear = fmaxf(fmaxf(tmin.x, tmin.y), fmaxf(tmin.x, tmin.z));
+            tfar  = fminf(fminf(tmax.x, tmax.y), fminf(tmax.x, tmax.z));
+        }
+
+        template<typename Vol>
+        __kf_device__ float interpolate(const Vol& volume, const float3& p_voxels)
+        {
+            float3 cf = p_voxels;
+
+            //rounding to negative infinity
+            int3 g = make_int3(__float2int_rd (cf.x), __float2int_rd (cf.y), __float2int_rd (cf.z));
+
+            if (g.x < 0 || g.x >= volume.dims.x - 1 || g.y < 0 || g.y >= volume.dims.y - 1 || g.z < 0 || g.z >= volume.dims.z - 1)
+                return numeric_limits<float>::quiet_NaN();
+
+            float a = cf.x - g.x;
+            float b = cf.y - g.y;
+            float c = cf.z - g.z;
+
+            float tsdf = 0.f;
+            tsdf += unpack_tsdf(*volume(g.x + 0, g.y + 0, g.z + 0)) * (1 - a) * (1 - b) * (1 - c);
+            tsdf += unpack_tsdf(*volume(g.x + 0, g.y + 0, g.z + 1)) * (1 - a) * (1 - b) *      c;
+            tsdf += unpack_tsdf(*volume(g.x + 0, g.y + 1, g.z + 0)) * (1 - a) *      b  * (1 - c);
+            tsdf += unpack_tsdf(*volume(g.x + 0, g.y + 1, g.z + 1)) * (1 - a) *      b  *      c;
+            tsdf += unpack_tsdf(*volume(g.x + 1, g.y + 0, g.z + 0)) *      a  * (1 - b) * (1 - c);
+            tsdf += unpack_tsdf(*volume(g.x + 1, g.y + 0, g.z + 1)) *      a  * (1 - b) *      c;
+            tsdf += unpack_tsdf(*volume(g.x + 1, g.y + 1, g.z + 0)) *      a  *      b  * (1 - c);
+            tsdf += unpack_tsdf(*volume(g.x + 1, g.y + 1, g.z + 1)) *      a  *      b  *      c;
+            return tsdf;
+        }
+
+        struct TsdfRaycaster
+        {
+            TsdfVolume volume;
+
+            Aff3f aff;
+            Mat3f Rinv;
+
+            Vec3f volume_size;
+            Reprojector reproj;
+            float time_step;
+            float3 gradient_delta;
+            float3 voxel_size_inv;
+
+            TsdfRaycaster(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv, const Reprojector& _reproj);
+
+            __kf_device__
+            float fetch_tsdf(const float3& p) const
+            {
+                //rounding to nearest even
+                int x = __float2int_rn (p.x * voxel_size_inv.x);
+                int y = __float2int_rn (p.y * voxel_size_inv.y);
+                int z = __float2int_rn (p.z * voxel_size_inv.z);
+                return unpack_tsdf(*volume(x, y, z));
+            }
+
+            __kf_device__
+            void operator()(PtrStepSz<ushort> depth, PtrStep<Normal> normals) const
+            {
+                int x = blockIdx.x * blockDim.x + threadIdx.x;
+                int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                if (x >= depth.cols || y >= depth.rows)
+                    return;
+
+                const float qnan = numeric_limits<float>::quiet_NaN();
+
+                depth(y, x) = 0;
+                normals(y, x) = make_float4(qnan, qnan, qnan, qnan);
+
+                float3 ray_org = aff.t;
+                float3 ray_dir = normalized( aff.R * reproj(x, y, 1.f) );
+
+                // We do subtract voxel size to minimize checks after
+                // Note: origin of volume coordinate is placed
+                // in the center of voxel (0,0,0), not in the corner of the voxel!
+                float3 box_max = volume_size - volume.voxel_size;
+
+                float tmin, tmax;
+                intersect(ray_org, ray_dir, box_max, tmin, tmax);
+
+                const float min_dist = 0.f;
+                tmin = fmax(min_dist, tmin);
+                if (tmin >= tmax)
+                    return;
+
+                tmax -= time_step;
+                float3 vstep = ray_dir * time_step;
+                float3 next = ray_org + ray_dir * tmin;
+
+                float tsdf_next = fetch_tsdf(next);
+                for (float tcurr = tmin; tcurr < tmax; tcurr += time_step)
+                {
+                    float tsdf_curr = tsdf_next;
+                    float3     curr = next;
+                    next += vstep;
+
+                    tsdf_next = fetch_tsdf(next);
+                    if (tsdf_curr < 0.f && tsdf_next > 0.f)
+                        break;
+
+                    if (tsdf_curr > 0.f && tsdf_next < 0.f)
+                    {
+                        float Ft   = interpolate(volume, curr * voxel_size_inv);
+                        float Ftdt = interpolate(volume, next * voxel_size_inv);
+
+                        float Ts = tcurr - __fdividef(time_step * Ft, Ftdt - Ft);
+
+                        float3 vertex = ray_org + ray_dir * Ts;
+                        float3 normal = compute_normal(vertex);
+
+                        if (!isnan(normal.x * normal.y * normal.z))
+                        {
+                            normal = Rinv * normal;
+                            vertex = Rinv * (vertex - aff.t);
+
+                            normals(y, x) = make_float4(normal.x, normal.y, normal.z, 0);
+                            depth(y, x) = static_cast<ushort>(vertex.z * 1000);
+                        }
+                        break;
+                    }
+                } /* for (;;) */
+            }
+
+            __kf_device__
+            void operator()(PtrStepSz<Point> points, PtrStep<Normal> normals) const
+            {
+                int x = blockIdx.x * blockDim.x + threadIdx.x;
+                int y = blockIdx.y * blockDim.y + threadIdx.y;
+
+                if (x >= points.cols || y >= points.rows)
+                    return;
+
+                const float qnan = numeric_limits<float>::quiet_NaN();
+
+                points(y, x) = normals(y, x) = make_float4(qnan, qnan, qnan, qnan);
+
+                float3 ray_org = aff.t;
+                float3 ray_dir = normalized( aff.R * reproj(x, y, 1.f) );
+
+                // We do subtract voxel size to minimize checks after
+                // Note: origin of volume coordinate is placeed
+                // in the center of voxel (0,0,0), not in the corener of the voxel!
+                float3 box_max = volume_size - volume.voxel_size;
+
+                float tmin, tmax;
+                intersect(ray_org, ray_dir, box_max, tmin, tmax);
+
+                const float min_dist = 0.f;
+                tmin = fmax(min_dist, tmin);
+                if (tmin >= tmax)
+                    return;
+
+                tmax -= time_step;
+                float3 vstep = ray_dir * time_step;
+                float3 next = ray_org + ray_dir * tmin;
+
+                float tsdf_next = fetch_tsdf(next);
+                for (float tcurr = tmin; tcurr < tmax; tcurr += time_step)
+                {
+                    float tsdf_curr = tsdf_next;
+                    float3     curr = next;
+                    next += vstep;
+
+                    tsdf_next = fetch_tsdf(next);
+                    if (tsdf_curr < 0.f && tsdf_next > 0.f)
+                        break;
+
+                    if (tsdf_curr > 0.f && tsdf_next < 0.f)
+                    {
+                        float Ft   = interpolate(volume, curr * voxel_size_inv);
+                        float Ftdt = interpolate(volume, next * voxel_size_inv);
+
+                        float Ts = tcurr - __fdividef(time_step * Ft, Ftdt - Ft);
+
+                        float3 vertex = ray_org + ray_dir * Ts;
+                        float3 normal = compute_normal(vertex);
+
+                        if (!isnan(normal.x * normal.y * normal.z))
+                        {
+                            normal = Rinv * normal;
+                            vertex = Rinv * (vertex - aff.t);
+
+                            normals(y, x) = make_float4(normal.x, normal.y, normal.z, 0.f);
+                            points(y, x) = make_float4(vertex.x, vertex.y, vertex.z, 0.f);
+                        }
+                        break;
+                    }
+                } /* for (;;) */
+            }
+
+
+            __kf_device__
+            float3 compute_normal(const float3& p) const
+            {
+                float3 n;
+
+                float Fx1 = interpolate(volume, make_float3(p.x + gradient_delta.x, p.y, p.z) * voxel_size_inv);
+                float Fx2 = interpolate(volume, make_float3(p.x - gradient_delta.x, p.y, p.z) * voxel_size_inv);
+                n.x = __fdividef(Fx1 - Fx2, gradient_delta.x);
+
+                float Fy1 = interpolate(volume, make_float3(p.x, p.y + gradient_delta.y, p.z) * voxel_size_inv);
+                float Fy2 = interpolate(volume, make_float3(p.x, p.y - gradient_delta.y, p.z) * voxel_size_inv);
+                n.y = __fdividef(Fy1 - Fy2, gradient_delta.y);
+
+                float Fz1 = interpolate(volume, make_float3(p.x, p.y, p.z + gradient_delta.z) * voxel_size_inv);
+                float Fz2 = interpolate(volume, make_float3(p.x, p.y, p.z - gradient_delta.z) * voxel_size_inv);
+                n.z = __fdividef(Fz1 - Fz2, gradient_delta.z);
+
+                return normalized (n);
+            }
+        };
+
+        inline TsdfRaycaster::TsdfRaycaster(const TsdfVolume& _volume, const Aff3f& _aff, const Mat3f& _Rinv, const Reprojector& _reproj)
+                : volume(_volume), aff(_aff), Rinv(_Rinv), reproj(_reproj) {}
+
+        __global__ void raycast_kernel(const TsdfRaycaster raycaster, PtrStepSz<ushort> depth, PtrStep<Normal> normals)
+        { raycaster(depth, normals); };
+
+        __global__ void raycast_kernel(const TsdfRaycaster raycaster, PtrStepSz<Point> points, PtrStep<Normal> normals)
+        { raycaster(points, normals); };
+
+    }
+}
+
+void kfusion::device::raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv, const Reprojector& reproj,
+                              Depth& depth, Normals& normals, float raycaster_step_factor, float gradient_delta_factor)
+{
+    TsdfRaycaster rc(volume, aff, Rinv, reproj);
+
+    rc.volume_size = volume.voxel_size * volume.dims;
+    rc.time_step = volume.trunc_dist * raycaster_step_factor;
+    rc.gradient_delta = volume.voxel_size * gradient_delta_factor;
+    rc.voxel_size_inv = 1.f/volume.voxel_size;
+
+    dim3 block(32, 8);
+    dim3 grid (divUp (depth.cols(), block.x), divUp (depth.rows(), block.y));
+
+    raycast_kernel<<<grid, block>>>(rc, (PtrStepSz<ushort>)depth, normals);
+    cudaSafeCall (cudaGetLastError ());
+}
+
+
+void kfusion::device::raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv, const Reprojector& reproj,
+                              Points& points, Normals& normals, float raycaster_step_factor, float gradient_delta_factor)
+{
+    TsdfRaycaster rc(volume, aff, Rinv, reproj);
+
+    rc.volume_size = volume.voxel_size * volume.dims;
+    rc.time_step = volume.trunc_dist * raycaster_step_factor;
+    rc.gradient_delta = volume.voxel_size * gradient_delta_factor;
+    rc.voxel_size_inv = 1.f/volume.voxel_size;
+
+    dim3 block(32, 8);
+    dim3 grid (divUp (points.cols(), block.x), divUp (points.rows(), block.y));
+
+    raycast_kernel<<<grid, block>>>(rc, (PtrStepSz<Point>)points, normals);
+    cudaSafeCall (cudaGetLastError ());
+}
+
+////////////////////////////////////////////////////////////////////////////////////////
+/// Volume cloud exctraction
+
+namespace kfusion
+{
+    namespace device
+    {
+        ////////////////////////////////////////////////////////////////////////////////////////
+        ///// Prefix Scan utility
+
+        enum ScanKind { exclusive, inclusive };
+
+        template<ScanKind Kind, class T>
+        __kf_device__ T scan_warp ( volatile T *ptr, const unsigned int idx = threadIdx.x )
+        {
+            const unsigned int lane = idx & 31;       // index of thread in warp (0..31)
+
+            if (lane >=  1) ptr[idx] = ptr[idx -  1] + ptr[idx];
+            if (lane >=  2) ptr[idx] = ptr[idx -  2] + ptr[idx];
+            if (lane >=  4) ptr[idx] = ptr[idx -  4] + ptr[idx];
+            if (lane >=  8) ptr[idx] = ptr[idx -  8] + ptr[idx];
+            if (lane >= 16) ptr[idx] = ptr[idx - 16] + ptr[idx];
+
+            if (Kind == inclusive)
+                return ptr[idx];
+            else
+                return (lane > 0) ? ptr[idx - 1] : 0;
+        }
+
+
+        __device__ int global_count = 0;
+        __device__ int output_count;
+        __device__ unsigned int blocks_done = 0;
+
+
+        struct FullScan6
+        {
+            enum
+            {
+                CTA_SIZE_X = 32,
+                CTA_SIZE_Y = 6,
+                CTA_SIZE = CTA_SIZE_X * CTA_SIZE_Y,
+
+                MAX_LOCAL_POINTS = 3
+            };
+
+            TsdfVolume volume;
+            Aff3f aff;
+
+            FullScan6(const TsdfVolume& vol) : volume(vol) {}
+
+            __kf_device__ float fetch(int x, int y, int z, int& weight) const
+            {
+                return unpack_tsdf(*volume(x, y, z), weight);
+            }
+
+            __kf_device__ void operator () (PtrSz<Point> output) const
+            {
+                int x = threadIdx.x + blockIdx.x * CTA_SIZE_X;
+                int y = threadIdx.y + blockIdx.y * CTA_SIZE_Y;
+#if __CUDA_ARCH__ < 200
+                __shared__ int cta_buffer[CTA_SIZE];
+#endif
+
+#if __CUDA_ARCH__ >= 120
+                if (__all (x >= volume.dims.x) || __all (y >= volume.dims.y))
+                    return;
+#else
+                if (Emulation::All(x >= volume.dims.x, cta_buffer) || Emulation::All(y >= volume.dims.y, cta_buffer))
+                    return;
+#endif
+
+                float3 V;
+                V.x = (x + 0.5f) * volume.voxel_size.x;
+                V.y = (y + 0.5f) * volume.voxel_size.y;
+
+                int ftid = Block::flattenedThreadId ();
+
+                for (int z = 0; z < volume.dims.z - 1; ++z)
+                {
+                    float3 points[MAX_LOCAL_POINTS];
+                    int local_count = 0;
+
+                    if (x < volume.dims.x && y < volume.dims.y)
+                    {
+                        int W;
+                        float F = fetch(x, y, z, W);
+
+                        if (W != 0 && F != 1.f)
+                        {
+                            V.z = (z + 0.5f) * volume.voxel_size.z;
+
+                            //process dx
+                            if (x + 1 < volume.dims.x)
+                            {
+                                int Wn;
+                                float Fn = fetch(x + 1, y, z, Wn);
+
+                                if (Wn != 0 && Fn != 1.f)
+                                    if ((F > 0 && Fn < 0) || (F < 0 && Fn > 0))
+                                    {
+                                        float3 p;
+                                        p.y = V.y;
+                                        p.z = V.z;
+
+                                        float Vnx = V.x + volume.voxel_size.x;
+
+                                        float d_inv = 1.f / (fabs (F) + fabs (Fn));
+                                        p.x = (V.x * fabs (Fn) + Vnx * fabs (F)) * d_inv;
+
+                                        points[local_count++] = aff * p;
+                                    }
+                            }  /* if (x + 1 < volume.dims.x) */
+
+                            //process dy
+                            if (y + 1 < volume.dims.y)
+                            {
+                                int Wn;
+                                float Fn = fetch (x, y + 1, z, Wn);
+
+                                if (Wn != 0 && Fn != 1.f)
+                                    if ((F > 0 && Fn < 0) || (F < 0 && Fn > 0))
+                                    {
+                                        float3 p;
+                                        p.x = V.x;
+                                        p.z = V.z;
+
+                                        float Vny = V.y + volume.voxel_size.y;
+
+                                        float d_inv = 1.f / (fabs (F) + fabs (Fn));
+                                        p.y = (V.y * fabs (Fn) + Vny * fabs (F)) * d_inv;
+
+                                        points[local_count++] = aff * p;
+                                    }
+                            } /*  if (y + 1 < volume.dims.y) */
+
+                            //process dz
+                            //if (z + 1 < volume.dims.z) // guaranteed by loop
+                            {
+                                int Wn;
+                                float Fn = fetch (x, y, z + 1, Wn);
+
+                                if (Wn != 0 && Fn != 1.f)
+                                    if ((F > 0 && Fn < 0) || (F < 0 && Fn > 0))
+                                    {
+                                        float3 p;
+                                        p.x = V.x;
+                                        p.y = V.y;
+
+                                        float Vnz = V.z + volume.voxel_size.z;
+
+                                        float d_inv = 1.f / (fabs (F) + fabs (Fn));
+                                        p.z = (V.z * fabs (Fn) + Vnz * fabs (F)) * d_inv;
+
+                                        points[local_count++] = aff * p;
+                                    }
+                            } /* if (z + 1 < volume.dims.z) */
+                        } /* if (W != 0 && F != 1.f) */
+                    } /* if (x < volume.dims.x && y < volume.dims.y) */
+
+#if __CUDA_ARCH__ >= 200
+                    ///not we fulfilled points array at current iteration
+                    int total_warp = __popc (__ballot (local_count > 0)) + __popc (__ballot (local_count > 1)) + __popc (__ballot (local_count > 2));
+#else
+                    int tid = Block::flattenedThreadId();
+                    cta_buffer[tid] = local_count;
+                    int total_warp = Emulation::warp_reduce(cta_buffer, tid);
+#endif
+                    __shared__ float storage_X[CTA_SIZE * MAX_LOCAL_POINTS];
+                    __shared__ float storage_Y[CTA_SIZE * MAX_LOCAL_POINTS];
+                    __shared__ float storage_Z[CTA_SIZE * MAX_LOCAL_POINTS];
+
+                    if (total_warp > 0)
+                    {
+                        int lane = Warp::laneId ();
+                        int storage_index = (ftid >> Warp::LOG_WARP_SIZE) * Warp::WARP_SIZE * MAX_LOCAL_POINTS;
+
+                        volatile int* cta_buffer = (int*)(storage_X + storage_index);
+
+                        cta_buffer[lane] = local_count;
+                        int offset = scan_warp<exclusive>(cta_buffer, lane);
+
+                        if (lane == 0)
+                        {
+                            int old_global_count = atomicAdd (&global_count, total_warp);
+                            cta_buffer[0] = old_global_count;
+                        }
+                        int old_global_count = cta_buffer[0];
+
+                        for (int l = 0; l < local_count; ++l)
+                        {
+                            storage_X[storage_index + offset + l] = points[l].x;
+                            storage_Y[storage_index + offset + l] = points[l].y;
+                            storage_Z[storage_index + offset + l] = points[l].z;
+                        }
+
+                        Point *pos = output.data + old_global_count + lane;
+                        for (int idx = lane; idx < total_warp; idx += Warp::STRIDE, pos += Warp::STRIDE)
+                        {
+                            float x = storage_X[storage_index + idx];
+                            float y = storage_Y[storage_index + idx];
+                            float z = storage_Z[storage_index + idx];
+                            *pos = make_float4(x, y, z, 0.f);
+                        }
+
+                        bool full = (old_global_count + total_warp) >= output.size;
+
+                        if (full)
+                            break;
+                    }
+
+                } /* for(int z = 0; z < volume.dims.z - 1; ++z) */
+
+
+                ///////////////////////////
+                // prepare for future scans
+                if (ftid == 0)
+                {
+                    unsigned int total_blocks = gridDim.x * gridDim.y * gridDim.z;
+                    unsigned int value = atomicInc (&blocks_done, total_blocks);
+
+                    //last block
+                    if (value == total_blocks - 1)
+                    {
+                        output_count = min ((int)output.size, global_count);
+                        blocks_done = 0;
+                        global_count = 0;
+                    }
+                }
+            }
+        };
+
+
+
+        __global__ void extract_kernel(const FullScan6 fs, PtrSz<Point> output) { fs(output); }
+
+
+
+        struct ExtractNormals
+        {
+            typedef float8 float8;
+
+            TsdfVolume volume;
+            PtrSz<Point> points;
+            float3 voxel_size_inv;
+            float3 gradient_delta;
+            Aff3f aff;
+            Mat3f Rinv;
+
+            ExtractNormals(const TsdfVolume& vol) : volume(vol)
+            {
+                voxel_size_inv.x = 1.f/volume.voxel_size.x;
+                voxel_size_inv.y = 1.f/volume.voxel_size.y;
+                voxel_size_inv.z = 1.f/volume.voxel_size.z;
+            }
+
+            __kf_device__ int3 getVoxel (const float3& p) const
+            {
+                //rounding to nearest even
+                int x = __float2int_rn (p.x * voxel_size_inv.x);
+                int y = __float2int_rn (p.y * voxel_size_inv.y);
+                int z = __float2int_rn (p.z * voxel_size_inv.z);
+                return make_int3 (x, y, z);
+            }
+
+            __kf_device__ void operator () (float4* output) const
+            {
+                int idx = threadIdx.x + blockIdx.x * blockDim.x;
+
+                if (idx >= points.size)
+                    return;
+
+                const float qnan = numeric_limits<float>::quiet_NaN ();
+                float3 n = make_float3 (qnan, qnan, qnan);
+
+                float3 point = Rinv * (tr(points.data[idx]) - aff.t);
+                int3 g = getVoxel (point);
+
+                if (g.x > 1 && g.y > 1 && g.z > 1 && g.x < volume.dims.x - 2 && g.y < volume.dims.y - 2 && g.z < volume.dims.z - 2)
+                {
+                    float3 t;
+
+                    t = point;
+                    t.x += gradient_delta.x;;
+                    float Fx1 = interpolate(volume, t * voxel_size_inv);
+
+                    t = point;
+                    t.x -= gradient_delta.x;
+                    float Fx2 = interpolate(volume, t * voxel_size_inv);
+
+                    n.x = __fdividef(Fx1 - Fx2, gradient_delta.x);
+
+                    t = point;
+                    t.y += gradient_delta.y;
+                    float Fy1 = interpolate(volume, t * voxel_size_inv);
+
+                    t = point;
+                    t.y -= gradient_delta.y;
+                    float Fy2 = interpolate(volume, t * voxel_size_inv);
+
+                    n.y = __fdividef(Fy1 - Fy2, gradient_delta.y);
+
+                    t = point;
+                    t.z += gradient_delta.z;
+                    float Fz1 = interpolate(volume, t * voxel_size_inv);
+
+                    t = point;
+                    t.z -= gradient_delta.z;
+                    float Fz2 = interpolate(volume, t * voxel_size_inv);
+
+                    n.z = __fdividef(Fz1 - Fz2, gradient_delta.z);
+
+                    n = normalized (aff.R * n);
+                }
+
+                output[idx] = make_float4(n.x, n.y, n.z, 0);
+            }
+        };
+
+        __global__ void extract_normals_kernel (const ExtractNormals en, float4* output) { en(output); }
+    }
+}
+
+size_t kfusion::device::extractCloud (const TsdfVolume& volume, const Aff3f& aff, PtrSz<Point> output)
+{
+    typedef FullScan6 FS;
+    FS fs(volume);
+    fs.aff = aff;
+
+    dim3 block (FS::CTA_SIZE_X, FS::CTA_SIZE_Y);
+    dim3 grid (divUp (volume.dims.x, block.x), divUp (volume.dims.y, block.y));
+
+    extract_kernel<<<grid, block>>>(fs, output);
+    cudaSafeCall ( cudaGetLastError () );
+    cudaSafeCall (cudaDeviceSynchronize ());
+
+    int size;
+    cudaSafeCall ( cudaMemcpyFromSymbol (&size, output_count, sizeof(size)) );
+    return (size_t)size;
+}
+
+void kfusion::device::extractNormals (const TsdfVolume& volume, const PtrSz<Point>& points, const Aff3f& aff, const Mat3f& Rinv, float gradient_delta_factor, float4* output)
+{
+    ExtractNormals en(volume);
+    en.points = points;
+    en.gradient_delta = volume.voxel_size * gradient_delta_factor;
+    en.aff = aff;
+    en.Rinv = Rinv;
+
+    dim3 block (256);
+    dim3 grid (divUp ((int)points.size, block.x));
+
+    extract_normals_kernel<<<grid, block>>>(en, output);
+    cudaSafeCall ( cudaGetLastError () );
+    cudaSafeCall (cudaDeviceSynchronize ());
+}
diff --git a/modules/dynamicfusion/kfusion/src/device_memory.cpp b/modules/dynamicfusion/kfusion/src/device_memory.cpp
new file mode 100644
index 00000000000..6a7f4565f2f
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/device_memory.cpp
@@ -0,0 +1,252 @@
+#include <kfusion/cuda/device_memory.hpp>
+#include "safe_call.hpp"
+#include <cassert>
+#include <iostream>
+#include <cstdlib>
+
+void kfusion::cuda::error(const char *error_string, const char *file, const int line, const char *func)
+{
+    std::cout << "KinFu2 error: " << error_string << "\t" << file << ":" << line << std::endl;
+    exit(0);
+}
+
+//////////////////////////    XADD    ///////////////////////////////
+
+#ifdef __GNUC__
+    
+    #if __GNUC__*10 + __GNUC_MINOR__ >= 42
+
+        #if !defined WIN32 && (defined __i486__ || defined __i586__ || defined __i686__ || defined __MMX__ || defined __SSE__  || defined __ppc__)
+            #define CV_XADD __sync_fetch_and_add
+        #else
+            #include <ext/atomicity.h>
+            #define CV_XADD __gnu_cxx::__exchange_and_add
+        #endif
+    #else
+        #include <bits/atomicity.h>
+        #if __GNUC__*10 + __GNUC_MINOR__ >= 34
+            #define CV_XADD __gnu_cxx::__exchange_and_add
+        #else
+            #define CV_XADD __exchange_and_add
+        #endif
+  #endif
+    
+#elif defined WIN32 || defined _WIN32
+    #include <intrin.h>
+    #define CV_XADD(addr,delta) _InterlockedExchangeAdd((long volatile*)(addr), (delta))
+#else
+
+    template<typename _Tp> static inline _Tp CV_XADD(_Tp* addr, _Tp delta)
+    { int tmp = *addr; *addr += delta; return tmp; }
+    
+#endif
+
+////////////////////////    DeviceArray    /////////////////////////////
+    
+kfusion::cuda::DeviceMemory::DeviceMemory() : data_(0), sizeBytes_(0), refcount_(0) {}
+kfusion::cuda::DeviceMemory::DeviceMemory(void *ptr_arg, size_t sizeBytes_arg) : data_(ptr_arg), sizeBytes_(sizeBytes_arg), refcount_(0){}
+kfusion::cuda::DeviceMemory::DeviceMemory(size_t sizeBtes_arg)  : data_(0), sizeBytes_(0), refcount_(0) { create(sizeBtes_arg); }
+kfusion::cuda::DeviceMemory::~DeviceMemory() { release(); }
+
+kfusion::cuda::DeviceMemory::DeviceMemory(const DeviceMemory& other_arg)
+    : data_(other_arg.data_), sizeBytes_(other_arg.sizeBytes_), refcount_(other_arg.refcount_)
+{
+    if( refcount_ )
+        CV_XADD(refcount_, 1);
+}
+
+kfusion::cuda::DeviceMemory& kfusion::cuda::DeviceMemory::operator = (const kfusion::cuda::DeviceMemory& other_arg)
+{
+    if( this != &other_arg )
+    {
+        if( other_arg.refcount_ )
+            CV_XADD(other_arg.refcount_, 1);
+        release();
+        
+        data_      = other_arg.data_;
+        sizeBytes_ = other_arg.sizeBytes_;                
+        refcount_  = other_arg.refcount_;
+    }
+    return *this;
+}
+
+void kfusion::cuda::DeviceMemory::create(size_t sizeBytes_arg)
+{
+    if (sizeBytes_arg == sizeBytes_)
+        return;
+            
+    if( sizeBytes_arg > 0)
+    {        
+        if( data_ )
+            release();
+
+        sizeBytes_ = sizeBytes_arg;
+                        
+        cudaSafeCall( cudaMalloc(&data_, sizeBytes_) );
+        
+        //refcount_ = (int*)cv::fastMalloc(sizeof(*refcount_));
+        refcount_ = new int;
+        *refcount_ = 1;
+    }
+}
+
+void kfusion::cuda::DeviceMemory::copyTo(DeviceMemory& other) const
+{
+    if (empty())
+        other.release();
+    else
+    {    
+        other.create(sizeBytes_);    
+        cudaSafeCall( cudaMemcpy(other.data_, data_, sizeBytes_, cudaMemcpyDeviceToDevice) );
+    }
+}
+
+void kfusion::cuda::DeviceMemory::release()
+{
+    if( refcount_ && CV_XADD(refcount_, -1) == 1 )
+    {
+        //cv::fastFree(refcount);
+        delete refcount_;
+        cudaSafeCall( cudaFree(data_) );
+    }
+    data_ = 0;
+    sizeBytes_ = 0;
+    refcount_ = 0;
+}
+
+void kfusion::cuda::DeviceMemory::upload(const void *host_ptr_arg, size_t sizeBytes_arg)
+{
+    create(sizeBytes_arg);
+    cudaSafeCall( cudaMemcpy(data_, host_ptr_arg, sizeBytes_, cudaMemcpyHostToDevice) );
+}
+
+void kfusion::cuda::DeviceMemory::download(void *host_ptr_arg) const
+{    
+    cudaSafeCall( cudaMemcpy(host_ptr_arg, data_, sizeBytes_, cudaMemcpyDeviceToHost) );
+}          
+
+void kfusion::cuda::DeviceMemory::swap(DeviceMemory& other_arg)
+{
+    std::swap(data_, other_arg.data_);
+    std::swap(sizeBytes_, other_arg.sizeBytes_);
+    std::swap(refcount_, other_arg.refcount_);
+}
+
+bool kfusion::cuda::DeviceMemory::empty() const { return !data_; }
+size_t kfusion::cuda::DeviceMemory::sizeBytes() const { return sizeBytes_; }
+
+
+////////////////////////    DeviceArray2D    /////////////////////////////
+
+kfusion::cuda::DeviceMemory2D::DeviceMemory2D() : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0) {}
+
+kfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg)
+    : data_(0), step_(0), colsBytes_(0), rows_(0), refcount_(0)
+{ 
+    create(rows_arg, colsBytes_arg); 
+}
+
+kfusion::cuda::DeviceMemory2D::DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg)
+    :  data_(data_arg), step_(step_arg), colsBytes_(colsBytes_arg), rows_(rows_arg), refcount_(0) {}
+
+kfusion::cuda::DeviceMemory2D::~DeviceMemory2D() { release(); }
+
+
+kfusion::cuda::DeviceMemory2D::DeviceMemory2D(const DeviceMemory2D& other_arg) :
+    data_(other_arg.data_), step_(other_arg.step_), colsBytes_(other_arg.colsBytes_), rows_(other_arg.rows_), refcount_(other_arg.refcount_)
+{
+    if( refcount_ )
+        CV_XADD(refcount_, 1);
+}
+
+kfusion::cuda::DeviceMemory2D& kfusion::cuda::DeviceMemory2D::operator = (const kfusion::cuda::DeviceMemory2D& other_arg)
+{
+    if( this != &other_arg )
+    {
+        if( other_arg.refcount_ )
+            CV_XADD(other_arg.refcount_, 1);
+        release();
+        
+        colsBytes_ = other_arg.colsBytes_;
+        rows_ = other_arg.rows_;
+        data_ = other_arg.data_;
+        step_ = other_arg.step_;
+                
+        refcount_ = other_arg.refcount_;
+    }
+    return *this;
+}
+
+void kfusion::cuda::DeviceMemory2D::create(int rows_arg, int colsBytes_arg)
+{
+    if (colsBytes_ == colsBytes_arg && rows_ == rows_arg)
+        return;
+            
+    if( rows_arg > 0 && colsBytes_arg > 0)
+    {        
+        if( data_ )
+            release();
+              
+        colsBytes_ = colsBytes_arg;
+        rows_ = rows_arg;
+                        
+        cudaSafeCall( cudaMallocPitch( (void**)&data_, &step_, colsBytes_, rows_) );        
+
+        //refcount = (int*)cv::fastMalloc(sizeof(*refcount));
+        refcount_ = new int;
+        *refcount_ = 1;
+    }
+}
+
+void kfusion::cuda::DeviceMemory2D::release()
+{
+    if( refcount_ && CV_XADD(refcount_, -1) == 1 )
+    {
+        //cv::fastFree(refcount);
+        delete refcount_;
+        cudaSafeCall( cudaFree(data_) );
+    }
+
+    colsBytes_ = 0;
+    rows_ = 0;    
+    data_ = 0;    
+    step_ = 0;
+    refcount_ = 0;
+}
+
+void kfusion::cuda::DeviceMemory2D::copyTo(DeviceMemory2D& other) const
+{
+    if (empty())
+        other.release();
+    else
+    {
+        other.create(rows_, colsBytes_);    
+        cudaSafeCall( cudaMemcpy2D(other.data_, other.step_, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToDevice) );
+    }
+}
+
+void kfusion::cuda::DeviceMemory2D::upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg)
+{
+    create(rows_arg, colsBytes_arg);
+    cudaSafeCall( cudaMemcpy2D(data_, step_, host_ptr_arg, host_step_arg, colsBytes_, rows_, cudaMemcpyHostToDevice) );        
+}
+
+void kfusion::cuda::DeviceMemory2D::download(void *host_ptr_arg, size_t host_step_arg) const
+{    
+    cudaSafeCall( cudaMemcpy2D(host_ptr_arg, host_step_arg, data_, step_, colsBytes_, rows_, cudaMemcpyDeviceToHost) );
+}      
+
+void kfusion::cuda::DeviceMemory2D::swap(DeviceMemory2D& other_arg)
+{    
+    std::swap(data_, other_arg.data_);
+    std::swap(step_, other_arg.step_);
+
+    std::swap(colsBytes_, other_arg.colsBytes_);
+    std::swap(rows_, other_arg.rows_);
+    std::swap(refcount_, other_arg.refcount_);                 
+}
+
+bool kfusion::cuda::DeviceMemory2D::empty() const { return !data_; }
+int kfusion::cuda::DeviceMemory2D::colsBytes() const { return colsBytes_; }
+int kfusion::cuda::DeviceMemory2D::rows() const { return rows_; }
+size_t kfusion::cuda::DeviceMemory2D::step() const { return step_; }
diff --git a/modules/dynamicfusion/kfusion/src/imgproc.cpp b/modules/dynamicfusion/kfusion/src/imgproc.cpp
new file mode 100644
index 00000000000..7f70126600a
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/imgproc.cpp
@@ -0,0 +1,202 @@
+#include "precomp.hpp"
+#include <kfusion/types.hpp>
+/**
+ *
+ * \param in
+ * \param out
+ * \param kernel_size
+ * \param sigma_spatial
+ * \param sigma_depth
+ */
+void kfusion::cuda::depthBilateralFilter(const Depth& in, Depth& out, int kernel_size, float sigma_spatial, float sigma_depth)
+{ 
+    out.create(in.rows(), in.cols());
+    device::bilateralFilter(in, out, kernel_size, sigma_spatial, sigma_depth);
+}
+
+/**
+ *
+ * \param depth
+ * \param threshold
+ */
+void kfusion::cuda::depthTruncation(Depth& depth, float threshold)
+{
+    device::truncateDepth(depth, threshold);
+}
+
+/**
+ *
+ * \param depth
+ * \param pyramid
+ * \param sigma_depth
+ */
+void kfusion::cuda::depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth)
+{ 
+    pyramid.create(depth.rows () / 2, depth.cols () / 2);
+    device::depthPyr(depth, pyramid, sigma_depth);
+}
+
+/**
+ *
+ */
+void kfusion::cuda::waitAllDefaultStream()
+{
+    cudaSafeCall(cudaDeviceSynchronize());
+}
+
+/**
+ *
+ * \param intr
+ * \param depth
+ * \param normals
+ */
+void kfusion::cuda::computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals)
+{
+    normals.create(depth.rows(), depth.cols());
+
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+
+    device::Normals& n = (device::Normals&)normals;
+    device::computeNormalsAndMaskDepth(reproj, depth, n);
+}
+
+/**
+ *
+ * \param intr
+ * \param depth
+ * \param points
+ * \param normals
+ */
+void kfusion::cuda::computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals)
+{
+    points.create(depth.rows(), depth.cols());
+    normals.create(depth.rows(), depth.cols());
+
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+
+    device::Points& p = (device::Points&)points;
+    device::Normals& n = (device::Normals&)normals;
+    device::computePointNormals(reproj, depth, p, n);
+}
+
+/**
+ *
+ * \param depth
+ * \param dists
+ * \param intr
+ */
+void kfusion::cuda::computeDists(const Depth& depth, Dists& dists, const Intr& intr)
+{
+    dists.create(depth.rows(), depth.cols());
+    device::compute_dists(depth, dists, make_float2(intr.fx, intr.fy), make_float2(intr.cx, intr.cy));
+}
+
+/**
+ *
+ * \param cloud
+ * \param depth
+ */
+void kfusion::cuda::cloudToDepth(const Cloud& cloud, Depth& depth)
+{
+    depth.create(cloud.rows(), cloud.cols());
+    device::Points points = (device::Points&)cloud;
+    device::cloud_to_depth(points, depth);
+}
+
+/**
+ *
+ * \param depth
+ * \param normals
+ * \param depth_out
+ * \param normals_out
+ */
+void kfusion::cuda::resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out)
+{
+    depth_out.create(depth.rows() / 2, depth.cols() / 2);
+    normals_out.create(normals.rows() / 2, normals.cols() / 2);
+
+    device::Normals& nsrc = (device::Normals&)normals;
+    device::Normals& ndst = (device::Normals&)normals_out;
+
+    device::resizeDepthNormals(depth, nsrc, depth_out, ndst);
+}
+
+/**
+ *
+ * \param points
+ * \param normals
+ * \param points_out
+ * \param normals_out
+ */
+void kfusion::cuda::resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out)
+{
+    points_out.create(points.rows() / 2, points.cols() / 2);
+    normals_out.create(normals.rows() / 2, normals.cols() / 2);
+
+    device::Points& pi = (device::Points&)points;
+    device::Normals& ni= (device::Normals&)normals;
+
+    device::Points& po = (device::Points&)points_out;
+    device::Normals& no = (device::Normals&)normals_out;
+
+    device::resizePointsNormals(pi, ni, po, no);
+}
+
+/**
+ *
+ * \param depth
+ * \param normals
+ * \param intr
+ * \param light_pose
+ * \param image
+ */
+void kfusion::cuda::renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image)
+{
+    image.create(depth.rows(), depth.cols());
+
+    const device::Depth& d = (const device::Depth&)depth;
+    const device::Normals& n = (const device::Normals&)normals;
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+    device::Vec3f light = device_cast<device::Vec3f>(light_pose);
+
+    device::Image& i = (device::Image&)image;
+    device::renderImage(d, n, reproj, light, i);
+    waitAllDefaultStream();
+}
+
+/**
+ *
+ * \param points
+ * \param normals
+ * \param intr
+ * \param light_pose
+ * \param image
+ */
+void kfusion::cuda::renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image)
+{
+    image.create(points.rows(), points.cols());
+
+    const device::Points& p = (const device::Points&)points;
+    const device::Normals& n = (const device::Normals&)normals;
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+    device::Vec3f light = device_cast<device::Vec3f>(light_pose);
+
+    device::Image& i = (device::Image&)image;
+    device::renderImage(p, n, reproj, light, i);
+    waitAllDefaultStream();
+}
+
+/**
+ * \brief
+ * \param normals
+ * \param image
+ */
+void kfusion::cuda::renderTangentColors(const Normals& normals, Image& image)
+{
+    image.create(normals.rows(), normals.cols());
+    const device::Normals& n = (const device::Normals&)normals;
+    device::Image& i = (device::Image&)image;
+
+    device::renderTangentColors(n, i);
+    waitAllDefaultStream();
+}
diff --git a/modules/dynamicfusion/kfusion/src/internal.hpp b/modules/dynamicfusion/kfusion/src/internal.hpp
new file mode 100644
index 00000000000..612c5cb89e9
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/internal.hpp
@@ -0,0 +1,149 @@
+#pragma once
+
+#include <kfusion/cuda/device_array.hpp>
+#include "safe_call.hpp"
+#include <opencv2/core/core.hpp>
+//#define USE_DEPTH
+
+namespace kfusion
+{
+    namespace device
+    {
+        typedef float4 Normal;
+        typedef float4 Point;
+
+        typedef unsigned short ushort;
+        typedef unsigned char uchar;
+
+        typedef PtrStepSz<ushort> Dists;
+        typedef DeviceArray2D<ushort> Depth;
+        typedef DeviceArray2D<Normal> Normals;
+        typedef DeviceArray2D<Point> Points;
+        typedef DeviceArray2D<uchar4> Image;
+
+        typedef int3   Vec3i;
+        typedef float3 Vec3f;
+        struct Mat3f { float3 data[3]; };
+        struct Aff3f { Mat3f R; Vec3f t; };
+
+        struct TsdfVolume
+        {
+        public:
+            typedef ushort2 elem_type;
+
+            elem_type *const data;
+            const int3 dims;
+            const float3 voxel_size;
+            const float trunc_dist;
+            const int max_weight;
+
+            TsdfVolume(elem_type* data, int3 dims, float3 voxel_size, float trunc_dist, int max_weight);
+            //TsdfVolume(const TsdfVolume&);
+
+            __kf_device__ elem_type* operator()(int x, int y, int z);
+            __kf_device__ const elem_type* operator() (int x, int y, int z) const ;
+            __kf_device__ elem_type* beg(int x, int y) const;
+            __kf_device__ elem_type* zstep(elem_type *const ptr) const;
+        private:
+            TsdfVolume& operator=(const TsdfVolume&);
+        };
+
+        struct Projector
+        {
+            float2 f, c;
+            Projector(){}
+            Projector(float fx, float fy, float cx, float cy);
+            __kf_device__ float2 operator()(const float3& p) const;
+        };
+
+        struct Reprojector
+        {
+            Reprojector() {}
+            Reprojector(float fx, float fy, float cx, float cy);
+            float2 finv, c;
+            __kf_device__ float3 operator()(int x, int y, float z) const;
+        };
+
+        struct ComputeIcpHelper
+        {
+            struct Policy;
+            struct PageLockHelper
+            {
+                float* data;
+                PageLockHelper();
+                ~PageLockHelper();
+            };
+
+            float min_cosine;
+            float dist2_thres;
+
+            Aff3f aff;
+
+            float rows, cols;
+            float2 f, c, finv;
+
+            PtrStep<ushort> dcurr;
+            PtrStep<Normal> ncurr;
+            PtrStep<Point> vcurr;
+
+            ComputeIcpHelper(float dist_thres, float angle_thres);
+            void setLevelIntr(int level_index, float fx, float fy, float cx, float cy);
+
+            void operator()(const Depth& dprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t stream);
+            void operator()(const Points& vprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t stream);
+
+            static void allocate_buffer(DeviceArray2D<float>& buffer, int partials_count = -1);
+
+            //private:
+            __kf_device__ int find_coresp(int x, int y, float3& n, float3& d, float3& s) const;
+            __kf_device__ void partial_reduce(const float row[7], PtrStep<float>& partial_buffer) const;
+            __kf_device__ float2 proj(const float3& p) const;
+            __kf_device__ float3 reproj(float x, float y, float z)  const;
+        };
+
+        //tsdf volume functions
+        void clear_volume(TsdfVolume volume);
+        void integrate(const Dists& depth, TsdfVolume& volume, const Aff3f& aff, const Projector& proj);
+        void project(const Dists& depth, Points& vertices, const Projector& proj);
+        void project_and_remove(PtrStepSz<ushort>& dists, Points &vertices, const Projector &proj);
+        void project_and_remove(const PtrStepSz<ushort>& dists, Points &vertices, const Projector &proj);
+
+        void raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv,
+                     const Reprojector& reproj, Depth& depth, Normals& normals, float step_factor, float delta_factor);
+
+        void raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv,
+                     const Reprojector& reproj, Points& points, Normals& normals, float step_factor, float delta_factor);
+
+        __kf_device__ ushort2 pack_tsdf(float tsdf, int weight);
+        __kf_device__ float unpack_tsdf(ushort2 value, int& weight);
+        __kf_device__ float unpack_tsdf(ushort2 value);
+
+
+        //image proc functions
+        void compute_dists(const Depth& depth, Dists dists, float2 f, float2 c);
+        void cloud_to_depth(const Points& cloud, Depth depth);
+
+        void truncateDepth(Depth& depth, float max_dist /*meters*/);
+        void bilateralFilter(const Depth& src, Depth& dst, int kernel_size, float sigma_spatial, float sigma_depth);
+        void depthPyr(const Depth& source, Depth& pyramid, float sigma_depth);
+
+        void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
+        void resizePointsNormals(const Points& points, const Normals& normals, Points& points_out, Normals& normals_out);
+
+        void computeNormalsAndMaskDepth(const Reprojector& reproj, Depth& depth, Normals& normals);
+        void computePointNormals(const Reprojector& reproj, const Depth& depth, Points& points, Normals& normals);
+
+        void renderImage(const Depth& depth, const Normals& normals, const Reprojector& reproj, const Vec3f& light_pose, Image& image);
+        void renderImage(const Points& points, const Normals& normals, const Reprojector& reproj, const Vec3f& light_pose, Image& image);
+        void renderTangentColors(const Normals& normals, Image& image);
+
+
+        //exctraction functionality
+        size_t extractCloud(const TsdfVolume& volume, const Aff3f& aff, PtrSz<Point> output);
+        void extractNormals(const TsdfVolume& volume, const PtrSz<Point>& points, const Aff3f& aff, const Mat3f& Rinv, float gradient_delta_factor, float4* output);
+
+        struct float8  { float x, y, z, w, c1, c2, c3, c4; };
+        struct float12 { float x, y, z, w, normal_x, normal_y, normal_z, n4, c1, c2, c3, c4; };
+        void mergePointNormal(const DeviceArray<Point>& cloud, const DeviceArray<float8>& normals, const DeviceArray<float12>& output);
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/src/kinfu.cpp b/modules/dynamicfusion/kfusion/src/kinfu.cpp
new file mode 100644
index 00000000000..7a057c62b84
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/kinfu.cpp
@@ -0,0 +1,445 @@
+#include "precomp.hpp"
+#include "internal.hpp"
+#include <tgmath.h>
+#include <dual_quaternion.hpp>
+#include <nanoflann.hpp>
+#include <quaternion.hpp>
+#include <knn_point_cloud.hpp>
+#include <kfusion/warp_field.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/core/core.hpp>
+#include <opencv2/highgui/highgui.hpp>
+#include <opencv2/viz/vizcore.hpp>
+
+using namespace std;
+using namespace kfusion;
+using namespace kfusion::cuda;
+
+static inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
+
+/**
+ * \brief
+ * \return
+ */
+kfusion::KinFuParams kfusion::KinFuParams::default_params_dynamicfusion()
+{
+    const int iters[] = {10, 5, 4, 0};
+    const int levels = sizeof(iters)/sizeof(iters[0]);
+
+    KinFuParams p;
+// TODO: this should be coming from a calibration file / shouldn't be hardcoded
+    p.cols = 640;  //pixels
+    p.rows = 480;  //pixels
+    p.intr = Intr(570.342f, 570.342f, 320.f, 240.f);
+
+    p.volume_dims = Vec3i::all(512);  //number of voxels
+    p.volume_size = Vec3f::all(1.f);  //meters
+    p.volume_pose = Affine3f().translate(Vec3f(-p.volume_size[0]/2, -p.volume_size[1]/2, 0.5f));
+
+    p.bilateral_sigma_depth = 0.04f;  //meter
+    p.bilateral_sigma_spatial = 4.5; //pixels
+    p.bilateral_kernel_size = 7;     //pixels
+
+    p.icp_truncate_depth_dist = 0.f;        //meters, disabled
+    p.icp_dist_thres = 0.1f;                //meters
+    p.icp_angle_thres = deg2rad(30.f); //radians
+    p.icp_iter_num.assign(iters, iters + levels);
+
+    p.tsdf_min_camera_movement = 0.f; //meters, disabled
+    p.tsdf_trunc_dist = 0.04f; //meters;
+    p.tsdf_max_weight = 64;   //frames
+
+    p.raycast_step_factor = 0.75f;  //in voxel sizes
+    p.gradient_delta_factor = 0.5f; //in voxel sizes
+
+    //p.light_pose = p.volume_pose.translation()/4; //meters
+    p.light_pose = Vec3f::all(0.f); //meters
+
+    return p;
+}
+
+/**
+ * \brief
+ * \return
+ */
+kfusion::KinFuParams kfusion::KinFuParams::default_params()
+{
+    const int iters[] = {10, 5, 4, 0};
+    const int levels = sizeof(iters)/sizeof(iters[0]);
+
+    KinFuParams p;
+// TODO: this should be coming from a calibration file / shouldn't be hardcoded
+    p.cols = 640;  //pixels
+    p.rows = 480;  //pixels
+    p.intr = Intr(525.f, 525.f, p.cols/2 - 0.5f, p.rows/2 - 0.5f);
+
+    p.volume_dims = Vec3i::all(512);  //number of voxels
+    p.volume_size = Vec3f::all(3.f);  //meters
+    p.volume_pose = Affine3f().translate(Vec3f(-p.volume_size[0]/2, -p.volume_size[1]/2, 0.5f));
+
+    p.bilateral_sigma_depth = 0.04f;  //meter
+    p.bilateral_sigma_spatial = 4.5; //pixels
+    p.bilateral_kernel_size = 7;     //pixels
+
+    p.icp_truncate_depth_dist = 0.f;        //meters, disabled
+    p.icp_dist_thres = 0.1f;                //meters
+    p.icp_angle_thres = deg2rad(30.f); //radians
+    p.icp_iter_num.assign(iters, iters + levels);
+
+    p.tsdf_min_camera_movement = 0.f; //meters, disabled
+    p.tsdf_trunc_dist = 0.04f; //meters;
+    p.tsdf_max_weight = 64;   //frames
+
+    p.raycast_step_factor = 0.75f;  //in voxel sizes
+    p.gradient_delta_factor = 0.5f; //in voxel sizes
+
+    //p.light_pose = p.volume_pose.translation()/4; //meters
+    p.light_pose = Vec3f::all(0.f); //meters
+
+    return p;
+}
+
+/**
+ * \brief
+ * \param params
+ */
+kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(params)
+{
+    CV_Assert(params.volume_dims[0] % 32 == 0);
+
+    volume_ = cv::Ptr<cuda::TsdfVolume>(new cuda::TsdfVolume(params_.volume_dims));
+    warp_ = cv::Ptr<WarpField>(new WarpField());
+
+    volume_->setTruncDist(params_.tsdf_trunc_dist);
+    volume_->setMaxWeight(params_.tsdf_max_weight);
+    volume_->setSize(params_.volume_size);
+    volume_->setPose(params_.volume_pose);
+    volume_->setRaycastStepFactor(params_.raycast_step_factor);
+    volume_->setGradientDeltaFactor(params_.gradient_delta_factor);
+
+    icp_ = cv::Ptr<cuda::ProjectiveICP>(new cuda::ProjectiveICP());
+    icp_->setDistThreshold(params_.icp_dist_thres);
+    icp_->setAngleThreshold(params_.icp_angle_thres);
+    icp_->setIterationsNum(params_.icp_iter_num);
+
+    allocate_buffers();
+    reset();
+}
+
+const kfusion::KinFuParams& kfusion::KinFu::params() const
+{ return params_; }
+
+kfusion::KinFuParams& kfusion::KinFu::params()
+{ return params_; }
+
+const kfusion::cuda::TsdfVolume& kfusion::KinFu::tsdf() const
+{ return *volume_; }
+
+kfusion::cuda::TsdfVolume& kfusion::KinFu::tsdf()
+{ return *volume_; }
+
+const kfusion::cuda::ProjectiveICP& kfusion::KinFu::icp() const
+{ return *icp_; }
+
+kfusion::cuda::ProjectiveICP& kfusion::KinFu::icp()
+{ return *icp_; }
+
+const kfusion::WarpField& kfusion::KinFu::getWarp() const
+{ return *warp_; }
+
+kfusion::WarpField& kfusion::KinFu::getWarp()
+{ return *warp_; }
+
+void kfusion::KinFu::allocate_buffers()
+{
+    const int LEVELS = cuda::ProjectiveICP::MAX_PYRAMID_LEVELS;
+
+    int cols = params_.cols;
+    int rows = params_.rows;
+
+    dists_.create(rows, cols);
+
+    curr_.depth_pyr.resize(LEVELS);
+    curr_.normals_pyr.resize(LEVELS);
+    first_.normals_pyr.resize(LEVELS);
+    first_.depth_pyr.resize(LEVELS);
+    prev_.depth_pyr.resize(LEVELS);
+    prev_.normals_pyr.resize(LEVELS);
+    first_.normals_pyr.resize(LEVELS);
+
+    curr_.points_pyr.resize(LEVELS);
+    prev_.points_pyr.resize(LEVELS);
+    first_.points_pyr.resize(LEVELS);
+
+    for(int i = 0; i < LEVELS; ++i)
+    {
+        curr_.depth_pyr[i].create(rows, cols);
+        curr_.normals_pyr[i].create(rows, cols);
+
+        prev_.depth_pyr[i].create(rows, cols);
+        prev_.normals_pyr[i].create(rows, cols);
+
+        first_.depth_pyr[i].create(rows, cols);
+        first_.normals_pyr[i].create(rows, cols);
+
+        curr_.points_pyr[i].create(rows, cols);
+        prev_.points_pyr[i].create(rows, cols);
+        first_.points_pyr[i].create(rows, cols);
+
+        cols /= 2;
+        rows /= 2;
+    }
+
+    depths_.create(params_.rows, params_.cols);
+    normals_.create(params_.rows, params_.cols);
+    points_.create(params_.rows, params_.cols);
+}
+
+void kfusion::KinFu::reset()
+{
+    if (frame_counter_)
+        cout << "Reset" << endl;
+
+    frame_counter_ = 0;
+    poses_.clear();
+    poses_.reserve(30000);
+    poses_.push_back(Affine3f::Identity());
+    volume_->clear();
+    warp_->clear();
+}
+
+/**
+ * \brief
+ * \param time
+ * \return
+ */
+kfusion::Affine3f kfusion::KinFu::getCameraPose (int time) const
+{
+    if (time > (int)poses_.size () || time < 0)
+        time = (int)poses_.size () - 1;
+    return poses_[time];
+}
+
+bool kfusion::KinFu::operator()(const kfusion::cuda::Depth& depth, const kfusion::cuda::Image& /*image*/)
+{
+    const KinFuParams& p = params_;
+    const int LEVELS = icp_->getUsedLevelsNum();
+
+    cuda::computeDists(depth, dists_, p.intr);
+    cuda::depthBilateralFilter(depth, curr_.depth_pyr[0], p.bilateral_kernel_size, p.bilateral_sigma_spatial, p.bilateral_sigma_depth);
+
+    if (p.icp_truncate_depth_dist > 0)
+        kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist);
+
+    for (int i = 1; i < LEVELS; ++i)
+        cuda::depthBuildPyramid(curr_.depth_pyr[i-1], curr_.depth_pyr[i], p.bilateral_sigma_depth);
+
+    for (int i = 0; i < LEVELS; ++i)
+#if defined USE_DEPTH
+        cuda::computeNormalsAndMaskDepth(p.intr(i), curr_.depth_pyr[i], curr_.normals_pyr[i]);
+#else
+        cuda::computePointNormals(p.intr(i), curr_.depth_pyr[i], curr_.points_pyr[i], curr_.normals_pyr[i]);
+#endif
+
+    cuda::waitAllDefaultStream();
+
+    //can't perform more on first frame
+    if (frame_counter_ == 0)
+    {
+
+        volume_->integrate(dists_, poses_.back(), p.intr);
+        volume_->compute_points();
+        volume_->compute_normals();
+
+        warp_->init(volume_->get_cloud_host(), volume_->get_normal_host());
+
+        #if defined USE_DEPTH
+        curr_.depth_pyr.swap(prev_.depth_pyr);
+        curr_.depth_pyr.swap(first_.depth_pyr);
+#else
+        curr_.points_pyr.swap(prev_.points_pyr);
+        curr_.points_pyr.swap(first_.points_pyr);
+#endif
+        curr_.normals_pyr.swap(prev_.normals_pyr);
+        curr_.normals_pyr.swap(first_.normals_pyr);
+        return ++frame_counter_, false;
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////
+    // ICP
+    Affine3f affine; // curr -> prev
+    {
+        //ScopeTime time("icp");
+#if defined USE_DEPTH
+        bool ok = icp_->estimateTransform(affine, p.intr, curr_.depth_pyr, curr_.normals_pyr, prev_.depth_pyr, prev_.normals_pyr);
+#else
+        bool ok = icp_->estimateTransform(affine, p.intr, curr_.points_pyr, curr_.normals_pyr, prev_.points_pyr, prev_.normals_pyr);
+#endif
+        if (!ok)
+            return reset(), false;
+    }
+
+    poses_.push_back(poses_.back() * affine); // curr -> global
+//    auto d = depth;
+    auto d = curr_.depth_pyr[0];
+    auto pts = curr_.points_pyr[0];
+    auto n = curr_.normals_pyr[0];
+    dynamicfusion(d, pts, n);
+
+
+    ///////////////////////////////////////////////////////////////////////////////////////////
+    // Ray casting
+    {
+        //ScopeTime time("ray-cast-all");
+#if defined USE_DEPTH
+        volume_->raycast(poses_.back(), p.intr, prev_.depth_pyr[0], prev_.normals_pyr[0]);
+        for (int i = 1; i < LEVELS; ++i)
+            resizeDepthNormals(prev_.depth_pyr[i-1], prev_.normals_pyr[i-1], prev_.depth_pyr[i], prev_.normals_pyr[i]);
+#else
+        volume_->raycast(poses_.back(), p.intr, prev_.points_pyr[0], prev_.normals_pyr[0]);
+        for (int i = 1; i < LEVELS; ++i)
+            resizePointsNormals(prev_.points_pyr[i-1], prev_.normals_pyr[i-1], prev_.points_pyr[i], prev_.normals_pyr[i]);
+#endif
+        cuda::waitAllDefaultStream();
+    }
+
+    return ++frame_counter_, true;
+}
+
+/**
+ * \brief
+ * \param image
+ * \param flag
+ */
+void kfusion::KinFu::renderImage(cuda::Image& image, int flag)
+{
+    const KinFuParams& p = params_;
+    image.create(p.rows, flag != 3 ? p.cols : p.cols * 2);
+
+#if defined USE_DEPTH
+    #define PASS1 prev_.depth_pyr
+#else
+    #define PASS1 prev_.points_pyr
+#endif
+
+    if (flag < 1 || flag > 3)
+        cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, image);
+    else if (flag == 2)
+        cuda::renderTangentColors(prev_.normals_pyr[0], image);
+    else /* if (flag == 3) */
+    {
+        DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
+        DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
+
+        cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, i1);
+        cuda::renderTangentColors(prev_.normals_pyr[0], i2);
+
+    }
+#undef PASS1
+}
+
+/**
+ * \brief
+ * \param image
+ * \param flag
+ */
+void kfusion::KinFu::dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals)
+{
+    cuda::Cloud cloud;
+    cuda::Normals normals;
+    cloud.create(depth.rows(), depth.cols());
+    normals.create(depth.rows(), depth.cols());
+    auto camera_pose = poses_.back();
+    tsdf().raycast(camera_pose, params_.intr, cloud, normals);
+
+
+    cv::Mat cloud_host(depth.rows(), depth.cols(), CV_32FC4);
+    cloud.download(cloud_host.ptr<Point>(), cloud_host.step);
+    std::vector<Vec3f> warped(cloud_host.rows * cloud_host.cols);
+    auto inverse_pose = camera_pose.inv(cv::DECOMP_SVD);
+    for (int i = 0; i < cloud_host.rows; i++)
+        for (int j = 0; j < cloud_host.cols; j++) {
+            Point point = cloud_host.at<Point>(i, j);
+            warped[i * cloud_host.cols + j][0] = point.x;
+            warped[i * cloud_host.cols + j][1] = point.y;
+            warped[i * cloud_host.cols + j][2] = point.z;
+            warped[i * cloud_host.cols + j] = inverse_pose * warped[i * cloud_host.cols + j];
+        }
+
+    cv::Mat normal_host(cloud_host.rows, cloud_host.cols, CV_32FC4);
+    normals.download(normal_host.ptr<Normal>(), normal_host.step);
+
+    std::vector<Vec3f> warped_normals(normal_host.rows * normal_host.cols);
+    for (int i = 0; i < normal_host.rows; i++)
+        for (int j = 0; j < normal_host.cols; j++) {
+            auto point = normal_host.at<Normal>(i, j);
+            warped_normals[i * normal_host.cols + j][0] = point.x;
+            warped_normals[i * normal_host.cols + j][1] = point.y;
+            warped_normals[i * normal_host.cols + j][2] = point.z;
+        }
+
+
+//    current_frame.download(cloud_host.ptr<Point>(), cloud_host.step);
+//    std::vector<Vec3f> live(cloud_host.rows * cloud_host.cols);
+//    for (int i = 0; i < cloud_host.rows; i++)
+//        for (int j = 0; j < cloud_host.cols; j++) {
+//            Point point = cloud_host.at<Point>(i, j);
+//            live[i * cloud_host.cols + j][0] = point.x;
+//            live[i * cloud_host.cols + j][1] = point.y;
+//            live[i * cloud_host.cols + j][2] = point.z;
+//            live[i * cloud_host.cols + j] = inverse_pose * warped[i * cloud_host.cols + j];
+//        }
+
+
+
+    std::vector<Vec3f> canonical_visible(warped);
+//    getWarp().energy_data(warped, warped_normals, warped, warped_normals); //crashes, leave out for now
+
+//    getWarp().warp(warped, warped_normals);
+//    //ScopeTime time("fusion");
+    tsdf().surface_fusion(getWarp(), warped, canonical_visible, depth, camera_pose, params_.intr);
+
+    cv::Mat depth_cloud(depth.rows(),depth.cols(), CV_16U);
+    depth.download(depth_cloud.ptr<void>(), depth_cloud.step);
+    cv::Mat display;
+    depth_cloud.convertTo(display, CV_8U, 255.0/4000);
+    cv::imshow("Depth diff", display);
+    volume_->compute_points();
+    volume_->compute_normals();
+}
+
+/**
+ * \brief
+ * \param image
+ * \param pose
+ * \param flag
+ */
+void kfusion::KinFu::renderImage(cuda::Image& image, const Affine3f& pose, int flag) {
+    const KinFuParams &p = params_;
+    image.create(p.rows, flag != 3 ? p.cols : p.cols * 2);
+    depths_.create(p.rows, p.cols);
+    normals_.create(p.rows, p.cols);
+    points_.create(p.rows, p.cols);
+
+#if defined USE_DEPTH
+#define PASS1 depths_
+#else
+#define PASS1 points_
+#endif
+
+    volume_->raycast(pose, p.intr, PASS1, normals_);
+
+    if (flag < 1 || flag > 3)
+        cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, image);
+    else if (flag == 2)
+        cuda::renderTangentColors(normals_, image);
+    else /* if (flag == 3) */
+    {
+        DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
+        DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
+
+        cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, i1);
+        cuda::renderTangentColors(normals_, i2);
+    }
+#undef PASS1
+}
diff --git a/modules/dynamicfusion/kfusion/src/optimisation.cpp b/modules/dynamicfusion/kfusion/src/optimisation.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modules/dynamicfusion/kfusion/src/precomp.cpp b/modules/dynamicfusion/kfusion/src/precomp.cpp
new file mode 100644
index 00000000000..080169ba91f
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/precomp.cpp
@@ -0,0 +1,70 @@
+#include "precomp.hpp"
+#include "internal.hpp"
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Kinfu/types implementation
+
+kfusion::Intr::Intr () {}
+kfusion::Intr::Intr (float fx_, float fy_, float cx_, float cy_) : fx(fx_), fy(fy_), cx(cx_), cy(cy_) {}
+      
+kfusion::Intr kfusion::Intr::operator()(int level_index) const
+{
+  int div = 1 << level_index;
+  return (Intr (fx / div, fy / div, cx / div, cy / div));
+}
+
+std::ostream& operator << (std::ostream& os, const kfusion::Intr& intr)
+{
+  return os << "([f = " << intr.fx << ", " << intr.fy << "] [cp = " << intr.cx << ", " << intr.cy << "])";
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// TsdfVolume host implementation
+
+kfusion::device::TsdfVolume::TsdfVolume(elem_type* _data, int3 _dims, float3 _voxel_size, float _trunc_dist, int _max_weight)
+: data(_data), dims(_dims), voxel_size(_voxel_size), trunc_dist(_trunc_dist), max_weight(_max_weight) {}
+
+//kfusion::device::TsdfVolume::elem_type* kfusionl::device::TsdfVolume::operator()(int x, int y, int z)
+//{ return data + x + y*dims.x + z*dims.y*dims.x; }
+//
+//const kfusion::device::TsdfVolume::elem_type* kfusionl::device::TsdfVolume::operator() (int x, int y, int z) const
+//{ return data + x + y*dims.x + z*dims.y*dims.x; }
+//
+//kfusion::device::TsdfVolume::elem_type* kfusionl::device::TsdfVolume::beg(int x, int y) const
+//{ return data + x + dims.x * y; }
+//
+//kfusion::device::TsdfVolume::elem_type* kfusionl::device::TsdfVolume::zstep(elem_type *const ptr) const
+//{ return data + dims.x * dims.y; }
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Projector host implementation
+
+kfusion::device::Projector::Projector(float fx, float fy, float cx, float cy) : f(make_float2(fx, fy)), c(make_float2(cx, cy)) {}
+
+//float2 kfusion::device::Projector::operator()(const float3& p) const
+//{
+//  float2 coo;
+//  coo.x = p.x * f.x / p.z + c.x;
+//  coo.y = p.y * f.y / p.z + c.y;
+//  return coo;
+//}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Reprojector host implementation
+
+kfusion::device::Reprojector::Reprojector(float fx, float fy, float cx, float cy) : finv(make_float2(1.f/fx, 1.f/fy)), c(make_float2(cx, cy)) {}
+
+//float3 kfusion::device::Reprojector::operator()(int u, int v, float z) const
+//{
+//  float x = z * (u - c.x) * finv.x;
+//  float y = z * (v - c.y) * finv.y;
+//  return make_float3(x, y, z);
+//}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// Host implementation of packing/unpacking tsdf volume element
+
+//ushort2 kfusion::device::pack_tsdf(float tsdf, int weight) { throw "Not implemented"; return ushort2(); }
+//float kfusion::device::unpack_tsdf(ushort2 value, int& weight) { throw "Not implemented"; return 0; }
+//float kfusion::device::unpack_tsdf(ushort2 value) { throw "Not implemented"; return 0; }
+
diff --git a/modules/dynamicfusion/kfusion/src/precomp.hpp b/modules/dynamicfusion/kfusion/src/precomp.hpp
new file mode 100644
index 00000000000..52b054ce7be
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/precomp.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <kfusion/types.hpp>
+#include <kfusion/kinfu.hpp>
+#include <kfusion/cuda/tsdf_volume.hpp>
+#include <kfusion/cuda/imgproc.hpp>
+#include <kfusion/cuda/projective_icp.hpp>
+//#include <opencv2/calib3d/calib3d.hpp>
+#include "internal.hpp"
+#include <iostream>
+#include "vector_functions.h"
+
+namespace kfusion
+{
+    template<typename D, typename S>
+    inline D device_cast(const S& source)
+    {
+        return *reinterpret_cast<const D*>(source.val);
+    }
+
+    template<>
+    inline device::Aff3f device_cast<device::Aff3f, Affine3f>(const Affine3f& source)
+    {
+        device::Aff3f aff;
+        Mat3f R = source.rotation();
+        Vec3f t = source.translation();
+        aff.R = device_cast<device::Mat3f>(R);
+        aff.t = device_cast<device::Vec3f>(t);
+        return aff;
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/src/projective_icp.cpp b/modules/dynamicfusion/kfusion/src/projective_icp.cpp
new file mode 100644
index 00000000000..3554f00becc
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/projective_icp.cpp
@@ -0,0 +1,214 @@
+#include "precomp.hpp"
+
+
+using namespace kfusion;
+using namespace std;
+using namespace kfusion::cuda;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// ComputeIcpHelper
+
+kfusion::device::ComputeIcpHelper::ComputeIcpHelper(float dist_thres, float angle_thres)
+{
+    min_cosine = cos(angle_thres);
+    dist2_thres = dist_thres * dist_thres;
+}
+
+void kfusion::device::ComputeIcpHelper::setLevelIntr(int level_index, float fx, float fy, float cx, float cy)
+{
+    int div = 1 << level_index;
+    f = make_float2(fx/div, fy/div);
+    c = make_float2(cx/div, cy/div);
+    finv = make_float2(1.f/f.x, 1.f/f.y);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// ProjectiveICP::StreamHelper
+
+struct kfusion::cuda::ProjectiveICP::StreamHelper
+{
+    typedef device::ComputeIcpHelper::PageLockHelper PageLockHelper;
+    typedef cv::Matx66f Mat6f;
+    typedef cv::Vec6f Vec6f;
+
+    cudaStream_t stream;
+    PageLockHelper locked_buffer;
+
+    StreamHelper() { cudaSafeCall( cudaStreamCreate(&stream) ); }
+    ~StreamHelper() { cudaSafeCall( cudaStreamDestroy(stream) ); }
+
+    operator float*() { return locked_buffer.data; }
+    operator cudaStream_t() { return stream; }
+
+    Mat6f get(Vec6f& b)
+    {
+        cudaSafeCall( cudaStreamSynchronize(stream) );
+
+        Mat6f A;
+        float *data_A = A.val;
+        float *data_b = b.val;
+
+        int shift = 0;
+        for (int i = 0; i < 6; ++i)   //rows
+            for (int j = i; j < 7; ++j) // cols + b
+            {
+                float value = locked_buffer.data[shift++];
+                if (j == 6)               // vector b
+                    data_b[i] = value;
+                else
+                    data_A[j * 6 + i] = data_A[i * 6 + j] = value;
+            }
+        return A;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+/// ProjectiveICP
+
+kfusion::cuda::ProjectiveICP::ProjectiveICP() : angle_thres_(deg2rad(20.f)), dist_thres_(0.1f)
+{ 
+    const int iters[] = {10, 5, 4, 0};
+    std::vector<int> vector_iters(iters, iters + 4);
+    setIterationsNum(vector_iters);
+    device::ComputeIcpHelper::allocate_buffer(buffer_);
+
+    shelp_ = cv::Ptr<StreamHelper>(new StreamHelper());
+}
+
+kfusion::cuda::ProjectiveICP::~ProjectiveICP() {}
+
+float kfusion::cuda::ProjectiveICP::getDistThreshold() const
+{ return dist_thres_; }
+
+void kfusion::cuda::ProjectiveICP::setDistThreshold(float distance)
+{ dist_thres_ = distance; }
+
+float kfusion::cuda::ProjectiveICP::getAngleThreshold() const
+{ return angle_thres_; }
+
+void kfusion::cuda::ProjectiveICP::setAngleThreshold(float angle)
+{ angle_thres_ = angle; }
+
+void kfusion::cuda::ProjectiveICP::setIterationsNum(const std::vector<int>& iters)
+{
+    if (iters.size() >= MAX_PYRAMID_LEVELS)
+        iters_.assign(iters.begin(), iters.begin() + MAX_PYRAMID_LEVELS);
+    else
+    {
+        iters_ = vector<int>(MAX_PYRAMID_LEVELS, 0);
+        copy(iters.begin(), iters.end(),iters_.begin());
+    }
+}
+
+int kfusion::cuda::ProjectiveICP::getUsedLevelsNum() const
+{
+    int i = MAX_PYRAMID_LEVELS - 1;
+    for(; i >= 0 && !iters_[i]; --i);
+    return i + 1;
+}
+
+bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f& /*affine*/, const Intr& /*intr*/, const Frame& /*curr*/, const Frame& /*prev*/)
+{
+//    bool has_depth = !curr.depth_pyr.empty() && !prev.depth_pyr.empty();
+//    bool has_points = !curr.points_pyr.empty() && !prev.points_pyr.empty();
+
+//    if (has_depth)
+//        return estimateTransform(affine, intr, curr.depth_pyr, curr.normals_pyr, prev.depth_pyr, prev.normals_pyr);
+//    else if(has_points)
+//         return estimateTransform(affine, intr, curr.points_pyr, curr.normals_pyr, prev.points_pyr, prev.normals_pyr);
+//    else
+//        CV_Assert(!"Wrong parameters passed to estimateTransform");
+    CV_Assert(!"Not implemented");
+    return false;
+}
+
+bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f& affine, const Intr& intr, const DepthPyr& dcurr, const NormalsPyr ncurr, const DepthPyr dprev, const NormalsPyr nprev)
+{
+    const int LEVELS = getUsedLevelsNum();
+    StreamHelper& sh = *shelp_;
+
+    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);
+    affine = Affine3f::Identity();
+
+    for(int level_index = LEVELS - 1; level_index >= 0; --level_index)
+    {
+        const device::Normals& n = (const device::Normals& )nprev[level_index];
+
+        helper.rows = (float)n.rows();
+        helper.cols = (float)n.cols();
+        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);
+        helper.dcurr = dcurr[level_index];
+        helper.ncurr = ncurr[level_index];
+
+        for(int iter = 0; iter < iters_[level_index]; ++iter)
+        {
+            helper.aff = device_cast<device::Aff3f>(affine);
+            helper(dprev[level_index], n, buffer_, sh, sh);
+
+            StreamHelper::Vec6f b;
+            StreamHelper::Mat6f A  = sh.get(b);
+
+            //checking nullspace
+            double det = cv::determinant(A);
+
+            if (fabs (det) < 1e-15 || cv::viz::isNan(det))
+            {
+                if (cv::viz::isNan(det)) cout << "qnan" << endl;
+                return false;
+            }
+
+            StreamHelper::Vec6f r;
+            cv::solve(A, b, r, cv::DECOMP_SVD);
+            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val+3));
+            affine = Tinc * affine;
+        }
+    }
+    return true;
+}
+
+bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f& affine, const Intr& intr, const PointsPyr& vcurr, const NormalsPyr ncurr, const PointsPyr vprev, const NormalsPyr nprev)
+{
+    const int LEVELS = getUsedLevelsNum();
+    StreamHelper& sh = *shelp_;
+
+    device::ComputeIcpHelper helper(dist_thres_, angle_thres_);
+    affine = Affine3f::Identity();
+
+    for(int level_index = LEVELS - 1; level_index >= 0; --level_index)
+    {
+        const device::Normals& n = (const device::Normals& )nprev[level_index];
+        const device::Points& v = (const device::Points& )vprev[level_index];
+
+        helper.rows = (float)n.rows();
+        helper.cols = (float)n.cols();
+        helper.setLevelIntr(level_index, intr.fx, intr.fy, intr.cx, intr.cy);
+        helper.vcurr = vcurr[level_index];
+        helper.ncurr = ncurr[level_index];
+
+        for(int iter = 0; iter < iters_[level_index]; ++iter)
+        {
+            helper.aff = device_cast<device::Aff3f>(affine);
+            helper(v, n, buffer_, sh, sh);
+
+            StreamHelper::Vec6f b;
+            StreamHelper::Mat6f A = sh.get(b);
+
+            //checking nullspace
+            double det = cv::determinant(A);
+
+            if (fabs (det) < 1e-15 || cv::viz::isNan (det))
+            {
+                if (cv::viz::isNan (det)) cout << "qnan" << endl;
+                return false;
+            }
+
+            StreamHelper::Vec6f r;
+            cv::solve(A, b, r, cv::DECOMP_SVD);
+
+            Affine3f Tinc(Vec3f(r.val), Vec3f(r.val+3));
+            affine = Tinc * affine;
+        }
+    }
+    return true;
+}
+
diff --git a/modules/dynamicfusion/kfusion/src/safe_call.hpp b/modules/dynamicfusion/kfusion/src/safe_call.hpp
new file mode 100644
index 00000000000..d0bb9919b9a
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/safe_call.hpp
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "cuda_runtime_api.h"
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        void error(const char *error_string, const char *file, const int line, const char *func);
+    }
+}
+
+#if defined(__GNUC__)
+    #define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#else /* defined(__CUDACC__) || defined(__MSVC__) */
+    #define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__)    
+#endif
+
+namespace kfusion
+{
+    namespace cuda
+    {
+        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+        {
+            if (cudaSuccess != err)
+                error(cudaGetErrorString(err), file, line, func);
+        }        
+
+        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
+    }
+
+    namespace device
+    {
+        using kfusion::cuda::divUp;
+    }
+}
diff --git a/modules/dynamicfusion/kfusion/src/tsdf_volume.cpp b/modules/dynamicfusion/kfusion/src/tsdf_volume.cpp
new file mode 100644
index 00000000000..2451bc6ba25
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/tsdf_volume.cpp
@@ -0,0 +1,334 @@
+#include "precomp.hpp"
+#include "dual_quaternion.hpp"
+#include <algorithm>
+#include <opencv2/imgproc/imgproc.hpp>
+#include <kfusion/warp_field.hpp>
+#include <knn_point_cloud.hpp>
+#include <numeric>
+#include <opencv/cv.h>
+#include <opencv2/viz/vizcore.hpp>
+#include <opencv/highgui.h>
+using namespace kfusion;
+using namespace kfusion::cuda;
+
+
+kfusion::cuda::TsdfVolume::TsdfVolume(const Vec3i& dims) : data_(),
+                                                           trunc_dist_(0.03f),
+                                                           max_weight_(128),
+                                                           dims_(dims),
+                                                           size_(Vec3f::all(3.f)),
+                                                           pose_(Affine3f::Identity()),
+                                                           gradient_delta_factor_(0.75f),
+                                                           raycast_step_factor_(0.75f)
+{
+    create(dims_);
+}
+
+kfusion::cuda::TsdfVolume::~TsdfVolume()
+{
+    delete cloud_host;
+    delete cloud_buffer;
+    delete cloud;
+    delete normal_host;
+    delete normal_buffer;
+}
+
+/**
+ * \brief
+ * \param dims
+ */
+void kfusion::cuda::TsdfVolume::create(const Vec3i& dims)
+{
+    dims_ = dims;
+    int voxels_number = dims_[0] * dims_[1] * dims_[2];
+    data_.create(voxels_number * sizeof(int));
+    setTruncDist(trunc_dist_);
+    clear();
+}
+
+/**
+ * \brief
+ * \return
+ */
+Vec3i kfusion::cuda::TsdfVolume::getDims() const
+{
+    return dims_;
+}
+
+/**
+ * \brief
+ * \return
+ */
+Vec3f kfusion::cuda::TsdfVolume::getVoxelSize() const
+{
+    return Vec3f(size_[0] / dims_[0], size_[1] / dims_[1], size_[2] / dims_[2]);
+}
+
+const CudaData kfusion::cuda::TsdfVolume::data() const { return data_; }
+CudaData kfusion::cuda::TsdfVolume::data() {  return data_; }
+Vec3f kfusion::cuda::TsdfVolume::getSize() const { return size_; }
+
+void kfusion::cuda::TsdfVolume::setSize(const Vec3f& size)
+{ size_ = size; setTruncDist(trunc_dist_); }
+
+float kfusion::cuda::TsdfVolume::getTruncDist() const { return trunc_dist_; }
+
+void kfusion::cuda::TsdfVolume::setTruncDist(float distance)
+{
+    Vec3f vsz = getVoxelSize();
+    float max_coeff = std::max<float>(std::max<float>(vsz[0], vsz[1]), vsz[2]);
+    trunc_dist_ = std::max (distance, 2.1f * max_coeff);
+}
+cv::Mat kfusion::cuda::TsdfVolume::get_cloud_host() const {return *cloud_host;};
+cv::Mat kfusion::cuda::TsdfVolume::get_normal_host() const {return *normal_host;};
+cv::Mat* kfusion::cuda::TsdfVolume::get_cloud_host_ptr() const {return cloud_host;};
+cv::Mat* kfusion::cuda::TsdfVolume::get_normal_host_ptr() const {return normal_host;};
+
+int kfusion::cuda::TsdfVolume::getMaxWeight() const { return max_weight_; }
+void kfusion::cuda::TsdfVolume::setMaxWeight(int weight) { max_weight_ = weight; }
+Affine3f kfusion::cuda::TsdfVolume::getPose() const  { return pose_; }
+void kfusion::cuda::TsdfVolume::setPose(const Affine3f& pose) { pose_ = pose; }
+float kfusion::cuda::TsdfVolume::getRaycastStepFactor() const { return raycast_step_factor_; }
+void kfusion::cuda::TsdfVolume::setRaycastStepFactor(float factor) { raycast_step_factor_ = factor; }
+float kfusion::cuda::TsdfVolume::getGradientDeltaFactor() const { return gradient_delta_factor_; }
+void kfusion::cuda::TsdfVolume::setGradientDeltaFactor(float factor) { gradient_delta_factor_ = factor; }
+void kfusion::cuda::TsdfVolume::swap(CudaData& data) { data_.swap(data); }
+void kfusion::cuda::TsdfVolume::applyAffine(const Affine3f& affine) { pose_ = affine * pose_; }
+void kfusion::cuda::TsdfVolume::clear()
+{
+    cloud_buffer = new cuda::DeviceArray<Point>();
+    cloud = new cuda::DeviceArray<Point>();
+    normal_buffer = new cuda::DeviceArray<Normal>();
+    cloud_host = new cv::Mat();
+    normal_host = new cv::Mat();
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+
+    device::TsdfVolume volume(data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    device::clear_volume(volume);
+}
+
+/**
+ * \brief
+ * \param dists
+ * \param camera_pose
+ * \param intr
+ */
+void kfusion::cuda::TsdfVolume::integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr)
+{
+    Affine3f vol2cam = camera_pose.inv() * pose_;
+
+    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+    device::Aff3f aff = device_cast<device::Aff3f>(vol2cam);
+
+    device::TsdfVolume volume(data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    device::integrate(dists, volume, aff, proj);
+}
+
+/**
+ * \brief
+ * \param camera_pose
+ * \param intr
+ * \param depth
+ * \param normals
+ */
+void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals)
+{
+    DeviceArray2D<device::Normal>& n = (DeviceArray2D<device::Normal>&)normals;
+
+    Affine3f cam2vol = pose_.inv() * camera_pose;
+
+    device::Aff3f aff = device_cast<device::Aff3f>(cam2vol);
+    device::Mat3f Rinv = device_cast<device::Mat3f>(cam2vol.rotation().inv(cv::DECOMP_SVD));
+
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+
+    device::TsdfVolume volume(data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    device::raycast(volume, aff, Rinv, reproj, depth, n, raycast_step_factor_, gradient_delta_factor_);
+
+}
+
+/**
+ * \brief
+ * \param camera_pose
+ * \param intr
+ * \param points
+ * \param normals
+ */
+void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals)
+{
+    device::Normals& n = (device::Normals&)normals;
+    device::Points& p = (device::Points&)points;
+
+    Affine3f cam2vol = pose_.inv() * camera_pose;
+
+    device::Aff3f aff = device_cast<device::Aff3f>(cam2vol);
+    device::Mat3f Rinv = device_cast<device::Mat3f>(cam2vol.rotation().inv(cv::DECOMP_SVD));
+
+    device::Reprojector reproj(intr.fx, intr.fy, intr.cx, intr.cy);
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+
+    device::TsdfVolume volume(data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    device::raycast(volume, aff, Rinv, reproj, p, n, raycast_step_factor_, gradient_delta_factor_);
+}
+
+/**
+ * \brief
+ * \param cloud_buffer
+ * \return
+ */
+DeviceArray<Point> kfusion::cuda::TsdfVolume::fetchCloud(DeviceArray<Point>& cloud_buffer) const
+{
+    //    enum { DEFAULT_CLOUD_BUFFER_SIZE = 10 * 1000 * 1000 };
+    enum { DEFAULT_CLOUD_BUFFER_SIZE = 256 * 256 * 256 };
+
+    if (cloud_buffer.empty ())
+        cloud_buffer.create (DEFAULT_CLOUD_BUFFER_SIZE);
+
+    DeviceArray<device::Point>& b = (DeviceArray<device::Point>&)cloud_buffer;
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+    device::Aff3f aff  = device_cast<device::Aff3f>(pose_);
+
+    device::TsdfVolume volume((ushort2*)data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    size_t size = extractCloud(volume, aff, b);
+
+    return DeviceArray<Point>((Point*)cloud_buffer.ptr(), size);
+}
+
+/**
+ *
+ * @param cloud
+ * @param normals
+ */
+void kfusion::cuda::TsdfVolume::fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const
+{
+    normals.create(cloud.size());
+    DeviceArray<device::Point>& c = (DeviceArray<device::Point>&)cloud;
+
+    device::Vec3i dims = device_cast<device::Vec3i>(dims_);
+    device::Vec3f vsz  = device_cast<device::Vec3f>(getVoxelSize());
+    device::Aff3f aff  = device_cast<device::Aff3f>(pose_);
+    device::Mat3f Rinv = device_cast<device::Mat3f>(pose_.rotation().inv(cv::DECOMP_SVD));
+
+    device::TsdfVolume volume((ushort2*)data_.ptr<ushort2>(), dims, vsz, trunc_dist_, max_weight_);
+    device::extractNormals(volume, c, aff, Rinv, gradient_delta_factor_, (float4*)normals.ptr());
+}
+
+//TODO: in order to make this more efficient, we can just pass in the already warped canonical points (x_t)
+// and the canonical points
+/**
+ * \brief
+ * \param warp_field
+ * \param depth_img
+ * \param camera_pose
+ * \param intr
+ */
+void kfusion::cuda::TsdfVolume::surface_fusion(const WarpField& warp_field,
+                                               std::vector<Vec3f> warped,
+                                               std::vector<Vec3f> canonical,
+                                               cuda::Depth& depth,
+                                               const Affine3f& camera_pose,
+                                               const Intr& intr)
+{
+    std::vector<float> ro = psdf(warped, depth, intr);
+
+    cuda::Dists dists;
+    cuda::computeDists(depth, dists, intr);
+    integrate(dists, camera_pose, intr);
+
+    for(size_t i = 0; i < ro.size(); i++)
+    {
+        if(ro[i] > -trunc_dist_)
+        {
+            warp_field.KNN(canonical[i]);
+            float weight = weighting(warp_field.out_dist_sqr, KNN_NEIGHBOURS);
+            float coeff = std::min(ro[i], trunc_dist_);
+
+////            tsdf_entries[i].tsdf_value = tsdf_entries[i].tsdf_value * tsdf_entries[i].tsdf_weight + coeff * weight;
+////            tsdf_entries[i].tsdf_value = tsdf_entries[i].tsdf_weight + weight;
+////
+////            tsdf_entries[i].tsdf_weight = std::min(tsdf_entries[i].tsdf_weight + weight, W_MAX);
+        }
+    }
+}
+
+/**
+ * \fn TSDF::psdf (Mat3f K, Depth& depth, Vec3f voxel_center)
+ * \brief return a quaternion that is the spherical linear interpolation between q1 and q2
+ *        where percentage (from 0 to 1) defines the amount of interpolation
+ * \param K: camera matrix
+ * \param depth: a depth frame
+ * \param voxel_center
+ *
+ */
+std::vector<float> kfusion::cuda::TsdfVolume::psdf(const std::vector<Vec3f>& warped,
+                                                   Dists& dists,
+                                                   const Intr& intr)
+{
+    device::Projector proj(intr.fx, intr.fy, intr.cx, intr.cy);
+    std::vector<float4, std::allocator<float4>> point_type(warped.size());
+    for(int i = 0; i < warped.size(); i++)
+    {
+        point_type[i].x = warped[i][0];
+        point_type[i].y = warped[i][1];
+        point_type[i].z = warped[i][2];
+        point_type[i].w = 0.f;
+    }
+    device::Points points;
+    points.upload(point_type, dists.cols());
+    device::project_and_remove(dists, points, proj);
+    int size;
+    points.download(point_type, size);
+    Mat3f K = Mat3f(intr.fx, 0, intr.cx,
+                    0, intr.fy, intr.cy,
+                    0, 0, 1).inv();
+
+    std::vector<float> distances(warped.size());
+    for(int i = 0; i < warped.size(); i++)
+        distances[i] = (K * Vec3f(point_type[i].x, point_type[i].y, point_type[i].z))[2] - warped[i][2];
+    return distances;
+}
+
+/**
+ * \brief
+ * \param dist_sqr
+ * \param k
+ * \return
+ */
+float kfusion::cuda::TsdfVolume::weighting(const std::vector<float>& dist_sqr, int k) const
+{
+    float distances = 0;
+    for(auto distance : dist_sqr)
+        distances += sqrt(distance);
+    return distances / k;
+}
+/**
+ * \brief
+ * \param dist_sqr
+ * \param k
+ * \return
+ */
+void kfusion::cuda::TsdfVolume::compute_points()
+{
+    *cloud = fetchCloud(*cloud_buffer);
+    *cloud_host = cv::Mat(1, (int)cloud->size(), CV_32FC4);
+    cloud->download(cloud_host->ptr<Point>());
+}
+
+void kfusion::cuda::TsdfVolume::compute_normals()
+{
+    fetchNormals(*cloud, *normal_buffer);
+    *normal_host = cv::Mat(1, (int)cloud->size(), CV_32FC4);
+    normal_buffer->download(normal_host->ptr<Normal>());
+}
diff --git a/modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp b/modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp
new file mode 100644
index 00000000000..f47bce057c0
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp
@@ -0,0 +1,289 @@
+#ifndef DYNAMIC_FUSION_DUAL_QUATERNION_HPP
+#define DYNAMIC_FUSION_DUAL_QUATERNION_HPP
+#include<iostream>
+#include<quaternion.hpp>
+
+//Adapted from https://github.com/Poofjunior/QPose
+/**
+ * \brief a dual quaternion class for encoding transformations.
+ * \details transformations are stored as first a translation; then a
+ *          rotation. It is possible to switch the order. See this paper:
+ *  https://www.thinkmind.org/download.php?articleid=intsys_v6_n12_2013_5
+ */
+namespace kfusion {
+    namespace utils {
+        static float epsilon()
+        {
+            return 1e-6;
+        }
+        template<typename T>
+        class DualQuaternion {
+        public:
+            /**
+             * \brief default constructor.
+             */
+            DualQuaternion(){};
+
+            /**
+             * \brief constructor that takes cartesian coordinates and Euler angles as
+             *        arguments.
+             */
+//            FIXME: always use Rodrigues angles, not Euler
+            DualQuaternion(T x, T y, T z, T roll, T pitch, T yaw)
+            {
+                // convert here.
+                rotation_.w_ = cos(roll / 2) * cos(pitch / 2) * cos(yaw / 2) +
+                               sin(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
+                rotation_.x_ = sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2) -
+                               cos(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
+                rotation_.y_ = cos(roll / 2) * sin(pitch / 2) * cos(yaw / 2) +
+                               sin(roll / 2) * cos(pitch / 2) * sin(yaw / 2);
+                rotation_.z_ = cos(roll / 2) * cos(pitch / 2) * sin(yaw / 2) -
+                               sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2);
+
+                translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
+            }
+
+            /**
+             * \brief constructor that takes two quaternions as arguments.
+             * \details The rotation
+             *          quaternion has the conventional encoding for a rotation as a
+             *          quaternion. The translation quaternion is a quaternion with
+             *          cartesian coordinates encoded as (0, x, y, z)
+             */
+            DualQuaternion(Quaternion<T> translation, Quaternion<T> rotation)
+            {
+                rotation_ = rotation;
+                translation_ = 0.5 * translation * rotation;
+            }
+
+            ~DualQuaternion(){};
+
+            /**
+             * \brief store a rotation
+             * \param angle is in radians
+             */
+            void encodeRotation(T angle, T x, T y, T z)
+            {
+                rotation_.encodeRotation(angle, x, y, z);
+            }
+            /**
+             * \brief store a rotation
+             * \param angle is in radians
+             */
+            void encodeRotation(T x, T y, T z)
+            {
+                rotation_.encodeRotation(sqrt(x*x+y*y+z*z), x, y, z);
+            }
+
+            void encodeTranslation(T x, T y, T z)
+            {
+                translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
+            }
+
+            /// handle accumulating error.
+            void normalize()
+            {
+                T x, y, z;
+                getTranslation(x, y, z);
+
+                rotation_.normalize();
+
+                encodeTranslation(x, y, z);
+            }
+
+            /**
+             * \brief a reference-based method for acquiring the latest
+             *        translation data.
+             */
+            void getTranslation(T &x, T &y, T &z) const
+            {
+                auto rot = rotation_;
+                rot.normalize();
+                Quaternion<T> result = 2 * translation_ * rot.conjugate();
+                /// note: inverse of a quaternion is the same as the conjugate.
+                x = result.x_;
+                y = result.y_;
+                z = result.z_;
+            }
+
+            /**
+             * \brief a reference-based method for acquiring the latest
+             *        translation data.
+             */
+//FIXME: need to make sure rotation is normalized in all getTranslation functions.
+            void getTranslation(Vec3f& vec3f) const
+            {
+
+                auto rot = rotation_;
+                rot.normalize();
+                Quaternion<T> result = 2 * translation_ * rot.conjugate();
+                vec3f = Vec3f(result.x_, result.y_, result.z_);
+            }
+
+            Quaternion<T> getTranslation() const
+            {
+                return 2 * translation_ * rotation_.conjugate();
+            }
+
+
+            /**
+             * \brief a reference-based method for acquiring the latest rotation data.
+             */
+            void getEuler(T &roll, T &pitch, T &yaw)
+            {
+                // FIXME: breaks for some value around PI.
+                roll = getRoll();
+                pitch = getPitch();
+                yaw = getYaw();
+            }
+
+            Quaternion<T> getRotation() const
+            {
+                return rotation_;
+            }
+
+
+            /**
+             * \brief Extraction everything (in a nice format)
+             */
+            void get6DOF(T &x, T &y, T &z, T &roll, T &pitch, T &yaw)
+            {
+                getTranslation(x, y, z);
+                getEuler(roll, pitch, yaw);
+            }
+
+            DualQuaternion operator+(const DualQuaternion &other)
+            {
+                DualQuaternion result;
+                result.rotation_ = rotation_ + other.rotation_;
+                result.translation_ = translation_ + other.translation_;
+                return result;
+            }
+
+            DualQuaternion operator-(const DualQuaternion &other)
+            {
+                DualQuaternion result;
+                result.rotation_ = rotation_ - other.rotation_;
+                result.translation_ = translation_ - other.translation_;
+                return result;
+            }
+
+            DualQuaternion operator*(const DualQuaternion &other)
+            {
+                DualQuaternion<T> result;
+                result.rotation_ = rotation_ * other.rotation_;
+                result.translation_ = (rotation_ * other.translation_) + (translation_ * other.rotation_);
+                return result;
+            }
+
+            DualQuaternion operator/(const std::pair<T,T> divisor)
+            {
+                DualQuaternion<T> result;
+                result.rotation_ = 1 / divisor.first * rotation_;
+                result.translation_ = 1 / divisor.second * translation_;
+                return result;
+            }
+
+            /// (left) Scalar Multiplication
+            /**
+             * \fn template <typename U> friend Quaternion operator*(const U scalar,
+             * \brief implements scalar multiplication for arbitrary scalar types
+             */
+            template<typename U>
+            friend DualQuaternion operator*(const U scalar, const DualQuaternion &q)
+            {
+                DualQuaternion<T> result;
+                result.rotation_ = scalar * q.rotation_;
+                result.translation_ = scalar * q.translation_;
+                return result;
+            }
+
+            DualQuaternion conjugate()
+            {
+                DualQuaternion<T> result;
+                result.rotation_ = rotation_.conjugate();
+                result.translation_ = translation_.conjugate();
+                return result;
+            }
+
+            inline DualQuaternion identity()
+            {
+                return DualQuaternion(Quaternion<T>(0, 0, 0, 0),Quaternion<T>(0, 1, 0, 0));
+            }
+
+            void transform(Vec3f& point) // TODO: this should be a lot more generic
+            {
+                Vec3f translation;
+                getTranslation(translation);
+                rotation_.rotate(point);
+                point += translation;
+            }
+
+            void from_twist(const float &r0, const float &r1, const float &r2,
+                            const float &x, const float &y, const float &z)
+            {
+                Vec3f r(r0,r1,r2), t(x,y,z);
+                float norm = sqrt(r0*r0 + r1 * r1 + r2 * r2);
+                Quaternion<T> rotation;
+                if (norm > epsilon())
+                {
+                    float cosNorm = cos(norm);
+                    float sign = (cosNorm > 0.f) - (cosNorm < 0.f);
+                    cosNorm *= sign;
+                    float sinNorm_norm = sign * sin(norm) / norm;
+                    rotation = Quaternion<T>(cosNorm, r0 * sinNorm_norm, r1 * sinNorm_norm, r2 * sinNorm_norm);
+                }
+                else
+                    rotation = Quaternion<T>();
+
+                translation_ = Quaternion<T>(0, x, y, z);
+                rotation_ = rotation;
+            }
+
+            std::pair<T,T> magnitude()
+            {
+                DualQuaternion result = (*this) * (*this).conjugate();
+                return std::make_pair(result.rotation_.w_, result.translation_.w_);
+            }
+
+        private:
+            Quaternion<T> rotation_;
+            Quaternion<T> translation_;
+
+            T position_[3] = {};    /// default initialize vector to zeros.
+
+            T rotAxis_[3] = {};     /// default initialize vector to zeros.
+            T rotAngle_;
+
+
+            T getRoll()
+            {
+                // TODO: test this!
+                return atan2(2*((rotation_.w_ * rotation_.x_) + (rotation_.y_ * rotation_.z_)),
+                             (1 - 2*((rotation_.x_*rotation_.x_) + (rotation_.y_*rotation_.y_))));
+            }
+
+            T getPitch()
+            {
+                // TODO: test this!
+                return asin(2*(rotation_.w_ * rotation_.y_ - rotation_.z_ * rotation_.x_));
+            }
+
+            T getYaw()
+            {
+                // TODO: test this!
+                return atan2(2*((rotation_.w_ * rotation_.z_) + (rotation_.x_ * rotation_.y_)),
+                             (1 - 2*((rotation_.y_*rotation_.y_) + (rotation_.z_*rotation_.z_))));
+            }
+        };
+
+        template <typename T>
+        std::ostream &operator<<(std::ostream &os, const DualQuaternion<T> &q)
+        {
+            os << "[" << q.getRotation() << ", " << q.getTranslation()<< ", " << "]" << std::endl;
+            return os;
+        }
+    }
+}
+#endif //DYNAMIC_FUSION_DUAL_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp b/modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp
new file mode 100644
index 00000000000..fd4034abfc1
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp
@@ -0,0 +1,56 @@
+#ifndef KFUSION_KNN_POINfloat_CLOUD_HPP
+#define KFUSION_KNN_POINfloat_CLOUD_HPP
+
+#include <kfusion/types.hpp>
+namespace kfusion
+{
+    namespace utils{
+
+        //  floatODO: Adapt this and nanoflann to work with Quaternions. Probably needs an adaptor class
+        // Check https://github.com/jlblancoc/nanoflann/blob/master/examples/pointcloud_adaptor_example.cpp
+        struct PointCloud
+        {
+            std::vector<Vec3f> pts;
+
+            // Must return the number of data points
+            inline size_t kdtree_get_point_count() const { return pts.size(); }
+
+            // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+            inline float kdtree_distance(const Vec3f p1, const size_t idx_p2,size_t /*size*/) const
+            {
+                const float d0=p1[0] - pts[idx_p2][0];
+                const float d1=p1[1] - pts[idx_p2][1];
+                const float d2=p1[2] - pts[idx_p2][2];
+                return d0*d0 + d1*d1 + d2*d2;
+            }
+
+            // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+            inline float kdtree_distance(const float *p1, const size_t idx_p2,size_t /*size*/) const
+            {
+                const float d0=p1[0]-pts[idx_p2][0];
+                const float d1=p1[1]-pts[idx_p2][1];
+                const float d2=p1[2]-pts[idx_p2][2];
+                return d0*d0+d1*d1+d2*d2;
+            }
+
+            // Returns the dim'th component of the idx'th point in the class:
+            // Since this is inlined and the "dim" argument is typically an immediate value, the
+            //  "if/else's" are actually solved at compile time.
+            inline float kdtree_get_pt(const size_t idx, int dim) const
+            {
+                if (dim==0) return pts[idx][0];
+                else if (dim==1) return pts[idx][1];
+                else return pts[idx][2];
+            }
+
+            // Optional bounding-box computation: return false to default to a standard bbox computation loop.
+            //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
+            //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
+            template <class BBOX>
+            bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }
+
+        };
+
+    }
+}
+#endif //KFUSION_KNN_POINfloat_CLOUD_HPP
diff --git a/modules/dynamicfusion/kfusion/src/utils/quaternion.hpp b/modules/dynamicfusion/kfusion/src/utils/quaternion.hpp
new file mode 100644
index 00000000000..7864ebf14e3
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/utils/quaternion.hpp
@@ -0,0 +1,246 @@
+#ifndef DYNAMIC_FUSION_QUATERNION_HPP
+#define DYNAMIC_FUSION_QUATERNION_HPP
+#pragma once
+#include <iostream>
+#include <kfusion/types.hpp>
+//Adapted from https://github.com/Poofjunior/QPose
+namespace kfusion{
+    namespace utils{
+
+
+        /**
+         * \class Quaternion
+         * \brief a templated quaternion class that also enables quick storage and
+         *        retrieval of rotations encoded as a vector3 and angle.
+         * \details All angles are in radians.
+         * \warning This template is intended to be instantiated with a floating point
+         *          data type.
+         */
+        template <typename T> class Quaternion
+        {
+        public:
+            Quaternion() : w_(1), x_(0), y_(0), z_(0)
+            {}
+
+            Quaternion(T w, T x, T y, T z) : w_(w), x_(x), y_(y), z_(z)
+            {}
+
+            /**
+             * Encodes rotation from a normal
+             * @param normal
+             */
+            Quaternion(const Vec3f& normal)
+            {
+                Vec3f a(1, 0, 0);
+                Vec3f b(0, 1, 0);
+
+                Vec3f t0 = normal.cross(a);
+
+                if (t0.dot(t0) < 0.001f)
+                    t0 = normal.cross(b);
+                t0 = cv::normalize(t0);
+
+                Vec3f t1 = normal.cross(t0);
+                t1 = cv::normalize(t1);
+
+                cv::Mat3f matrix;
+                matrix.push_back(t0);
+                matrix.push_back(t1);
+                matrix.push_back(normal);
+                w_ = sqrt(1.0 + matrix.at<float>(0,0) + matrix.at<float>(1,1) + matrix.at<float>(2,2)) / 2.0;
+//                FIXME: this breaks when w_ = 0;
+                x_ = (matrix.at<float>(2,1) - matrix.at<float>(1,2)) / (w_ * 4);
+                y_ = (matrix.at<float>(0,2) - matrix.at<float>(2,0)) / (w_ * 4);
+                z_ = (matrix.at<float>(1,0) - matrix.at<float>(2,1)) / (w_ * 4);
+            }
+
+            ~Quaternion()
+            {}
+
+
+            /**
+             * Quaternion Rotation Properties for straightforward usage of quaternions
+             *  to store rotations.
+             */
+
+            /**
+             * \fn void encodeRotation( T theta, T x, T y, T z)
+             * \brief Store a normalized rotation in the quaternion encoded as a rotation
+             *        of theta about the vector (x,y,z).
+             */
+            void encodeRotation(T theta, T x, T y, T z)
+            {
+                auto sin_half = sin(theta / 2);
+                w_ = cos(theta / 2);
+                x_ = x * sin_half;
+                y_ = y * sin_half;
+                z_ = z * sin_half;
+                normalize();
+            }
+
+            /**
+             * \fn void encodeRotation( T theta, T x, T y, T z)
+             * \brief Store a normalized rotation in the quaternion encoded as a rotation
+             *        of theta about the vector (x,y,z).
+             */
+            void getRodrigues(T& x, T& y, T& z)
+            {
+                if(w_ == 1)
+                {
+                    x = y = z = 0;
+                    return;
+                }
+                T half_theta = acos(w_);
+                T k = sin(half_theta) * tan(half_theta);
+                x = x_ / k;
+                y = y_ / k;
+                z = z_ / k;
+            }
+
+
+            /**
+             * \fn void rotate( T& x, T& y, T& z)
+             * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
+             * (U_x, U_y, U_z) stored in the quaternion.
+             */
+            void rotate(T& x, T& y, T& z)
+            {
+                Quaternion<T> q = (*this);
+                Quaternion<T> qStar = (*this).conjugate();
+                Quaternion<T> rotatedVal = q * Quaternion(0, x, y, z) * qStar;
+
+                x = rotatedVal.x_;
+                y = rotatedVal.y_;
+                z = rotatedVal.z_;
+            }
+
+            /**
+            /**
+             * \fn void rotate( T& x, T& y, T& z)
+             * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
+             * (U_x, U_y, U_z) stored in the quaternion.
+             */
+            void rotate(Vec3f& v) const
+            {
+                auto rot= *this;
+                rot.normalize();
+                Vec3f q_vec(rot.x_, rot.y_, rot.z_);
+                v += (q_vec*2.f).cross( q_vec.cross(v) + v*rot.w_ );
+            }
+
+            /**
+             * Quaternion Mathematical Properties
+             * implemented below
+             **/
+
+            Quaternion operator+(const Quaternion& other)
+            {
+                return Quaternion(  (w_ + other.w_),
+                                    (x_ + other.x_),
+                                    (y_ + other.y_),
+                                    (z_ + other.z_));
+            }
+
+            Quaternion operator-(const Quaternion& other)
+            {
+                return Quaternion((w_ - other.w_),
+                                  (x_ - other.x_),
+                                  (y_ - other.y_),
+                                  (z_ - other.z_));
+            }
+
+            Quaternion operator-()
+            {
+                return Quaternion(-w_, -x_, -y_, -z_);
+            }
+
+            bool operator==(const Quaternion& other) const
+            {
+                return (w_ == other.w_) && (x_ == other.x_) && (y_ == other.y_) && (z_ == other.z_);
+            }
+
+            /**
+             * \fn template <typename U> friend Quaternion operator*(const U scalar,
+             *                                                       const Quaternion& q)
+             * \brief implements scalar multiplication for arbitrary scalar types.
+             */
+            template <typename U> friend Quaternion operator*(const U scalar, const Quaternion& other)
+            {
+                return Quaternion<T>((scalar * other.w_),
+                                     (scalar * other.x_),
+                                     (scalar * other.y_),
+                                     (scalar * other.z_));
+            }
+
+            template <typename U> friend Quaternion operator/(const Quaternion& q, const U scalar)
+            {
+                return (1 / scalar) * q;
+            }
+
+            /// Quaternion Product
+            Quaternion operator*(const Quaternion& other)
+            {
+                return Quaternion(
+                        ((w_*other.w_) - (x_*other.x_) - (y_*other.y_) - (z_*other.z_)),
+                        ((w_*other.x_) + (x_*other.w_) + (y_*other.z_) - (z_*other.y_)),
+                        ((w_*other.y_) - (x_*other.z_) + (y_*other.w_) + (z_*other.x_)),
+                        ((w_*other.z_) + (x_*other.y_) - (y_*other.x_) + (z_*other.w_))
+                );
+            }
+
+            /**
+             * \fn static T dotProduct(Quaternion q1, Quaternion q2)
+             * \brief returns the dot product of two quaternions.
+             */
+            T dotProduct(Quaternion other)
+            {
+                return 0.5 * ((conjugate() * other) + (*this) * other.conjugate()).w_;
+            }
+
+            /// Conjugate
+            Quaternion conjugate() const
+            {
+                return Quaternion<T>(w_, -x_, -y_, -z_);
+            }
+
+            T norm()
+            {
+                return sqrt((w_ * w_) + (x_ * x_) + (y_ * y_) + (z_ * z_));
+            }
+
+            /**
+             * \fn void normalize()
+             * \brief normalizes the quaternion to magnitude 1
+             */
+            void normalize()
+            {
+                // should never happen unless the Quaternion<T> wasn't initialized
+                // correctly.
+                assert( !((w_ == 0) && (x_ == 0) && (y_ == 0) && (z_ == 0)));
+                T theNorm = norm();
+                assert(theNorm > 0);
+                (*this) = (1.0/theNorm) * (*this);
+            }
+
+            /**
+             * \fn template <typename U> friend std::ostream& operator <<
+             *                                  (std::ostream& os, const Quaternion<U>& q);
+             * \brief a templated friend function for printing quaternions.
+             * \details T cannot be used as dummy parameter since it would be shared by
+             *          the class, and this function is not a member function.
+             */
+            template <typename U> friend std::ostream& operator << (std::ostream& os, const Quaternion<U>& q)
+            {
+                os << "(" << q.w_ << ", " << q.x_ << ", " <<  q.y_ << ", " << q.z_ << ")";
+                return os;
+            }
+            //TODO: shouldn't have Vec3f but rather Vec3<T>. Not sure how to determine this later
+
+            T w_;
+            T x_;
+            T y_;
+            T z_;
+        };
+    }
+}
+#endif // DYNAMIC_FUSION_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/kfusion/src/warp_field.cpp b/modules/dynamicfusion/kfusion/src/warp_field.cpp
new file mode 100644
index 00000000000..de570f03f1c
--- /dev/null
+++ b/modules/dynamicfusion/kfusion/src/warp_field.cpp
@@ -0,0 +1,383 @@
+#include <dual_quaternion.hpp>
+#include <knn_point_cloud.hpp>
+#include <kfusion/types.hpp>
+#include <nanoflann.hpp>
+#include "kfusion/warp_field.hpp"
+#include "internal.hpp"
+#include "precomp.hpp"
+#include <opencv2/core/affine.hpp>
+#include <kfusion/optimisation.hpp>
+
+using namespace kfusion;
+std::vector<utils::DualQuaternion<float>> neighbours; //THIS SHOULD BE SOMEWHERE ELSE BUT TOO SLOW TO REINITIALISE
+utils::PointCloud cloud;
+
+WarpField::WarpField()
+{
+    nodes = new std::vector<deformation_node>();
+    index = new kd_tree_t(3, cloud, nanoflann::KDTreeSingleIndexAdaptorParams(10));
+    ret_index = std::vector<size_t>(KNN_NEIGHBOURS);
+    out_dist_sqr = std::vector<float>(KNN_NEIGHBOURS);
+    resultSet = new nanoflann::KNNResultSet<float>(KNN_NEIGHBOURS);
+    resultSet->init(&ret_index[0], &out_dist_sqr[0]);
+    neighbours = std::vector<utils::DualQuaternion<float>>(KNN_NEIGHBOURS);
+
+}
+
+WarpField::~WarpField()
+{
+    delete[] nodes;
+    delete resultSet;
+    delete index;
+}
+
+/**
+ *
+ * @param first_frame
+ * @param normals
+ */
+void WarpField::init(const cv::Mat& first_frame, const cv::Mat& normals)
+{
+    assert(first_frame.rows == normals.rows);
+    assert(first_frame.cols == normals.cols);
+    nodes->resize(first_frame.cols * first_frame.rows);
+    auto voxel_size = kfusion::KinFuParams::default_params_dynamicfusion().volume_size[0] /
+                      kfusion::KinFuParams::default_params_dynamicfusion().volume_dims[0];
+    for(int i = 0; i < first_frame.rows; i++)
+        for(int j = 0; j < first_frame.cols; j++)
+        {
+            auto point = first_frame.at<Point>(i,j);
+            auto norm = normals.at<Normal>(i,j);
+            if(!std::isnan(point.x))
+            {
+                utils::Quaternion<float> r(Vec3f(norm.x,norm.y,norm.z));
+                if(std::isnan(r.w_) || std::isnan(r.x_) ||std::isnan(r.y_) ||std::isnan(r.z_))
+                    continue;
+
+                utils::Quaternion<float> t(0,point.x, point.y, point.z);
+                nodes->at(i*first_frame.cols+j).transform = utils::DualQuaternion<float>(t, r);
+
+                nodes->at(i*first_frame.cols+j).vertex = Vec3f(point.x,point.y,point.z);
+                nodes->at(i*first_frame.cols+j).weight = voxel_size;
+            }
+        }
+    buildKDTree();
+}
+
+/**
+ *
+ * @param first_frame
+ * @param normals
+ */
+void WarpField::init(const std::vector<Vec3f>& first_frame, const std::vector<Vec3f>& normals)
+{
+    nodes->resize(first_frame.size());
+    auto voxel_size = kfusion::KinFuParams::default_params_dynamicfusion().volume_size[0] /
+                      kfusion::KinFuParams::default_params_dynamicfusion().volume_dims[0];
+    for (int i = 0; i < first_frame.size(); i++)
+    {
+        auto point = first_frame[i];
+        auto norm = normals[i];
+        if (!std::isnan(point[0]))
+        {
+            utils::Quaternion<float> t(0.f, point[0], point[1], point[2]);
+            utils::Quaternion<float> r(norm);
+            nodes->at(i).transform = utils::DualQuaternion<float>(t,r);
+
+            nodes->at(i).vertex = point;
+            nodes->at(i).weight = voxel_size;
+        }
+    }
+    buildKDTree();
+}
+
+/**
+ * \brief
+ * \param frame
+ * \param normals
+ * \param pose
+ * \param tsdfVolume
+ * \param edges
+ */
+void WarpField::energy(const cuda::Cloud &frame,
+                       const cuda::Normals &normals,
+                       const Affine3f &pose,
+                       const cuda::TsdfVolume &tsdfVolume,
+                       const std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> &edges
+)
+{
+    assert(normals.cols()==frame.cols());
+    assert(normals.rows()==frame.rows());
+}
+
+/**
+ *
+ * @param canonical_vertices
+ * @param canonical_normals
+ * @param live_vertices
+ * @param live_normals
+ * @return
+ */
+float WarpField::energy_data(const std::vector<Vec3f> &canonical_vertices,
+                             const std::vector<Vec3f> &canonical_normals,
+                             const std::vector<Vec3f> &live_vertices,
+                             const std::vector<Vec3f> &live_normals
+)
+{
+
+    ceres::Problem problem;
+    int i = 0;
+
+    auto parameters = new double[nodes->size() * 6];
+    std::vector<cv::Vec3d> double_vertices;
+    for(auto v : canonical_vertices)
+    {
+        if(std::isnan(v[0]))
+            continue;
+        ceres::CostFunction* cost_function = DynamicFusionDataEnergy::Create(live_vertices[i], Vec3f(1,0,0), v, Vec3f(1,0,0), this);
+        problem.AddResidualBlock(cost_function,
+                                 NULL /* squared loss */,
+                                 parameters);
+        i++;
+    }
+    ceres::Solver::Options options;
+    options.linear_solver_type = ceres::DENSE_SCHUR;
+    options.minimizer_progress_to_stdout = true;
+    ceres::Solver::Summary summary;
+    ceres::Solve(options, &problem, &summary);
+    std::cout << summary.FullReport() << std::endl;
+
+    for(int i = 0; i < nodes->size() * 6; i++)
+    {
+        std::cout<<parameters[i]<<" ";
+        if((i+1) % 6 == 0)
+            std::cout<<std::endl;
+    }
+
+
+
+    float weights[KNN_NEIGHBOURS];
+    auto canonical2 = canonical_vertices;
+    for(auto v : canonical_vertices)
+    {
+        utils::Quaternion<float> rotation(0,0,0,0);
+        Vec3f translation(0,0,0);
+        getWeightsAndUpdateKNN(v, weights);
+        for(int i = 0; i < KNN_NEIGHBOURS; i++)
+        {
+            auto block_position = ret_index[i];
+            std::cout<<ret_index[i]<<" Weight:"<<weights[i]<<" ";
+            utils::Quaternion<float> rotation1(Vec3f(parameters[block_position],
+                                                     parameters[block_position+1],
+                                                     parameters[block_position+2]));
+            rotation = rotation + weights[i] * rotation1 * nodes->at(block_position).transform.getRotation();
+
+            Vec3f translation1(parameters[block_position+3],
+                               parameters[block_position+4],
+                               parameters[block_position+5]);
+            Vec3f t;
+            nodes->at(block_position).transform.getTranslation(t[0],t[1],t[2]);
+            translation += weights[i]*t + translation1;
+        }
+        rotation.rotate(v);
+        v += translation;
+        std::cout<<std::endl<<"Value of v:"<<v<<std::endl;
+    }
+    for(auto v : canonical_vertices)
+    {
+        utils::DualQuaternion<float> final_quat = DQB(v, parameters);
+        final_quat.transform(v);
+        std::cout<<"Value of v[ "<<i<<" ]:"<<v<<std::endl;
+    }
+
+    delete[] parameters;
+    return 0;
+}
+/**
+ * \brief
+ * \param edges
+ */
+void WarpField::energy_reg(const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
+        kfusion::utils::DualQuaternion<float>>> &edges)
+{
+
+}
+
+/**
+ * Tukey loss function as described in http://web.as.uky.edu/statistics/users/pbreheny/764-F11/notes/12-1.pdf
+ * \param x
+ * \param c
+ * \return
+ *
+ * \note
+ * The value c = 4.685 is usually used for this loss function, and
+ * it provides an asymptotic efficiency 95% that of linear
+ * regression for the normal distribution
+ *
+ * In the paper, a value of 0.01 is suggested for c
+ */
+float WarpField::tukeyPenalty(float x, float c) const
+{
+    return std::abs(x) <= c ? x * std::pow((1 - (x * x) / (c * c)), 2) : 0.0f;
+}
+
+/**
+ * Huber penalty function, implemented as described in https://en.wikipedia.org/wiki/Huber_loss
+ * In the paper, a value of 0.0001 is suggested for delta
+ * \param a
+ * \param delta
+ * \return
+ */
+float WarpField::huberPenalty(float a, float delta) const
+{
+    return std::abs(a) <= delta ? a * a / 2 : delta * std::abs(a) - delta * delta / 2;
+}
+
+/**
+ *
+ * @param points
+ * @param normals
+ */
+void WarpField::warp(std::vector<Vec3f>& points, std::vector<Vec3f>& normals) const
+{
+    int i = 0;
+    for (auto& point : points)
+    {
+        if(std::isnan(point[0]) || std::isnan(normals[i][0]))
+            continue;
+        KNN(point);
+        utils::DualQuaternion<float> dqb = DQB(point);
+        dqb.transform(point);
+        point = warp_to_live * point;
+
+        dqb.transform(normals[i]);
+        normals[i] = warp_to_live * normals[i];
+        i++;
+    }
+}
+
+/**
+ * \brief
+ * \param vertex
+ * \param weight
+ * \return
+ */
+utils::DualQuaternion<float> WarpField::DQB(const Vec3f& vertex) const
+{
+    utils::DualQuaternion<float> quaternion_sum;
+    for (size_t i = 0; i < KNN_NEIGHBOURS; i++)
+        quaternion_sum = quaternion_sum + weighting(out_dist_sqr[ret_index[i]], nodes->at(ret_index[i]).weight) *
+                                          nodes->at(ret_index[i]).transform;
+
+    auto norm = quaternion_sum.magnitude();
+
+    return utils::DualQuaternion<float>(quaternion_sum.getRotation() / norm.first,
+                                        quaternion_sum.getTranslation() / norm.second);
+}
+
+
+/**
+ * \brief
+ * \param vertex
+ * \param weight
+ * \return
+ */
+utils::DualQuaternion<float> WarpField::DQB(const Vec3f& vertex, double epsilon[KNN_NEIGHBOURS * 6]) const
+{
+    if(epsilon == NULL)
+    {
+        std::cerr<<"Invalid pointer in DQB"<<std::endl;
+        exit(-1);
+    }
+    utils::DualQuaternion<float> quaternion_sum;
+    utils::DualQuaternion<float> eps;
+    for (size_t i = 0; i < KNN_NEIGHBOURS; i++)
+    {
+        // epsilon [0:2] is rotation [3:5] is translation
+        eps.from_twist(epsilon[i*6],epsilon[i*6 + 1],epsilon[i*6 + 2],epsilon[i*6 + 3],epsilon[i*6 + 4],epsilon[i*6 + 5]);
+        quaternion_sum = quaternion_sum + weighting(out_dist_sqr[ret_index[i]], nodes->at(ret_index[i]).weight) *
+                                          nodes->at(ret_index[i]).transform * eps;
+    }
+
+    auto norm = quaternion_sum.magnitude();
+
+    return utils::DualQuaternion<float>(quaternion_sum.getRotation() / norm.first,
+                                        quaternion_sum.getTranslation() / norm.second);
+}
+
+/**
+ * \brief
+ * \param vertex
+ * \param weight
+ * \return
+ */
+void WarpField::getWeightsAndUpdateKNN(const Vec3f& vertex, float weights[KNN_NEIGHBOURS])
+{
+    KNN(vertex);
+    for (size_t i = 0; i < KNN_NEIGHBOURS; i++)
+        // epsilon [0:2] is rotation [3:5] is translation
+        weights[i] = weighting(out_dist_sqr[i], nodes->at(ret_index[i]).weight);
+}
+
+/**
+ * \brief
+ * \param squared_dist
+ * \param weight
+ * \return
+ */
+float WarpField::weighting(float squared_dist, float weight) const
+{
+    return (float) exp(-squared_dist / (2 * weight * weight));
+}
+
+/**
+ * \brief
+ * \return
+ */
+void WarpField::KNN(Vec3f point) const
+{
+//    resultSet->init(&ret_index[0], &out_dist_sqr[0]);
+    index->findNeighbors(*resultSet, point.val, nanoflann::SearchParams(10));
+}
+
+/**
+ * \brief
+ * \return
+ */
+const std::vector<deformation_node>* WarpField::getNodes() const
+{
+    return nodes;
+}
+
+/**
+ * \brief
+ * \return
+ */
+void WarpField::buildKDTree()
+{
+    //    Build kd-tree with current warp nodes.
+    cloud.pts.resize(nodes->size());
+    for(size_t i = 0; i < nodes->size(); i++)
+        cloud.pts[i] = nodes->at(i).vertex;
+    index->buildIndex();
+}
+
+const cv::Mat WarpField::getNodesAsMat() const
+{
+    cv::Mat matrix(1, nodes->size(), CV_32FC3);
+    for(int i = 0; i < nodes->size(); i++)
+        matrix.at<cv::Vec3f>(i) = nodes->at(i).vertex;
+    return matrix;
+}
+
+/**
+ * \brief
+ */
+void WarpField::clear()
+{
+
+}
+void WarpField::setWarpToLive(const Affine3f &pose)
+{
+    warp_to_live = pose;
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/tests/CMakeLists.txt b/modules/dynamicfusion/tests/CMakeLists.txt
new file mode 100644
index 00000000000..cad4efd398f
--- /dev/null
+++ b/modules/dynamicfusion/tests/CMakeLists.txt
@@ -0,0 +1,15 @@
+project(tests)
+add_subdirectory(utils)
+include_directories(${CMAKE_SOURCE_DIR}/kfusion/include)
+
+add_executable(ceres_test ceres_test_warp.cpp)
+target_link_libraries(
+        ceres_test
+        ${CERES_LIBRARIES}
+        kfusion
+)
+
+target_include_directories(
+        ceres_test PUBLIC
+        ${CERES_INCLUDE_DIRS}
+)
diff --git a/modules/dynamicfusion/tests/ceres_test.cpp b/modules/dynamicfusion/tests/ceres_test.cpp
new file mode 100644
index 00000000000..cfdb2363f18
--- /dev/null
+++ b/modules/dynamicfusion/tests/ceres_test.cpp
@@ -0,0 +1,167 @@
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <dynamicfusion/warp_field.hpp>
+#include "ceres/ceres.h"
+#include "ceres/rotation.h"
+// Read a Bundle Adjustment in the Large dataset.
+class BALProblem {
+public:
+    BALProblem(dynamicfusion::WarpField warp) : warpField_(&warp){};
+    ~BALProblem() {
+        delete[] point_index_;
+        delete[] camera_index_;
+        delete[] observations_;
+        delete[] parameters_;
+    }
+    int num_observations()       const { return num_observations_;               }
+    const double* observations() const { return observations_;                   }
+    const cv::Vec3d* observations_vector() const { return observations_vector_;   }
+    double* mutable_cameras()          { return parameters_;                     }
+    double* mutable_points()           { return parameters_  + 9 * num_cameras_; }
+    double* mutable_camera_for_observation(int i) {
+        return mutable_cameras() + camera_index_[i] * 9;
+    }
+    double* mutable_point_for_observation(int i) {
+        return mutable_points() + point_index_[i] * 3;
+    }
+    bool LoadFile(const char* filename) {
+        FILE* fptr = fopen(filename, "r");
+        if (fptr == NULL) {
+            return false;
+        };
+        FscanfOrDie(fptr, "%d", &num_cameras_);
+        FscanfOrDie(fptr, "%d", &num_points_);
+        FscanfOrDie(fptr, "%d", &num_observations_);
+        point_index_ = new int[num_observations_];
+        camera_index_ = new int[num_observations_];
+        observations_ = new double[2 * num_observations_];
+        num_parameters_ = 9 * num_cameras_ + 3 * num_points_;
+        parameters_ = new double[num_parameters_];
+        observations_vector_ = new cv::Vec3d[num_observations_];
+        for (int i = 0; i < num_observations_; ++i) {
+            FscanfOrDie(fptr, "%d", camera_index_ + i);
+            FscanfOrDie(fptr, "%d", point_index_ + i);
+            for (int j = 0; j < 2; ++j) {
+                FscanfOrDie(fptr, "%lf", observations_ + 2*i + j);
+            }
+            observations_vector_[i] = cv::Vec3d(observations_[i],observations_[i+1],observations_[i+2]);
+        }
+        for (int i = 0; i < num_parameters_; ++i) {
+            FscanfOrDie(fptr, "%lf", parameters_ + i);
+        }
+        return true;
+    }
+private:
+    template<typename T>
+    void FscanfOrDie(FILE *fptr, const char *format, T *value) {
+        int num_scanned = fscanf(fptr, format, value);
+        if (num_scanned != 1) {
+            LOG(FATAL) << "Invalid UW data file.";
+        }
+    }
+    int num_cameras_;
+    int num_points_;
+    int num_observations_;
+    int num_parameters_;
+
+    int* point_index_;
+    int* camera_index_;
+
+    double* observations_;
+    cv::Vec3d* observations_vector_;
+    double* parameters_;
+
+    dynamicfusion::WarpField* warpField_;
+};
+// Templated pinhole camera model for used with Ceres.  The camera is
+// parameterized using 9 parameters: 3 for rotation, 3 for translation, 1 for
+// focal length and 2 for radial distortion. The principal point is not modeled
+// (i.e. it is assumed be located at the image center).
+struct SnavelyReprojectionError {
+//    SnavelyReprojectionError(double observed_x, double observed_y, double observed_z)
+//            : observed_x(observed_x), observed_y(observed_y) {}
+    SnavelyReprojectionError(cv::Vec3d observed)
+            : observed_(observed) {}
+    template <typename T>
+    bool operator()(const T* const camera,
+                    const T* const point,
+                    T* residuals) const {
+        // camera[0,1,2] are the angle-axis rotation.
+        T p[3];
+        ceres::AngleAxisRotatePoint(camera, point, p);
+        // camera[3,4,5] are the translation.
+        p[0] += camera[3];
+        p[1] += camera[4];
+        p[2] += camera[5];
+        // Compute the center of distortion. The sign change comes from
+        // the camera model that Noah Snavely's Bundler assumes, whereby
+        // the camera coordinate system has a negative z axis.
+        T xp = - p[0] / p[2];
+        T yp = - p[1] / p[2];
+        // Apply second and fourth order radial distortion.
+        const T& l1 = camera[7];
+        const T& l2 = camera[8];
+        T r2 = xp*xp + yp*yp;
+        T distortion = 1.0 + r2  * (l1 + l2  * r2);
+        // Compute final projected point position.
+        const T& focal = camera[6];
+        T predicted_x = focal * distortion * xp;
+        T predicted_y = focal * distortion * yp;
+        T predicted_z = focal * distortion * yp;
+
+        // The error is the difference between the predicted and observed position.
+        residuals[0] = predicted_x - observed_[0];
+        residuals[1] = predicted_y - observed_[1];
+        residuals[2] = predicted_z - observed_[2];
+        return true;
+    }
+    // Factory to hide the construction of the CostFunction object from
+    // the client code.{
+    static ceres::CostFunction* Create(const cv::Vec3d observed) {
+        return (new ceres::AutoDiffCostFunction<SnavelyReprojectionError, 3, 9, 3>(
+                new SnavelyReprojectionError(observed)));
+    }
+    cv::Vec3d observed_;
+};
+
+
+int main(int argc, char** argv) {
+    google::InitGoogleLogging(argv[0]);
+    if (argc != 2) {
+        std::cerr << "usage: simple_bundle_adjuster <bal_problem>\n";
+        return 1;
+    }
+    dynamicfusion::WarpField warpField;
+    BALProblem bal_problem(warpField);
+    if (!bal_problem.LoadFile(argv[1])) {
+        std::cerr << "ERROR: unable to open file " << argv[1] << "\n";
+        return 1;
+    }
+    const double* observations = bal_problem.observations();
+    const auto observations_vector = bal_problem.observations_vector();
+    // Create residuals for each observation in the bundle adjustment problem. The
+    // parameters for cameras and points are added automatically.
+    ceres::Problem problem;
+    for (int i = 0; i < bal_problem.num_observations(); ++i) {
+        // Each Residual block takes a point and a camera as input and outputs a 2
+        // dimensional residual. Internally, the cost function stores the observed
+        // image location and compares the reprojection against the observation.
+        ceres::CostFunction* cost_function =
+                SnavelyReprojectionError::Create(observations[i]);
+        problem.AddResidualBlock(cost_function,
+                                 NULL /* squared loss */,
+                                 bal_problem.mutable_camera_for_observation(i),
+                                 bal_problem.mutable_point_for_observation(i));
+    }
+    // Make Ceres automatically detect the bundle structure. Note that the
+    // standard solver, SPARSE_NORMAL_CHOLESKY, also works fine but it is slower
+    // for standard bundle adjustment problems.
+    ceres::Solver::Options options;
+    options.linear_solver_type = ceres::DENSE_SCHUR;
+    options.minimizer_progress_to_stdout = true;
+    ceres::Solver::Summary summary;
+    ceres::Solve(options, &problem, &summary);
+    std::cout << summary.FullReport() << "\n";
+    return 0;
+}
diff --git a/modules/dynamicfusion/tests/ceres_test_warp.cpp b/modules/dynamicfusion/tests/ceres_test_warp.cpp
new file mode 100644
index 00000000000..4173e6f279a
--- /dev/null
+++ b/modules/dynamicfusion/tests/ceres_test_warp.cpp
@@ -0,0 +1,49 @@
+#include <cmath>
+#include <cstdio>
+#include <iostream>
+#include <kfusion/warp_field.hpp>
+#include "ceres/ceres.h"
+#include "ceres/rotation.h"
+#include <kfusion/optimisation.hpp>
+
+int main(int argc, char** argv) {
+    google::InitGoogleLogging(argv[0]);
+
+    kfusion::WarpField warpField;
+    std::vector<cv::Vec3f> warp_init;
+    std::vector<cv::Vec3f> warp_normals;
+    for(int i=0; i < KNN_NEIGHBOURS; i++)
+        warp_normals.push_back(cv::Vec3f(0,0,1));
+
+    warp_init.push_back(cv::Vec3f(1,1,1));
+    warp_init.push_back(cv::Vec3f(1,1,-1));
+    warp_init.push_back(cv::Vec3f(1,-1,1));
+    warp_init.push_back(cv::Vec3f(1,-1,-1));
+    warp_init.push_back(cv::Vec3f(-1,1,1));
+    warp_init.push_back(cv::Vec3f(-1,1,-1));
+    warp_init.push_back(cv::Vec3f(-1,-1,1));
+    warp_init.push_back(cv::Vec3f(-1,-1,-1));
+
+    warpField.init(warp_init, warp_normals);
+    float weights[KNN_NEIGHBOURS];
+    warpField.getWeightsAndUpdateKNN(cv::Vec3f(0,0,0), weights);
+
+    std::vector<cv::Vec3f> canonical_vertices;
+    canonical_vertices.push_back(cv::Vec3f(0,0,0));
+    canonical_vertices.push_back(cv::Vec3f(2,2,2));
+
+    std::vector<cv::Vec3f> canonical_normals;
+    canonical_normals.push_back(cv::Vec3f(0,0,1));
+    canonical_normals.push_back(cv::Vec3f(0,0,1));
+
+    std::vector<cv::Vec3f> live_vertices;
+    live_vertices.push_back(cv::Vec3f(0.01,0.01,0.01));
+    live_vertices.push_back(cv::Vec3f(2.01,2.01,2.01));
+
+    std::vector<cv::Vec3f> live_normals;
+    live_normals.push_back(cv::Vec3f(0,0,1));
+    live_normals.push_back(cv::Vec3f(0,0,1));
+    
+    warpField.energy_data(canonical_vertices, canonical_normals,live_vertices, live_normals);
+    return 0;
+}
diff --git a/modules/dynamicfusion/tests/test_warp_field.cpp b/modules/dynamicfusion/tests/test_warp_field.cpp
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/modules/dynamicfusion/tests/utils/CMakeLists.txt b/modules/dynamicfusion/tests/utils/CMakeLists.txt
new file mode 100644
index 00000000000..38265205f76
--- /dev/null
+++ b/modules/dynamicfusion/tests/utils/CMakeLists.txt
@@ -0,0 +1,17 @@
+include_directories(${CMAKE_SOURCE_DIR}/kfusion/include)
+file(GLOB SourceFiles
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.h
+        ${CMAKE_CURRENT_SOURCE_DIR}/*.cc)
+add_executable(utils_tests ${SourceFiles})
+target_link_libraries(
+        utils_tests
+        ${GTEST_BOTH_LIBRARIES}
+        ${OpenCV_LIBS}
+        kfusion
+)
+
+target_include_directories(
+        utils_tests PUBLIC
+        ${CMAKE_SOURCE_DIR}/kfusion/include
+        ${OpenCV_INCLUDE_DIRS}
+)
\ No newline at end of file
diff --git a/modules/dynamicfusion/tests/utils/test_dual_quaternion.cc b/modules/dynamicfusion/tests/utils/test_dual_quaternion.cc
new file mode 100644
index 00000000000..df4f16729a0
--- /dev/null
+++ b/modules/dynamicfusion/tests/utils/test_dual_quaternion.cc
@@ -0,0 +1,79 @@
+#include <gtest/gtest.h>
+#include <dual_quaternion.hpp>
+
+
+using namespace kfusion::utils;
+TEST(DualQuaternionTest, DualQuaternionConstructor)
+{
+    DualQuaternion<float> dualQuaternion(1, 2, 3, 1, 2, 3);
+    Quaternion<float> translation = dualQuaternion.getTranslation();
+    Quaternion<float> rotation = dualQuaternion.getRotation();
+
+    EXPECT_NEAR(translation.w_, 0, 0.001);
+    EXPECT_NEAR(translation.x_, 1, 0.1);
+    EXPECT_NEAR(translation.y_, 2, 0.1);
+    EXPECT_NEAR(translation.z_, 3, 0.1);
+
+    EXPECT_NEAR(rotation.w_, 0.435953, 0.01);
+    EXPECT_NEAR(rotation.x_, -0.718287, 0.01);
+    EXPECT_NEAR(rotation.y_, 0.310622, 0.01);
+    EXPECT_NEAR(rotation.z_, 0.454649, 0.01);
+}
+
+TEST(DualQuaternionTest, get6DOF)
+{
+    DualQuaternion<float> dualQuaternion(1.0f, 2.0f, 3.0f, 1.0f, 2.0f, 3.0f);
+    float x, y, z, roll, pitch, yaw;
+    dualQuaternion.get6DOF(x, y, z, roll, pitch, yaw);
+    EXPECT_FLOAT_EQ(x, 1.0f);
+    EXPECT_FLOAT_EQ(y, 2.0f);
+    EXPECT_FLOAT_EQ(z, 3.0f);
+    EXPECT_FLOAT_EQ(roll, 1.0f);
+    EXPECT_FLOAT_EQ(pitch, 2.0f);
+    EXPECT_FLOAT_EQ(yaw, 3.0f);
+}
+
+TEST(DualQuaternionTest, isAssociative)
+{
+    DualQuaternion<float> dualQuaternion(1, 2, 3, 1, 2, 3);
+    DualQuaternion<float> dualQuaternion1(3, 4, 5, 3, 4, 5);
+    dualQuaternion.encodeRotation(1,2,3);
+    dualQuaternion1.encodeRotation(3,4,5);
+    cv::Vec3f test11(1,0,0);
+    cv::Vec3f test12(1,0,0);
+    cv::Vec3f test2(1,0,0);
+    auto cumul = dualQuaternion1 + dualQuaternion;
+    cumul.normalize();
+    dualQuaternion.transform(test11);
+    dualQuaternion1.transform(test12);
+    cumul.transform(test2);
+    auto result = test11 + test12;
+
+    EXPECT_NE(test2, result);
+}
+TEST(DualQuaternionTest, canSplitOperations)
+{
+    DualQuaternion<float> dualQuaternion(1, 2, 3, 1, 2, 3);
+    DualQuaternion<float> dualQuaternion1(3, 4, 5, 3, 4, 5);
+    dualQuaternion.encodeRotation(1,2,3);
+    dualQuaternion1.encodeRotation(3,4,5);
+    cv::Vec3f test1(1,0,0);
+    cv::Vec3f test2(1,0,0);
+    cv::Vec3f t1, t2;
+
+    auto cumul = dualQuaternion1 + dualQuaternion;
+    auto quat1 = dualQuaternion.getRotation() + dualQuaternion1.getRotation();
+    dualQuaternion.getTranslation(t1);
+    dualQuaternion1.getTranslation(t2);
+    auto trans = t1 + t2;
+
+
+    cumul.normalize();
+    cumul.transform(test1);
+//    auto result = test1 + test12;
+    quat1.rotate(test2);
+    auto result = test2 + trans;
+    std::cout<<test1<<std::endl;
+    std::cout<<test2<<std::endl;
+    EXPECT_NE(test1, result);
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/tests/utils/test_quaternion.cc b/modules/dynamicfusion/tests/utils/test_quaternion.cc
new file mode 100644
index 00000000000..25b5319c567
--- /dev/null
+++ b/modules/dynamicfusion/tests/utils/test_quaternion.cc
@@ -0,0 +1,88 @@
+#include <gtest/gtest.h>
+#include <quaternion.hpp>
+#include <math.h>
+#include <kfusion/types.hpp>
+using namespace kfusion::utils;
+TEST(QuaternionTest, encodeRotation)
+{
+    Quaternion<float> quaternion;
+    quaternion.encodeRotation(M_PI_4, 0, 0, 1);
+
+    ASSERT_FLOAT_EQ(0.9238795, quaternion.w_);
+    ASSERT_FLOAT_EQ(0, quaternion.x_);
+    ASSERT_FLOAT_EQ(0, quaternion.y_);
+    ASSERT_FLOAT_EQ(0.38268346, quaternion.z_);
+}
+
+TEST(QuaternionTest, rotate)
+{
+    Quaternion<float> quaternion(0,0,1,1);
+    float vector[] = {0,0,1};
+    quaternion.rotate(vector[0], vector[1], vector[2]);
+    EXPECT_EQ(vector[0], 0);
+    EXPECT_EQ(vector[1], 2);
+    EXPECT_EQ(vector[2], 0);
+}
+
+TEST(QuaternionTest, quat_product)
+{
+    Quaternion<float> quaternion(1,1,2,2);
+    Quaternion<float> quaternion1(0,0,1,1);
+    Quaternion<float> product = quaternion * quaternion1;
+    EXPECT_EQ(product.w_, -4);
+    EXPECT_EQ(product.x_, 0);
+    EXPECT_EQ(product.y_, 0);
+    EXPECT_EQ(product.z_, 2);
+}
+
+TEST(QuaternionTest, power)
+{
+    Quaternion<float> quaternion(1,1,2,2);
+    EXPECT_EQ(quaternion.power(2), quaternion*quaternion);
+}
+
+TEST(QuaternionTest, dotProduct)
+{
+    Quaternion<float> quaternion(1,1,2,2);
+    Quaternion<float> quaternion1(0,0,1,1);
+    EXPECT_EQ(quaternion.dotProduct(quaternion1), 4);
+}
+
+TEST(QuaternionTest, normalize)
+{
+    Quaternion<float> quaternion(10,10,10,10);
+    quaternion.normalize();
+    EXPECT_EQ(quaternion, Quaternion<float>(0.5, 0.5, 0.5, 0.5));
+}
+
+TEST(QuaternionTest, slerp)
+{
+    Quaternion<float> quaternion(1,1,2,2);
+    Quaternion<float> quaternion1(0,0,1,1);
+    Quaternion<float> result = quaternion.slerp(quaternion1, 0.5);
+
+    ASSERT_FLOAT_EQ(result.w_, 0.16245984);
+    ASSERT_FLOAT_EQ(result.x_, 0.16245984);
+    ASSERT_FLOAT_EQ(result.y_, 0.688191);
+    ASSERT_FLOAT_EQ(result.z_, 0.688191);
+}
+
+TEST(QuaternionTest, rodrigues)
+{
+    Quaternion<float> quaternion;
+    quaternion.encodeRotation(3,1,1,1);
+    float x, y, z;
+    quaternion.getRodrigues(x,y,z);
+    std::cout<<x<<" "<<y<<" "<<z<<std::endl;
+    ASSERT_FLOAT_EQ(2, x);
+    ASSERT_FLOAT_EQ(4, y);
+    ASSERT_FLOAT_EQ(4, z);
+}
+
+TEST(QuaternionTest, normal)
+{
+    kfusion::Vec3f normal(0,1,0);
+    Quaternion<float> quaternion(normal);
+
+    std::cout<<"Quaternion:" << quaternion;
+}

From 11bbe6ee478470f11c102870e8ee2b9b3cc23382 Mon Sep 17 00:00:00 2001
From: Mihai Bujanca <mihaibujanca@yahoo.com>
Date: Fri, 1 Sep 2017 01:59:57 +0100
Subject: [PATCH 2/4] Linking errors but got rid of compilation errors. Still
 have warnings to solve

---
 modules/dynamicfusion/CMakeLists.txt          | 100 +++++++++---------
 .../{kfusion => }/include/io/capture.hpp      |   1 -
 .../include/kfusion/cuda/device_array.hpp     |   0
 .../include/kfusion/cuda/device_memory.hpp    |   0
 .../include/kfusion/cuda/imgproc.hpp          |   0
 .../kfusion/cuda/kernel_containers.hpp        |   0
 .../include/kfusion/cuda/projective_icp.hpp   |   0
 .../include/kfusion/cuda/tsdf_volume.hpp      |   2 +-
 .../{kfusion => }/include/kfusion/exports.hpp |   0
 .../{kfusion => }/include/kfusion/kinfu.hpp   |   4 +-
 .../include/kfusion/optimisation.hpp          |   0
 .../{kfusion => }/include/kfusion/types.hpp   |   2 -
 .../include/kfusion/warp_field.hpp            |   4 +-
 .../include/nanoflann/nanoflann.hpp           |   0
 .../src => include}/utils/dual_quaternion.hpp |   2 +-
 .../src => include}/utils/knn_point_cloud.hpp |   0
 .../src => include}/utils/quaternion.hpp      |   4 +-
 modules/dynamicfusion/kfusion/CMakeLists.txt  |   6 --
 .../{apps => samples}/CMakeLists.txt          |   2 +-
 .../dynamicfusion/{apps => samples}/demo.cpp  |  12 +--
 .../dynamicfusion_kinect.cpp                  |   6 +-
 .../{kfusion => }/src/capture.cpp             |   0
 .../dynamicfusion/{kfusion => }/src/core.cpp  |   0
 .../{kfusion => }/src/cuda/device.hpp         |   0
 .../{kfusion => }/src/cuda/imgproc.cu         |   0
 .../{kfusion => }/src/cuda/proj_icp.cu        |   0
 .../{kfusion => }/src/cuda/temp_utils.hpp     |   0
 .../{kfusion => }/src/cuda/texture_binder.hpp |   0
 .../{kfusion => }/src/cuda/tsdf_volume.cu     |   0
 .../{kfusion => }/src/device_memory.cpp       |   0
 .../{kfusion => }/src/imgproc.cpp             |   0
 .../{kfusion => }/src/internal.hpp            |   1 -
 .../dynamicfusion/{kfusion => }/src/kinfu.cpp |  14 ++-
 .../{kfusion => }/src/optimisation.cpp        |   0
 .../{kfusion => }/src/precomp.cpp             |   0
 .../{kfusion => }/src/precomp.hpp             |   1 -
 .../{kfusion => }/src/projective_icp.cpp      |   2 +-
 .../{kfusion => }/src/safe_call.hpp           |   0
 .../{kfusion => }/src/tsdf_volume.cpp         |   7 +-
 .../{kfusion => }/src/warp_field.cpp          |  15 ++-
 40 files changed, 84 insertions(+), 101 deletions(-)
 rename modules/dynamicfusion/{kfusion => }/include/io/capture.hpp (96%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/device_array.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/device_memory.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/imgproc.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/kernel_containers.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/projective_icp.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/cuda/tsdf_volume.hpp (98%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/exports.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/kinfu.hpp (97%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/optimisation.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/types.hpp (96%)
 rename modules/dynamicfusion/{kfusion => }/include/kfusion/warp_field.hpp (97%)
 rename modules/dynamicfusion/{kfusion => }/include/nanoflann/nanoflann.hpp (100%)
 rename modules/dynamicfusion/{kfusion/src => include}/utils/dual_quaternion.hpp (99%)
 rename modules/dynamicfusion/{kfusion/src => include}/utils/knn_point_cloud.hpp (100%)
 rename modules/dynamicfusion/{kfusion/src => include}/utils/quaternion.hpp (98%)
 delete mode 100644 modules/dynamicfusion/kfusion/CMakeLists.txt
 rename modules/dynamicfusion/{apps => samples}/CMakeLists.txt (95%)
 rename modules/dynamicfusion/{apps => samples}/demo.cpp (95%)
 rename modules/dynamicfusion/{apps => samples}/dynamicfusion_kinect.cpp (98%)
 rename modules/dynamicfusion/{kfusion => }/src/capture.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/core.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/device.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/imgproc.cu (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/proj_icp.cu (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/temp_utils.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/texture_binder.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/cuda/tsdf_volume.cu (100%)
 rename modules/dynamicfusion/{kfusion => }/src/device_memory.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/imgproc.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/internal.hpp (99%)
 rename modules/dynamicfusion/{kfusion => }/src/kinfu.cpp (98%)
 rename modules/dynamicfusion/{kfusion => }/src/optimisation.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/precomp.cpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/precomp.hpp (94%)
 rename modules/dynamicfusion/{kfusion => }/src/projective_icp.cpp (99%)
 rename modules/dynamicfusion/{kfusion => }/src/safe_call.hpp (100%)
 rename modules/dynamicfusion/{kfusion => }/src/tsdf_volume.cpp (98%)
 rename modules/dynamicfusion/{kfusion => }/src/warp_field.cpp (97%)

diff --git a/modules/dynamicfusion/CMakeLists.txt b/modules/dynamicfusion/CMakeLists.txt
index d14b636d3fa..8fea6a63e3e 100644
--- a/modules/dynamicfusion/CMakeLists.txt
+++ b/modules/dynamicfusion/CMakeLists.txt
@@ -1,11 +1,7 @@
-cmake_minimum_required(VERSION 3.5.0)
-
-# ---[ Configurations types
-set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Possible configurations" FORCE)
+set(the_description "Non-rigid 3D reconstruction")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DFORCE_INLINERS -D_MWAITXINTRIN_H_INCLUDED")
-if (DEFINED CMAKE_BUILD_TYPE)
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
-endif()
+
+
 if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile)
   find_package(Doxygen)
   if(DOXYGEN_FOUND)
@@ -33,50 +29,56 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile)
 
   endif(DOXYGEN_FOUND)
 endif()
-
-# ---[ Solution name
-project(kfusion C CXX)
-
-# ---[ utility
+#
+## ---[ Solution name
+#
+## ---[ utility
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 include(cmake/Utils.cmake)
-include(cmake/Targets.cmake)
-
-# ---[ find dependencies
-find_package(OpenCV REQUIRED COMPONENTS core viz highgui calib3d)
+#include(cmake/Targets.cmake)
+#
+## ---[ find dependencies
 find_package(CUDA REQUIRED)
-find_package(OpenNI)
+#find_package(OpenNI)
 find_package(Ceres REQUIRED)
-include_directories(${OpenCV_INCLUDE_DIRS} ${CERES_INCLUDE_DIRS} ${Boost_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${OPENNI_INCLUDE_DIR} "kfusion/src/utils" "kfusion/include/nanoflann")
-if(OPENNI_FOUND)
-    message("FOUND OPENNI AT: ${OPENNI_INCLUDE_DIR}")
-endif()
-# ---[ misc settings
-if(USE_PROJECT_FOLDERS)
-  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-  set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
-endif()
-
-# ---[ cuda settings
-set(HAVE_CUDA 1)
-list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_61,code=sm_61")
-
-if(UNIX OR APPLE)
-  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC;")
-endif()
-
-warnings_disable(CMAKE_CXX_FLAGS /wd4985)
-
-add_subdirectory(kfusion)
-add_subdirectory(apps)
+include_directories(${CERES_INCLUDE_DIRS} ${Boost_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${OPENNI_INCLUDE_DIR} "kfusion/src/utils" "kfusion/include/nanoflann")
+#if(OPENNI_FOUND)
+#    message("FOUND OPENNI AT: ${OPENNI_INCLUDE_DIR}")
+#endif()
+## ---[ misc settings
+#if(USE_PROJECT_FOLDERS)
+#  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
+#  set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
+#endif()
+#
+## ---[ cuda settings
+#set(HAVE_CUDA 1)
+#list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_61,code=sm_61")
+#
+#if(UNIX OR APPLE)
+#  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC;")
+#endif()
+#
+#warnings_disable(CMAKE_CXX_FLAGS /wd4985)
+#
+#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false")
+#add_module_library(kfusion)
+#if(OPENNI_FOUND)
+#    target_compile_definitions(kfusion PRIVATE OPENNI_FOUND=1)
+#endif()
+#target_link_libraries(kfusion ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${OPENNI_LIBRARY} ${CERES_LIBRARIES})
+#add_subdirectory(apps)
+#
+#if(BUILD_TESTS)
+#  find_package(GTest REQUIRED)
+#  if( GTEST_FOUND )
+#    message( "Found Gtest at ${GTEST_ROOT}")
+#    message( "GTest Libs: ${GTEST_BOTH_LIBRARIES}")
+#    message( "GTest Include: ${GTEST_INCLUDE_DIRS}")
+#    include_directories(${GTEST_INCLUDE_DIRS})
+#    add_subdirectory(tests)
+#  endif()
+#endif()
 
-if(BUILD_TESTS)
-  find_package(GTest REQUIRED)
-  if( GTEST_FOUND )
-    message( "Found Gtest at ${GTEST_ROOT}")
-    message( "GTest Libs: ${GTEST_BOTH_LIBRARIES}")
-    message( "GTest Include: ${GTEST_INCLUDE_DIRS}")
-    include_directories(${GTEST_INCLUDE_DIRS})
-    add_subdirectory(tests)
-  endif()
-endif()
\ No newline at end of file
+ocv_define_module(dynamicfusion opencv_core opencv_calib3d opencv_viz opencv_highgui)
+target_compile_options(opencv_dynamicfusion PRIVATE "-Wall")
diff --git a/modules/dynamicfusion/kfusion/include/io/capture.hpp b/modules/dynamicfusion/include/io/capture.hpp
similarity index 96%
rename from modules/dynamicfusion/kfusion/include/io/capture.hpp
rename to modules/dynamicfusion/include/io/capture.hpp
index dc8f786273d..b56531d185f 100644
--- a/modules/dynamicfusion/kfusion/include/io/capture.hpp
+++ b/modules/dynamicfusion/include/io/capture.hpp
@@ -1,7 +1,6 @@
 #pragma once
 
 #include <kfusion/kinfu.hpp>
-#include <opencv2/core/core.hpp>
 #include <string>
 
 namespace kfusion
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp b/modules/dynamicfusion/include/kfusion/cuda/device_array.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/device_array.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/device_array.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp b/modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/device_memory.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp b/modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/imgproc.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp b/modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/kernel_containers.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp b/modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/projective_icp.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp b/modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
similarity index 98%
rename from modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp
rename to modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
index fc74ee86681..ba1dd4c1853 100644
--- a/modules/dynamicfusion/kfusion/include/kfusion/cuda/tsdf_volume.hpp
+++ b/modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include <kfusion/types.hpp>
-#include <dual_quaternion.hpp>
+#include <utils/dual_quaternion.hpp>
 
 namespace kfusion
 {
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/exports.hpp b/modules/dynamicfusion/include/kfusion/exports.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/exports.hpp
rename to modules/dynamicfusion/include/kfusion/exports.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp b/modules/dynamicfusion/include/kfusion/kinfu.hpp
similarity index 97%
rename from modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp
rename to modules/dynamicfusion/include/kfusion/kinfu.hpp
index 9c2abdc3c42..3be08e84914 100644
--- a/modules/dynamicfusion/kfusion/include/kfusion/kinfu.hpp
+++ b/modules/dynamicfusion/include/kfusion/kinfu.hpp
@@ -3,8 +3,8 @@
 #include <kfusion/types.hpp>
 #include <vector>
 #include <string>
-#include <dual_quaternion.hpp>
-#include <quaternion.hpp>
+#include "utils/dual_quaternion.hpp"
+#include "utils/quaternion.hpp"
 #include <kfusion/cuda/projective_icp.hpp>
 #include <kfusion/cuda/tsdf_volume.hpp>
 #include <kfusion/warp_field.hpp>
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp b/modules/dynamicfusion/include/kfusion/optimisation.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/kfusion/optimisation.hpp
rename to modules/dynamicfusion/include/kfusion/optimisation.hpp
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/types.hpp b/modules/dynamicfusion/include/kfusion/types.hpp
similarity index 96%
rename from modules/dynamicfusion/kfusion/include/kfusion/types.hpp
rename to modules/dynamicfusion/include/kfusion/types.hpp
index b99585dc105..cc3f33f490c 100644
--- a/modules/dynamicfusion/kfusion/include/kfusion/types.hpp
+++ b/modules/dynamicfusion/include/kfusion/types.hpp
@@ -1,8 +1,6 @@
 #pragma once
 
 #include <kfusion/cuda/device_array.hpp>
-#include <opencv2/core/core.hpp>
-#include <opencv2/core/affine.hpp>
 #include <opencv2/viz/vizcore.hpp>
 #include <iosfwd>
 
diff --git a/modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp b/modules/dynamicfusion/include/kfusion/warp_field.hpp
similarity index 97%
rename from modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp
rename to modules/dynamicfusion/include/kfusion/warp_field.hpp
index 63faa1f23c8..bfd122d829d 100644
--- a/modules/dynamicfusion/kfusion/include/kfusion/warp_field.hpp
+++ b/modules/dynamicfusion/include/kfusion/warp_field.hpp
@@ -5,10 +5,10 @@
  * \brief
  * \details
  */
-#include <dual_quaternion.hpp>
+#include <utils/dual_quaternion.hpp>
 #include <kfusion/types.hpp>
 #include <nanoflann/nanoflann.hpp>
-#include <knn_point_cloud.hpp>
+#include <utils/knn_point_cloud.hpp>
 #include <kfusion/cuda/tsdf_volume.hpp>
 #define KNN_NEIGHBOURS 8
 
diff --git a/modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp b/modules/dynamicfusion/include/nanoflann/nanoflann.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/include/nanoflann/nanoflann.hpp
rename to modules/dynamicfusion/include/nanoflann/nanoflann.hpp
diff --git a/modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp b/modules/dynamicfusion/include/utils/dual_quaternion.hpp
similarity index 99%
rename from modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp
rename to modules/dynamicfusion/include/utils/dual_quaternion.hpp
index f47bce057c0..f3dcd5bb097 100644
--- a/modules/dynamicfusion/kfusion/src/utils/dual_quaternion.hpp
+++ b/modules/dynamicfusion/include/utils/dual_quaternion.hpp
@@ -1,7 +1,7 @@
 #ifndef DYNAMIC_FUSION_DUAL_QUATERNION_HPP
 #define DYNAMIC_FUSION_DUAL_QUATERNION_HPP
 #include<iostream>
-#include<quaternion.hpp>
+#include"quaternion.hpp"
 
 //Adapted from https://github.com/Poofjunior/QPose
 /**
diff --git a/modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp b/modules/dynamicfusion/include/utils/knn_point_cloud.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/utils/knn_point_cloud.hpp
rename to modules/dynamicfusion/include/utils/knn_point_cloud.hpp
diff --git a/modules/dynamicfusion/kfusion/src/utils/quaternion.hpp b/modules/dynamicfusion/include/utils/quaternion.hpp
similarity index 98%
rename from modules/dynamicfusion/kfusion/src/utils/quaternion.hpp
rename to modules/dynamicfusion/include/utils/quaternion.hpp
index 7864ebf14e3..dbe922d54fb 100644
--- a/modules/dynamicfusion/kfusion/src/utils/quaternion.hpp
+++ b/modules/dynamicfusion/include/utils/quaternion.hpp
@@ -216,9 +216,9 @@ namespace kfusion{
             {
                 // should never happen unless the Quaternion<T> wasn't initialized
                 // correctly.
-                assert( !((w_ == 0) && (x_ == 0) && (y_ == 0) && (z_ == 0)));
+//                CV_Assert( !((w_ == 0) && (x_ == 0) && (y_ == 0) && (z_ == 0)));
                 T theNorm = norm();
-                assert(theNorm > 0);
+//                CV_Assert(theNorm > 0);
                 (*this) = (1.0/theNorm) * (*this);
             }
 
diff --git a/modules/dynamicfusion/kfusion/CMakeLists.txt b/modules/dynamicfusion/kfusion/CMakeLists.txt
deleted file mode 100644
index c74af700487..00000000000
--- a/modules/dynamicfusion/kfusion/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false")
-add_module_library(kfusion)
-if(OPENNI_FOUND)
-    target_compile_definitions(kfusion PRIVATE OPENNI_FOUND=1)
-endif()
-target_link_libraries(kfusion ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${OpenCV_LIBS} ${OPENNI_LIBRARY} ${CERES_LIBRARIES})
diff --git a/modules/dynamicfusion/apps/CMakeLists.txt b/modules/dynamicfusion/samples/CMakeLists.txt
similarity index 95%
rename from modules/dynamicfusion/apps/CMakeLists.txt
rename to modules/dynamicfusion/samples/CMakeLists.txt
index 559eacd728b..218e6889124 100644
--- a/modules/dynamicfusion/apps/CMakeLists.txt
+++ b/modules/dynamicfusion/samples/CMakeLists.txt
@@ -8,7 +8,7 @@ set_target_properties(dynamicfusion PROPERTIES
     DEBUG_POSTFIX "d"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
-
+target_compile_options(dynamicfusion PRIVATE "-Wall")
 
 if(OPENNI_FOUND)
     add_executable(dynamicfusion_kinect dynamicfusion_kinect.cpp)
diff --git a/modules/dynamicfusion/apps/demo.cpp b/modules/dynamicfusion/samples/demo.cpp
similarity index 95%
rename from modules/dynamicfusion/apps/demo.cpp
rename to modules/dynamicfusion/samples/demo.cpp
index 3429fe1533a..6c7dd0cf7b4 100644
--- a/modules/dynamicfusion/apps/demo.cpp
+++ b/modules/dynamicfusion/samples/demo.cpp
@@ -1,12 +1,10 @@
 #include <iostream>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/viz/vizcore.hpp>
 #include <kfusion/kinfu.hpp>
 #include <kfusion/cuda/tsdf_volume.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/range/iterator_range.hpp>
-
+#include <opencv2/highgui/highgui_c.h>
+#include <opencv2/highgui.hpp>
 using namespace kfusion;
 
 struct DynamicFusionApp
@@ -41,7 +39,7 @@ struct DynamicFusionApp
     {
         cv::Mat display;
         depth.convertTo(display, CV_8U, 255.0/4000);
-        cv::imshow("Depth", display);
+        cv::viz::imshow("Depth", display);
     }
 
     void show_raycasted(KinFu& kinfu)
@@ -54,7 +52,7 @@ struct DynamicFusionApp
 
         view_host_.create(view_device_.rows(), view_device_.cols(), CV_8UC4);
         view_device_.download(view_host_.ptr<void>(), view_host_.step);
-        cv::imshow("Scene", view_host_);
+        cv::viz::imshow("Scene", view_host_);
     }
 
     void show_warp(KinFu &kinfu)
@@ -95,7 +93,7 @@ struct DynamicFusionApp
                 show_raycasted(dfusion);
 
             show_depth(depth);
-            cv::imshow("Image", image);
+            cv::viz::imshow("Image", image);
 
             if (!interactive_mode_) {
                 viz.setViewerPose(dfusion.getCameraPose());
diff --git a/modules/dynamicfusion/apps/dynamicfusion_kinect.cpp b/modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
similarity index 98%
rename from modules/dynamicfusion/apps/dynamicfusion_kinect.cpp
rename to modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
index 62dcacf2c8d..e32c0dc5273 100644
--- a/modules/dynamicfusion/apps/dynamicfusion_kinect.cpp
+++ b/modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
@@ -41,8 +41,8 @@ struct KinFuApp
     static void show_depth(const cv::Mat& depth)
     {
         cv::Mat display;
-        //cv::normalize(depth, display, 0, 255, cv::NORM_MINMAX, CV_8U);
-        depth.convertTo(display, CV_8U, 255.0/4000);
+        //cv::normalize(depth, display, 0, 255, cv::NORM_MINMAX, cv::CV_8U);
+        depth.convertTo(display, cv::CV_8U, 255.0/4000);
         cv::imshow("Depth", display);
     }
 
@@ -54,7 +54,7 @@ struct KinFuApp
         else
             kinfu.renderImage(view_device_, mode);
 
-        view_host_.create(view_device_.rows(), view_device_.cols(), CV_8UC4);
+        view_host_.create(view_device_.rows(), view_device_.cols(), cv::CV_8UC4);
         view_device_.download(view_host_.ptr<void>(), view_host_.step);
         cv::imshow("Scene", view_host_);
     }
diff --git a/modules/dynamicfusion/kfusion/src/capture.cpp b/modules/dynamicfusion/src/capture.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/capture.cpp
rename to modules/dynamicfusion/src/capture.cpp
diff --git a/modules/dynamicfusion/kfusion/src/core.cpp b/modules/dynamicfusion/src/core.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/core.cpp
rename to modules/dynamicfusion/src/core.cpp
diff --git a/modules/dynamicfusion/kfusion/src/cuda/device.hpp b/modules/dynamicfusion/src/cuda/device.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/device.hpp
rename to modules/dynamicfusion/src/cuda/device.hpp
diff --git a/modules/dynamicfusion/kfusion/src/cuda/imgproc.cu b/modules/dynamicfusion/src/cuda/imgproc.cu
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/imgproc.cu
rename to modules/dynamicfusion/src/cuda/imgproc.cu
diff --git a/modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu b/modules/dynamicfusion/src/cuda/proj_icp.cu
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/proj_icp.cu
rename to modules/dynamicfusion/src/cuda/proj_icp.cu
diff --git a/modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp b/modules/dynamicfusion/src/cuda/temp_utils.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/temp_utils.hpp
rename to modules/dynamicfusion/src/cuda/temp_utils.hpp
diff --git a/modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp b/modules/dynamicfusion/src/cuda/texture_binder.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/texture_binder.hpp
rename to modules/dynamicfusion/src/cuda/texture_binder.hpp
diff --git a/modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu b/modules/dynamicfusion/src/cuda/tsdf_volume.cu
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/cuda/tsdf_volume.cu
rename to modules/dynamicfusion/src/cuda/tsdf_volume.cu
diff --git a/modules/dynamicfusion/kfusion/src/device_memory.cpp b/modules/dynamicfusion/src/device_memory.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/device_memory.cpp
rename to modules/dynamicfusion/src/device_memory.cpp
diff --git a/modules/dynamicfusion/kfusion/src/imgproc.cpp b/modules/dynamicfusion/src/imgproc.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/imgproc.cpp
rename to modules/dynamicfusion/src/imgproc.cpp
diff --git a/modules/dynamicfusion/kfusion/src/internal.hpp b/modules/dynamicfusion/src/internal.hpp
similarity index 99%
rename from modules/dynamicfusion/kfusion/src/internal.hpp
rename to modules/dynamicfusion/src/internal.hpp
index 612c5cb89e9..ceffc128d6a 100644
--- a/modules/dynamicfusion/kfusion/src/internal.hpp
+++ b/modules/dynamicfusion/src/internal.hpp
@@ -2,7 +2,6 @@
 
 #include <kfusion/cuda/device_array.hpp>
 #include "safe_call.hpp"
-#include <opencv2/core/core.hpp>
 //#define USE_DEPTH
 
 namespace kfusion
diff --git a/modules/dynamicfusion/kfusion/src/kinfu.cpp b/modules/dynamicfusion/src/kinfu.cpp
similarity index 98%
rename from modules/dynamicfusion/kfusion/src/kinfu.cpp
rename to modules/dynamicfusion/src/kinfu.cpp
index 7a057c62b84..ea1576b1b2c 100644
--- a/modules/dynamicfusion/kfusion/src/kinfu.cpp
+++ b/modules/dynamicfusion/src/kinfu.cpp
@@ -1,14 +1,12 @@
 #include "precomp.hpp"
 #include "internal.hpp"
 #include <tgmath.h>
-#include <dual_quaternion.hpp>
-#include <nanoflann.hpp>
-#include <quaternion.hpp>
-#include <knn_point_cloud.hpp>
+#include "utils/dual_quaternion.hpp"
+#include "nanoflann/nanoflann.hpp"
+#include "utils/quaternion.hpp"
+#include <utils/knn_point_cloud.hpp>
 #include <kfusion/warp_field.hpp>
 #include <kfusion/cuda/tsdf_volume.hpp>
-#include <opencv2/core/core.hpp>
-#include <opencv2/highgui/highgui.hpp>
 #include <opencv2/viz/vizcore.hpp>
 
 using namespace std;
@@ -105,7 +103,7 @@ kfusion::KinFuParams kfusion::KinFuParams::default_params()
  */
 kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(params)
 {
-    CV_Assert(params.volume_dims[0] % 32 == 0);
+//    cv::CV_Assert(params.volume_dims[0] % 32 == 0);
 
     volume_ = cv::Ptr<cuda::TsdfVolume>(new cuda::TsdfVolume(params_.volume_dims));
     warp_ = cv::Ptr<WarpField>(new WarpField());
@@ -403,7 +401,7 @@ void kfusion::KinFu::dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame
     depth.download(depth_cloud.ptr<void>(), depth_cloud.step);
     cv::Mat display;
     depth_cloud.convertTo(display, CV_8U, 255.0/4000);
-    cv::imshow("Depth diff", display);
+    cv::viz::imshow("Depth diff", display);
     volume_->compute_points();
     volume_->compute_normals();
 }
diff --git a/modules/dynamicfusion/kfusion/src/optimisation.cpp b/modules/dynamicfusion/src/optimisation.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/optimisation.cpp
rename to modules/dynamicfusion/src/optimisation.cpp
diff --git a/modules/dynamicfusion/kfusion/src/precomp.cpp b/modules/dynamicfusion/src/precomp.cpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/precomp.cpp
rename to modules/dynamicfusion/src/precomp.cpp
diff --git a/modules/dynamicfusion/kfusion/src/precomp.hpp b/modules/dynamicfusion/src/precomp.hpp
similarity index 94%
rename from modules/dynamicfusion/kfusion/src/precomp.hpp
rename to modules/dynamicfusion/src/precomp.hpp
index 52b054ce7be..41e92d8c4e2 100644
--- a/modules/dynamicfusion/kfusion/src/precomp.hpp
+++ b/modules/dynamicfusion/src/precomp.hpp
@@ -5,7 +5,6 @@
 #include <kfusion/cuda/tsdf_volume.hpp>
 #include <kfusion/cuda/imgproc.hpp>
 #include <kfusion/cuda/projective_icp.hpp>
-//#include <opencv2/calib3d/calib3d.hpp>
 #include "internal.hpp"
 #include <iostream>
 #include "vector_functions.h"
diff --git a/modules/dynamicfusion/kfusion/src/projective_icp.cpp b/modules/dynamicfusion/src/projective_icp.cpp
similarity index 99%
rename from modules/dynamicfusion/kfusion/src/projective_icp.cpp
rename to modules/dynamicfusion/src/projective_icp.cpp
index 3554f00becc..4eba75f55a5 100644
--- a/modules/dynamicfusion/kfusion/src/projective_icp.cpp
+++ b/modules/dynamicfusion/src/projective_icp.cpp
@@ -118,7 +118,7 @@ bool kfusion::cuda::ProjectiveICP::estimateTransform(Affine3f& /*affine*/, const
 //         return estimateTransform(affine, intr, curr.points_pyr, curr.normals_pyr, prev.points_pyr, prev.normals_pyr);
 //    else
 //        CV_Assert(!"Wrong parameters passed to estimateTransform");
-    CV_Assert(!"Not implemented");
+//    CV_Assert(!"Not implemented");
     return false;
 }
 
diff --git a/modules/dynamicfusion/kfusion/src/safe_call.hpp b/modules/dynamicfusion/src/safe_call.hpp
similarity index 100%
rename from modules/dynamicfusion/kfusion/src/safe_call.hpp
rename to modules/dynamicfusion/src/safe_call.hpp
diff --git a/modules/dynamicfusion/kfusion/src/tsdf_volume.cpp b/modules/dynamicfusion/src/tsdf_volume.cpp
similarity index 98%
rename from modules/dynamicfusion/kfusion/src/tsdf_volume.cpp
rename to modules/dynamicfusion/src/tsdf_volume.cpp
index 2451bc6ba25..149108c76ae 100644
--- a/modules/dynamicfusion/kfusion/src/tsdf_volume.cpp
+++ b/modules/dynamicfusion/src/tsdf_volume.cpp
@@ -1,13 +1,10 @@
 #include "precomp.hpp"
-#include "dual_quaternion.hpp"
+#include "utils/dual_quaternion.hpp"
 #include <algorithm>
-#include <opencv2/imgproc/imgproc.hpp>
 #include <kfusion/warp_field.hpp>
-#include <knn_point_cloud.hpp>
+#include <utils/knn_point_cloud.hpp>
 #include <numeric>
-#include <opencv/cv.h>
 #include <opencv2/viz/vizcore.hpp>
-#include <opencv/highgui.h>
 using namespace kfusion;
 using namespace kfusion::cuda;
 
diff --git a/modules/dynamicfusion/kfusion/src/warp_field.cpp b/modules/dynamicfusion/src/warp_field.cpp
similarity index 97%
rename from modules/dynamicfusion/kfusion/src/warp_field.cpp
rename to modules/dynamicfusion/src/warp_field.cpp
index de570f03f1c..25dc0fdeaa9 100644
--- a/modules/dynamicfusion/kfusion/src/warp_field.cpp
+++ b/modules/dynamicfusion/src/warp_field.cpp
@@ -1,11 +1,10 @@
-#include <dual_quaternion.hpp>
-#include <knn_point_cloud.hpp>
+#include <utils/dual_quaternion.hpp>
+#include <utils/knn_point_cloud.hpp>
 #include <kfusion/types.hpp>
-#include <nanoflann.hpp>
+#include <nanoflann/nanoflann.hpp>
 #include "kfusion/warp_field.hpp"
 #include "internal.hpp"
 #include "precomp.hpp"
-#include <opencv2/core/affine.hpp>
 #include <kfusion/optimisation.hpp>
 
 using namespace kfusion;
@@ -38,8 +37,8 @@ WarpField::~WarpField()
  */
 void WarpField::init(const cv::Mat& first_frame, const cv::Mat& normals)
 {
-    assert(first_frame.rows == normals.rows);
-    assert(first_frame.cols == normals.cols);
+//    CV_Assert(first_frame.rows == normals.rows);
+//    CV_Assert(first_frame.cols == normals.cols);
     nodes->resize(first_frame.cols * first_frame.rows);
     auto voxel_size = kfusion::KinFuParams::default_params_dynamicfusion().volume_size[0] /
                       kfusion::KinFuParams::default_params_dynamicfusion().volume_dims[0];
@@ -106,8 +105,8 @@ void WarpField::energy(const cuda::Cloud &frame,
                        const std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> &edges
 )
 {
-    assert(normals.cols()==frame.cols());
-    assert(normals.rows()==frame.rows());
+//CV_Assert(normals.cols()==frame.cols());
+//CV_Assert(normals.rows()==frame.rows());
 }
 
 /**

From 7c12c247d57aa4b2b349ff6a29686dc82f6badcc Mon Sep 17 00:00:00 2001
From: Mihai Bujanca <mihaibujanca@yahoo.com>
Date: Thu, 7 Sep 2017 10:27:51 +0100
Subject: [PATCH 3/4] Integrate with cv namespace, change directory structure

---
 modules/dynamicfusion/CMakeLists.txt          |  41 +--
 modules/dynamicfusion/doc/dynamicfusion.bib   |  23 ++
 modules/dynamicfusion/include/io/capture.hpp  |  41 ---
 .../include/kfusion/cuda/device_array.hpp     | 303 -----------------
 .../include/kfusion/cuda/device_memory.hpp    | 258 ---------------
 .../include/kfusion/cuda/imgproc.hpp          |  35 --
 .../kfusion/cuda/kernel_containers.hpp        |  75 -----
 .../include/kfusion/cuda/projective_icp.hpp   |  48 ---
 .../include/kfusion/cuda/tsdf_volume.hpp      | 102 ------
 .../dynamicfusion/include/kfusion/kinfu.hpp   | 107 ------
 .../include/kfusion/optimisation.hpp          | 168 ----------
 .../dynamicfusion/include/kfusion/types.hpp   |  98 ------
 .../include/kfusion/warp_field.hpp            | 102 ------
 .../include/opencv2/dynamicfusion.hpp         |   5 +
 .../include/opencv2/io/capture.hpp            |  43 +++
 .../opencv2/kfusion/cuda/device_array.hpp     | 306 +++++++++++++++++
 .../opencv2/kfusion/cuda/device_memory.hpp    | 260 +++++++++++++++
 .../include/opencv2/kfusion/cuda/imgproc.hpp  |  38 +++
 .../kfusion/cuda/kernel_containers.hpp        |  78 +++++
 .../opencv2/kfusion/cuda/projective_icp.hpp   |  51 +++
 .../opencv2/kfusion/cuda/tsdf_volume.hpp      | 104 ++++++
 .../include/{ => opencv2}/kfusion/exports.hpp |   0
 .../include/opencv2/kfusion/kinfu.hpp         | 113 +++++++
 .../include/opencv2/kfusion/optimisation.hpp  | 174 ++++++++++
 .../include/opencv2/kfusion/types.hpp         | 101 ++++++
 .../include/opencv2/kfusion/warp_field.hpp    | 105 ++++++
 .../include/opencv2/utils/dual_quaternion.hpp | 265 +++++++++++++++
 .../include/opencv2/utils/knn_point_cloud.hpp |  59 ++++
 .../include/opencv2/utils/quaternion.hpp      | 249 ++++++++++++++
 .../include/utils/dual_quaternion.hpp         | 289 ----------------
 .../include/utils/knn_point_cloud.hpp         |  56 ----
 .../include/utils/quaternion.hpp              | 246 --------------
 modules/dynamicfusion/samples/CMakeLists.txt  |  25 --
 modules/dynamicfusion/samples/demo.cpp        |  81 +++--
 .../samples/dynamicfusion_kinect.cpp          | 158 ---------
 modules/dynamicfusion/src/capture.cpp         |   2 +-
 modules/dynamicfusion/src/core.cpp            |   4 +-
 modules/dynamicfusion/src/cuda/temp_utils.hpp |   2 +-
 .../dynamicfusion/src/cuda/texture_binder.hpp |   2 +-
 modules/dynamicfusion/src/device_memory.cpp   |   4 +-
 modules/dynamicfusion/src/imgproc.cpp         |   3 +-
 modules/dynamicfusion/src/internal.hpp        | 310 +++++++++++-------
 modules/dynamicfusion/src/kinfu.cpp           | 102 +++---
 modules/dynamicfusion/src/optimisation.cpp    |   0
 modules/dynamicfusion/src/precomp.cpp         |   2 +-
 modules/dynamicfusion/src/precomp.hpp         |  41 +--
 modules/dynamicfusion/src/projective_icp.cpp  |   6 +-
 modules/dynamicfusion/src/safe_call.hpp       |  40 +--
 modules/dynamicfusion/src/tsdf_volume.cpp     |  90 ++---
 modules/dynamicfusion/src/warp_field.cpp      |  13 +-
 50 files changed, 2370 insertions(+), 2458 deletions(-)
 create mode 100644 modules/dynamicfusion/doc/dynamicfusion.bib
 delete mode 100644 modules/dynamicfusion/include/io/capture.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/device_array.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/kinfu.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/optimisation.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/types.hpp
 delete mode 100644 modules/dynamicfusion/include/kfusion/warp_field.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/io/capture.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp
 rename modules/dynamicfusion/include/{ => opencv2}/kfusion/exports.hpp (100%)
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/types.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp
 create mode 100644 modules/dynamicfusion/include/opencv2/utils/quaternion.hpp
 delete mode 100644 modules/dynamicfusion/include/utils/dual_quaternion.hpp
 delete mode 100644 modules/dynamicfusion/include/utils/knn_point_cloud.hpp
 delete mode 100644 modules/dynamicfusion/include/utils/quaternion.hpp
 delete mode 100644 modules/dynamicfusion/samples/CMakeLists.txt
 delete mode 100644 modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
 delete mode 100644 modules/dynamicfusion/src/optimisation.cpp

diff --git a/modules/dynamicfusion/CMakeLists.txt b/modules/dynamicfusion/CMakeLists.txt
index 8fea6a63e3e..a901d87dfd8 100644
--- a/modules/dynamicfusion/CMakeLists.txt
+++ b/modules/dynamicfusion/CMakeLists.txt
@@ -1,34 +1,6 @@
 set(the_description "Non-rigid 3D reconstruction")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DFORCE_INLINERS -D_MWAITXINTRIN_H_INCLUDED")
 
-
-if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile)
-  find_package(Doxygen)
-  if(DOXYGEN_FOUND)
-
-    set(DOXYGEN_INPUT ${SOURCE_FILES})
-    set(DOXYGEN_OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile)
-
-    add_custom_command(
-            OUTPUT ${DOXYGEN_OUTPUT}
-            COMMAND ${CMAKE_COMMAND} -E echo_append "Building API Documentation..."
-            COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_INPUT}
-            COMMAND ${CMAKE_COMMAND} -E echo "Done."
-            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-            DEPENDS ${DOXYGEN_INPUT}
-    )
-
-    add_custom_target(apidoc ALL DEPENDS ${DOXYGEN_OUTPUT})
-
-    add_custom_target(apidoc_forced
-            COMMAND ${CMAKE_COMMAND} -E echo_append "Building API Documentation..."
-            COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_INPUT}
-            COMMAND ${CMAKE_COMMAND} -E echo "Done."
-            WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
-            )
-
-  endif(DOXYGEN_FOUND)
-endif()
 #
 ## ---[ Solution name
 #
@@ -79,6 +51,15 @@ include_directories(${CERES_INCLUDE_DIRS} ${Boost_INCLUDE_DIR} ${CUDA_INCLUDE_DI
 #    add_subdirectory(tests)
 #  endif()
 #endif()
-
+find_package(Boost REQUIRED COMPONENTS system filesystem program_options)
 ocv_define_module(dynamicfusion opencv_core opencv_calib3d opencv_viz opencv_highgui)
-target_compile_options(opencv_dynamicfusion PRIVATE "-Wall")
+
+ocv_warnings_disable(CMAKE_CXX_FLAGS
+        -Wundef
+        -Wshadow
+        -Wsign-compare
+        -Wmissing-declarations
+        -Wunused-but-set-variable
+        -Wunused-parameter
+        -Wunused-function
+        )
diff --git a/modules/dynamicfusion/doc/dynamicfusion.bib b/modules/dynamicfusion/doc/dynamicfusion.bib
new file mode 100644
index 00000000000..e50d6fe3f90
--- /dev/null
+++ b/modules/dynamicfusion/doc/dynamicfusion.bib
@@ -0,0 +1,23 @@
+@InProceedings{Newcombe_2015_CVPR,
+author = {Newcombe, Richard A. and Fox, Dieter and Seitz, Steven M.},
+title = {DynamicFusion: Reconstruction and Tracking of Non-Rigid Scenes in Real-Time},
+booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+month = {June},
+year = {2015}
+}
+
+@inbook{innmann2016volume,
+author = "Innmann, Matthias and Zollh{\"o}fer, Michael and Nie{\ss}ner, Matthias and Theobalt, Christian
+         and Stamminger, Marc",
+editor = "Leibe, Bastian and Matas, Jiri and Sebe, Nicu and Welling, Max",
+title = "VolumeDeform: Real-Time Volumetric Non-rigid Reconstruction",
+bookTitle = "Computer Vision -- ECCV 2016: 14th European Conference, Amsterdam, The Netherlands,
+            October 11-14, 2016, Proceedings, Part VIII",
+year = "2016",
+publisher = "Springer International Publishing",
+address = "Cham",
+pages = "362--379",
+isbn = "978-3-319-46484-8",
+doi = "10.1007/978-3-319-46484-8_22",
+url = "http://dx.doi.org/10.1007/978-3-319-46484-8_22"
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/io/capture.hpp b/modules/dynamicfusion/include/io/capture.hpp
deleted file mode 100644
index b56531d185f..00000000000
--- a/modules/dynamicfusion/include/io/capture.hpp
+++ /dev/null
@@ -1,41 +0,0 @@
-#pragma once
-
-#include <kfusion/kinfu.hpp>
-#include <string>
-
-namespace kfusion
-{
-    class KF_EXPORTS OpenNISource
-    {
-    public:
-        typedef kfusion::PixelRGB RGB24;
-
-        enum { PROP_OPENNI_REGISTRATION_ON  = 104 };
-
-        OpenNISource();
-        OpenNISource(int device);
-        OpenNISource(const std::string& oni_filename, bool repeat = false);
-
-        void open(int device);
-        void open(const std::string& oni_filename, bool repeat = false);
-        void release();
-
-        ~OpenNISource();
-
-        bool grab(cv::Mat &depth, cv::Mat &image);
-
-        //parameters taken from camera/oni
-        int shadow_value, no_sample_value;
-        float depth_focal_length_VGA;
-        float baseline;               // mm
-        double pixelSize;             // mm
-        unsigned short max_depth;     // mm
-
-        bool setRegistration (bool value = false);
-    private:
-        struct Impl;
-        cv::Ptr<Impl> impl_;
-        void getParams ();
-
-    };
-}
diff --git a/modules/dynamicfusion/include/kfusion/cuda/device_array.hpp b/modules/dynamicfusion/include/kfusion/cuda/device_array.hpp
deleted file mode 100644
index 2186f51e724..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/device_array.hpp
+++ /dev/null
@@ -1,303 +0,0 @@
-#pragma once
-
-#include <kfusion/exports.hpp>
-#include <kfusion/cuda/device_memory.hpp>
-
-#include <vector>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        /** \brief @b DeviceArray class
-          *
-          * \note Typed container for GPU memory with reference counting.
-          *
-          * \author Anatoly Baksheev
-          */
-        template<class T>
-        class KF_EXPORTS DeviceArray : public DeviceMemory
-        {
-        public:
-            /** \brief Element type. */
-            typedef T type;
-
-            /** \brief Element size. */
-            enum { elem_size = sizeof(T) };
-
-            /** \brief Empty constructor. */
-            DeviceArray();
-
-            /** \brief Allocates internal buffer in GPU memory
-              * \param size_t: number of elements to allocate
-              * */
-            DeviceArray(size_t size);
-
-            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
-              * \param ptr: pointer to buffer
-              * \param size: element number
-              * */
-            DeviceArray(T *ptr, size_t size);
-
-            /** \brief Copy constructor. Just increments reference counter. */
-            DeviceArray(const DeviceArray& other);
-
-            /** \brief Assigment operator. Just increments reference counter. */
-            DeviceArray& operator = (const DeviceArray& other);
-
-            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
-              * \param size: elemens number
-              * */
-            void create(size_t size);
-
-            /** \brief Decrements reference counter and releases internal buffer if needed. */
-            void release();
-
-            /** \brief Performs data copying. If destination size differs it will be reallocated.
-              * \param other_arg: destination container
-              * */
-            void copyTo(DeviceArray& other) const;
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param host_ptr_arg: pointer to buffer to upload
-              * \param size: elemens number
-              * */
-            void upload(const T *host_ptr, size_t size);
-
-            /** \brief Downloads data from internal buffer to CPU memory
-              * \param host_ptr_arg: pointer to buffer to download
-              * */
-            void download(T *host_ptr) const;
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param data: host vector to upload from
-              * */
-            template<class A>
-            void upload(const std::vector<T, A>& data);
-
-            /** \brief Downloads data from internal buffer to CPU memory
-              * \param data:  host vector to download to
-              * */
-            template<typename A>
-            void download(std::vector<T, A>& data) const;
-
-            /** \brief Performs swap of data pointed with another device array.
-              * \param other: device array to swap with
-              * */
-            void swap(DeviceArray& other_arg);
-
-            /** \brief Returns pointer for internal buffer in GPU memory. */
-            T* ptr();
-
-            /** \brief Returns const pointer for internal buffer in GPU memory. */
-            const T* ptr() const;
-
-            //using DeviceMemory::ptr;
-
-            /** \brief Returns pointer for internal buffer in GPU memory. */
-            operator T*();
-
-            /** \brief Returns const pointer for internal buffer in GPU memory. */
-            operator const T*() const;
-
-            /** \brief Returns size in elements. */
-            size_t size() const;
-        };
-
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        /** \brief @b DeviceArray2D class
-          *
-          * \note Typed container for pitched GPU memory with reference counting.
-          *
-          * \author Anatoly Baksheev
-          */
-        template<class T>
-        class KF_EXPORTS DeviceArray2D : public DeviceMemory2D
-        {
-        public:
-            /** \brief Element type. */
-            typedef T type;
-
-            /** \brief Element size. */
-            enum { elem_size = sizeof(T) };
-
-            /** \brief Empty constructor. */
-            DeviceArray2D();
-
-            /** \brief Allocates internal buffer in GPU memory
-              * \param rows: number of rows to allocate
-              * \param cols: number of elements in each row
-              * */
-            DeviceArray2D(int rows, int cols);
-
-            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
-             * \param rows: number of rows
-             * \param cols: number of elements in each row
-             * \param data: pointer to buffer
-             * \param stepBytes: stride between two consecutive rows in bytes
-             * */
-            DeviceArray2D(int rows, int cols, void *data, size_t stepBytes);
-
-            /** \brief Copy constructor. Just increments reference counter. */
-            DeviceArray2D(const DeviceArray2D& other);
-
-            /** \brief Assigment operator. Just increments reference counter. */
-            DeviceArray2D& operator = (const DeviceArray2D& other);
-
-            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
-               * \param rows: number of rows to allocate
-               * \param cols: number of elements in each row
-               * */
-            void create(int rows, int cols);
-
-            /** \brief Decrements reference counter and releases internal buffer if needed. */
-            void release();
-
-            /** \brief Performs data copying. If destination size differs it will be reallocated.
-              * \param other: destination container
-              * */
-            void copyTo(DeviceArray2D& other) const;
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param host_ptr: pointer to host buffer to upload
-              * \param host_step: stride between two consecutive rows in bytes for host buffer
-              * \param rows: number of rows to upload
-              * \param cols: number of elements in each row
-              * */
-            void upload(const void *host_ptr, size_t host_step, int rows, int cols);
-
-            /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
-              * \param host_ptr: pointer to host buffer to download
-              * \param host_step: stride between two consecutive rows in bytes for host buffer
-              * */
-            void download(void *host_ptr, size_t host_step) const;
-
-            /** \brief Performs swap of data pointed with another device array.
-              * \param other: device array to swap with
-              * */
-            void swap(DeviceArray2D& other_arg);
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param data: host vector to upload from
-              * \param cols: stride in elements between two consecutive rows for host buffer
-              * */
-            template<class A>
-            void upload(const std::vector<T, A>& data, int cols);
-
-            /** \brief Downloads data from internal buffer to CPU memory
-               * \param data: host vector to download to
-               * \param cols: Output stride in elements between two consecutive rows for host vector.
-               * */
-            template<class A>
-            void download(std::vector<T, A>& data, int& cols) const;
-
-            /** \brief Returns pointer to given row in internal buffer.
-              * \param y_arg: row index
-              * */
-            T* ptr(int y = 0);
-
-            /** \brief Returns const pointer to given row in internal buffer.
-              * \param y_arg: row index
-              * */
-            const T* ptr(int y = 0) const;
-
-            //using DeviceMemory2D::ptr;
-
-            /** \brief Returns pointer for internal buffer in GPU memory. */
-            operator T*();
-
-            /** \brief Returns const pointer for internal buffer in GPU memory. */
-            operator const T*() const;
-
-            /** \brief Returns number of elements in each row. */
-            int cols() const;
-
-            /** \brief Returns number of rows. */
-            int rows() const;
-
-            /** \brief Returns step in elements. */
-            size_t elem_step() const;
-        };
-    }
-
-    namespace device
-    {
-        using kfusion::cuda::DeviceArray;
-        using kfusion::cuda::DeviceArray2D;
-    }
-}
-
-/////////////////////  Inline implementations of DeviceArray ////////////////////////////////////////////
-
-template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray() {}
-template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
-template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
-template<class T> inline kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray& other) : DeviceMemory(other) {}
-template<class T> inline kfusion::cuda::DeviceArray<T>& kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray& other)
-{ DeviceMemory::operator=(other); return *this; }
-
-template<class T> inline void kfusion::cuda::DeviceArray<T>::create(size_t size)
-{ DeviceMemory::create(size * elem_size); }
-template<class T> inline void kfusion::cuda::DeviceArray<T>::release()
-{ DeviceMemory::release(); }
-
-template<class T> inline void kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray& other) const
-{ DeviceMemory::copyTo(other); }
-template<class T> inline void kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size)
-{ DeviceMemory::upload(host_ptr, size * elem_size); }
-template<class T> inline void kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const
-{ DeviceMemory::download( host_ptr ); }
-
-template<class T> void kfusion::cuda::DeviceArray<T>::swap(DeviceArray& other_arg) { DeviceMemory::swap(other_arg); }
-
-template<class T> inline kfusion::cuda::DeviceArray<T>::operator T*() { return ptr(); }
-template<class T> inline kfusion::cuda::DeviceArray<T>::operator const T*() const { return ptr(); }
-template<class T> inline size_t kfusion::cuda::DeviceArray<T>::size() const { return sizeBytes() / elem_size; }
-
-template<class T> inline       T* kfusion::cuda::DeviceArray<T>::ptr()       { return DeviceMemory::ptr<T>(); }
-template<class T> inline const T* kfusion::cuda::DeviceArray<T>::ptr() const { return DeviceMemory::ptr<T>(); }
-
-template<class T> template<class A> inline void kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A>& data) { upload(&data[0], data.size()); }
-template<class T> template<class A> inline void kfusion::cuda::DeviceArray<T>::download(std::vector<T, A>& data) const { data.resize(size()); if (!data.empty()) download(&data[0]); }
-
-/////////////////////  Inline implementations of DeviceArray2D ////////////////////////////////////////////
-
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D& other) : DeviceMemory2D(other) {}
-template<class T> inline kfusion::cuda::DeviceArray2D<T>& kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D& other)
-{ DeviceMemory2D::operator=(other); return *this; }
-
-template<class T> inline void kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols)
-{ DeviceMemory2D::create(rows, cols * elem_size); }
-template<class T> inline void kfusion::cuda::DeviceArray2D<T>::release()
-{ DeviceMemory2D::release(); }
-
-template<class T> inline void kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D& other) const
-{ DeviceMemory2D::copyTo(other); }
-template<class T> inline void kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols)
-{ DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size); }
-template<class T> inline void kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const
-{ DeviceMemory2D::download( host_ptr, host_step ); }
-
-template<class T> template<class A> inline void kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A>& data, int cols)
-{ upload(&data[0], cols * elem_size, data.size()/cols, cols); }
-
-template<class T> template<class A> inline void kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A>& data, int& elem_step) const
-{ elem_step = cols(); data.resize(cols() * rows()); if (!data.empty()) download(&data[0], colsBytes());  }
-
-template<class T> void  kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D& other_arg) { DeviceMemory2D::swap(other_arg); }
-
-template<class T> inline       T* kfusion::cuda::DeviceArray2D<T>::ptr(int y)       { return DeviceMemory2D::ptr<T>(y); }
-template<class T> inline const T* kfusion::cuda::DeviceArray2D<T>::ptr(int y) const { return DeviceMemory2D::ptr<T>(y); }
-
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::operator T*() { return ptr(); }
-template<class T> inline kfusion::cuda::DeviceArray2D<T>::operator const T*() const { return ptr(); }
-
-template<class T> inline int kfusion::cuda::DeviceArray2D<T>::cols() const { return DeviceMemory2D::colsBytes()/elem_size; }
-template<class T> inline int kfusion::cuda::DeviceArray2D<T>::rows() const { return DeviceMemory2D::rows(); }
-
-template<class T> inline size_t kfusion::cuda::DeviceArray2D<T>::elem_step() const { return DeviceMemory2D::step()/elem_size; }
diff --git a/modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp b/modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp
deleted file mode 100644
index c444baf304c..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/device_memory.hpp
+++ /dev/null
@@ -1,258 +0,0 @@
-#pragma once
-
-#include <kfusion/exports.hpp>
-#include <kfusion/cuda/kernel_containers.hpp>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        /** \brief Error handler. All GPU functions from this subsystem call the function to report an error. For internal use only */
-        KF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = "");
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        /** \brief @b DeviceMemory class
-          *
-          * \note This is a BLOB container class with reference counting for GPU memory.
-          *
-          * \author Anatoly Baksheev
-          */
-
-        class KF_EXPORTS DeviceMemory
-        {
-        public:
-            /** \brief Empty constructor. */
-            DeviceMemory();
-
-            /** \brief Destructor. */
-            ~DeviceMemory();
-
-            /** \brief Allocates internal buffer in GPU memory
-              * \param sizeBytes_arg: amount of memory to allocate
-              * */
-            DeviceMemory(size_t sizeBytes_arg);
-
-            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
-              * \param ptr_arg: pointer to buffer
-              * \param sizeBytes_arg: buffer size
-              * */
-            DeviceMemory(void *ptr_arg, size_t sizeBytes_arg);
-
-            /** \brief Copy constructor. Just increments reference counter. */
-            DeviceMemory(const DeviceMemory& other_arg);
-
-            /** \brief Assigment operator. Just increments reference counter. */
-            DeviceMemory& operator=(const DeviceMemory& other_arg);
-
-             /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
-               * \param sizeBytes_arg: buffer size
-               * */
-            void create(size_t sizeBytes_arg);
-
-            /** \brief Decrements reference counter and releases internal buffer if needed. */
-            void release();
-
-            /** \brief Performs data copying. If destination size differs it will be reallocated.
-              * \param other_arg: destination container
-              * */
-            void copyTo(DeviceMemory& other) const;
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param host_ptr_arg: pointer to buffer to upload
-              * \param sizeBytes_arg: buffer size
-              * */
-            void upload(const void *host_ptr_arg, size_t sizeBytes_arg);
-
-            /** \brief Downloads data from internal buffer to CPU memory
-              * \param host_ptr_arg: pointer to buffer to download
-              * */
-            void download(void *host_ptr_arg) const;
-
-            /** \brief Performs swap of data pointed with another device memory.
-              * \param other: device memory to swap with
-              * */
-            void swap(DeviceMemory& other_arg);
-
-            /** \brief Returns pointer for internal buffer in GPU memory. */
-            template<class T> T* ptr();
-
-            /** \brief Returns constant pointer for internal buffer in GPU memory. */
-            template<class T> const T* ptr() const;
-
-            /** \brief Conversion to PtrSz for passing to kernel functions. */
-            template <class U> operator PtrSz<U>() const;
-
-            /** \brief Returns true if unallocated otherwise false. */
-            bool empty() const;
-
-            size_t sizeBytes() const;
-
-        private:
-            /** \brief Device pointer. */
-            void *data_;
-
-            /** \brief Allocated size in bytes. */
-            size_t sizeBytes_;
-
-            /** \brief Pointer to reference counter in CPU memory. */
-            int* refcount_;
-        };
-
-        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-        /** \brief @b DeviceMemory2D class
-          *
-          * \note This is a BLOB container class with reference counting for pitched GPU memory.
-          *
-          * \author Anatoly Baksheev
-          */
-
-        class KF_EXPORTS DeviceMemory2D
-        {
-        public:
-            /** \brief Empty constructor. */
-            DeviceMemory2D();
-
-            /** \brief Destructor. */
-            ~DeviceMemory2D();
-
-            /** \brief Allocates internal buffer in GPU memory
-              * \param rows_arg: number of rows to allocate
-              * \param colsBytes_arg: width of the buffer in bytes
-              * */
-            DeviceMemory2D(int rows_arg, int colsBytes_arg);
-
-
-            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
-              * \param rows_arg: number of rows
-              * \param colsBytes_arg: width of the buffer in bytes
-              * \param data_arg: pointer to buffer
-              * \param stepBytes_arg: stride between two consecutive rows in bytes
-              * */
-            DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg);
-
-            /** \brief Copy constructor. Just increments reference counter. */
-            DeviceMemory2D(const DeviceMemory2D& other_arg);
-
-            /** \brief Assigment operator. Just increments reference counter. */
-            DeviceMemory2D& operator=(const DeviceMemory2D& other_arg);
-
-            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
-               * \param ptr_arg: number of rows to allocate
-               * \param sizeBytes_arg: width of the buffer in bytes
-               * */
-            void create(int rows_arg, int colsBytes_arg);
-
-            /** \brief Decrements reference counter and releases internal buffer if needed. */
-            void release();
-
-            /** \brief Performs data copying. If destination size differs it will be reallocated.
-              * \param other_arg: destination container
-              * */
-            void copyTo(DeviceMemory2D& other) const;
-
-            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
-              * \param host_ptr_arg: pointer to host buffer to upload
-              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
-              * \param rows_arg: number of rows to upload
-              * \param sizeBytes_arg: width of host buffer in bytes
-              * */
-            void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg);
-
-            /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
-              * \param host_ptr_arg: pointer to host buffer to download
-              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
-              * */
-            void download(void *host_ptr_arg, size_t host_step_arg) const;
-
-            /** \brief Performs swap of data pointed with another device memory.
-              * \param other: device memory to swap with
-              * */
-            void swap(DeviceMemory2D& other_arg);
-
-            /** \brief Returns pointer to given row in internal buffer.
-              * \param y_arg: row index
-              * */
-            template<class T> T* ptr(int y_arg = 0);
-
-            /** \brief Returns constant pointer to given row in internal buffer.
-              * \param y_arg: row index
-              * */
-            template<class T> const T* ptr(int y_arg = 0) const;
-
-             /** \brief Conversion to PtrStep for passing to kernel functions. */
-            template <class U> operator PtrStep<U>() const;
-
-            /** \brief Conversion to PtrStepSz for passing to kernel functions. */
-            template <class U> operator PtrStepSz<U>() const;
-
-            /** \brief Returns true if unallocated otherwise false. */
-            bool empty() const;
-
-            /** \brief Returns number of bytes in each row. */
-            int colsBytes() const;
-
-            /** \brief Returns number of rows. */
-            int rows() const;
-
-            /** \brief Returns stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
-            size_t step() const;
-        private:
-            /** \brief Device pointer. */
-            void *data_;
-
-            /** \brief Stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
-            size_t step_;
-
-            /** \brief Width of the buffer in bytes. */
-            int colsBytes_;
-
-            /** \brief Number of rows. */
-            int rows_;
-
-            /** \brief Pointer to reference counter in CPU memory. */
-            int* refcount_;
-        };
-    }
-
-    namespace device
-    {
-        using kfusion::cuda::DeviceMemory;
-        using kfusion::cuda::DeviceMemory2D;
-    }
-}
-
-/////////////////////  Inline implementations of DeviceMemory ////////////////////////////////////////////
-
-template<class T> inline       T* kfusion::cuda::DeviceMemory::ptr()       { return (      T*)data_; }
-template<class T> inline const T* kfusion::cuda::DeviceMemory::ptr() const { return (const T*)data_; }
-
-template <class U> inline kfusion::cuda::DeviceMemory::operator kfusion::cuda::PtrSz<U>() const
-{
-    PtrSz<U> result;
-    result.data = (U*)ptr<U>();
-    result.size = sizeBytes_/sizeof(U);
-    return result;
-}
-
-/////////////////////  Inline implementations of DeviceMemory2D ////////////////////////////////////////////
-
-template<class T>        T* kfusion::cuda::DeviceMemory2D::ptr(int y_arg)       { return (      T*)((      char*)data_ + y_arg * step_); }
-template<class T>  const T* kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const { return (const T*)((const char*)data_ + y_arg * step_); }
-
-template <class U> kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStep<U>() const
-{
-    PtrStep<U> result;
-    result.data = (U*)ptr<U>();
-    result.step = step_;
-    return result;
-}
-
-template <class U> kfusion::cuda::DeviceMemory2D::operator kfusion::cuda::PtrStepSz<U>() const
-{
-    PtrStepSz<U> result;
-    result.data = (U*)ptr<U>();
-    result.step = step_;
-    result.cols = colsBytes_/sizeof(U);
-    result.rows = rows_;
-    return result;
-}
diff --git a/modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp b/modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp
deleted file mode 100644
index 69f21f2d7d3..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/imgproc.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include <kfusion/types.hpp>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        KF_EXPORTS void depthBilateralFilter(const Depth& in, Depth& out, int ksz, float sigma_spatial, float sigma_depth);
-
-        KF_EXPORTS void depthTruncation(Depth& depth, float threshold);
-
-        KF_EXPORTS void depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth);
-
-        KF_EXPORTS void computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals);
-
-        KF_EXPORTS void computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals);
-
-        KF_EXPORTS void computeDists(const Depth& depth, Dists& dists, const Intr& intr);
-
-        KF_EXPORTS void cloudToDepth(const Cloud& cloud, Depth& depth);
-
-        KF_EXPORTS void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
-
-        KF_EXPORTS void resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out);
-
-        KF_EXPORTS void waitAllDefaultStream();
-
-        KF_EXPORTS void renderTangentColors(const Normals& normals, Image& image);
-
-        KF_EXPORTS void renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
-
-        KF_EXPORTS void renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
-    }
-}
diff --git a/modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp b/modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp
deleted file mode 100644
index 485f93bc0e8..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/kernel_containers.hpp
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-
-#if defined(__CUDACC__)
-    #define __kf_hdevice__ __host__ __device__ __forceinline__
-    #define __kf_device__ __device__ __forceinline__
-#else
-    #define __kf_hdevice__
-    #define __kf_device__
-#endif  
-
-#include <cstddef>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        template<typename T> struct DevPtr
-        {
-            typedef T elem_type;
-            const static size_t elem_size = sizeof(elem_type);
-
-            T* data;
-
-            __kf_hdevice__ DevPtr() : data(0) {}
-            __kf_hdevice__ DevPtr(T* data_arg) : data(data_arg) {}
-
-            __kf_hdevice__ size_t elemSize() const { return elem_size; }
-            __kf_hdevice__ operator       T*()       { return data; }
-            __kf_hdevice__ operator const T*() const { return data; }
-        };
-
-        template<typename T> struct PtrSz : public DevPtr<T>
-        {                     
-            __kf_hdevice__ PtrSz() : size(0) {}
-            __kf_hdevice__ PtrSz(T* data_arg, size_t size_arg) : DevPtr<T>(data_arg), size(size_arg) {}
-
-            size_t size;
-        };
-
-        template<typename T>  struct PtrStep : public DevPtr<T>
-        {   
-            __kf_hdevice__ PtrStep() : step(0) {}
-            __kf_hdevice__ PtrStep(T* data_arg, size_t step_arg) : DevPtr<T>(data_arg), step(step_arg) {}
-
-            /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
-            size_t step;            
-
-            __kf_hdevice__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
-            __kf_hdevice__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
-
-            __kf_hdevice__       T& operator()(int y, int x)       { return ptr(y)[x]; }
-            __kf_hdevice__ const T& operator()(int y, int x) const { return ptr(y)[x]; }
-        };
-
-        template <typename T> struct PtrStepSz : public PtrStep<T>
-        {   
-            __kf_hdevice__ PtrStepSz() : cols(0), rows(0) {}
-            __kf_hdevice__ PtrStepSz(int rows_arg, int cols_arg, T* data_arg, size_t step_arg)
-                : PtrStep<T>(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {}
-
-            int cols;
-            int rows;                                                                              
-        };
-    }
-
-    namespace device
-    {
-        using kfusion::cuda::PtrSz;
-        using kfusion::cuda::PtrStep;
-        using kfusion::cuda::PtrStepSz;
-    }
-}
-
-namespace kf = kfusion;
-
diff --git a/modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp b/modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp
deleted file mode 100644
index 1b71d8c436b..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/projective_icp.hpp
+++ /dev/null
@@ -1,48 +0,0 @@
-#pragma once
-
-#include <kfusion/types.hpp>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        class ProjectiveICP
-        {
-        public:
-            enum { MAX_PYRAMID_LEVELS = 4 };
-
-            typedef std::vector<Depth> DepthPyr;
-            typedef std::vector<Cloud> PointsPyr;
-            typedef std::vector<Normals> NormalsPyr;
-
-            ProjectiveICP();
-            virtual ~ProjectiveICP();
-
-            float getDistThreshold() const;
-            void setDistThreshold(float distance);
-
-            float getAngleThreshold() const;
-            void setAngleThreshold(float angle);
-
-            void setIterationsNum(const std::vector<int>& iters);
-            int getUsedLevelsNum() const;
-
-            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const Frame& curr, const Frame& prev);
-
-            /** The function takes masked depth, i.e. it assumes for performance reasons that
-              * "if depth(y,x) is not zero, then normals(y,x) surely is not qnan" */
-            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const DepthPyr& dcurr, const NormalsPyr ncurr, const DepthPyr dprev, const NormalsPyr nprev);
-            virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const PointsPyr& vcurr, const NormalsPyr ncurr, const PointsPyr vprev, const NormalsPyr nprev);
-
-            //static Vec3f rodrigues2(const Mat3f& matrix);
-        private:
-            std::vector<int> iters_;
-            float angle_thres_;
-            float dist_thres_;
-            DeviceArray2D<float> buffer_;
-
-            struct StreamHelper;
-            cv::Ptr<StreamHelper> shelp_;
-        };
-    }
-}
diff --git a/modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp b/modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
deleted file mode 100644
index ba1dd4c1853..00000000000
--- a/modules/dynamicfusion/include/kfusion/cuda/tsdf_volume.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#pragma once
-
-#include <kfusion/types.hpp>
-#include <utils/dual_quaternion.hpp>
-
-namespace kfusion
-{
-    class WarpField;
-    namespace cuda
-    {
-        class KF_EXPORTS TsdfVolume
-        {
-        public:
-            TsdfVolume(const cv::Vec3i& dims);
-            virtual ~TsdfVolume();
-
-            void create(const Vec3i& dims);
-
-            Vec3i getDims() const;
-            Vec3f getVoxelSize() const;
-
-            const CudaData data() const;
-            CudaData data();
-
-            cv::Mat get_cloud_host() const;
-            cv::Mat get_normal_host() const;
-
-            cv::Mat* get_cloud_host_ptr() const;
-            cv::Mat* get_normal_host_ptr() const;
-
-            Vec3f getSize() const;
-            void setSize(const Vec3f& size);
-
-            float getTruncDist() const;
-            void setTruncDist(float distance);
-
-            int getMaxWeight() const;
-            void setMaxWeight(int weight);
-
-            Affine3f getPose() const;
-            void setPose(const Affine3f& pose);
-
-            float getRaycastStepFactor() const;
-            void setRaycastStepFactor(float factor);
-
-            float getGradientDeltaFactor() const;
-            void setGradientDeltaFactor(float factor);
-
-            Vec3i getGridOrigin() const;
-            void setGridOrigin(const Vec3i& origin);
-
-            std::vector<float> psdf(const std::vector<Vec3f>& warped, Dists& depth_img, const Intr& intr);
-//            float psdf(const std::vector<Vec3f>& warped, Dists& dists, const Intr& intr);
-            float weighting(const std::vector<float>& dist_sqr, int k) const;
-            void surface_fusion(const WarpField& warp_field,
-                                std::vector<Vec3f> warped,
-                                std::vector<Vec3f> canonical,
-                                cuda::Depth &depth,
-                                const Affine3f& camera_pose,
-                                const Intr& intr);
-
-            virtual void clear();
-            virtual void applyAffine(const Affine3f& affine);
-            virtual void integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr);
-            virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals);
-            virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals);
-
-            void swap(CudaData& data);
-
-            DeviceArray<Point> fetchCloud(DeviceArray<Point>& cloud_buffer) const;
-            void fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const;
-            void compute_points();
-            void compute_normals();
-
-
-        private:
-            CudaData data_;
-//            need to make this smart pointers
-            cuda::DeviceArray<Point> *cloud_buffer;
-            cuda::DeviceArray<Point> *cloud;
-            cuda::DeviceArray<Normal> *normal_buffer;
-            cv::Mat *cloud_host;
-            cv::Mat *normal_host;
-
-            float trunc_dist_;
-            float max_weight_;
-            Vec3i dims_;
-            Vec3f size_;
-            Affine3f pose_;
-            float gradient_delta_factor_;
-            float raycast_step_factor_;
-            // TODO: remember to add entry when adding a new node
-            struct Entry
-            {
-                float tsdf_value;
-                float tsdf_weight;
-            };
-
-            std::vector<Entry> tsdf_entries;
-        };
-    }
-}
diff --git a/modules/dynamicfusion/include/kfusion/kinfu.hpp b/modules/dynamicfusion/include/kfusion/kinfu.hpp
deleted file mode 100644
index 3be08e84914..00000000000
--- a/modules/dynamicfusion/include/kfusion/kinfu.hpp
+++ /dev/null
@@ -1,107 +0,0 @@
-#pragma once
-
-#include <kfusion/types.hpp>
-#include <vector>
-#include <string>
-#include "utils/dual_quaternion.hpp"
-#include "utils/quaternion.hpp"
-#include <kfusion/cuda/projective_icp.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
-#include <kfusion/warp_field.hpp>
-
-namespace kfusion
-{
-    namespace cuda
-    {
-        KF_EXPORTS int getCudaEnabledDeviceCount();
-        KF_EXPORTS void setDevice(int device);
-        KF_EXPORTS std::string getDeviceName(int device);
-        KF_EXPORTS bool checkIfPreFermiGPU(int device);
-        KF_EXPORTS void printCudaDeviceInfo(int device);
-        KF_EXPORTS void printShortCudaDeviceInfo(int device);
-    }
-
-    struct KF_EXPORTS KinFuParams
-    {
-        static KinFuParams default_params();
-        static KinFuParams default_params_dynamicfusion();
-
-        int cols;  //pixels
-        int rows;  //pixels
-
-        Intr intr;  //Camera parameters
-
-        Vec3i volume_dims; //number of voxels
-        Vec3f volume_size; //meters
-        Affine3f volume_pose; //meters, inital pose
-
-        float bilateral_sigma_depth;   //meters
-        float bilateral_sigma_spatial;   //pixels
-        int   bilateral_kernel_size;   //pixels
-
-        float icp_truncate_depth_dist; //meters
-        float icp_dist_thres;          //meters
-        float icp_angle_thres;         //radians
-        std::vector<int> icp_iter_num; //iterations for level index 0,1,..,3
-
-        float tsdf_min_camera_movement; //meters, integrate only if exceedes
-        float tsdf_trunc_dist;             //meters;
-        int tsdf_max_weight;               //frames
-
-        float raycast_step_factor;   // in voxel sizes
-        float gradient_delta_factor; // in voxel sizes
-
-        Vec3f light_pose; //meters
-
-    };
-
-    class KF_EXPORTS KinFu
-    {
-    public:        
-        typedef cv::Ptr<KinFu> Ptr;
-
-        KinFu(const KinFuParams& params);
-
-        const KinFuParams& params() const;
-        KinFuParams& params();
-
-        const cuda::TsdfVolume& tsdf() const;
-        cuda::TsdfVolume& tsdf();
-
-        const cuda::ProjectiveICP& icp() const;
-        cuda::ProjectiveICP& icp();
-
-        const WarpField& getWarp() const;
-        WarpField& getWarp();
-
-        void reset();
-
-        bool operator()(const cuda::Depth& depth, const cuda::Image& image = cuda::Image());
-
-        void renderImage(cuda::Image& image, int flags = 0);
-        void dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals);
-        void renderImage(cuda::Image& image, const Affine3f& pose, int flags = 0);
-        void reprojectToDepth();
-
-        Affine3f getCameraPose (int time = -1) const;
-    private:
-        void allocate_buffers();
-
-        int frame_counter_;
-        KinFuParams params_;
-
-        std::vector<Affine3f> poses_;
-
-        cuda::Dists dists_;
-        cuda::Frame curr_, prev_, first_;
-
-        cuda::Cloud points_;
-        cuda::Normals normals_;
-        cuda::Depth depths_;
-
-        cv::Ptr<cuda::TsdfVolume> volume_;
-        cv::Ptr<cuda::ProjectiveICP> icp_;
-        cv::Ptr<WarpField> warp_;
-        std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> edges;
-    };
-}
diff --git a/modules/dynamicfusion/include/kfusion/optimisation.hpp b/modules/dynamicfusion/include/kfusion/optimisation.hpp
deleted file mode 100644
index 7f088d8f7c1..00000000000
--- a/modules/dynamicfusion/include/kfusion/optimisation.hpp
+++ /dev/null
@@ -1,168 +0,0 @@
-#ifndef KFUSION_OPTIMISATION_H
-#define KFUSION_OPTIMISATION_H
-#include "ceres/ceres.h"
-#include "ceres/rotation.h"
-#include <kfusion/warp_field.hpp>
-
-typedef Eigen::Vector3d Vec3;
-struct DynamicFusionDataEnergy
-{
-    DynamicFusionDataEnergy(cv::Vec3d live_vertex,
-                            cv::Vec3f live_normal,
-                            cv::Vec3f canonical_vertex,
-                            cv::Vec3f canonical_normal,
-                            kfusion::WarpField* warpField)
-            : live_vertex_(live_vertex),
-              live_normal_(live_normal),
-              canonical_vertex_(canonical_vertex),
-              canonical_normal_(canonical_normal),
-              warpField_(warpField) {}
-    template <typename T>
-    bool operator()(T const * const * epsilon_, T* residuals) const
-    {
-        T const * epsilon = epsilon_[0];
-        float weights[KNN_NEIGHBOURS];
-        warpField_->getWeightsAndUpdateKNN(canonical_vertex_, weights);
-        auto nodes = warpField_->getNodes();
-        T total_quaternion[4] = {T(0), T(0), T(0), T(0)};
-        T total_translation[3] = {T(0), T(0), T(0)};
-        for(int i = 0; i < KNN_NEIGHBOURS; i++)
-        {
-            int ret_index_i = warpField_->ret_index[i];
-            auto quat = weights[i] * nodes->at(ret_index_i).transform;
-
-            T eps_r[3] = {epsilon[ret_index_i],epsilon[ret_index_i + 1],epsilon[ret_index_i + 2]};
-            T eps_t[3] = {epsilon[ret_index_i + 3],epsilon[ret_index_i + 4],epsilon[ret_index_i + 5]};
-            float temp[3];
-            auto r_quat = quat.getRotation();
-            T r[4] = { T(r_quat.w_), T(r_quat.x_), T(r_quat.y_), T(r_quat.z_)};
-            quat.getTranslation(temp[0], temp[1], temp[2]);
-
-
-            T eps_quaternion[4];
-            ceres::AngleAxisToQuaternion(eps_r, eps_quaternion);
-            T product[4];
-
-            ceres::QuaternionProduct(eps_quaternion, r, product);
-
-            total_quaternion[0] += product[0];
-            total_quaternion[1] += product[1];
-            total_quaternion[2] += product[2];
-            total_quaternion[3] += product[3];
-
-            //probably wrong, should do like in quaternion multiplication.
-            total_translation[0] += T(temp[0]) +  eps_t[0];
-            total_translation[1] += T(temp[1]) +  eps_t[1];
-            total_translation[2] += T(temp[2]) +  eps_t[2];
-
-        }
-
-
-        T predicted_x, predicted_y, predicted_z;
-        T point[3];
-        T predicted[3];
-        ceres::QuaternionRotatePoint(total_quaternion, point, predicted);
-        predicted_x = predicted[0] + total_translation[0];
-        predicted_y = predicted[1] + total_translation[1];
-        predicted_z = predicted[2] + total_translation[2];
-
-//        T normal[3] = {T(canonical_normal_[0]),T(canonical_normal_[1]),T(canonical_normal_[2])};
-//        T result[3];
-        // The error is the difference between the predicted and observed position.
-        residuals[0] = predicted_x - live_vertex_[0];
-        residuals[1] = predicted_y - live_vertex_[1];
-        residuals[2] = predicted_z - live_vertex_[2];
-//        T dotProd = ceres::DotProduct(residuals, normal);
-//
-//        residuals[0] = tukeyPenalty(dotProd);
-//        result[0] = predicted_x - live_vertex_[0];
-//        result[1] = predicted_y - live_vertex_[1];
-//        result[2] = predicted_z - live_vertex_[2];
-//        T dotProd = ceres::DotProduct(residuals, normal);
-//
-//        residuals[0] = tukeyPenalty(dotProd);
-        return true;
-    }
-
-/**
- * Tukey loss function as described in http://web.as.uky.edu/statistics/users/pbreheny/764-F11/notes/12-1.pdf
- * \param x
- * \param c
- * \return
- *
- * \note
- * The value c = 4.685 is usually used for this loss function, and
- * it provides an asymptotic efficiency 95% that of linear
- * regression for the normal distribution
- *
- * In the paper, a value of 0.01 is suggested for c
- */
-    template <typename T>
-    T tukeyPenalty(T x, T c = T(0.01)) const
-    {
-        return ceres::abs(x) <= c ? x * ceres::pow((T(1.0) - (x * x) / (c * c)), 2) : T(0.0);
-    }
-
-    // Factory to hide the construction of the CostFunction object from
-    // the client code.
-//      TODO: this will only have one residual at the end, remember to change
-//      TODO: find out how to sum residuals
-    static ceres::CostFunction* Create(const cv::Vec3d live_vertex,
-                                       const cv::Vec3d live_normal,
-                                       const cv::Vec3f canonical_vertex,
-                                       const cv::Vec3f canonical_normal,
-                                       kfusion::WarpField* warpField) {
-        {
-            auto cost_function = new ceres::DynamicAutoDiffCostFunction<DynamicFusionDataEnergy, 4>(
-                    new DynamicFusionDataEnergy(live_vertex, live_normal, canonical_vertex, canonical_normal, warpField));
-            cost_function->AddParameterBlock(warpField->getNodes()->size() * 6);
-            cost_function->SetNumResiduals(3);
-            return cost_function;
-        }
-    }
-    const cv::Vec3d live_vertex_;
-    const cv::Vec3d live_normal_;
-    const cv::Vec3f canonical_vertex_;
-    const cv::Vec3f canonical_normal_;
-    kfusion::WarpField* warpField_;
-};
-
-class WarpProblem {
-public:
-    WarpProblem(kfusion::WarpField* warp) : warpField_(warp)
-    {
-        parameters_ = new double[warpField_->getNodes()->size() * 6];
-        mutable_epsilon_ = new double*[KNN_NEIGHBOURS * 6];
-    };
-    ~WarpProblem() {
-        delete[] parameters_;
-        for(int i = 0; i < KNN_NEIGHBOURS * 6; i++)
-            delete[] mutable_epsilon_[i];
-        delete[] mutable_epsilon_;
-    }
-    double **mutable_epsilon(int *index_list)
-    {
-        for(int i = 0; i < KNN_NEIGHBOURS; i++)
-            for(int j = 0; j < 6; j++)
-                mutable_epsilon_[i * 6 + j] = &(parameters_[index_list[i] + j]);
-        return mutable_epsilon_;
-    }
-    double *mutable_params()
-    {
-        return parameters_;
-    }
-
-
-private:
-    double **mutable_epsilon_;
-    double *parameters_;
-
-    kfusion::WarpField* warpField_;
-};
-
-class Optimisation {
-
-};
-
-
-#endif //KFUSION_OPTIMISATION_H
diff --git a/modules/dynamicfusion/include/kfusion/types.hpp b/modules/dynamicfusion/include/kfusion/types.hpp
deleted file mode 100644
index cc3f33f490c..00000000000
--- a/modules/dynamicfusion/include/kfusion/types.hpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#pragma once
-
-#include <kfusion/cuda/device_array.hpp>
-#include <opencv2/viz/vizcore.hpp>
-#include <iosfwd>
-
-struct CUevent_st;
-
-namespace kfusion
-{
-    typedef cv::Matx33f Mat3f;
-    typedef cv::Matx44f Mat4f;
-    typedef cv::Vec3f Vec3f;
-    typedef cv::Vec4f Vec4f;
-    typedef cv::Vec3i Vec3i;
-    typedef cv::Affine3f Affine3f;
-
-    struct KF_EXPORTS Intr
-    {
-        float fx, fy, cx, cy;
-
-        Intr ();
-        Intr (float fx, float fy, float cx, float cy);
-        Intr operator()(int level_index) const;
-    };
-
-    KF_EXPORTS std::ostream& operator << (std::ostream& os, const Intr& intr);
-
-    struct Point
-    {
-        union
-        {
-            float data[4];
-            struct { float x, y, z; };
-        };
-    };
-
-    typedef Point Normal;
-
-    struct RGB
-    {
-        union
-        {
-            struct { unsigned char b, g, r; };
-            int bgra;
-        };
-    };
-
-    struct PixelRGB
-    {
-        unsigned char r, g, b;
-    };
-
-    namespace cuda
-    {
-        typedef cuda::DeviceMemory CudaData;
-        typedef cuda::DeviceArray2D<unsigned short> Depth;
-        typedef cuda::DeviceArray2D<unsigned short> Dists;
-        typedef cuda::DeviceArray2D<RGB> Image;
-        typedef cuda::DeviceArray2D<Normal> Normals;
-        typedef cuda::DeviceArray2D<Point> Cloud;
-
-        struct Frame
-        {
-            bool use_points;
-
-            std::vector<Depth> depth_pyr;
-            std::vector<Cloud> points_pyr;
-            std::vector<Normals> normals_pyr;
-        };
-    }
-
-    inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
-
-    struct KF_EXPORTS ScopeTime
-    {
-        const char* name;
-        double start;
-        ScopeTime(const char *name);
-        ~ScopeTime();
-    };
-
-    struct KF_EXPORTS SampledScopeTime
-    {
-    public:
-        enum { EACH = 33 };
-        SampledScopeTime(double& time_ms);
-        ~SampledScopeTime();
-    private:
-        double getTime();
-        SampledScopeTime(const SampledScopeTime&);
-        SampledScopeTime& operator=(const SampledScopeTime&);
-
-        double& time_ms_;
-        double start;
-    };
-
-}
diff --git a/modules/dynamicfusion/include/kfusion/warp_field.hpp b/modules/dynamicfusion/include/kfusion/warp_field.hpp
deleted file mode 100644
index bfd122d829d..00000000000
--- a/modules/dynamicfusion/include/kfusion/warp_field.hpp
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef KFUSION_WARP_FIELD_HPP
-#define KFUSION_WARP_FIELD_HPP
-
-/**
- * \brief
- * \details
- */
-#include <utils/dual_quaternion.hpp>
-#include <kfusion/types.hpp>
-#include <nanoflann/nanoflann.hpp>
-#include <utils/knn_point_cloud.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
-#define KNN_NEIGHBOURS 8
-
-namespace kfusion
-{
-    typedef nanoflann::KDTreeSingleIndexAdaptor<
-            nanoflann::L2_Simple_Adaptor<float, utils::PointCloud>,
-            utils::PointCloud,
-            3 /* dim */
-    > kd_tree_t;
-
-
-    //    TODO: remember to rewrite this with proper doxygen formatting (e.g <sub></sub> rather than _
-    /*!
-     * \struct node
-     * \brief A node of the warp field
-     * \details The state of the warp field Wt at time t is defined by the values of a set of n
-     * deformation nodes Nt_warp = {dg_v, dg_w, dg_se3}_t. Here, this is represented as follows
-     *
-     * \var node::index
-     * Index of the node in the canonical frame. Equivalent to dg_v
-     *
-     * \var node::transform
-     * Transform from canonical point to warped point, equivalent to dg_se in the paper.
-     *
-     * \var node::weight
-     * Equivalent to dg_w
-     */
-    struct deformation_node
-    {
-        Vec3f vertex;
-        kfusion::utils::DualQuaternion<float> transform;
-        float weight = 0;
-    };
-    class WarpField
-    {
-    public:
-        WarpField();
-        ~WarpField();
-
-        void init(const cv::Mat& first_frame, const cv::Mat& normals);
-        void init(const std::vector<Vec3f>& first_frame, const std::vector<Vec3f>& normals);
-        void energy(const cuda::Cloud &frame,
-                    const cuda::Normals &normals,
-                    const Affine3f &pose,
-                    const cuda::TsdfVolume &tsdfVolume,
-                    const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
-                            kfusion::utils::DualQuaternion<float>>> &edges
-        );
-
-        float energy_data(const std::vector<Vec3f> &canonical_vertices,
-                          const std::vector<Vec3f> &canonical_normals,
-                          const std::vector<Vec3f> &live_vertices,
-                          const std::vector<Vec3f> &live_normals);
-        void energy_reg(const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
-                kfusion::utils::DualQuaternion<float>>> &edges);
-
-        float tukeyPenalty(float x, float c = 0.01) const;
-
-        float huberPenalty(float a, float delta) const;
-
-        void warp(std::vector<Vec3f>& points, std::vector<Vec3f>& normals) const;
-        void warp(cuda::Cloud& points) const;
-
-        utils::DualQuaternion<float> DQB(const Vec3f& vertex) const;
-        utils::DualQuaternion<float> DQB(const Vec3f& vertex, double epsilon[KNN_NEIGHBOURS * 6]) const;
-
-        void getWeightsAndUpdateKNN(const Vec3f& vertex, float weights[KNN_NEIGHBOURS]);
-
-        float weighting(float squared_dist, float weight) const;
-        void KNN(Vec3f point) const;
-
-        void clear();
-
-        const std::vector<deformation_node>* getNodes() const;
-        const cv::Mat getNodesAsMat() const;
-        void setWarpToLive(const Affine3f &pose);
-
-
-        std::vector<float> out_dist_sqr;
-        std::vector<size_t> ret_index;
-
-    private:
-        std::vector<deformation_node>* nodes;
-        kd_tree_t* index;
-        nanoflann::KNNResultSet<float> *resultSet;
-        Affine3f warp_to_live;
-        void buildKDTree();
-    };
-}
-#endif //KFUSION_WARP_FIELD_HPP
diff --git a/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
new file mode 100644
index 00000000000..72b7cbd6dce
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
@@ -0,0 +1,5 @@
+#ifndef OPENCV_DYNAMICFUSION_HPP
+#define OPENCV_DYNAMICFUSION_HPP
+#include <opencv2/kfusion/kinfu.hpp>
+#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#endif //OPENCV_DYNAMICFUSION_HPP
diff --git a/modules/dynamicfusion/include/opencv2/io/capture.hpp b/modules/dynamicfusion/include/opencv2/io/capture.hpp
new file mode 100644
index 00000000000..4e6d6b5f8c2
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/io/capture.hpp
@@ -0,0 +1,43 @@
+#pragma once
+
+#include <opencv2/kfusion/kinfu.hpp>
+#include <string>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        class KF_EXPORTS OpenNISource
+        {
+            public:
+            typedef kfusion::PixelRGB RGB24;
+
+            enum { PROP_OPENNI_REGISTRATION_ON  = 104 };
+
+            OpenNISource();
+            OpenNISource(int device);
+            OpenNISource(const std::string& oni_filename, bool repeat = false);
+
+            void open(int device);
+            void open(const std::string& oni_filename, bool repeat = false);
+            void release();
+
+            ~OpenNISource();
+
+            bool grab(cv::Mat &depth, cv::Mat &image);
+
+            //parameters taken from camera/oni
+            int shadow_value, no_sample_value;
+            float depth_focal_length_VGA;
+            float baseline;               // mm
+            double pixelSize;             // mm
+            unsigned short max_depth;     // mm
+
+            bool setRegistration (bool value = false);
+            private:
+            struct Impl;
+            cv::Ptr<Impl> impl_;
+            void getParams ();
+        };
+    }
+}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp
new file mode 100644
index 00000000000..83b7b6788fc
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp
@@ -0,0 +1,306 @@
+#pragma once
+
+#include <opencv2/kfusion/exports.hpp>
+#include <opencv2/kfusion/cuda/device_memory.hpp>
+
+#include <vector>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            /** \brief @b DeviceArray class
+              *
+              * \note Typed container for GPU memory with reference counting.
+              *
+              * \author Anatoly Baksheev
+              */
+            template<class T>
+            class KF_EXPORTS DeviceArray : public DeviceMemory
+        {
+            public:
+            /** \brief Element type. */
+            typedef T type;
+
+            /** \brief Element size. */
+            enum { elem_size = sizeof(T) };
+
+            /** \brief Empty constructor. */
+            DeviceArray();
+
+            /** \brief Allocates internal buffer in GPU memory
+              * \param size_t: number of elements to allocate
+              * */
+            DeviceArray(size_t size);
+
+            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+              * \param ptr: pointer to buffer
+              * \param size: element number
+              * */
+            DeviceArray(T *ptr, size_t size);
+
+            /** \brief Copy constructor. Just increments reference counter. */
+            DeviceArray(const DeviceArray& other);
+
+            /** \brief Assigment operator. Just increments reference counter. */
+            DeviceArray& operator = (const DeviceArray& other);
+
+            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+              * \param size: elemens number
+              * */
+            void create(size_t size);
+
+            /** \brief Decrements reference counter and releases internal buffer if needed. */
+            void release();
+
+            /** \brief Performs data copying. If destination size differs it will be reallocated.
+              * \param other_arg: destination container
+              * */
+            void copyTo(DeviceArray& other) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param host_ptr_arg: pointer to buffer to upload
+              * \param size: elemens number
+              * */
+            void upload(const T *host_ptr, size_t size);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+              * \param host_ptr_arg: pointer to buffer to download
+              * */
+            void download(T *host_ptr) const;
+
+            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+              * \param data: host vector to upload from
+              * */
+            template<class A>
+            void upload(const std::vector<T, A>& data);
+
+            /** \brief Downloads data from internal buffer to CPU memory
+              * \param data:  host vector to download to
+              * */
+            template<typename A>
+            void download(std::vector<T, A>& data) const;
+
+            /** \brief Performs swap of data pointed with another device array.
+              * \param other: device array to swap with
+              * */
+            void swap(DeviceArray& other_arg);
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            T* ptr();
+
+            /** \brief Returns const pointer for internal buffer in GPU memory. */
+            const T* ptr() const;
+
+            //using DeviceMemory::ptr;
+
+            /** \brief Returns pointer for internal buffer in GPU memory. */
+            operator T*();
+
+            /** \brief Returns const pointer for internal buffer in GPU memory. */
+            operator const T*() const;
+
+            /** \brief Returns size in elements. */
+            size_t size() const;
+        };
+
+
+        //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+        /** \brief @b DeviceArray2D class
+          *
+          * \note Typed container for pitched GPU memory with reference counting.
+          *
+          * \author Anatoly Baksheev
+          */
+        template<class T>
+        class KF_EXPORTS DeviceArray2D : public DeviceMemory2D
+    {
+        public:
+        /** \brief Element type. */
+        typedef T type;
+
+        /** \brief Element size. */
+        enum { elem_size = sizeof(T) };
+
+        /** \brief Empty constructor. */
+        DeviceArray2D();
+
+        /** \brief Allocates internal buffer in GPU memory
+          * \param rows: number of rows to allocate
+          * \param cols: number of elements in each row
+          * */
+        DeviceArray2D(int rows, int cols);
+
+        /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+         * \param rows: number of rows
+         * \param cols: number of elements in each row
+         * \param data: pointer to buffer
+         * \param stepBytes: stride between two consecutive rows in bytes
+         * */
+        DeviceArray2D(int rows, int cols, void *data, size_t stepBytes);
+
+        /** \brief Copy constructor. Just increments reference counter. */
+        DeviceArray2D(const DeviceArray2D& other);
+
+        /** \brief Assigment operator. Just increments reference counter. */
+        DeviceArray2D& operator = (const DeviceArray2D& other);
+
+        /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+           * \param rows: number of rows to allocate
+           * \param cols: number of elements in each row
+           * */
+        void create(int rows, int cols);
+
+        /** \brief Decrements reference counter and releases internal buffer if needed. */
+        void release();
+
+        /** \brief Performs data copying. If destination size differs it will be reallocated.
+          * \param other: destination container
+          * */
+        void copyTo(DeviceArray2D& other) const;
+
+        /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+          * \param host_ptr: pointer to host buffer to upload
+          * \param host_step: stride between two consecutive rows in bytes for host buffer
+          * \param rows: number of rows to upload
+          * \param cols: number of elements in each row
+          * */
+        void upload(const void *host_ptr, size_t host_step, int rows, int cols);
+
+        /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
+          * \param host_ptr: pointer to host buffer to download
+          * \param host_step: stride between two consecutive rows in bytes for host buffer
+          * */
+        void download(void *host_ptr, size_t host_step) const;
+
+        /** \brief Performs swap of data pointed with another device array.
+          * \param other: device array to swap with
+          * */
+        void swap(DeviceArray2D& other_arg);
+
+        /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+          * \param data: host vector to upload from
+          * \param cols: stride in elements between two consecutive rows for host buffer
+          * */
+        template<class A>
+        void upload(const std::vector<T, A>& data, int cols);
+
+        /** \brief Downloads data from internal buffer to CPU memory
+           * \param data: host vector to download to
+           * \param cols: Output stride in elements between two consecutive rows for host vector.
+           * */
+        template<class A>
+        void download(std::vector<T, A>& data, int& cols) const;
+
+        /** \brief Returns pointer to given row in internal buffer.
+          * \param y_arg: row index
+          * */
+        T* ptr(int y = 0);
+
+        /** \brief Returns const pointer to given row in internal buffer.
+          * \param y_arg: row index
+          * */
+        const T* ptr(int y = 0) const;
+
+        //using DeviceMemory2D::ptr;
+
+        /** \brief Returns pointer for internal buffer in GPU memory. */
+        operator T*();
+
+        /** \brief Returns const pointer for internal buffer in GPU memory. */
+        operator const T*() const;
+
+        /** \brief Returns number of elements in each row. */
+        int cols() const;
+
+        /** \brief Returns number of rows. */
+        int rows() const;
+
+        /** \brief Returns step in elements. */
+        size_t elem_step() const;
+    };
+};
+
+namespace device
+{
+    using cv::kfusion::cuda::DeviceArray;
+    using cv::kfusion::cuda::DeviceArray2D;
+}
+}
+}
+
+/////////////////////  Inline implementations of DeviceArray ////////////////////////////////////////////
+
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray() {}
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray& other) : DeviceMemory(other) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>& cv::kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray& other)
+{ DeviceMemory::operator=(other); return *this; }
+
+template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::create(size_t size)
+{ DeviceMemory::create(size * elem_size); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::release()
+{ DeviceMemory::release(); }
+
+template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray& other) const
+{ DeviceMemory::copyTo(other); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size)
+{ DeviceMemory::upload(host_ptr, size * elem_size); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const
+{ DeviceMemory::download( host_ptr ); }
+
+template<class T> void cv::kfusion::cuda::DeviceArray<T>::swap(DeviceArray& other_arg) { DeviceMemory::swap(other_arg); }
+
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::operator T*() { return ptr(); }
+template<class T> inline cv::kfusion::cuda::DeviceArray<T>::operator const T*() const { return ptr(); }
+template<class T> inline size_t cv::kfusion::cuda::DeviceArray<T>::size() const { return sizeBytes() / elem_size; }
+
+template<class T> inline       T* cv::kfusion::cuda::DeviceArray<T>::ptr()       { return DeviceMemory::ptr<T>(); }
+template<class T> inline const T* cv::kfusion::cuda::DeviceArray<T>::ptr() const { return DeviceMemory::ptr<T>(); }
+
+template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A>& data) { upload(&data[0], data.size()); }
+template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray<T>::download(std::vector<T, A>& data) const { data.resize(size()); if (!data.empty()) download(&data[0]); }
+
+/////////////////////  Inline implementations of DeviceArray2D ////////////////////////////////////////////
+
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D& other) : DeviceMemory2D(other) {}
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>& cv::kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D& other)
+{ DeviceMemory2D::operator=(other); return *this; }
+
+template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols)
+{ DeviceMemory2D::create(rows, cols * elem_size); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::release()
+{ DeviceMemory2D::release(); }
+
+template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D& other) const
+{ DeviceMemory2D::copyTo(other); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols)
+{ DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size); }
+template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const
+{ DeviceMemory2D::download( host_ptr, host_step ); }
+
+template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A>& data, int cols)
+{ upload(&data[0], cols * elem_size, data.size()/cols, cols); }
+
+template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A>& data, int& elem_step) const
+{ elem_step = cols(); data.resize(cols() * rows()); if (!data.empty()) download(&data[0], colsBytes());  }
+
+template<class T> void  cv::kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D& other_arg) { DeviceMemory2D::swap(other_arg); }
+
+template<class T> inline       T* cv::kfusion::cuda::DeviceArray2D<T>::ptr(int y)       { return DeviceMemory2D::ptr<T>(y); }
+template<class T> inline const T* cv::kfusion::cuda::DeviceArray2D<T>::ptr(int y) const { return DeviceMemory2D::ptr<T>(y); }
+
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::operator T*() { return ptr(); }
+template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::operator const T*() const { return ptr(); }
+
+template<class T> inline int cv::kfusion::cuda::DeviceArray2D<T>::cols() const { return DeviceMemory2D::colsBytes()/elem_size; }
+template<class T> inline int cv::kfusion::cuda::DeviceArray2D<T>::rows() const { return DeviceMemory2D::rows(); }
+
+template<class T> inline size_t cv::kfusion::cuda::DeviceArray2D<T>::elem_step() const { return DeviceMemory2D::step()/elem_size; }
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp
new file mode 100644
index 00000000000..c7dde9ac190
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp
@@ -0,0 +1,260 @@
+#pragma once
+
+#include <opencv2/kfusion/exports.hpp>
+#include <opencv2/kfusion/cuda/kernel_containers.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            /** \brief Error handler. All GPU functions from this subsystem call the function to report an error. For internal use only */
+            KF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = "");
+
+            //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            /** \brief @b DeviceMemory class
+              *
+              * \note This is a BLOB container class with reference counting for GPU memory.
+              *
+              * \author Anatoly Baksheev
+              */
+
+            class KF_EXPORTS DeviceMemory
+                    {
+                            public:
+                            /** \brief Empty constructor. */
+                            DeviceMemory();
+
+                            /** \brief Destructor. */
+                            ~DeviceMemory();
+
+                            /** \brief Allocates internal buffer in GPU memory
+                              * \param sizeBytes_arg: amount of memory to allocate
+                              * */
+                            DeviceMemory(size_t sizeBytes_arg);
+
+                            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+                              * \param ptr_arg: pointer to buffer
+                              * \param sizeBytes_arg: buffer size
+                              * */
+                            DeviceMemory(void *ptr_arg, size_t sizeBytes_arg);
+
+                            /** \brief Copy constructor. Just increments reference counter. */
+                            DeviceMemory(const DeviceMemory& other_arg);
+
+                            /** \brief Assigment operator. Just increments reference counter. */
+                            DeviceMemory& operator=(const DeviceMemory& other_arg);
+
+                            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+                              * \param sizeBytes_arg: buffer size
+                              * */
+                            void create(size_t sizeBytes_arg);
+
+                            /** \brief Decrements reference counter and releases internal buffer if needed. */
+                            void release();
+
+                            /** \brief Performs data copying. If destination size differs it will be reallocated.
+                              * \param other_arg: destination container
+                              * */
+                            void copyTo(DeviceMemory& other) const;
+
+                            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+                              * \param host_ptr_arg: pointer to buffer to upload
+                              * \param sizeBytes_arg: buffer size
+                              * */
+                            void upload(const void *host_ptr_arg, size_t sizeBytes_arg);
+
+                            /** \brief Downloads data from internal buffer to CPU memory
+                              * \param host_ptr_arg: pointer to buffer to download
+                              * */
+                            void download(void *host_ptr_arg) const;
+
+                            /** \brief Performs swap of data pointed with another device memory.
+                              * \param other: device memory to swap with
+                              * */
+                            void swap(DeviceMemory& other_arg);
+
+                            /** \brief Returns pointer for internal buffer in GPU memory. */
+                            template<class T> T* ptr();
+
+                            /** \brief Returns constant pointer for internal buffer in GPU memory. */
+                            template<class T> const T* ptr() const;
+
+                            /** \brief Conversion to PtrSz for passing to kernel functions. */
+                            template <class U> operator PtrSz<U>() const;
+
+                            /** \brief Returns true if unallocated otherwise false. */
+                            bool empty() const;
+
+                            size_t sizeBytes() const;
+
+                            private:
+                            /** \brief Device pointer. */
+                            void *data_;
+
+                            /** \brief Allocated size in bytes. */
+                            size_t sizeBytes_;
+
+                            /** \brief Pointer to reference counter in CPU memory. */
+                            int* refcount_;
+                    };
+
+            //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+            /** \brief @b DeviceMemory2D class
+              *
+              * \note This is a BLOB container class with reference counting for pitched GPU memory.
+              *
+              * \author Anatoly Baksheev
+              */
+
+            class KF_EXPORTS DeviceMemory2D
+                    {
+                            public:
+                            /** \brief Empty constructor. */
+                            DeviceMemory2D();
+
+                            /** \brief Destructor. */
+                            ~DeviceMemory2D();
+
+                            /** \brief Allocates internal buffer in GPU memory
+                              * \param rows_arg: number of rows to allocate
+                              * \param colsBytes_arg: width of the buffer in bytes
+                              * */
+                            DeviceMemory2D(int rows_arg, int colsBytes_arg);
+
+
+                            /** \brief Initializes with user allocated buffer. Reference counting is disabled in this case.
+                              * \param rows_arg: number of rows
+                              * \param colsBytes_arg: width of the buffer in bytes
+                              * \param data_arg: pointer to buffer
+                              * \param stepBytes_arg: stride between two consecutive rows in bytes
+                              * */
+                            DeviceMemory2D(int rows_arg, int colsBytes_arg, void *data_arg, size_t step_arg);
+
+                            /** \brief Copy constructor. Just increments reference counter. */
+                            DeviceMemory2D(const DeviceMemory2D& other_arg);
+
+                            /** \brief Assigment operator. Just increments reference counter. */
+                            DeviceMemory2D& operator=(const DeviceMemory2D& other_arg);
+
+                            /** \brief Allocates internal buffer in GPU memory. If internal buffer was created before the function recreates it with new size. If new and old sizes are equal it does nothing.
+                               * \param ptr_arg: number of rows to allocate
+                               * \param sizeBytes_arg: width of the buffer in bytes
+                               * */
+                            void create(int rows_arg, int colsBytes_arg);
+
+                            /** \brief Decrements reference counter and releases internal buffer if needed. */
+                            void release();
+
+                            /** \brief Performs data copying. If destination size differs it will be reallocated.
+                              * \param other_arg: destination container
+                              * */
+                            void copyTo(DeviceMemory2D& other) const;
+
+                            /** \brief Uploads data to internal buffer in GPU memory. It calls create() inside to ensure that intenal buffer size is enough.
+                              * \param host_ptr_arg: pointer to host buffer to upload
+                              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
+                              * \param rows_arg: number of rows to upload
+                              * \param sizeBytes_arg: width of host buffer in bytes
+                              * */
+                            void upload(const void *host_ptr_arg, size_t host_step_arg, int rows_arg, int colsBytes_arg);
+
+                            /** \brief Downloads data from internal buffer to CPU memory. User is resposible for correct host buffer size.
+                              * \param host_ptr_arg: pointer to host buffer to download
+                              * \param host_step_arg: stride between two consecutive rows in bytes for host buffer
+                              * */
+                            void download(void *host_ptr_arg, size_t host_step_arg) const;
+
+                            /** \brief Performs swap of data pointed with another device memory.
+                              * \param other: device memory to swap with
+                              * */
+                            void swap(DeviceMemory2D& other_arg);
+
+                            /** \brief Returns pointer to given row in internal buffer.
+                              * \param y_arg: row index
+                              * */
+                            template<class T> T* ptr(int y_arg = 0);
+
+                            /** \brief Returns constant pointer to given row in internal buffer.
+                              * \param y_arg: row index
+                              * */
+                            template<class T> const T* ptr(int y_arg = 0) const;
+
+                            /** \brief Conversion to PtrStep for passing to kernel functions. */
+                            template <class U> operator PtrStep<U>() const;
+
+                            /** \brief Conversion to PtrStepSz for passing to kernel functions. */
+                            template <class U> operator PtrStepSz<U>() const;
+
+                            /** \brief Returns true if unallocated otherwise false. */
+                            bool empty() const;
+
+                            /** \brief Returns number of bytes in each row. */
+                            int colsBytes() const;
+
+                            /** \brief Returns number of rows. */
+                            int rows() const;
+
+                            /** \brief Returns stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
+                            size_t step() const;
+                            private:
+                            /** \brief Device pointer. */
+                            void *data_;
+
+                            /** \brief Stride between two consecutive rows in bytes for internal buffer. Step is stored always and everywhere in bytes!!! */
+                            size_t step_;
+
+                            /** \brief Width of the buffer in bytes. */
+                            int colsBytes_;
+
+                            /** \brief Number of rows. */
+                            int rows_;
+
+                            /** \brief Pointer to reference counter in CPU memory. */
+                            int* refcount_;
+                    };
+        }
+
+        namespace device
+        {
+            using cv::kfusion::cuda::DeviceMemory;
+            using cv::kfusion::cuda::DeviceMemory2D;
+        }
+    }
+}
+/////////////////////  Inline implementations of DeviceMemory ////////////////////////////////////////////
+
+template<class T> inline       T* cv::kfusion::cuda::DeviceMemory::ptr()       { return (      T*)data_; }
+template<class T> inline const T* cv::kfusion::cuda::DeviceMemory::ptr() const { return (const T*)data_; }
+
+template <class U> inline cv::kfusion::cuda::DeviceMemory::operator cv::kfusion::cuda::PtrSz<U>() const
+{
+    PtrSz<U> result;
+    result.data = (U*)ptr<U>();
+    result.size = sizeBytes_/sizeof(U);
+    return result;
+}
+
+/////////////////////  Inline implementations of DeviceMemory2D ////////////////////////////////////////////
+
+template<class T>        T* cv::kfusion::cuda::DeviceMemory2D::ptr(int y_arg)       { return (      T*)((      char*)data_ + y_arg * step_); }
+template<class T>  const T* cv::kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const { return (const T*)((const char*)data_ + y_arg * step_); }
+
+template <class U> cv::kfusion::cuda::DeviceMemory2D::operator cv::kfusion::cuda::PtrStep<U>() const
+{
+    PtrStep<U> result;
+    result.data = (U*)ptr<U>();
+    result.step = step_;
+    return result;
+}
+
+template <class U> cv::kfusion::cuda::DeviceMemory2D::operator cv::kfusion::cuda::PtrStepSz<U>() const
+{
+    PtrStepSz<U> result;
+    result.data = (U*)ptr<U>();
+    result.step = step_;
+    result.cols = colsBytes_/sizeof(U);
+    result.rows = rows_;
+    return result;
+}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
new file mode 100644
index 00000000000..7b2f5535ca6
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <opencv2/kfusion/types.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            KF_EXPORTS void depthBilateralFilter(const Depth& in, Depth& out, int ksz, float sigma_spatial, float sigma_depth);
+
+            KF_EXPORTS void depthTruncation(Depth& depth, float threshold);
+
+            KF_EXPORTS void depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth);
+
+            KF_EXPORTS void computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals);
+
+            KF_EXPORTS void computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals);
+
+            KF_EXPORTS void computeDists(const Depth& depth, Dists& dists, const Intr& intr);
+
+            KF_EXPORTS void cloudToDepth(const Cloud& cloud, Depth& depth);
+
+            KF_EXPORTS void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
+
+            KF_EXPORTS void resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out);
+
+            KF_EXPORTS void waitAllDefaultStream();
+
+            KF_EXPORTS void renderTangentColors(const Normals& normals, Image& image);
+
+            KF_EXPORTS void renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+
+            KF_EXPORTS void renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+        }
+    }
+}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp
new file mode 100644
index 00000000000..7aa76aedd75
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp
@@ -0,0 +1,78 @@
+#pragma once
+
+#if defined(__CUDACC__)
+    #define __kf_hdevice__ __host__ __device__ __forceinline__
+    #define __kf_device__ __device__ __forceinline__
+#else
+    #define __kf_hdevice__
+    #define __kf_device__
+#endif  
+
+#include <cstddef>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            template<typename T> struct DevPtr
+            {
+                typedef T elem_type;
+                const static size_t elem_size = sizeof(elem_type);
+
+                T* data;
+
+                __kf_hdevice__ DevPtr() : data(0) {}
+                __kf_hdevice__ DevPtr(T* data_arg) : data(data_arg) {}
+
+                __kf_hdevice__ size_t elemSize() const { return elem_size; }
+                __kf_hdevice__ operator       T*()       { return data; }
+                __kf_hdevice__ operator const T*() const { return data; }
+            };
+
+            template<typename T> struct PtrSz : public DevPtr<T>
+            {
+                __kf_hdevice__ PtrSz() : size(0) {}
+                __kf_hdevice__ PtrSz(T* data_arg, size_t size_arg) : DevPtr<T>(data_arg), size(size_arg) {}
+
+                size_t size;
+            };
+
+            template<typename T>  struct PtrStep : public DevPtr<T>
+            {
+                __kf_hdevice__ PtrStep() : step(0) {}
+                __kf_hdevice__ PtrStep(T* data_arg, size_t step_arg) : DevPtr<T>(data_arg), step(step_arg) {}
+
+                /** \brief stride between two consecutive rows in bytes. Step is stored always and everywhere in bytes!!! */
+                size_t step;
+
+                __kf_hdevice__       T* ptr(int y = 0)       { return (      T*)( (      char*)DevPtr<T>::data + y * step); }
+                __kf_hdevice__ const T* ptr(int y = 0) const { return (const T*)( (const char*)DevPtr<T>::data + y * step); }
+
+                __kf_hdevice__       T& operator()(int y, int x)       { return ptr(y)[x]; }
+                __kf_hdevice__ const T& operator()(int y, int x) const { return ptr(y)[x]; }
+            };
+
+            template <typename T> struct PtrStepSz : public PtrStep<T>
+            {
+                __kf_hdevice__ PtrStepSz() : cols(0), rows(0) {}
+                __kf_hdevice__ PtrStepSz(int rows_arg, int cols_arg, T* data_arg, size_t step_arg)
+                        : PtrStep<T>(data_arg, step_arg), cols(cols_arg), rows(rows_arg) {}
+
+                int cols;
+                int rows;
+            };
+        }
+
+        namespace device
+        {
+            using kfusion::cuda::PtrSz;
+            using kfusion::cuda::PtrStep;
+            using kfusion::cuda::PtrStepSz;
+        }
+    }
+}
+
+namespace kf = cv::kfusion;
+
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp
new file mode 100644
index 00000000000..33854a5238f
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <opencv2/kfusion/types.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            class ProjectiveICP
+            {
+            public:
+                enum { MAX_PYRAMID_LEVELS = 4 };
+
+                typedef std::vector<Depth> DepthPyr;
+                typedef std::vector<Cloud> PointsPyr;
+                typedef std::vector<Normals> NormalsPyr;
+
+                ProjectiveICP();
+                virtual ~ProjectiveICP();
+
+                float getDistThreshold() const;
+                void setDistThreshold(float distance);
+
+                float getAngleThreshold() const;
+                void setAngleThreshold(float angle);
+
+                void setIterationsNum(const std::vector<int>& iters);
+                int getUsedLevelsNum() const;
+
+                virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const Frame& curr, const Frame& prev);
+
+                /** The function takes masked depth, i.e. it assumes for performance reasons that
+                  * "if depth(y,x) is not zero, then normals(y,x) surely is not qnan" */
+                virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const DepthPyr& dcurr, const NormalsPyr ncurr, const DepthPyr dprev, const NormalsPyr nprev);
+                virtual bool estimateTransform(Affine3f& affine, const Intr& intr, const PointsPyr& vcurr, const NormalsPyr ncurr, const PointsPyr vprev, const NormalsPyr nprev);
+
+                //static Vec3f rodrigues2(const Mat3f& matrix);
+            private:
+                std::vector<int> iters_;
+                float angle_thres_;
+                float dist_thres_;
+                DeviceArray2D<float> buffer_;
+
+                struct StreamHelper;
+                cv::Ptr<StreamHelper> shelp_;
+            };
+        }
+    }
+}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp
new file mode 100644
index 00000000000..8a415c2a2ed
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <opencv2/kfusion/types.hpp>
+#include <opencv2/utils/dual_quaternion.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        class WarpField;
+        namespace cuda
+        {
+            class KF_EXPORTS TsdfVolume
+            {
+                public:
+                TsdfVolume(const cv::Vec3i& dims);
+                virtual ~TsdfVolume();
+
+                void create(const Vec3i& dims);
+
+                Vec3i getDims() const;
+                Vec3f getVoxelSize() const;
+
+                const CudaData data() const;
+                CudaData data();
+
+                cv::Mat get_cloud_host() const;
+                cv::Mat get_normal_host() const;
+
+                cv::Mat* get_cloud_host_ptr() const;
+                cv::Mat* get_normal_host_ptr() const;
+
+                Vec3f getSize() const;
+                void setSize(const Vec3f& size);
+
+                float getTruncDist() const;
+                void setTruncDist(float distance);
+
+                int getMaxWeight() const;
+                void setMaxWeight(int weight);
+
+                Affine3f getPose() const;
+                void setPose(const Affine3f& pose);
+
+                float getRaycastStepFactor() const;
+                void setRaycastStepFactor(float factor);
+
+                float getGradientDeltaFactor() const;
+                void setGradientDeltaFactor(float factor);
+
+                Vec3i getGridOrigin() const;
+                void setGridOrigin(const Vec3i& origin);
+
+                std::vector<float> psdf(const std::vector<Vec3f>& warped, Dists& depth_img, const Intr& intr);
+//            float psdf(const std::vector<Vec3f>& warped, Dists& dists, const Intr& intr);
+                float weighting(const std::vector<float>& dist_sqr, int k) const;
+                void surface_fusion(const WarpField& warp_field,
+                std::vector<Vec3f> warped,
+                std::vector<Vec3f> canonical,
+                cuda::Depth &depth,
+                const Affine3f& camera_pose,
+                const Intr& intr);
+
+                virtual void clear();
+                virtual void applyAffine(const Affine3f& affine);
+                virtual void integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr);
+                virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals);
+                virtual void raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals);
+
+                void swap(CudaData& data);
+                cv::kfusion::cuda::DeviceArray<cv::kfusion::Point> fetchCloud(cv::kfusion::cuda::DeviceArray<cv::kfusion::Point>& cloud_buffer) const;
+                void fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const;
+                void compute_points();
+                void compute_normals();
+
+
+                private:
+                CudaData data_;
+//            need to make this smart pointers
+                cuda::DeviceArray<Point> *cloud_buffer;
+                cuda::DeviceArray<Point> *cloud;
+                cuda::DeviceArray<Normal> *normal_buffer;
+                cv::Mat *cloud_host;
+                cv::Mat *normal_host;
+
+                float trunc_dist_;
+                float max_weight_;
+                Vec3i dims_;
+                Vec3f size_;
+                Affine3f pose_;
+                float gradient_delta_factor_;
+                float raycast_step_factor_;
+                // TODO: remember to add entry when adding a new node
+                struct Entry
+                {
+                    float tsdf_value;
+                    float tsdf_weight;
+                };
+
+                std::vector<Entry> tsdf_entries;
+            };
+        }
+    }
+}
diff --git a/modules/dynamicfusion/include/kfusion/exports.hpp b/modules/dynamicfusion/include/opencv2/kfusion/exports.hpp
similarity index 100%
rename from modules/dynamicfusion/include/kfusion/exports.hpp
rename to modules/dynamicfusion/include/opencv2/kfusion/exports.hpp
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp b/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
new file mode 100644
index 00000000000..bf3cda7438a
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
@@ -0,0 +1,113 @@
+#ifndef DYNAMIC_FUSION_KINFU_HPP
+#define DYNAMIC_FUSION_KINFU_HPP
+//#pragma once
+
+#include <opencv2/kfusion/types.hpp>
+#include <vector>
+#include <string>
+#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/utils/quaternion.hpp>
+#include <opencv2/kfusion/cuda/projective_icp.hpp>
+#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/kfusion/warp_field.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            KF_EXPORTS int getCudaEnabledDeviceCount();
+            KF_EXPORTS void setDevice(int device);
+            KF_EXPORTS std::string getDeviceName(int device);
+            KF_EXPORTS bool checkIfPreFermiGPU(int device);
+            KF_EXPORTS void printCudaDeviceInfo(int device);
+            KF_EXPORTS void printShortCudaDeviceInfo(int device);
+        }
+
+        struct KF_EXPORTS KinFuParams
+                {
+                        static KinFuParams default_params();
+                        static KinFuParams default_params_dynamicfusion();
+
+                        int cols;  //pixels
+                        int rows;  //pixels
+
+                        Intr intr;  //Camera parameters
+
+                        Vec3i volume_dims; //number of voxels
+                        Vec3f volume_size; //meters
+                        Affine3f volume_pose; //meters, inital pose
+
+                        float bilateral_sigma_depth;   //meters
+                        float bilateral_sigma_spatial;   //pixels
+                        int   bilateral_kernel_size;   //pixels
+
+                        float icp_truncate_depth_dist; //meters
+                        float icp_dist_thres;          //meters
+                        float icp_angle_thres;         //radians
+                        std::vector<int> icp_iter_num; //iterations for level index 0,1,..,3
+
+                        float tsdf_min_camera_movement; //meters, integrate only if exceedes
+                        float tsdf_trunc_dist;             //meters;
+                        int tsdf_max_weight;               //frames
+
+                        float raycast_step_factor;   // in voxel sizes
+                        float gradient_delta_factor; // in voxel sizes
+
+                        Vec3f light_pose; //meters
+
+                };
+
+        class KF_EXPORTS KinFu
+                {
+                        public:
+                        typedef cv::Ptr<KinFu> Ptr;
+
+                        KinFu(const KinFuParams& params);
+
+                        const KinFuParams& params() const;
+                        KinFuParams& params();
+
+                        const cuda::TsdfVolume& tsdf() const;
+                        cuda::TsdfVolume& tsdf();
+
+                        const cuda::ProjectiveICP& icp() const;
+                        cuda::ProjectiveICP& icp();
+
+                        const WarpField& getWarp() const;
+                        WarpField& getWarp();
+
+                        void reset();
+
+                        bool operator()(const cuda::Depth& depth, const cuda::Image& image = cuda::Image());
+
+                        void renderImage(cuda::Image& image, int flags = 0);
+                        void dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals);
+                        void renderImage(cuda::Image& image, const Affine3f& pose, int flags = 0);
+                        void reprojectToDepth();
+
+                        Affine3f getCameraPose (int time = -1) const;
+                        private:
+                        void allocate_buffers();
+
+                        int frame_counter_;
+                        KinFuParams params_;
+
+                        std::vector<Affine3f> poses_;
+
+                        cuda::Dists dists_;
+                        cuda::Frame curr_, prev_, first_;
+
+                        cuda::Cloud points_;
+                        cuda::Normals normals_;
+                        cuda::Depth depths_;
+
+                        cv::Ptr<cuda::TsdfVolume> volume_;
+                        cv::Ptr<cuda::ProjectiveICP> icp_;
+                        cv::Ptr<WarpField> warp_;
+                        std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> edges;
+                };
+    }
+}
+#endif
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp b/modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp
new file mode 100644
index 00000000000..5877dc14e0d
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp
@@ -0,0 +1,174 @@
+#ifndef KFUSION_OPTIMISATION_H
+#define KFUSION_OPTIMISATION_H
+#include "ceres/ceres.h"
+#include "ceres/rotation.h"
+#include <opencv2/kfusion/warp_field.hpp>
+
+namespace cv
+{
+    namespace kfusion{
+
+        typedef Eigen::Vector3d Vec3;
+        struct DynamicFusionDataEnergy
+        {
+            DynamicFusionDataEnergy(cv::Vec3d live_vertex,
+                                    cv::Vec3f live_normal,
+                                    cv::Vec3f canonical_vertex,
+                                    cv::Vec3f canonical_normal,
+                                    kfusion::WarpField* warpField)
+                    : live_vertex_(live_vertex),
+                      live_normal_(live_normal),
+                      canonical_vertex_(canonical_vertex),
+                      canonical_normal_(canonical_normal),
+                      warpField_(warpField) {}
+            template <typename T>
+            bool operator()(T const * const * epsilon_, T* residuals) const
+            {
+                T const * epsilon = epsilon_[0];
+                float weights[KNN_NEIGHBOURS];
+                warpField_->getWeightsAndUpdateKNN(canonical_vertex_, weights);
+                auto nodes = warpField_->getNodes();
+                T total_quaternion[4] = {T(0), T(0), T(0), T(0)};
+                T total_translation[3] = {T(0), T(0), T(0)};
+                for(int i = 0; i < KNN_NEIGHBOURS; i++)
+                {
+                    int ret_index_i = warpField_->ret_index[i];
+                    auto quat = weights[i] * nodes->at(ret_index_i).transform;
+
+                    T eps_r[3] = {epsilon[ret_index_i],epsilon[ret_index_i + 1],epsilon[ret_index_i + 2]};
+                    T eps_t[3] = {epsilon[ret_index_i + 3],epsilon[ret_index_i + 4],epsilon[ret_index_i + 5]};
+                    float temp[3];
+                    auto r_quat = quat.getRotation();
+                    T r[4] = { T(r_quat.w_), T(r_quat.x_), T(r_quat.y_), T(r_quat.z_)};
+                    quat.getTranslation(temp[0], temp[1], temp[2]);
+
+
+                    T eps_quaternion[4];
+                    ceres::AngleAxisToQuaternion(eps_r, eps_quaternion);
+                    T product[4];
+
+                    ceres::QuaternionProduct(eps_quaternion, r, product);
+
+                    total_quaternion[0] += product[0];
+                    total_quaternion[1] += product[1];
+                    total_quaternion[2] += product[2];
+                    total_quaternion[3] += product[3];
+
+                    //probably wrong, should do like in quaternion multiplication.
+                    total_translation[0] += T(temp[0]) +  eps_t[0];
+                    total_translation[1] += T(temp[1]) +  eps_t[1];
+                    total_translation[2] += T(temp[2]) +  eps_t[2];
+
+                }
+
+
+                T predicted_x, predicted_y, predicted_z;
+                T point[3];
+                T predicted[3];
+                ceres::QuaternionRotatePoint(total_quaternion, point, predicted);
+                predicted_x = predicted[0] + total_translation[0];
+                predicted_y = predicted[1] + total_translation[1];
+                predicted_z = predicted[2] + total_translation[2];
+
+//        T normal[3] = {T(canonical_normal_[0]),T(canonical_normal_[1]),T(canonical_normal_[2])};
+//        T result[3];
+                // The error is the difference between the predicted and observed position.
+                residuals[0] = predicted_x - live_vertex_[0];
+                residuals[1] = predicted_y - live_vertex_[1];
+                residuals[2] = predicted_z - live_vertex_[2];
+//        T dotProd = ceres::DotProduct(residuals, normal);
+//
+//        residuals[0] = tukeyPenalty(dotProd);
+//        result[0] = predicted_x - live_vertex_[0];
+//        result[1] = predicted_y - live_vertex_[1];
+//        result[2] = predicted_z - live_vertex_[2];
+//        T dotProd = ceres::DotProduct(residuals, normal);
+//
+//        residuals[0] = tukeyPenalty(dotProd);
+                return true;
+            }
+
+/**
+ * Tukey loss function as described in http://web.as.uky.edu/statistics/users/pbreheny/764-F11/notes/12-1.pdf
+ * \param x
+ * \param c
+ * \return
+ *
+ * \note
+ * The value c = 4.685 is usually used for this loss function, and
+ * it provides an asymptotic efficiency 95% that of linear
+ * regression for the normal distribution
+ *
+ * In the paper, a value of 0.01 is suggested for c
+ */
+            template <typename T>
+            T tukeyPenalty(T x, T c = T(0.01)) const
+            {
+                return ceres::abs(x) <= c ? x * ceres::pow((T(1.0) - (x * x) / (c * c)), 2) : T(0.0);
+            }
+
+            // Factory to hide the construction of the CostFunction object from
+            // the client code.
+//      TODO: this will only have one residual at the end, remember to change
+//      TODO: find out how to sum residuals
+            static ceres::CostFunction* Create(const cv::Vec3d live_vertex,
+                                               const cv::Vec3d live_normal,
+                                               const cv::Vec3f canonical_vertex,
+                                               const cv::Vec3f canonical_normal,
+                                               kfusion::WarpField* warpField) {
+                {
+                    auto cost_function = new ceres::DynamicAutoDiffCostFunction<DynamicFusionDataEnergy, 4>(
+                            new DynamicFusionDataEnergy(live_vertex, live_normal, canonical_vertex, canonical_normal, warpField));
+                    cost_function->AddParameterBlock(warpField->getNodes()->size() * 6);
+                    cost_function->SetNumResiduals(3);
+                    return cost_function;
+                }
+            }
+            const cv::Vec3d live_vertex_;
+            const cv::Vec3d live_normal_;
+            const cv::Vec3f canonical_vertex_;
+            const cv::Vec3f canonical_normal_;
+            kfusion::WarpField* warpField_;
+        };
+
+        class WarpProblem {
+        public:
+            WarpProblem(kfusion::WarpField* warp) : warpField_(warp)
+            {
+                parameters_ = new double[warpField_->getNodes()->size() * 6];
+                mutable_epsilon_ = new double*[KNN_NEIGHBOURS * 6];
+            };
+            ~WarpProblem() {
+                delete[] parameters_;
+                for(int i = 0; i < KNN_NEIGHBOURS * 6; i++)
+                    delete[] mutable_epsilon_[i];
+                delete[] mutable_epsilon_;
+            }
+            double **mutable_epsilon(int *index_list)
+            {
+                for(int i = 0; i < KNN_NEIGHBOURS; i++)
+                    for(int j = 0; j < 6; j++)
+                        mutable_epsilon_[i * 6 + j] = &(parameters_[index_list[i] + j]);
+                return mutable_epsilon_;
+            }
+            double *mutable_params()
+            {
+                return parameters_;
+            }
+
+
+        private:
+            double **mutable_epsilon_;
+            double *parameters_;
+
+            kfusion::WarpField* warpField_;
+        };
+
+        class Optimisation {
+
+        };
+
+
+    }
+}
+#endif //KFUSION_OPTIMISATION_H
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/types.hpp b/modules/dynamicfusion/include/opencv2/kfusion/types.hpp
new file mode 100644
index 00000000000..07b6a12093a
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/types.hpp
@@ -0,0 +1,101 @@
+#pragma once
+
+#include <opencv2/kfusion/cuda/device_array.hpp>
+#include <opencv2/viz/vizcore.hpp>
+#include <iosfwd>
+
+struct CUevent_st;
+
+namespace cv
+{
+    namespace kfusion
+    {
+        typedef cv::Matx33f Mat3f;
+        typedef cv::Matx44f Mat4f;
+        typedef cv::Vec3f Vec3f;
+        typedef cv::Vec4f Vec4f;
+        typedef cv::Vec3i Vec3i;
+        typedef cv::Affine3f Affine3f;
+
+        struct KF_EXPORTS Intr
+                {
+                        float fx, fy, cx, cy;
+
+                Intr ();
+                Intr (float fx, float fy, float cx, float cy);
+                Intr operator()(int level_index) const;
+                };
+
+        KF_EXPORTS std::ostream& operator << (std::ostream& os, const Intr& intr);
+
+        struct Point
+        {
+            union
+            {
+                float data[4];
+                struct { float x, y, z; };
+            };
+        };
+
+        typedef Point Normal;
+
+        struct RGB
+        {
+            union
+            {
+                struct { unsigned char b, g, r; };
+                int bgra;
+            };
+        };
+
+        struct PixelRGB
+        {
+            unsigned char r, g, b;
+        };
+
+        namespace cuda
+        {
+            typedef cuda::DeviceMemory CudaData;
+            typedef cuda::DeviceArray2D<unsigned short> Depth;
+            typedef cuda::DeviceArray2D<unsigned short> Dists;
+            typedef cuda::DeviceArray2D<RGB> Image;
+            typedef cuda::DeviceArray2D<Normal> Normals;
+            typedef cuda::DeviceArray2D<Point> Cloud;
+
+            struct Frame
+            {
+                bool use_points;
+
+                std::vector<Depth> depth_pyr;
+                std::vector<Cloud> points_pyr;
+                std::vector<Normals> normals_pyr;
+            };
+        }
+
+        inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
+
+        struct KF_EXPORTS ScopeTime
+                {
+                        const char* name;
+                        double start;
+                        ScopeTime(const char *name);
+                        ~ScopeTime();
+                };
+
+        struct KF_EXPORTS SampledScopeTime
+                {
+                        public:
+                        enum { EACH = 33 };
+                        SampledScopeTime(double& time_ms);
+                        ~SampledScopeTime();
+                        private:
+                        double getTime();
+                        SampledScopeTime(const SampledScopeTime&);
+                        SampledScopeTime& operator=(const SampledScopeTime&);
+
+                        double& time_ms_;
+                        double start;
+                };
+
+    }
+}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp b/modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp
new file mode 100644
index 00000000000..0779953c93f
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp
@@ -0,0 +1,105 @@
+#ifndef KFUSION_WARP_FIELD_HPP
+#define KFUSION_WARP_FIELD_HPP
+
+/**
+ * \brief
+ * \details
+ */
+#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/kfusion/types.hpp>
+#include <nanoflann/nanoflann.hpp>
+#include <opencv2/utils/knn_point_cloud.hpp>
+#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#define KNN_NEIGHBOURS 8
+
+namespace cv
+{
+    namespace kfusion
+    {
+        typedef nanoflann::KDTreeSingleIndexAdaptor<
+                nanoflann::L2_Simple_Adaptor<float, utils::PointCloud>,
+                utils::PointCloud,
+                3 /* dim */
+        > kd_tree_t;
+
+
+        //    TODO: remember to rewrite this with proper doxygen formatting (e.g <sub></sub> rather than _
+        /*!
+         * \struct node
+         * \brief A node of the warp field
+         * \details The state of the warp field Wt at time t is defined by the values of a set of n
+         * deformation nodes Nt_warp = {dg_v, dg_w, dg_se3}_t. Here, this is represented as follows
+         *
+         * \var node::index
+         * Index of the node in the canonical frame. Equivalent to dg_v
+         *
+         * \var node::transform
+         * Transform from canonical point to warped point, equivalent to dg_se in the paper.
+         *
+         * \var node::weight
+         * Equivalent to dg_w
+         */
+        struct deformation_node
+        {
+            Vec3f vertex;
+            kfusion::utils::DualQuaternion<float> transform;
+            float weight = 0;
+        };
+        class WarpField
+        {
+        public:
+            WarpField();
+            ~WarpField();
+
+            void init(const cv::Mat& first_frame, const cv::Mat& normals);
+            void init(const std::vector<Vec3f>& first_frame, const std::vector<Vec3f>& normals);
+            void energy(const cuda::Cloud &frame,
+                        const cuda::Normals &normals,
+                        const Affine3f &pose,
+                        const cuda::TsdfVolume &tsdfVolume,
+                        const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
+                                kfusion::utils::DualQuaternion<float>>> &edges
+            );
+
+            float energy_data(const std::vector<Vec3f> &canonical_vertices,
+                              const std::vector<Vec3f> &canonical_normals,
+                              const std::vector<Vec3f> &live_vertices,
+                              const std::vector<Vec3f> &live_normals);
+            void energy_reg(const std::vector<std::pair<kfusion::utils::DualQuaternion<float>,
+                    kfusion::utils::DualQuaternion<float>>> &edges);
+
+            float tukeyPenalty(float x, float c = 0.01) const;
+
+            float huberPenalty(float a, float delta) const;
+
+            void warp(std::vector<Vec3f>& points, std::vector<Vec3f>& normals) const;
+            void warp(cuda::Cloud& points) const;
+
+            utils::DualQuaternion<float> DQB(const Vec3f& vertex) const;
+            utils::DualQuaternion<float> DQB(const Vec3f& vertex, double epsilon[KNN_NEIGHBOURS * 6]) const;
+
+            void getWeightsAndUpdateKNN(const Vec3f& vertex, float weights[KNN_NEIGHBOURS]);
+
+            float weighting(float squared_dist, float weight) const;
+            void KNN(Vec3f point) const;
+
+            void clear();
+
+            const std::vector<deformation_node>* getNodes() const;
+            const cv::Mat getNodesAsMat() const;
+            void setWarpToLive(const Affine3f &pose);
+
+
+            std::vector<float> out_dist_sqr;
+            std::vector<size_t> ret_index;
+
+        private:
+            std::vector<deformation_node>* nodes;
+            kd_tree_t* index;
+            nanoflann::KNNResultSet<float> *resultSet;
+            Affine3f warp_to_live;
+            void buildKDTree();
+        };
+    }
+}
+#endif //KFUSION_WARP_FIELD_HPP
diff --git a/modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp b/modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp
new file mode 100644
index 00000000000..83ed53c413a
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp
@@ -0,0 +1,265 @@
+#ifndef DYNAMIC_FUSION_DUAL_QUATERNION_HPP
+#define DYNAMIC_FUSION_DUAL_QUATERNION_HPP
+#include<iostream>
+#include<opencv2/utils/quaternion.hpp>
+
+//Adapted from https://github.com/Poofjunior/QPose
+/**
+ * \brief a dual quaternion class for encoding transformations.
+ * \details transformations are stored as first a translation; then a
+ *          rotation. It is possible to switch the order. See this paper:
+ *  https://www.thinkmind.org/download.php?articleid=intsys_v6_n12_2013_5
+ */
+namespace cv {
+    namespace kfusion {
+        namespace utils {
+            static float epsilon() {
+                return 1e-6;
+            }
+
+            template<typename T>
+            class DualQuaternion {
+            public:
+                /**
+                 * \brief default constructor.
+                 */
+                DualQuaternion() {};
+
+                /**
+                 * \brief constructor that takes cartesian coordinates and Euler angles as
+                 *        arguments.
+                 */
+//            FIXME: always use Rodrigues angles, not Euler
+                DualQuaternion(T x, T y, T z, T roll, T pitch, T yaw) {
+                    // convert here.
+                    rotation_.w_ = cos(roll / 2) * cos(pitch / 2) * cos(yaw / 2) +
+                                   sin(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
+                    rotation_.x_ = sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2) -
+                                   cos(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
+                    rotation_.y_ = cos(roll / 2) * sin(pitch / 2) * cos(yaw / 2) +
+                                   sin(roll / 2) * cos(pitch / 2) * sin(yaw / 2);
+                    rotation_.z_ = cos(roll / 2) * cos(pitch / 2) * sin(yaw / 2) -
+                                   sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2);
+
+                    translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
+                }
+
+                /**
+                 * \brief constructor that takes two quaternions as arguments.
+                 * \details The rotation
+                 *          quaternion has the conventional encoding for a rotation as a
+                 *          quaternion. The translation quaternion is a quaternion with
+                 *          cartesian coordinates encoded as (0, x, y, z)
+                 */
+                DualQuaternion(Quaternion <T> translation, Quaternion <T> rotation) {
+                    rotation_ = rotation;
+                    translation_ = 0.5 * translation * rotation;
+                }
+
+                ~DualQuaternion() {};
+
+                /**
+                 * \brief store a rotation
+                 * \param angle is in radians
+                 */
+                void encodeRotation(T angle, T x, T y, T z) {
+                    rotation_.encodeRotation(angle, x, y, z);
+                }
+
+                /**
+                 * \brief store a rotation
+                 * \param angle is in radians
+                 */
+                void encodeRotation(T x, T y, T z) {
+                    rotation_.encodeRotation(sqrt(x * x + y * y + z * z), x, y, z);
+                }
+
+                void encodeTranslation(T x, T y, T z) {
+                    translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
+                }
+
+                /// handle accumulating error.
+                void normalize() {
+                    T x, y, z;
+                    getTranslation(x, y, z);
+
+                    rotation_.normalize();
+
+                    encodeTranslation(x, y, z);
+                }
+
+                /**
+                 * \brief a reference-based method for acquiring the latest
+                 *        translation data.
+                 */
+                void getTranslation(T &x, T &y, T &z) const {
+                    auto rot = rotation_;
+                    rot.normalize();
+                    Quaternion <T> result = 2 * translation_ * rot.conjugate();
+                    /// note: inverse of a quaternion is the same as the conjugate.
+                    x = result.x_;
+                    y = result.y_;
+                    z = result.z_;
+                }
+
+                /**
+                 * \brief a reference-based method for acquiring the latest
+                 *        translation data.
+                 */
+//FIXME: need to make sure rotation is normalized in all getTranslation functions.
+                void getTranslation(Vec3f &vec3f) const {
+
+                    auto rot = rotation_;
+                    rot.normalize();
+                    Quaternion <T> result = 2 * translation_ * rot.conjugate();
+                    vec3f = Vec3f(result.x_, result.y_, result.z_);
+                }
+
+                Quaternion <T> getTranslation() const {
+                    return 2 * translation_ * rotation_.conjugate();
+                }
+
+
+                /**
+                 * \brief a reference-based method for acquiring the latest rotation data.
+                 */
+                void getEuler(T &roll, T &pitch, T &yaw) {
+                    // FIXME: breaks for some value around PI.
+                    roll = getRoll();
+                    pitch = getPitch();
+                    yaw = getYaw();
+                }
+
+                Quaternion <T> getRotation() const {
+                    return rotation_;
+                }
+
+
+                /**
+                 * \brief Extraction everything (in a nice format)
+                 */
+                void get6DOF(T &x, T &y, T &z, T &roll, T &pitch, T &yaw) {
+                    getTranslation(x, y, z);
+                    getEuler(roll, pitch, yaw);
+                }
+
+                DualQuaternion operator+(const DualQuaternion &other) {
+                    DualQuaternion result;
+                    result.rotation_ = rotation_ + other.rotation_;
+                    result.translation_ = translation_ + other.translation_;
+                    return result;
+                }
+
+                DualQuaternion operator-(const DualQuaternion &other) {
+                    DualQuaternion result;
+                    result.rotation_ = rotation_ - other.rotation_;
+                    result.translation_ = translation_ - other.translation_;
+                    return result;
+                }
+
+                DualQuaternion operator*(const DualQuaternion &other) {
+                    DualQuaternion<T> result;
+                    result.rotation_ = rotation_ * other.rotation_;
+                    result.translation_ = (rotation_ * other.translation_) + (translation_ * other.rotation_);
+                    return result;
+                }
+
+                DualQuaternion operator/(const std::pair <T, T> divisor) {
+                    DualQuaternion<T> result;
+                    result.rotation_ = 1 / divisor.first * rotation_;
+                    result.translation_ = 1 / divisor.second * translation_;
+                    return result;
+                }
+
+                /// (left) Scalar Multiplication
+                /**
+                 * \fn template <typename U> friend Quaternion operator*(const U scalar,
+                 * \brief implements scalar multiplication for arbitrary scalar types
+                 */
+                template<typename U>
+                friend DualQuaternion operator*(const U scalar, const DualQuaternion &q) {
+                    DualQuaternion<T> result;
+                    result.rotation_ = scalar * q.rotation_;
+                    result.translation_ = scalar * q.translation_;
+                    return result;
+                }
+
+                DualQuaternion conjugate() {
+                    DualQuaternion<T> result;
+                    result.rotation_ = rotation_.conjugate();
+                    result.translation_ = translation_.conjugate();
+                    return result;
+                }
+
+                inline DualQuaternion identity() {
+                    return DualQuaternion(Quaternion<T>(0, 0, 0, 0), Quaternion<T>(0, 1, 0, 0));
+                }
+
+                void transform(Vec3f &point) // TODO: this should be a lot more generic
+                {
+                    Vec3f translation;
+                    getTranslation(translation);
+                    rotation_.rotate(point);
+                    point += translation;
+                }
+
+                void from_twist(const float &r0, const float &r1, const float &r2,
+                                const float &x, const float &y, const float &z) {
+                    Vec3f r(r0, r1, r2), t(x, y, z);
+                    float norm = sqrt(r0 * r0 + r1 * r1 + r2 * r2);
+                    Quaternion <T> rotation;
+                    if (norm > epsilon()) {
+                        float cosNorm = cos(norm);
+                        float sign = (cosNorm > 0.f) - (cosNorm < 0.f);
+                        cosNorm *= sign;
+                        float sinNorm_norm = sign * sin(norm) / norm;
+                        rotation = Quaternion<T>(cosNorm, r0 * sinNorm_norm, r1 * sinNorm_norm, r2 * sinNorm_norm);
+                    } else
+                        rotation = Quaternion<T>();
+
+                    translation_ = Quaternion<T>(0, x, y, z);
+                    rotation_ = rotation;
+                }
+
+                std::pair <T, T> magnitude() {
+                    DualQuaternion result = (*this) * (*this).conjugate();
+                    return std::make_pair(result.rotation_.w_, result.translation_.w_);
+                }
+
+            private:
+                Quaternion <T> rotation_;
+                Quaternion <T> translation_;
+
+                T position_[3] = {};    /// default initialize vector to zeros.
+
+                T rotAxis_[3] = {};     /// default initialize vector to zeros.
+                T rotAngle_;
+
+
+                T getRoll() {
+                    // TODO: test this!
+                    return atan2(2 * ((rotation_.w_ * rotation_.x_) + (rotation_.y_ * rotation_.z_)),
+                                 (1 - 2 * ((rotation_.x_ * rotation_.x_) + (rotation_.y_ * rotation_.y_))));
+                }
+
+                T getPitch() {
+                    // TODO: test this!
+                    return asin(2 * (rotation_.w_ * rotation_.y_ - rotation_.z_ * rotation_.x_));
+                }
+
+                T getYaw() {
+                    // TODO: test this!
+                    return atan2(2 * ((rotation_.w_ * rotation_.z_) + (rotation_.x_ * rotation_.y_)),
+                                 (1 - 2 * ((rotation_.y_ * rotation_.y_) + (rotation_.z_ * rotation_.z_))));
+                }
+            };
+
+            template<typename T>
+            std::ostream &operator<<(std::ostream &os, const DualQuaternion<T> &q) {
+                os << "[" << q.getRotation() << ", " << q.getTranslation() << ", " << "]" << std::endl;
+                return os;
+            }
+        }
+    }
+}
+#endif //DYNAMIC_FUSION_DUAL_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp b/modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp
new file mode 100644
index 00000000000..b72c51943b8
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp
@@ -0,0 +1,59 @@
+#ifndef KFUSION_KNN_POINfloat_CLOUD_HPP
+#define KFUSION_KNN_POINfloat_CLOUD_HPP
+
+#include <opencv2/kfusion/types.hpp>
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace utils{
+
+            //  floatODO: Adapt this and nanoflann to work with Quaternions. Probably needs an adaptor class
+            // Check https://github.com/jlblancoc/nanoflann/blob/master/examples/pointcloud_adaptor_example.cpp
+            struct PointCloud
+            {
+                std::vector<Vec3f> pts;
+
+                // Must return the number of data points
+                inline size_t kdtree_get_point_count() const { return pts.size(); }
+
+                // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+                inline float kdtree_distance(const Vec3f p1, const size_t idx_p2,size_t /*size*/) const
+                {
+                    const float d0=p1[0] - pts[idx_p2][0];
+                    const float d1=p1[1] - pts[idx_p2][1];
+                    const float d2=p1[2] - pts[idx_p2][2];
+                    return d0*d0 + d1*d1 + d2*d2;
+                }
+
+                // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
+                inline float kdtree_distance(const float *p1, const size_t idx_p2,size_t /*size*/) const
+                {
+                    const float d0=p1[0]-pts[idx_p2][0];
+                    const float d1=p1[1]-pts[idx_p2][1];
+                    const float d2=p1[2]-pts[idx_p2][2];
+                    return d0*d0+d1*d1+d2*d2;
+                }
+
+                // Returns the dim'th component of the idx'th point in the class:
+                // Since this is inlined and the "dim" argument is typically an immediate value, the
+                //  "if/else's" are actually solved at compile time.
+                inline float kdtree_get_pt(const size_t idx, int dim) const
+                {
+                    if (dim==0) return pts[idx][0];
+                    else if (dim==1) return pts[idx][1];
+                    else return pts[idx][2];
+                }
+
+                // Optional bounding-box computation: return false to default to a standard bbox computation loop.
+                //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
+                //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
+                template <class BBOX>
+                bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }
+
+            };
+
+        }
+    }
+}
+#endif //KFUSION_KNN_POINfloat_CLOUD_HPP
diff --git a/modules/dynamicfusion/include/opencv2/utils/quaternion.hpp b/modules/dynamicfusion/include/opencv2/utils/quaternion.hpp
new file mode 100644
index 00000000000..75784454ed1
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/utils/quaternion.hpp
@@ -0,0 +1,249 @@
+#ifndef DYNAMIC_FUSION_QUATERNION_HPP
+#define DYNAMIC_FUSION_QUATERNION_HPP
+#pragma once
+#include <iostream>
+#include <opencv2/kfusion/types.hpp>
+//Adapted from https://github.com/Poofjunior/QPose
+namespace cv
+{
+    namespace kfusion{
+        namespace utils{
+
+
+            /**
+             * \class Quaternion
+             * \brief a templated quaternion class that also enables quick storage and
+             *        retrieval of rotations encoded as a vector3 and angle.
+             * \details All angles are in radians.
+             * \warning This template is intended to be instantiated with a floating point
+             *          data type.
+             */
+            template <typename T> class Quaternion
+            {
+            public:
+                Quaternion() : w_(1), x_(0), y_(0), z_(0)
+                {}
+
+                Quaternion(T w, T x, T y, T z) : w_(w), x_(x), y_(y), z_(z)
+                {}
+
+                /**
+                 * Encodes rotation from a normal
+                 * @param normal
+                 */
+                Quaternion(const Vec3f& normal)
+                {
+                    Vec3f a(1, 0, 0);
+                    Vec3f b(0, 1, 0);
+
+                    Vec3f t0 = normal.cross(a);
+
+                    if (t0.dot(t0) < 0.001f)
+                        t0 = normal.cross(b);
+                    t0 = cv::normalize(t0);
+
+                    Vec3f t1 = normal.cross(t0);
+                    t1 = cv::normalize(t1);
+
+                    cv::Mat3f matrix;
+                    matrix.push_back(t0);
+                    matrix.push_back(t1);
+                    matrix.push_back(normal);
+                    w_ = sqrt(1.0 + matrix.at<float>(0,0) + matrix.at<float>(1,1) + matrix.at<float>(2,2)) / 2.0;
+//                FIXME: this breaks when w_ = 0;
+                    x_ = (matrix.at<float>(2,1) - matrix.at<float>(1,2)) / (w_ * 4);
+                    y_ = (matrix.at<float>(0,2) - matrix.at<float>(2,0)) / (w_ * 4);
+                    z_ = (matrix.at<float>(1,0) - matrix.at<float>(2,1)) / (w_ * 4);
+                }
+
+                ~Quaternion()
+                {}
+
+
+                /**
+                 * Quaternion Rotation Properties for straightforward usage of quaternions
+                 *  to store rotations.
+                 */
+
+                /**
+                 * \fn void encodeRotation( T theta, T x, T y, T z)
+                 * \brief Store a normalized rotation in the quaternion encoded as a rotation
+                 *        of theta about the vector (x,y,z).
+                 */
+                void encodeRotation(T theta, T x, T y, T z)
+                {
+                    auto sin_half = sin(theta / 2);
+                    w_ = cos(theta / 2);
+                    x_ = x * sin_half;
+                    y_ = y * sin_half;
+                    z_ = z * sin_half;
+                    normalize();
+                }
+
+                /**
+                 * \fn void encodeRotation( T theta, T x, T y, T z)
+                 * \brief Store a normalized rotation in the quaternion encoded as a rotation
+                 *        of theta about the vector (x,y,z).
+                 */
+                void getRodrigues(T& x, T& y, T& z)
+                {
+                    if(w_ == 1)
+                    {
+                        x = y = z = 0;
+                        return;
+                    }
+                    T half_theta = acos(w_);
+                    T k = sin(half_theta) * tan(half_theta);
+                    x = x_ / k;
+                    y = y_ / k;
+                    z = z_ / k;
+                }
+
+
+                /**
+                 * \fn void rotate( T& x, T& y, T& z)
+                 * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
+                 * (U_x, U_y, U_z) stored in the quaternion.
+                 */
+                void rotate(T& x, T& y, T& z)
+                {
+                    Quaternion<T> q = (*this);
+                    Quaternion<T> qStar = (*this).conjugate();
+                    Quaternion<T> rotatedVal = q * Quaternion(0, x, y, z) * qStar;
+
+                    x = rotatedVal.x_;
+                    y = rotatedVal.y_;
+                    z = rotatedVal.z_;
+                }
+
+                /**
+                /**
+                 * \fn void rotate( T& x, T& y, T& z)
+                 * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
+                 * (U_x, U_y, U_z) stored in the quaternion.
+                 */
+                void rotate(Vec3f& v) const
+                {
+                    auto rot= *this;
+                    rot.normalize();
+                    Vec3f q_vec(rot.x_, rot.y_, rot.z_);
+                    v += (q_vec*2.f).cross( q_vec.cross(v) + v*rot.w_ );
+                }
+
+                /**
+                 * Quaternion Mathematical Properties
+                 * implemented below
+                 **/
+
+                Quaternion operator+(const Quaternion& other)
+                {
+                    return Quaternion(  (w_ + other.w_),
+                                        (x_ + other.x_),
+                                        (y_ + other.y_),
+                                        (z_ + other.z_));
+                }
+
+                Quaternion operator-(const Quaternion& other)
+                {
+                    return Quaternion((w_ - other.w_),
+                                      (x_ - other.x_),
+                                      (y_ - other.y_),
+                                      (z_ - other.z_));
+                }
+
+                Quaternion operator-()
+                {
+                    return Quaternion(-w_, -x_, -y_, -z_);
+                }
+
+                bool operator==(const Quaternion& other) const
+                {
+                    return (w_ == other.w_) && (x_ == other.x_) && (y_ == other.y_) && (z_ == other.z_);
+                }
+
+                /**
+                 * \fn template <typename U> friend Quaternion operator*(const U scalar,
+                 *                                                       const Quaternion& q)
+                 * \brief implements scalar multiplication for arbitrary scalar types.
+                 */
+                template <typename U> friend Quaternion operator*(const U scalar, const Quaternion& other)
+                {
+                    return Quaternion<T>((scalar * other.w_),
+                                         (scalar * other.x_),
+                                         (scalar * other.y_),
+                                         (scalar * other.z_));
+                }
+
+                template <typename U> friend Quaternion operator/(const Quaternion& q, const U scalar)
+                {
+                    return (1 / scalar) * q;
+                }
+
+                /// Quaternion Product
+                Quaternion operator*(const Quaternion& other)
+                {
+                    return Quaternion(
+                            ((w_*other.w_) - (x_*other.x_) - (y_*other.y_) - (z_*other.z_)),
+                            ((w_*other.x_) + (x_*other.w_) + (y_*other.z_) - (z_*other.y_)),
+                            ((w_*other.y_) - (x_*other.z_) + (y_*other.w_) + (z_*other.x_)),
+                            ((w_*other.z_) + (x_*other.y_) - (y_*other.x_) + (z_*other.w_))
+                    );
+                }
+
+                /**
+                 * \fn static T dotProduct(Quaternion q1, Quaternion q2)
+                 * \brief returns the dot product of two quaternions.
+                 */
+                T dotProduct(Quaternion other)
+                {
+                    return 0.5 * ((conjugate() * other) + (*this) * other.conjugate()).w_;
+                }
+
+                /// Conjugate
+                Quaternion conjugate() const
+                {
+                    return Quaternion<T>(w_, -x_, -y_, -z_);
+                }
+
+                T norm()
+                {
+                    return sqrt((w_ * w_) + (x_ * x_) + (y_ * y_) + (z_ * z_));
+                }
+
+                /**
+                 * \fn void normalize()
+                 * \brief normalizes the quaternion to magnitude 1
+                 */
+                void normalize()
+                {
+                    // should never happen unless the Quaternion<T> wasn't initialized
+                    // correctly.
+//                CV_Assert( !((w_ == 0) && (x_ == 0) && (y_ == 0) && (z_ == 0)));
+                    T theNorm = norm();
+//                CV_Assert(theNorm > 0);
+                    (*this) = (1.0/theNorm) * (*this);
+                }
+
+                /**
+                 * \fn template <typename U> friend std::ostream& operator <<
+                 *                                  (std::ostream& os, const Quaternion<U>& q);
+                 * \brief a templated friend function for printing quaternions.
+                 * \details T cannot be used as dummy parameter since it would be shared by
+                 *          the class, and this function is not a member function.
+                 */
+                template <typename U> friend std::ostream& operator << (std::ostream& os, const Quaternion<U>& q)
+                {
+                    os << "(" << q.w_ << ", " << q.x_ << ", " <<  q.y_ << ", " << q.z_ << ")";
+                    return os;
+                }
+                //TODO: shouldn't have Vec3f but rather Vec3<T>. Not sure how to determine this later
+
+                T w_;
+                T x_;
+                T y_;
+                T z_;
+            };
+        }
+    }
+}
+#endif // DYNAMIC_FUSION_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/utils/dual_quaternion.hpp b/modules/dynamicfusion/include/utils/dual_quaternion.hpp
deleted file mode 100644
index f3dcd5bb097..00000000000
--- a/modules/dynamicfusion/include/utils/dual_quaternion.hpp
+++ /dev/null
@@ -1,289 +0,0 @@
-#ifndef DYNAMIC_FUSION_DUAL_QUATERNION_HPP
-#define DYNAMIC_FUSION_DUAL_QUATERNION_HPP
-#include<iostream>
-#include"quaternion.hpp"
-
-//Adapted from https://github.com/Poofjunior/QPose
-/**
- * \brief a dual quaternion class for encoding transformations.
- * \details transformations are stored as first a translation; then a
- *          rotation. It is possible to switch the order. See this paper:
- *  https://www.thinkmind.org/download.php?articleid=intsys_v6_n12_2013_5
- */
-namespace kfusion {
-    namespace utils {
-        static float epsilon()
-        {
-            return 1e-6;
-        }
-        template<typename T>
-        class DualQuaternion {
-        public:
-            /**
-             * \brief default constructor.
-             */
-            DualQuaternion(){};
-
-            /**
-             * \brief constructor that takes cartesian coordinates and Euler angles as
-             *        arguments.
-             */
-//            FIXME: always use Rodrigues angles, not Euler
-            DualQuaternion(T x, T y, T z, T roll, T pitch, T yaw)
-            {
-                // convert here.
-                rotation_.w_ = cos(roll / 2) * cos(pitch / 2) * cos(yaw / 2) +
-                               sin(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
-                rotation_.x_ = sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2) -
-                               cos(roll / 2) * sin(pitch / 2) * sin(yaw / 2);
-                rotation_.y_ = cos(roll / 2) * sin(pitch / 2) * cos(yaw / 2) +
-                               sin(roll / 2) * cos(pitch / 2) * sin(yaw / 2);
-                rotation_.z_ = cos(roll / 2) * cos(pitch / 2) * sin(yaw / 2) -
-                               sin(roll / 2) * cos(pitch / 2) * cos(yaw / 2);
-
-                translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
-            }
-
-            /**
-             * \brief constructor that takes two quaternions as arguments.
-             * \details The rotation
-             *          quaternion has the conventional encoding for a rotation as a
-             *          quaternion. The translation quaternion is a quaternion with
-             *          cartesian coordinates encoded as (0, x, y, z)
-             */
-            DualQuaternion(Quaternion<T> translation, Quaternion<T> rotation)
-            {
-                rotation_ = rotation;
-                translation_ = 0.5 * translation * rotation;
-            }
-
-            ~DualQuaternion(){};
-
-            /**
-             * \brief store a rotation
-             * \param angle is in radians
-             */
-            void encodeRotation(T angle, T x, T y, T z)
-            {
-                rotation_.encodeRotation(angle, x, y, z);
-            }
-            /**
-             * \brief store a rotation
-             * \param angle is in radians
-             */
-            void encodeRotation(T x, T y, T z)
-            {
-                rotation_.encodeRotation(sqrt(x*x+y*y+z*z), x, y, z);
-            }
-
-            void encodeTranslation(T x, T y, T z)
-            {
-                translation_ = 0.5 * Quaternion<T>(0, x, y, z) * rotation_;
-            }
-
-            /// handle accumulating error.
-            void normalize()
-            {
-                T x, y, z;
-                getTranslation(x, y, z);
-
-                rotation_.normalize();
-
-                encodeTranslation(x, y, z);
-            }
-
-            /**
-             * \brief a reference-based method for acquiring the latest
-             *        translation data.
-             */
-            void getTranslation(T &x, T &y, T &z) const
-            {
-                auto rot = rotation_;
-                rot.normalize();
-                Quaternion<T> result = 2 * translation_ * rot.conjugate();
-                /// note: inverse of a quaternion is the same as the conjugate.
-                x = result.x_;
-                y = result.y_;
-                z = result.z_;
-            }
-
-            /**
-             * \brief a reference-based method for acquiring the latest
-             *        translation data.
-             */
-//FIXME: need to make sure rotation is normalized in all getTranslation functions.
-            void getTranslation(Vec3f& vec3f) const
-            {
-
-                auto rot = rotation_;
-                rot.normalize();
-                Quaternion<T> result = 2 * translation_ * rot.conjugate();
-                vec3f = Vec3f(result.x_, result.y_, result.z_);
-            }
-
-            Quaternion<T> getTranslation() const
-            {
-                return 2 * translation_ * rotation_.conjugate();
-            }
-
-
-            /**
-             * \brief a reference-based method for acquiring the latest rotation data.
-             */
-            void getEuler(T &roll, T &pitch, T &yaw)
-            {
-                // FIXME: breaks for some value around PI.
-                roll = getRoll();
-                pitch = getPitch();
-                yaw = getYaw();
-            }
-
-            Quaternion<T> getRotation() const
-            {
-                return rotation_;
-            }
-
-
-            /**
-             * \brief Extraction everything (in a nice format)
-             */
-            void get6DOF(T &x, T &y, T &z, T &roll, T &pitch, T &yaw)
-            {
-                getTranslation(x, y, z);
-                getEuler(roll, pitch, yaw);
-            }
-
-            DualQuaternion operator+(const DualQuaternion &other)
-            {
-                DualQuaternion result;
-                result.rotation_ = rotation_ + other.rotation_;
-                result.translation_ = translation_ + other.translation_;
-                return result;
-            }
-
-            DualQuaternion operator-(const DualQuaternion &other)
-            {
-                DualQuaternion result;
-                result.rotation_ = rotation_ - other.rotation_;
-                result.translation_ = translation_ - other.translation_;
-                return result;
-            }
-
-            DualQuaternion operator*(const DualQuaternion &other)
-            {
-                DualQuaternion<T> result;
-                result.rotation_ = rotation_ * other.rotation_;
-                result.translation_ = (rotation_ * other.translation_) + (translation_ * other.rotation_);
-                return result;
-            }
-
-            DualQuaternion operator/(const std::pair<T,T> divisor)
-            {
-                DualQuaternion<T> result;
-                result.rotation_ = 1 / divisor.first * rotation_;
-                result.translation_ = 1 / divisor.second * translation_;
-                return result;
-            }
-
-            /// (left) Scalar Multiplication
-            /**
-             * \fn template <typename U> friend Quaternion operator*(const U scalar,
-             * \brief implements scalar multiplication for arbitrary scalar types
-             */
-            template<typename U>
-            friend DualQuaternion operator*(const U scalar, const DualQuaternion &q)
-            {
-                DualQuaternion<T> result;
-                result.rotation_ = scalar * q.rotation_;
-                result.translation_ = scalar * q.translation_;
-                return result;
-            }
-
-            DualQuaternion conjugate()
-            {
-                DualQuaternion<T> result;
-                result.rotation_ = rotation_.conjugate();
-                result.translation_ = translation_.conjugate();
-                return result;
-            }
-
-            inline DualQuaternion identity()
-            {
-                return DualQuaternion(Quaternion<T>(0, 0, 0, 0),Quaternion<T>(0, 1, 0, 0));
-            }
-
-            void transform(Vec3f& point) // TODO: this should be a lot more generic
-            {
-                Vec3f translation;
-                getTranslation(translation);
-                rotation_.rotate(point);
-                point += translation;
-            }
-
-            void from_twist(const float &r0, const float &r1, const float &r2,
-                            const float &x, const float &y, const float &z)
-            {
-                Vec3f r(r0,r1,r2), t(x,y,z);
-                float norm = sqrt(r0*r0 + r1 * r1 + r2 * r2);
-                Quaternion<T> rotation;
-                if (norm > epsilon())
-                {
-                    float cosNorm = cos(norm);
-                    float sign = (cosNorm > 0.f) - (cosNorm < 0.f);
-                    cosNorm *= sign;
-                    float sinNorm_norm = sign * sin(norm) / norm;
-                    rotation = Quaternion<T>(cosNorm, r0 * sinNorm_norm, r1 * sinNorm_norm, r2 * sinNorm_norm);
-                }
-                else
-                    rotation = Quaternion<T>();
-
-                translation_ = Quaternion<T>(0, x, y, z);
-                rotation_ = rotation;
-            }
-
-            std::pair<T,T> magnitude()
-            {
-                DualQuaternion result = (*this) * (*this).conjugate();
-                return std::make_pair(result.rotation_.w_, result.translation_.w_);
-            }
-
-        private:
-            Quaternion<T> rotation_;
-            Quaternion<T> translation_;
-
-            T position_[3] = {};    /// default initialize vector to zeros.
-
-            T rotAxis_[3] = {};     /// default initialize vector to zeros.
-            T rotAngle_;
-
-
-            T getRoll()
-            {
-                // TODO: test this!
-                return atan2(2*((rotation_.w_ * rotation_.x_) + (rotation_.y_ * rotation_.z_)),
-                             (1 - 2*((rotation_.x_*rotation_.x_) + (rotation_.y_*rotation_.y_))));
-            }
-
-            T getPitch()
-            {
-                // TODO: test this!
-                return asin(2*(rotation_.w_ * rotation_.y_ - rotation_.z_ * rotation_.x_));
-            }
-
-            T getYaw()
-            {
-                // TODO: test this!
-                return atan2(2*((rotation_.w_ * rotation_.z_) + (rotation_.x_ * rotation_.y_)),
-                             (1 - 2*((rotation_.y_*rotation_.y_) + (rotation_.z_*rotation_.z_))));
-            }
-        };
-
-        template <typename T>
-        std::ostream &operator<<(std::ostream &os, const DualQuaternion<T> &q)
-        {
-            os << "[" << q.getRotation() << ", " << q.getTranslation()<< ", " << "]" << std::endl;
-            return os;
-        }
-    }
-}
-#endif //DYNAMIC_FUSION_DUAL_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/utils/knn_point_cloud.hpp b/modules/dynamicfusion/include/utils/knn_point_cloud.hpp
deleted file mode 100644
index fd4034abfc1..00000000000
--- a/modules/dynamicfusion/include/utils/knn_point_cloud.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-#ifndef KFUSION_KNN_POINfloat_CLOUD_HPP
-#define KFUSION_KNN_POINfloat_CLOUD_HPP
-
-#include <kfusion/types.hpp>
-namespace kfusion
-{
-    namespace utils{
-
-        //  floatODO: Adapt this and nanoflann to work with Quaternions. Probably needs an adaptor class
-        // Check https://github.com/jlblancoc/nanoflann/blob/master/examples/pointcloud_adaptor_example.cpp
-        struct PointCloud
-        {
-            std::vector<Vec3f> pts;
-
-            // Must return the number of data points
-            inline size_t kdtree_get_point_count() const { return pts.size(); }
-
-            // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
-            inline float kdtree_distance(const Vec3f p1, const size_t idx_p2,size_t /*size*/) const
-            {
-                const float d0=p1[0] - pts[idx_p2][0];
-                const float d1=p1[1] - pts[idx_p2][1];
-                const float d2=p1[2] - pts[idx_p2][2];
-                return d0*d0 + d1*d1 + d2*d2;
-            }
-
-            // Returns the distance between the vector "p1[0:size-1]" and the data point with index "idx_p2" stored in the class:
-            inline float kdtree_distance(const float *p1, const size_t idx_p2,size_t /*size*/) const
-            {
-                const float d0=p1[0]-pts[idx_p2][0];
-                const float d1=p1[1]-pts[idx_p2][1];
-                const float d2=p1[2]-pts[idx_p2][2];
-                return d0*d0+d1*d1+d2*d2;
-            }
-
-            // Returns the dim'th component of the idx'th point in the class:
-            // Since this is inlined and the "dim" argument is typically an immediate value, the
-            //  "if/else's" are actually solved at compile time.
-            inline float kdtree_get_pt(const size_t idx, int dim) const
-            {
-                if (dim==0) return pts[idx][0];
-                else if (dim==1) return pts[idx][1];
-                else return pts[idx][2];
-            }
-
-            // Optional bounding-box computation: return false to default to a standard bbox computation loop.
-            //   Return true if the BBOX was already computed by the class and returned in "bb" so it can be avoided to redo it again.
-            //   Look at bb.size() to find out the expected dimensionality (e.g. 2 or 3 for point clouds)
-            template <class BBOX>
-            bool kdtree_get_bbox(BBOX& /* bb */) const { return false; }
-
-        };
-
-    }
-}
-#endif //KFUSION_KNN_POINfloat_CLOUD_HPP
diff --git a/modules/dynamicfusion/include/utils/quaternion.hpp b/modules/dynamicfusion/include/utils/quaternion.hpp
deleted file mode 100644
index dbe922d54fb..00000000000
--- a/modules/dynamicfusion/include/utils/quaternion.hpp
+++ /dev/null
@@ -1,246 +0,0 @@
-#ifndef DYNAMIC_FUSION_QUATERNION_HPP
-#define DYNAMIC_FUSION_QUATERNION_HPP
-#pragma once
-#include <iostream>
-#include <kfusion/types.hpp>
-//Adapted from https://github.com/Poofjunior/QPose
-namespace kfusion{
-    namespace utils{
-
-
-        /**
-         * \class Quaternion
-         * \brief a templated quaternion class that also enables quick storage and
-         *        retrieval of rotations encoded as a vector3 and angle.
-         * \details All angles are in radians.
-         * \warning This template is intended to be instantiated with a floating point
-         *          data type.
-         */
-        template <typename T> class Quaternion
-        {
-        public:
-            Quaternion() : w_(1), x_(0), y_(0), z_(0)
-            {}
-
-            Quaternion(T w, T x, T y, T z) : w_(w), x_(x), y_(y), z_(z)
-            {}
-
-            /**
-             * Encodes rotation from a normal
-             * @param normal
-             */
-            Quaternion(const Vec3f& normal)
-            {
-                Vec3f a(1, 0, 0);
-                Vec3f b(0, 1, 0);
-
-                Vec3f t0 = normal.cross(a);
-
-                if (t0.dot(t0) < 0.001f)
-                    t0 = normal.cross(b);
-                t0 = cv::normalize(t0);
-
-                Vec3f t1 = normal.cross(t0);
-                t1 = cv::normalize(t1);
-
-                cv::Mat3f matrix;
-                matrix.push_back(t0);
-                matrix.push_back(t1);
-                matrix.push_back(normal);
-                w_ = sqrt(1.0 + matrix.at<float>(0,0) + matrix.at<float>(1,1) + matrix.at<float>(2,2)) / 2.0;
-//                FIXME: this breaks when w_ = 0;
-                x_ = (matrix.at<float>(2,1) - matrix.at<float>(1,2)) / (w_ * 4);
-                y_ = (matrix.at<float>(0,2) - matrix.at<float>(2,0)) / (w_ * 4);
-                z_ = (matrix.at<float>(1,0) - matrix.at<float>(2,1)) / (w_ * 4);
-            }
-
-            ~Quaternion()
-            {}
-
-
-            /**
-             * Quaternion Rotation Properties for straightforward usage of quaternions
-             *  to store rotations.
-             */
-
-            /**
-             * \fn void encodeRotation( T theta, T x, T y, T z)
-             * \brief Store a normalized rotation in the quaternion encoded as a rotation
-             *        of theta about the vector (x,y,z).
-             */
-            void encodeRotation(T theta, T x, T y, T z)
-            {
-                auto sin_half = sin(theta / 2);
-                w_ = cos(theta / 2);
-                x_ = x * sin_half;
-                y_ = y * sin_half;
-                z_ = z * sin_half;
-                normalize();
-            }
-
-            /**
-             * \fn void encodeRotation( T theta, T x, T y, T z)
-             * \brief Store a normalized rotation in the quaternion encoded as a rotation
-             *        of theta about the vector (x,y,z).
-             */
-            void getRodrigues(T& x, T& y, T& z)
-            {
-                if(w_ == 1)
-                {
-                    x = y = z = 0;
-                    return;
-                }
-                T half_theta = acos(w_);
-                T k = sin(half_theta) * tan(half_theta);
-                x = x_ / k;
-                y = y_ / k;
-                z = z_ / k;
-            }
-
-
-            /**
-             * \fn void rotate( T& x, T& y, T& z)
-             * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
-             * (U_x, U_y, U_z) stored in the quaternion.
-             */
-            void rotate(T& x, T& y, T& z)
-            {
-                Quaternion<T> q = (*this);
-                Quaternion<T> qStar = (*this).conjugate();
-                Quaternion<T> rotatedVal = q * Quaternion(0, x, y, z) * qStar;
-
-                x = rotatedVal.x_;
-                y = rotatedVal.y_;
-                z = rotatedVal.z_;
-            }
-
-            /**
-            /**
-             * \fn void rotate( T& x, T& y, T& z)
-             * \brief rotate a vector3 (x,y,z) by the angle theta about the axis
-             * (U_x, U_y, U_z) stored in the quaternion.
-             */
-            void rotate(Vec3f& v) const
-            {
-                auto rot= *this;
-                rot.normalize();
-                Vec3f q_vec(rot.x_, rot.y_, rot.z_);
-                v += (q_vec*2.f).cross( q_vec.cross(v) + v*rot.w_ );
-            }
-
-            /**
-             * Quaternion Mathematical Properties
-             * implemented below
-             **/
-
-            Quaternion operator+(const Quaternion& other)
-            {
-                return Quaternion(  (w_ + other.w_),
-                                    (x_ + other.x_),
-                                    (y_ + other.y_),
-                                    (z_ + other.z_));
-            }
-
-            Quaternion operator-(const Quaternion& other)
-            {
-                return Quaternion((w_ - other.w_),
-                                  (x_ - other.x_),
-                                  (y_ - other.y_),
-                                  (z_ - other.z_));
-            }
-
-            Quaternion operator-()
-            {
-                return Quaternion(-w_, -x_, -y_, -z_);
-            }
-
-            bool operator==(const Quaternion& other) const
-            {
-                return (w_ == other.w_) && (x_ == other.x_) && (y_ == other.y_) && (z_ == other.z_);
-            }
-
-            /**
-             * \fn template <typename U> friend Quaternion operator*(const U scalar,
-             *                                                       const Quaternion& q)
-             * \brief implements scalar multiplication for arbitrary scalar types.
-             */
-            template <typename U> friend Quaternion operator*(const U scalar, const Quaternion& other)
-            {
-                return Quaternion<T>((scalar * other.w_),
-                                     (scalar * other.x_),
-                                     (scalar * other.y_),
-                                     (scalar * other.z_));
-            }
-
-            template <typename U> friend Quaternion operator/(const Quaternion& q, const U scalar)
-            {
-                return (1 / scalar) * q;
-            }
-
-            /// Quaternion Product
-            Quaternion operator*(const Quaternion& other)
-            {
-                return Quaternion(
-                        ((w_*other.w_) - (x_*other.x_) - (y_*other.y_) - (z_*other.z_)),
-                        ((w_*other.x_) + (x_*other.w_) + (y_*other.z_) - (z_*other.y_)),
-                        ((w_*other.y_) - (x_*other.z_) + (y_*other.w_) + (z_*other.x_)),
-                        ((w_*other.z_) + (x_*other.y_) - (y_*other.x_) + (z_*other.w_))
-                );
-            }
-
-            /**
-             * \fn static T dotProduct(Quaternion q1, Quaternion q2)
-             * \brief returns the dot product of two quaternions.
-             */
-            T dotProduct(Quaternion other)
-            {
-                return 0.5 * ((conjugate() * other) + (*this) * other.conjugate()).w_;
-            }
-
-            /// Conjugate
-            Quaternion conjugate() const
-            {
-                return Quaternion<T>(w_, -x_, -y_, -z_);
-            }
-
-            T norm()
-            {
-                return sqrt((w_ * w_) + (x_ * x_) + (y_ * y_) + (z_ * z_));
-            }
-
-            /**
-             * \fn void normalize()
-             * \brief normalizes the quaternion to magnitude 1
-             */
-            void normalize()
-            {
-                // should never happen unless the Quaternion<T> wasn't initialized
-                // correctly.
-//                CV_Assert( !((w_ == 0) && (x_ == 0) && (y_ == 0) && (z_ == 0)));
-                T theNorm = norm();
-//                CV_Assert(theNorm > 0);
-                (*this) = (1.0/theNorm) * (*this);
-            }
-
-            /**
-             * \fn template <typename U> friend std::ostream& operator <<
-             *                                  (std::ostream& os, const Quaternion<U>& q);
-             * \brief a templated friend function for printing quaternions.
-             * \details T cannot be used as dummy parameter since it would be shared by
-             *          the class, and this function is not a member function.
-             */
-            template <typename U> friend std::ostream& operator << (std::ostream& os, const Quaternion<U>& q)
-            {
-                os << "(" << q.w_ << ", " << q.x_ << ", " <<  q.y_ << ", " << q.z_ << ")";
-                return os;
-            }
-            //TODO: shouldn't have Vec3f but rather Vec3<T>. Not sure how to determine this later
-
-            T w_;
-            T x_;
-            T y_;
-            T z_;
-        };
-    }
-}
-#endif // DYNAMIC_FUSION_QUATERNION_HPP
\ No newline at end of file
diff --git a/modules/dynamicfusion/samples/CMakeLists.txt b/modules/dynamicfusion/samples/CMakeLists.txt
deleted file mode 100644
index 218e6889124..00000000000
--- a/modules/dynamicfusion/samples/CMakeLists.txt
+++ /dev/null
@@ -1,25 +0,0 @@
-include_directories(${CMAKE_SOURCE_DIR}/kfusion/include)
-find_package(Boost REQUIRED COMPONENTS system filesystem program_options)
-
-add_executable(dynamicfusion demo.cpp)
-target_link_libraries(dynamicfusion ${OpenCV_LIBS} ${Boost_LIBRARIES} kfusion)
-
-set_target_properties(dynamicfusion PROPERTIES
-    DEBUG_POSTFIX "d"
-    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
-target_compile_options(dynamicfusion PRIVATE "-Wall")
-
-if(OPENNI_FOUND)
-    add_executable(dynamicfusion_kinect dynamicfusion_kinect.cpp)
-    target_link_libraries(dynamicfusion_kinect ${OPENNI_LIBRARY} ${OpenCV_LIBS} ${Boost_LIBRARIES} kfusion)
-
-    set_target_properties(dynamicfusion_kinect PROPERTIES
-        DEBUG_POSTFIX "d"
-        ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
-    install(TARGETS dynamicfusion_kinect RUNTIME DESTINATION bin COMPONENT main)
-endif()
-
-install(TARGETS dynamicfusion RUNTIME DESTINATION bin COMPONENT main)
-#install(FILES ${srcs} DESTINATION app COMPONENT main)
diff --git a/modules/dynamicfusion/samples/demo.cpp b/modules/dynamicfusion/samples/demo.cpp
index 6c7dd0cf7b4..30b21d53bb2 100644
--- a/modules/dynamicfusion/samples/demo.cpp
+++ b/modules/dynamicfusion/samples/demo.cpp
@@ -1,19 +1,16 @@
 #include <iostream>
-#include <kfusion/kinfu.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/range/iterator_range.hpp>
 #include <opencv2/highgui/highgui_c.h>
 #include <opencv2/highgui.hpp>
-using namespace kfusion;
+#include <opencv2/dynamicfusion.hpp>
 
+using namespace cv;
 struct DynamicFusionApp
 {
-    static void KeyboardCallback(const cv::viz::KeyboardEvent& event, void* pthis)
+    static void KeyboardCallback(const viz::KeyboardEvent& event, void* pthis)
     {
         DynamicFusionApp& kinfu = *static_cast<DynamicFusionApp*>(pthis);
 
-        if(event.action != cv::viz::KeyboardEvent::KEY_DOWN)
+        if(event.action != viz::KeyboardEvent::KEY_DOWN)
             return;
 
         if(event.code == 't' || event.code == 'T')
@@ -25,24 +22,24 @@ struct DynamicFusionApp
 
     DynamicFusionApp(std::string dir) : exit_ (false), interactive_mode_(false), pause_(false), directory(true), dir_name(dir)
     {
-        KinFuParams params = KinFuParams::default_params_dynamicfusion();
-        kinfu_ = KinFu::Ptr( new KinFu(params) );
+        kfusion::KinFuParams params = kfusion::KinFuParams::default_params_dynamicfusion();
+        kinfu_ = kfusion::KinFu::Ptr( new kfusion::KinFu(params) );
 
 
-        cv::viz::WCube cube(cv::Vec3d::all(0), cv::Vec3d(params.volume_size), true, cv::viz::Color::apricot());
+        viz::WCube cube(Vec3d::all(0), Vec3d(params.volume_size), true, viz::Color::apricot());
         viz.showWidget("cube", cube, params.volume_pose);
-        viz.showWidget("coor", cv::viz::WCoordinateSystem(0.1));
+        viz.showWidget("coor", viz::WCoordinateSystem(0.1));
         viz.registerKeyboardCallback(KeyboardCallback, this);
 
     }
-    static void show_depth(const cv::Mat& depth)
+    static void show_depth(const Mat& depth)
     {
-        cv::Mat display;
+        Mat display;
         depth.convertTo(display, CV_8U, 255.0/4000);
-        cv::viz::imshow("Depth", display);
+        viz::imshow("Depth", display);
     }
 
-    void show_raycasted(KinFu& kinfu)
+    void show_raycasted(kfusion::KinFu& kinfu)
     {
         const int mode = 3;
         if (interactive_mode_)
@@ -52,39 +49,36 @@ struct DynamicFusionApp
 
         view_host_.create(view_device_.rows(), view_device_.cols(), CV_8UC4);
         view_device_.download(view_host_.ptr<void>(), view_host_.step);
-        cv::viz::imshow("Scene", view_host_);
+        viz::imshow("Scene", view_host_);
     }
 
-    void show_warp(KinFu &kinfu)
+    void show_warp(kfusion::KinFu &kinfu)
     {
-        cv::Mat warp_host =  kinfu.getWarp().getNodesAsMat();
-        viz1.showWidget("warp_field", cv::viz::WCloud(warp_host));
+        Mat warp_host =  kinfu.getWarp().getNodesAsMat();
+        viz1.showWidget("warp_field", viz::WCloud(warp_host));
     }
 
     bool execute()
     {
-        KinFu& dfusion = *kinfu_;
-        cv::Mat depth, image;
+        kfusion::KinFu& dfusion = *kinfu_;
+        Mat depth, image;
         double time_ms = 0;
         bool has_image = false;
-        std::vector<boost::filesystem::path> depths;             // store paths,
-        std::vector<boost::filesystem::path> images;             // store paths,
-
-        copy(boost::filesystem::directory_iterator(dir_name + "/depth"), boost::filesystem::directory_iterator(),
-             back_inserter(depths));
-        copy(boost::filesystem::directory_iterator(dir_name + "/color"), boost::filesystem::directory_iterator(),
-             back_inserter(images));
+        std::vector<String> depths;             // store paths,
+        std::vector<String> images;             // store paths,
+        glob(dir_name + "/depth", depths);
+        glob(dir_name + "/color", images);
 
         std::sort(depths.begin(), depths.end());
         std::sort(images.begin(), images.end());
 
         for (int i = 0; i < depths.size() && !exit_ && !viz.wasStopped(); i++) {
-            image = cv::imread(images[i].string(), CV_LOAD_IMAGE_COLOR);
-            depth = cv::imread(depths[i].string(), CV_LOAD_IMAGE_ANYDEPTH);
+            image = imread(images[i], CV_LOAD_IMAGE_COLOR);
+            depth = imread(depths[i], CV_LOAD_IMAGE_ANYDEPTH);
             depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);
 
             {
-                SampledScopeTime fps(time_ms);
+                kfusion::SampledScopeTime fps(time_ms);
                 (void) fps;
                 has_image = dfusion(depth_device_);
             }
@@ -93,14 +87,14 @@ struct DynamicFusionApp
                 show_raycasted(dfusion);
 
             show_depth(depth);
-            cv::viz::imshow("Image", image);
+            viz::imshow("Image", image);
 
             if (!interactive_mode_) {
                 viz.setViewerPose(dfusion.getCameraPose());
                 viz1.setViewerPose(dfusion.getCameraPose());
             }
 
-            int key = cv::waitKey(pause_ ? 0 : 3);
+            int key = waitKey(pause_ ? 0 : 3);
             show_warp(dfusion);
             switch (key) {
                 case 't':
@@ -129,13 +123,13 @@ struct DynamicFusionApp
     bool pause_ /*= false*/;
     bool exit_, interactive_mode_, directory;
     std::string dir_name;
-    KinFu::Ptr kinfu_;
-    cv::viz::Viz3d viz;
-    cv::viz::Viz3d viz1;
+    kfusion::KinFu::Ptr kinfu_;
+    viz::Viz3d viz;
+    viz::Viz3d viz1;
 
-    cv::Mat view_host_;
-    cuda::Image view_device_;
-    cuda::Depth depth_device_;
+    Mat view_host_;
+    kfusion::cuda::Image view_device_;
+    kfusion::cuda::Depth depth_device_;
 
 
 };
@@ -146,15 +140,14 @@ struct DynamicFusionApp
 int main (int argc, char* argv[])
 {
     int device = 0;
-    cuda::setDevice (device);
-    cuda::printShortCudaDeviceInfo (device);
+    kfusion::cuda::setDevice (device);
+    kfusion::cuda::printShortCudaDeviceInfo(device);
 
-    if(cuda::checkIfPreFermiGPU(device))
+    if(kfusion::cuda::checkIfPreFermiGPU(device))
         return std::cout << std::endl << "Kinfu is not supported for pre-Fermi GPU architectures, and not built for them by default. Exiting..." << std::endl, -1;
 
     DynamicFusionApp *app;
-    if(boost::filesystem::is_directory(argv[1]))
-        app = new DynamicFusionApp(argv[1]);
+    app = new DynamicFusionApp(argv[1]);
 
     // executing
     try { app->execute (); }
diff --git a/modules/dynamicfusion/samples/dynamicfusion_kinect.cpp b/modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
deleted file mode 100644
index e32c0dc5273..00000000000
--- a/modules/dynamicfusion/samples/dynamicfusion_kinect.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-#include <iostream>
-#include <opencv2/highgui/highgui.hpp>
-#include <opencv2/imgproc/imgproc.hpp>
-#include <opencv2/viz/vizcore.hpp>
-#include <kfusion/kinfu.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
-#include <boost/filesystem.hpp>
-#include <boost/range/iterator_range.hpp>
-#include <io/capture.hpp>
-using namespace kfusion;
-
-struct KinFuApp
-{
-    static void KeyboardCallback(const cv::viz::KeyboardEvent& event, void* pthis)
-    {
-        KinFuApp& kinfu = *static_cast<KinFuApp*>(pthis);
-
-        if(event.action != cv::viz::KeyboardEvent::KEY_DOWN)
-            return;
-
-        if(event.code == 't' || event.code == 'T')
-            kinfu.take_cloud(*kinfu.kinfu_);
-
-        if(event.code == 'i' || event.code == 'I')
-            kinfu.interactive_mode_ = !kinfu.interactive_mode_;
-    }
-
-    KinFuApp(OpenNISource& source) : exit_ (false), interactive_mode_(false), capture_ (source), pause_(false)
-    {
-        KinFuParams params = KinFuParams::default_params();
-        kinfu_ = KinFu::Ptr( new KinFu(params) );
-
-        capture_.setRegistration(true);
-
-        cv::viz::WCube cube(cv::Vec3d::all(0), cv::Vec3d(params.volume_size), true, cv::viz::Color::apricot());
-        viz.showWidget("cube", cube, params.volume_pose);
-        viz.showWidget("coor", cv::viz::WCoordinateSystem(0.1));
-        viz.registerKeyboardCallback(KeyboardCallback, this);
-    }
-
-    static void show_depth(const cv::Mat& depth)
-    {
-        cv::Mat display;
-        //cv::normalize(depth, display, 0, 255, cv::NORM_MINMAX, cv::CV_8U);
-        depth.convertTo(display, cv::CV_8U, 255.0/4000);
-        cv::imshow("Depth", display);
-    }
-
-    void show_raycasted(KinFu& kinfu)
-    {
-        const int mode = 3;
-        if (interactive_mode_)
-            kinfu.renderImage(view_device_, viz.getViewerPose(), mode);
-        else
-            kinfu.renderImage(view_device_, mode);
-
-        view_host_.create(view_device_.rows(), view_device_.cols(), cv::CV_8UC4);
-        view_device_.download(view_host_.ptr<void>(), view_host_.step);
-        cv::imshow("Scene", view_host_);
-    }
-
-    void take_cloud(KinFu& kinfu)
-    {
-//        cv::Mat cloud_host = kinfu.tsdf().get_cloud_host();
-        cv::Mat normal_host =  kinfu.tsdf().get_normal_host();
-        cv::Mat warp_host =  kinfu.getWarp().getNodesAsMat();
-
-//        viz.showWidget("cloud", cv::viz::WCloud(cloud_host));
-//        viz.showWidget("cloud_normals", cv::viz::WCloudNormals(cloud_host, normal_host, 64, 0.05, cv::viz::Color::blue()));
-        viz1.showWidget("warp_field", cv::viz::WCloud(warp_host));
-    }
-
-    bool execute()
-    {
-        KinFu& kinfu = *kinfu_;
-        cv::Mat depth, image;
-        double time_ms = 0;
-        bool has_image = false;
-            for (int i = 0; !exit_ && !viz.wasStopped(); ++i)
-            {
-                bool has_frame = capture_.grab(depth, image);
-                if (!has_frame)
-                    return std::cout << "Can't grab" << std::endl, false;
-                depth_device_.upload(depth.data, depth.step, depth.rows, depth.cols);
-
-                {
-                    SampledScopeTime fps(time_ms); (void)fps;
-                    has_image = kinfu(depth_device_);
-                }
-
-                if (has_image)
-                    show_raycasted(kinfu);
-
-                show_depth(depth);
-                cv::imshow("Image", image);
-
-                if (!interactive_mode_)
-                {
-                    viz.setViewerPose(kinfu.getCameraPose());
-                    viz1.setViewerPose(kinfu.getCameraPose());
-                }
-
-                int key = cv::waitKey(pause_ ? 0 : 3);
-                take_cloud(kinfu);
-                switch(key)
-                {
-                    case 't': case 'T' : take_cloud(kinfu); break;
-                    case 'i': case 'I' : interactive_mode_ = !interactive_mode_; break;
-                    case 27: exit_ = true; break;
-                    case 32: pause_ = !pause_; break;
-                }
-
-                //exit_ = exit_ || i > 100;
-                viz.spinOnce(3, true);
-                viz1.spinOnce(3, true);
-            }
-        return true;
-    }
-
-    bool pause_ /*= false*/;
-    bool exit_, interactive_mode_;
-    KinFu::Ptr kinfu_;
-    cv::viz::Viz3d viz;
-    cv::viz::Viz3d viz1;
-
-    cv::Mat view_host_;
-    cuda::Image view_device_;
-    cuda::Depth depth_device_;
-    OpenNISource& capture_;
-
-};
-
-
-///////////////////////////////////////////////////////////////////////////////////////////////////////////////////
-
-int main (int argc, char* argv[])
-{
-    int device = 0;
-    cuda::setDevice (device);
-    cuda::printShortCudaDeviceInfo (device);
-
-    if(cuda::checkIfPreFermiGPU(device))
-        return std::cout << std::endl << "Kinfu is not supported for pre-Fermi GPU architectures, and not built for them by default. Exiting..." << std::endl, -1;
-
-    KinFuApp *app;
-
-    OpenNISource capture;
-    capture.open(argv[1]);
-    app = new KinFuApp(capture);
-
-    // executing
-    try { app->execute (); }
-    catch (const std::bad_alloc& /*e*/) { std::cout << "Bad alloc" << std::endl; }
-    catch (const std::exception& /*e*/) { std::cout << "Exception" << std::endl; }
-
-    delete app;
-    return 0;
-}
diff --git a/modules/dynamicfusion/src/capture.cpp b/modules/dynamicfusion/src/capture.cpp
index 070f3910095..b350d21723b 100644
--- a/modules/dynamicfusion/src/capture.cpp
+++ b/modules/dynamicfusion/src/capture.cpp
@@ -6,7 +6,7 @@
 
 using namespace std;
 using namespace xn;
-
+using namespace cv;
 //const std::string XMLConfig =
 //"<OpenNI>"
 //        "<Licenses>"
diff --git a/modules/dynamicfusion/src/core.cpp b/modules/dynamicfusion/src/core.cpp
index a9fe78521fe..5a1a568cd22 100644
--- a/modules/dynamicfusion/src/core.cpp
+++ b/modules/dynamicfusion/src/core.cpp
@@ -1,10 +1,10 @@
-#include <kfusion/kinfu.hpp>
+#include <opencv2/kfusion/kinfu.hpp>
 #include "safe_call.hpp"
 
 #include <cuda.h>
 #include <cstdio>
 #include <iostream>
-
+using namespace cv;
 int kf::cuda::getCudaEnabledDeviceCount()
 {
     int count;
diff --git a/modules/dynamicfusion/src/cuda/temp_utils.hpp b/modules/dynamicfusion/src/cuda/temp_utils.hpp
index 993c3a25524..3709b658e7f 100644
--- a/modules/dynamicfusion/src/cuda/temp_utils.hpp
+++ b/modules/dynamicfusion/src/cuda/temp_utils.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "cuda.h"
-#include <kfusion/cuda/kernel_containers.hpp>
+#include <opencv2/kfusion/cuda/kernel_containers.hpp>
 
 namespace kfusion
 {
diff --git a/modules/dynamicfusion/src/cuda/texture_binder.hpp b/modules/dynamicfusion/src/cuda/texture_binder.hpp
index 10258c12026..0d3702199ef 100644
--- a/modules/dynamicfusion/src/cuda/texture_binder.hpp
+++ b/modules/dynamicfusion/src/cuda/texture_binder.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <kfusion/cuda/device_array.hpp>
+#include <opencv2/kfusion/cuda/device_array.hpp>
 #include <safe_call.hpp>
 
 namespace kfusion
diff --git a/modules/dynamicfusion/src/device_memory.cpp b/modules/dynamicfusion/src/device_memory.cpp
index 6a7f4565f2f..84e30fb3f91 100644
--- a/modules/dynamicfusion/src/device_memory.cpp
+++ b/modules/dynamicfusion/src/device_memory.cpp
@@ -1,9 +1,9 @@
-#include <kfusion/cuda/device_memory.hpp>
+#include <opencv2/kfusion/cuda/device_memory.hpp>
 #include "safe_call.hpp"
 #include <cassert>
 #include <iostream>
 #include <cstdlib>
-
+using namespace cv;
 void kfusion::cuda::error(const char *error_string, const char *file, const int line, const char *func)
 {
     std::cout << "KinFu2 error: " << error_string << "\t" << file << ":" << line << std::endl;
diff --git a/modules/dynamicfusion/src/imgproc.cpp b/modules/dynamicfusion/src/imgproc.cpp
index 7f70126600a..d5fedb60aa0 100644
--- a/modules/dynamicfusion/src/imgproc.cpp
+++ b/modules/dynamicfusion/src/imgproc.cpp
@@ -1,5 +1,6 @@
 #include "precomp.hpp"
-#include <kfusion/types.hpp>
+#include <opencv2/kfusion/types.hpp>
+using namespace cv;
 /**
  *
  * \param in
diff --git a/modules/dynamicfusion/src/internal.hpp b/modules/dynamicfusion/src/internal.hpp
index ceffc128d6a..f964ea8e6a5 100644
--- a/modules/dynamicfusion/src/internal.hpp
+++ b/modules/dynamicfusion/src/internal.hpp
@@ -1,148 +1,218 @@
 #pragma once
 
-#include <kfusion/cuda/device_array.hpp>
+#include <opencv2/kfusion/cuda/device_array.hpp>
 #include "safe_call.hpp"
 //#define USE_DEPTH
+namespace cv {
+    namespace kfusion {
+        namespace device {
+            typedef float4 Normal;
+            typedef float4 Point;
+
+            typedef unsigned short ushort;
+            typedef unsigned char uchar;
+
+            typedef PtrStepSz <ushort> Dists;
+            typedef DeviceArray2D <ushort> Depth;
+            typedef DeviceArray2D <Normal> Normals;
+            typedef DeviceArray2D <Point> Points;
+            typedef DeviceArray2D <uchar4> Image;
+
+            typedef int3 Vec3i;
+            typedef float3 Vec3f;
+            struct Mat3f {
+                float3 data[3];
+            };
+            struct Aff3f {
+                Mat3f R;
+                Vec3f t;
+            };
+
+            struct TsdfVolume {
+            public:
+                typedef ushort2 elem_type;
+
+                elem_type *const data;
+                const int3 dims;
+                const float3 voxel_size;
+                const float trunc_dist;
+                const int max_weight;
+
+                TsdfVolume(elem_type *data, int3 dims, float3 voxel_size, float trunc_dist, int max_weight);
+                //TsdfVolume(const TsdfVolume&);
+
+                __kf_device__ elem_type
+                *
+
+                operator()(int x, int y, int z);
+
+                __kf_device__ const elem_type
+                *
+
+                operator()(int x, int y, int z) const;
+
+                __kf_device__ elem_type
+                *
+
+                beg(int x, int y) const;
+
+                __kf_device__ elem_type
+                *
+                zstep(elem_type
+                *const ptr) const;
+            private:
+                TsdfVolume &operator=(const TsdfVolume &);
+            };
 
-namespace kfusion
-{
-    namespace device
-    {
-        typedef float4 Normal;
-        typedef float4 Point;
-
-        typedef unsigned short ushort;
-        typedef unsigned char uchar;
-
-        typedef PtrStepSz<ushort> Dists;
-        typedef DeviceArray2D<ushort> Depth;
-        typedef DeviceArray2D<Normal> Normals;
-        typedef DeviceArray2D<Point> Points;
-        typedef DeviceArray2D<uchar4> Image;
-
-        typedef int3   Vec3i;
-        typedef float3 Vec3f;
-        struct Mat3f { float3 data[3]; };
-        struct Aff3f { Mat3f R; Vec3f t; };
-
-        struct TsdfVolume
-        {
-        public:
-            typedef ushort2 elem_type;
-
-            elem_type *const data;
-            const int3 dims;
-            const float3 voxel_size;
-            const float trunc_dist;
-            const int max_weight;
-
-            TsdfVolume(elem_type* data, int3 dims, float3 voxel_size, float trunc_dist, int max_weight);
-            //TsdfVolume(const TsdfVolume&);
-
-            __kf_device__ elem_type* operator()(int x, int y, int z);
-            __kf_device__ const elem_type* operator() (int x, int y, int z) const ;
-            __kf_device__ elem_type* beg(int x, int y) const;
-            __kf_device__ elem_type* zstep(elem_type *const ptr) const;
-        private:
-            TsdfVolume& operator=(const TsdfVolume&);
-        };
-
-        struct Projector
-        {
-            float2 f, c;
-            Projector(){}
-            Projector(float fx, float fy, float cx, float cy);
-            __kf_device__ float2 operator()(const float3& p) const;
-        };
-
-        struct Reprojector
-        {
-            Reprojector() {}
-            Reprojector(float fx, float fy, float cx, float cy);
-            float2 finv, c;
-            __kf_device__ float3 operator()(int x, int y, float z) const;
-        };
-
-        struct ComputeIcpHelper
-        {
-            struct Policy;
-            struct PageLockHelper
-            {
-                float* data;
-                PageLockHelper();
-                ~PageLockHelper();
+            struct Projector {
+                float2 f, c;
+
+                Projector() {}
+
+                Projector(float fx, float fy, float cx, float cy);
+
+                __kf_device__ float2
+
+                operator()(const float3 &p) const;
             };
 
-            float min_cosine;
-            float dist2_thres;
+            struct Reprojector {
+                Reprojector() {}
 
-            Aff3f aff;
+                Reprojector(float fx, float fy, float cx, float cy);
+
+                float2 finv, c;
+                __kf_device__ float3
+
+                operator()(int x, int y, float z) const;
+            };
 
-            float rows, cols;
-            float2 f, c, finv;
+            struct ComputeIcpHelper {
+                struct Policy;
 
-            PtrStep<ushort> dcurr;
-            PtrStep<Normal> ncurr;
-            PtrStep<Point> vcurr;
+                struct PageLockHelper {
+                    float *data;
 
-            ComputeIcpHelper(float dist_thres, float angle_thres);
-            void setLevelIntr(int level_index, float fx, float fy, float cx, float cy);
+                    PageLockHelper();
 
-            void operator()(const Depth& dprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t stream);
-            void operator()(const Points& vprev, const Normals& nprev, DeviceArray2D<float>& buffer, float* data, cudaStream_t stream);
+                    ~PageLockHelper();
+                };
 
-            static void allocate_buffer(DeviceArray2D<float>& buffer, int partials_count = -1);
+                float min_cosine;
+                float dist2_thres;
 
-            //private:
-            __kf_device__ int find_coresp(int x, int y, float3& n, float3& d, float3& s) const;
-            __kf_device__ void partial_reduce(const float row[7], PtrStep<float>& partial_buffer) const;
-            __kf_device__ float2 proj(const float3& p) const;
-            __kf_device__ float3 reproj(float x, float y, float z)  const;
-        };
+                Aff3f aff;
 
-        //tsdf volume functions
-        void clear_volume(TsdfVolume volume);
-        void integrate(const Dists& depth, TsdfVolume& volume, const Aff3f& aff, const Projector& proj);
-        void project(const Dists& depth, Points& vertices, const Projector& proj);
-        void project_and_remove(PtrStepSz<ushort>& dists, Points &vertices, const Projector &proj);
-        void project_and_remove(const PtrStepSz<ushort>& dists, Points &vertices, const Projector &proj);
+                float rows, cols;
+                float2 f, c, finv;
 
-        void raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv,
-                     const Reprojector& reproj, Depth& depth, Normals& normals, float step_factor, float delta_factor);
+                PtrStep <ushort> dcurr;
+                PtrStep <Normal> ncurr;
+                PtrStep <Point> vcurr;
 
-        void raycast(const TsdfVolume& volume, const Aff3f& aff, const Mat3f& Rinv,
-                     const Reprojector& reproj, Points& points, Normals& normals, float step_factor, float delta_factor);
+                ComputeIcpHelper(float dist_thres, float angle_thres);
 
-        __kf_device__ ushort2 pack_tsdf(float tsdf, int weight);
-        __kf_device__ float unpack_tsdf(ushort2 value, int& weight);
-        __kf_device__ float unpack_tsdf(ushort2 value);
+                void setLevelIntr(int level_index, float fx, float fy, float cx, float cy);
 
+                void operator()(const Depth &dprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,
+                                cudaStream_t stream);
 
-        //image proc functions
-        void compute_dists(const Depth& depth, Dists dists, float2 f, float2 c);
-        void cloud_to_depth(const Points& cloud, Depth depth);
+                void operator()(const Points &vprev, const Normals &nprev, DeviceArray2D<float> &buffer, float *data,
+                                cudaStream_t stream);
 
-        void truncateDepth(Depth& depth, float max_dist /*meters*/);
-        void bilateralFilter(const Depth& src, Depth& dst, int kernel_size, float sigma_spatial, float sigma_depth);
-        void depthPyr(const Depth& source, Depth& pyramid, float sigma_depth);
+                static void allocate_buffer(DeviceArray2D<float> &buffer, int partials_count = -1);
 
-        void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
-        void resizePointsNormals(const Points& points, const Normals& normals, Points& points_out, Normals& normals_out);
+                //private:
+                __kf_device__ int find_coresp(int x, int y, float3 &n, float3 &d, float3 &s) const;
+
+                __kf_device__ void partial_reduce(const float row[7], PtrStep<float> &partial_buffer) const;
+
+                __kf_device__ float2
+
+                proj(const float3 &p) const;
+
+                __kf_device__ float3
+
+                reproj(float x, float y, float z) const;
+            };
 
-        void computeNormalsAndMaskDepth(const Reprojector& reproj, Depth& depth, Normals& normals);
-        void computePointNormals(const Reprojector& reproj, const Depth& depth, Points& points, Normals& normals);
+            //tsdf volume functions
+            void clear_volume(TsdfVolume volume);
 
-        void renderImage(const Depth& depth, const Normals& normals, const Reprojector& reproj, const Vec3f& light_pose, Image& image);
-        void renderImage(const Points& points, const Normals& normals, const Reprojector& reproj, const Vec3f& light_pose, Image& image);
-        void renderTangentColors(const Normals& normals, Image& image);
+            void integrate(const Dists &depth, TsdfVolume &volume, const Aff3f &aff, const Projector &proj);
 
+            void project(const Dists &depth, Points &vertices, const Projector &proj);
 
-        //exctraction functionality
-        size_t extractCloud(const TsdfVolume& volume, const Aff3f& aff, PtrSz<Point> output);
-        void extractNormals(const TsdfVolume& volume, const PtrSz<Point>& points, const Aff3f& aff, const Mat3f& Rinv, float gradient_delta_factor, float4* output);
+            void project_and_remove(PtrStepSz <ushort> &dists, Points &vertices, const Projector &proj);
+
+            void project_and_remove(const PtrStepSz <ushort> &dists, Points &vertices, const Projector &proj);
+
+            void raycast(const TsdfVolume &volume, const Aff3f &aff, const Mat3f &Rinv,
+                         const Reprojector &reproj, Depth &depth, Normals &normals, float step_factor,
+                         float delta_factor);
+
+            void raycast(const TsdfVolume &volume, const Aff3f &aff, const Mat3f &Rinv,
+                         const Reprojector &reproj, Points &points, Normals &normals, float step_factor,
+                         float delta_factor);
+
+            __kf_device__ ushort2
+
+            pack_tsdf(float tsdf, int weight);
+
+            __kf_device__ float unpack_tsdf(ushort2
+            value,
+            int &weight
+            );
+            __kf_device__ float unpack_tsdf(ushort2
+            value);
+
+
+            //image proc functions
+            void compute_dists(const Depth &depth, Dists dists, float2 f, float2 c);
+
+            void cloud_to_depth(const Points &cloud, Depth depth);
+
+            void truncateDepth(Depth &depth, float max_dist /*meters*/);
+
+            void bilateralFilter(const Depth &src, Depth &dst, int kernel_size, float sigma_spatial, float sigma_depth);
+
+            void depthPyr(const Depth &source, Depth &pyramid, float sigma_depth);
+
+            void resizeDepthNormals(const Depth &depth, const Normals &normals, Depth &depth_out, Normals &normals_out);
+
+            void
+            resizePointsNormals(const Points &points, const Normals &normals, Points &points_out, Normals &normals_out);
+
+            void computeNormalsAndMaskDepth(const Reprojector &reproj, Depth &depth, Normals &normals);
+
+            void computePointNormals(const Reprojector &reproj, const Depth &depth, Points &points, Normals &normals);
+
+            void
+            renderImage(const Depth &depth, const Normals &normals, const Reprojector &reproj, const Vec3f &light_pose,
+                        Image &image);
+
+            void renderImage(const Points &points, const Normals &normals, const Reprojector &reproj,
+                             const Vec3f &light_pose, Image &image);
+
+            void renderTangentColors(const Normals &normals, Image &image);
+
+
+            //exctraction functionality
+            size_t extractCloud(const TsdfVolume &volume, const Aff3f &aff, PtrSz <Point> output);
+
+            void
+            extractNormals(const TsdfVolume &volume, const PtrSz <Point> &points, const Aff3f &aff, const Mat3f &Rinv,
+                           float gradient_delta_factor, float4 *output);
+
+            struct float8 {
+                float x, y, z, w, c1, c2, c3, c4;
+            };
+            struct float12 {
+                float x, y, z, w, normal_x, normal_y, normal_z, n4, c1, c2, c3, c4;
+            };
 
-        struct float8  { float x, y, z, w, c1, c2, c3, c4; };
-        struct float12 { float x, y, z, w, normal_x, normal_y, normal_z, n4, c1, c2, c3, c4; };
-        void mergePointNormal(const DeviceArray<Point>& cloud, const DeviceArray<float8>& normals, const DeviceArray<float12>& output);
+            void mergePointNormal(const DeviceArray <Point> &cloud, const DeviceArray <float8> &normals,
+                                  const DeviceArray <float12> &output);
+        }
     }
-}
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/src/kinfu.cpp b/modules/dynamicfusion/src/kinfu.cpp
index ea1576b1b2c..82491d9d3d0 100644
--- a/modules/dynamicfusion/src/kinfu.cpp
+++ b/modules/dynamicfusion/src/kinfu.cpp
@@ -1,25 +1,23 @@
 #include "precomp.hpp"
 #include "internal.hpp"
 #include <tgmath.h>
-#include "utils/dual_quaternion.hpp"
-#include "nanoflann/nanoflann.hpp"
-#include "utils/quaternion.hpp"
-#include <utils/knn_point_cloud.hpp>
-#include <kfusion/warp_field.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/utils/dual_quaternion.hpp>
+#include <nanoflann/nanoflann.hpp>
+#include <opencv2/utils/quaternion.hpp>
+#include <opencv2/utils/knn_point_cloud.hpp>
+#include <opencv2/kfusion/warp_field.hpp>
+#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
 #include <opencv2/viz/vizcore.hpp>
 
 using namespace std;
-using namespace kfusion;
-using namespace kfusion::cuda;
-
+using namespace cv;
 static inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
 
 /**
  * \brief
  * \return
  */
-kfusion::KinFuParams kfusion::KinFuParams::default_params_dynamicfusion()
+cv::kfusion::KinFuParams cv::kfusion::KinFuParams::default_params_dynamicfusion()
 {
     const int iters[] = {10, 5, 4, 0};
     const int levels = sizeof(iters)/sizeof(iters[0]);
@@ -60,7 +58,7 @@ kfusion::KinFuParams kfusion::KinFuParams::default_params_dynamicfusion()
  * \brief
  * \return
  */
-kfusion::KinFuParams kfusion::KinFuParams::default_params()
+cv::kfusion::KinFuParams cv::kfusion::KinFuParams::default_params()
 {
     const int iters[] = {10, 5, 4, 0};
     const int levels = sizeof(iters)/sizeof(iters[0]);
@@ -101,11 +99,11 @@ kfusion::KinFuParams kfusion::KinFuParams::default_params()
  * \brief
  * \param params
  */
-kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(params)
+cv::kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(params)
 {
 //    cv::CV_Assert(params.volume_dims[0] % 32 == 0);
 
-    volume_ = cv::Ptr<cuda::TsdfVolume>(new cuda::TsdfVolume(params_.volume_dims));
+    volume_ = cv::Ptr<cv::kfusion::cuda::TsdfVolume>(new cv::kfusion::cuda::TsdfVolume(params_.volume_dims));
     warp_ = cv::Ptr<WarpField>(new WarpField());
 
     volume_->setTruncDist(params_.tsdf_trunc_dist);
@@ -115,7 +113,7 @@ kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(pa
     volume_->setRaycastStepFactor(params_.raycast_step_factor);
     volume_->setGradientDeltaFactor(params_.gradient_delta_factor);
 
-    icp_ = cv::Ptr<cuda::ProjectiveICP>(new cuda::ProjectiveICP());
+    icp_ = cv::Ptr<cv::kfusion::cuda::ProjectiveICP>(new cv::kfusion::cuda::ProjectiveICP());
     icp_->setDistThreshold(params_.icp_dist_thres);
     icp_->setAngleThreshold(params_.icp_angle_thres);
     icp_->setIterationsNum(params_.icp_iter_num);
@@ -124,33 +122,33 @@ kfusion::KinFu::KinFu(const KinFuParams& params) : frame_counter_(0), params_(pa
     reset();
 }
 
-const kfusion::KinFuParams& kfusion::KinFu::params() const
+const cv::kfusion::KinFuParams& cv::kfusion::KinFu::params() const
 { return params_; }
 
-kfusion::KinFuParams& kfusion::KinFu::params()
+cv::kfusion::KinFuParams& cv::kfusion::KinFu::params()
 { return params_; }
 
-const kfusion::cuda::TsdfVolume& kfusion::KinFu::tsdf() const
+const cv::kfusion::cuda::TsdfVolume& cv::kfusion::KinFu::tsdf() const
 { return *volume_; }
 
-kfusion::cuda::TsdfVolume& kfusion::KinFu::tsdf()
+cv::kfusion::cuda::TsdfVolume& cv::kfusion::KinFu::tsdf()
 { return *volume_; }
 
-const kfusion::cuda::ProjectiveICP& kfusion::KinFu::icp() const
+const cv::kfusion::cuda::ProjectiveICP& cv::kfusion::KinFu::icp() const
 { return *icp_; }
 
-kfusion::cuda::ProjectiveICP& kfusion::KinFu::icp()
+cv::kfusion::cuda::ProjectiveICP& cv::kfusion::KinFu::icp()
 { return *icp_; }
 
-const kfusion::WarpField& kfusion::KinFu::getWarp() const
+const cv::kfusion::WarpField& cv::kfusion::KinFu::getWarp() const
 { return *warp_; }
 
-kfusion::WarpField& kfusion::KinFu::getWarp()
+cv::kfusion::WarpField& cv::kfusion::KinFu::getWarp()
 { return *warp_; }
 
-void kfusion::KinFu::allocate_buffers()
+void cv::kfusion::KinFu::allocate_buffers()
 {
-    const int LEVELS = cuda::ProjectiveICP::MAX_PYRAMID_LEVELS;
+    const int LEVELS = cv::kfusion::cuda::ProjectiveICP::MAX_PYRAMID_LEVELS;
 
     int cols = params_.cols;
     int rows = params_.rows;
@@ -193,7 +191,7 @@ void kfusion::KinFu::allocate_buffers()
     points_.create(params_.rows, params_.cols);
 }
 
-void kfusion::KinFu::reset()
+void cv::kfusion::KinFu::reset()
 {
     if (frame_counter_)
         cout << "Reset" << endl;
@@ -211,35 +209,35 @@ void kfusion::KinFu::reset()
  * \param time
  * \return
  */
-kfusion::Affine3f kfusion::KinFu::getCameraPose (int time) const
+cv::kfusion::Affine3f cv::kfusion::KinFu::getCameraPose (int time) const
 {
     if (time > (int)poses_.size () || time < 0)
         time = (int)poses_.size () - 1;
     return poses_[time];
 }
 
-bool kfusion::KinFu::operator()(const kfusion::cuda::Depth& depth, const kfusion::cuda::Image& /*image*/)
+bool cv::kfusion::KinFu::operator()(const cv::kfusion::cuda::Depth& depth, const cv::kfusion::cuda::Image& /*image*/)
 {
     const KinFuParams& p = params_;
     const int LEVELS = icp_->getUsedLevelsNum();
 
-    cuda::computeDists(depth, dists_, p.intr);
-    cuda::depthBilateralFilter(depth, curr_.depth_pyr[0], p.bilateral_kernel_size, p.bilateral_sigma_spatial, p.bilateral_sigma_depth);
+    cv::kfusion::cuda::computeDists(depth, dists_, p.intr);
+    cv::kfusion::cuda::depthBilateralFilter(depth, curr_.depth_pyr[0], p.bilateral_kernel_size, p.bilateral_sigma_spatial, p.bilateral_sigma_depth);
 
     if (p.icp_truncate_depth_dist > 0)
-        kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist);
+        cv::kfusion::cuda::depthTruncation(curr_.depth_pyr[0], p.icp_truncate_depth_dist);
 
     for (int i = 1; i < LEVELS; ++i)
-        cuda::depthBuildPyramid(curr_.depth_pyr[i-1], curr_.depth_pyr[i], p.bilateral_sigma_depth);
+        cv::kfusion::cuda::depthBuildPyramid(curr_.depth_pyr[i-1], curr_.depth_pyr[i], p.bilateral_sigma_depth);
 
     for (int i = 0; i < LEVELS; ++i)
 #if defined USE_DEPTH
-        cuda::computeNormalsAndMaskDepth(p.intr(i), curr_.depth_pyr[i], curr_.normals_pyr[i]);
+        cv::kfusion::cuda::computeNormalsAndMaskDepth(p.intr(i), curr_.depth_pyr[i], curr_.normals_pyr[i]);
 #else
-        cuda::computePointNormals(p.intr(i), curr_.depth_pyr[i], curr_.points_pyr[i], curr_.normals_pyr[i]);
+        cv::kfusion::cuda::computePointNormals(p.intr(i), curr_.depth_pyr[i], curr_.points_pyr[i], curr_.normals_pyr[i]);
 #endif
 
-    cuda::waitAllDefaultStream();
+    cv::kfusion::cuda::waitAllDefaultStream();
 
     //can't perform more on first frame
     if (frame_counter_ == 0)
@@ -298,7 +296,7 @@ bool kfusion::KinFu::operator()(const kfusion::cuda::Depth& depth, const kfusion
         for (int i = 1; i < LEVELS; ++i)
             resizePointsNormals(prev_.points_pyr[i-1], prev_.normals_pyr[i-1], prev_.points_pyr[i], prev_.normals_pyr[i]);
 #endif
-        cuda::waitAllDefaultStream();
+        cv::kfusion::cuda::waitAllDefaultStream();
     }
 
     return ++frame_counter_, true;
@@ -309,7 +307,7 @@ bool kfusion::KinFu::operator()(const kfusion::cuda::Depth& depth, const kfusion
  * \param image
  * \param flag
  */
-void kfusion::KinFu::renderImage(cuda::Image& image, int flag)
+void cv::kfusion::KinFu::renderImage(cv::kfusion::cuda::Image& image, int flag)
 {
     const KinFuParams& p = params_;
     image.create(p.rows, flag != 3 ? p.cols : p.cols * 2);
@@ -321,16 +319,16 @@ void kfusion::KinFu::renderImage(cuda::Image& image, int flag)
 #endif
 
     if (flag < 1 || flag > 3)
-        cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, image);
+        cv::kfusion::cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, image);
     else if (flag == 2)
-        cuda::renderTangentColors(prev_.normals_pyr[0], image);
+        cv::kfusion::cuda::renderTangentColors(prev_.normals_pyr[0], image);
     else /* if (flag == 3) */
     {
-        DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
-        DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
+        cv::kfusion::cuda::DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
+        cv::kfusion::cuda::DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
 
-        cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, i1);
-        cuda::renderTangentColors(prev_.normals_pyr[0], i2);
+        cv::kfusion::cuda::renderImage(PASS1[0], prev_.normals_pyr[0], params_.intr, params_.light_pose, i1);
+        cv::kfusion::cuda::renderTangentColors(prev_.normals_pyr[0], i2);
 
     }
 #undef PASS1
@@ -341,10 +339,10 @@ void kfusion::KinFu::renderImage(cuda::Image& image, int flag)
  * \param image
  * \param flag
  */
-void kfusion::KinFu::dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals)
+void cv::kfusion::KinFu::dynamicfusion(cv::kfusion::cuda::Depth& depth, cv::kfusion::cuda::Cloud current_frame, cv::kfusion::cuda::Normals current_normals)
 {
-    cuda::Cloud cloud;
-    cuda::Normals normals;
+    cv::kfusion::cuda::Cloud cloud;
+    cv::kfusion::cuda::Normals normals;
     cloud.create(depth.rows(), depth.cols());
     normals.create(depth.rows(), depth.cols());
     auto camera_pose = poses_.back();
@@ -412,7 +410,7 @@ void kfusion::KinFu::dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame
  * \param pose
  * \param flag
  */
-void kfusion::KinFu::renderImage(cuda::Image& image, const Affine3f& pose, int flag) {
+void cv::kfusion::KinFu::renderImage(cv::kfusion::cuda::Image& image, const Affine3f& pose, int flag) {
     const KinFuParams &p = params_;
     image.create(p.rows, flag != 3 ? p.cols : p.cols * 2);
     depths_.create(p.rows, p.cols);
@@ -428,16 +426,16 @@ void kfusion::KinFu::renderImage(cuda::Image& image, const Affine3f& pose, int f
     volume_->raycast(pose, p.intr, PASS1, normals_);
 
     if (flag < 1 || flag > 3)
-        cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, image);
+        cv::kfusion::cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, image);
     else if (flag == 2)
-        cuda::renderTangentColors(normals_, image);
+        cv::kfusion::cuda::renderTangentColors(normals_, image);
     else /* if (flag == 3) */
     {
-        DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
-        DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
+        cv::kfusion::cuda::DeviceArray2D<RGB> i1(p.rows, p.cols, image.ptr(), image.step());
+        cv::kfusion::cuda::DeviceArray2D<RGB> i2(p.rows, p.cols, image.ptr() + p.cols, image.step());
 
-        cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, i1);
-        cuda::renderTangentColors(normals_, i2);
+        cv::kfusion::cuda::renderImage(PASS1, normals_, params_.intr, params_.light_pose, i1);
+        cv::kfusion::cuda::renderTangentColors(normals_, i2);
     }
 #undef PASS1
 }
diff --git a/modules/dynamicfusion/src/optimisation.cpp b/modules/dynamicfusion/src/optimisation.cpp
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/modules/dynamicfusion/src/precomp.cpp b/modules/dynamicfusion/src/precomp.cpp
index 080169ba91f..05703fecf4a 100644
--- a/modules/dynamicfusion/src/precomp.cpp
+++ b/modules/dynamicfusion/src/precomp.cpp
@@ -1,6 +1,6 @@
 #include "precomp.hpp"
 #include "internal.hpp"
-
+using namespace cv;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Kinfu/types implementation
 
diff --git a/modules/dynamicfusion/src/precomp.hpp b/modules/dynamicfusion/src/precomp.hpp
index 41e92d8c4e2..b2c485e3f4c 100644
--- a/modules/dynamicfusion/src/precomp.hpp
+++ b/modules/dynamicfusion/src/precomp.hpp
@@ -1,30 +1,33 @@
 #pragma once
 
-#include <kfusion/types.hpp>
-#include <kfusion/kinfu.hpp>
-#include <kfusion/cuda/tsdf_volume.hpp>
-#include <kfusion/cuda/imgproc.hpp>
-#include <kfusion/cuda/projective_icp.hpp>
+#include <opencv2/kfusion/types.hpp>
+#include <opencv2/kfusion/kinfu.hpp>
+#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/kfusion/cuda/imgproc.hpp>
+#include <opencv2/kfusion/cuda/projective_icp.hpp>
 #include "internal.hpp"
 #include <iostream>
 #include "vector_functions.h"
 
-namespace kfusion
+namespace cv
 {
-    template<typename D, typename S>
-    inline D device_cast(const S& source)
+    namespace kfusion
     {
-        return *reinterpret_cast<const D*>(source.val);
-    }
+        template<typename D, typename S>
+        inline D device_cast(const S& source)
+        {
+            return *reinterpret_cast<const D*>(source.val);
+        }
 
-    template<>
-    inline device::Aff3f device_cast<device::Aff3f, Affine3f>(const Affine3f& source)
-    {
-        device::Aff3f aff;
-        Mat3f R = source.rotation();
-        Vec3f t = source.translation();
-        aff.R = device_cast<device::Mat3f>(R);
-        aff.t = device_cast<device::Vec3f>(t);
-        return aff;
+        template<>
+        inline device::Aff3f device_cast<device::Aff3f, Affine3f>(const Affine3f& source)
+        {
+            device::Aff3f aff;
+            Mat3f R = source.rotation();
+            Vec3f t = source.translation();
+            aff.R = device_cast<device::Mat3f>(R);
+            aff.t = device_cast<device::Vec3f>(t);
+            return aff;
+        }
     }
 }
diff --git a/modules/dynamicfusion/src/projective_icp.cpp b/modules/dynamicfusion/src/projective_icp.cpp
index 4eba75f55a5..0e1e4f8f18c 100644
--- a/modules/dynamicfusion/src/projective_icp.cpp
+++ b/modules/dynamicfusion/src/projective_icp.cpp
@@ -1,10 +1,10 @@
 #include "precomp.hpp"
 
 
-using namespace kfusion;
+using namespace cv::kfusion;
 using namespace std;
-using namespace kfusion::cuda;
-
+using namespace cv::kfusion::cuda;
+using namespace cv;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// ComputeIcpHelper
 
diff --git a/modules/dynamicfusion/src/safe_call.hpp b/modules/dynamicfusion/src/safe_call.hpp
index d0bb9919b9a..be340688ef9 100644
--- a/modules/dynamicfusion/src/safe_call.hpp
+++ b/modules/dynamicfusion/src/safe_call.hpp
@@ -2,35 +2,37 @@
 
 #include "cuda_runtime_api.h"
 
-namespace kfusion
-{
-    namespace cuda
+namespace cv{
+    namespace kfusion
     {
-        void error(const char *error_string, const char *file, const int line, const char *func);
+        namespace cuda
+        {
+            void error(const char *error_string, const char *file, const int line, const char *func);
+        }
     }
-}
 
 #if defined(__GNUC__)
-    #define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
+#define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__, __func__)
 #else /* defined(__CUDACC__) || defined(__MSVC__) */
-    #define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__)    
+#define cudaSafeCall(expr)  kfusion::cuda::___cudaSafeCall(expr, __FILE__, __LINE__)
 #endif
 
-namespace kfusion
-{
-    namespace cuda
+    namespace kfusion
     {
-        static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+        namespace cuda
         {
-            if (cudaSuccess != err)
-                error(cudaGetErrorString(err), file, line, func);
-        }        
+            static inline void ___cudaSafeCall(cudaError_t err, const char *file, const int line, const char *func = "")
+            {
+                if (cudaSuccess != err)
+                    error(cudaGetErrorString(err), file, line, func);
+            }
 
-        static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
-    }
+            static inline int divUp(int total, int grain) { return (total + grain - 1) / grain; }
+        }
 
-    namespace device
-    {
-        using kfusion::cuda::divUp;
+        namespace device
+        {
+            using kfusion::cuda::divUp;
+        }
     }
 }
diff --git a/modules/dynamicfusion/src/tsdf_volume.cpp b/modules/dynamicfusion/src/tsdf_volume.cpp
index 149108c76ae..eb0bac2f31d 100644
--- a/modules/dynamicfusion/src/tsdf_volume.cpp
+++ b/modules/dynamicfusion/src/tsdf_volume.cpp
@@ -1,15 +1,15 @@
+#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/kfusion/warp_field.hpp>
+#include <opencv2/utils/knn_point_cloud.hpp>
+#include <opencv2/viz/vizcore.hpp>
+#include <numeric>
 #include "precomp.hpp"
-#include "utils/dual_quaternion.hpp"
 #include <algorithm>
-#include <kfusion/warp_field.hpp>
-#include <utils/knn_point_cloud.hpp>
-#include <numeric>
-#include <opencv2/viz/vizcore.hpp>
-using namespace kfusion;
-using namespace kfusion::cuda;
-
+using namespace cv::kfusion;
+using namespace cv::kfusion::cuda;
+using namespace cv;
 
-kfusion::cuda::TsdfVolume::TsdfVolume(const Vec3i& dims) : data_(),
+cv::kfusion::cuda::TsdfVolume::TsdfVolume(const Vec3i& dims) : data_(),
                                                            trunc_dist_(0.03f),
                                                            max_weight_(128),
                                                            dims_(dims),
@@ -21,7 +21,7 @@ kfusion::cuda::TsdfVolume::TsdfVolume(const Vec3i& dims) : data_(),
     create(dims_);
 }
 
-kfusion::cuda::TsdfVolume::~TsdfVolume()
+cv::kfusion::cuda::TsdfVolume::~TsdfVolume()
 {
     delete cloud_host;
     delete cloud_buffer;
@@ -34,7 +34,7 @@ kfusion::cuda::TsdfVolume::~TsdfVolume()
  * \brief
  * \param dims
  */
-void kfusion::cuda::TsdfVolume::create(const Vec3i& dims)
+void cv::kfusion::cuda::TsdfVolume::create(const Vec3i& dims)
 {
     dims_ = dims;
     int voxels_number = dims_[0] * dims_[1] * dims_[2];
@@ -47,7 +47,7 @@ void kfusion::cuda::TsdfVolume::create(const Vec3i& dims)
  * \brief
  * \return
  */
-Vec3i kfusion::cuda::TsdfVolume::getDims() const
+Vec3i cv::kfusion::cuda::TsdfVolume::getDims() const
 {
     return dims_;
 }
@@ -56,42 +56,42 @@ Vec3i kfusion::cuda::TsdfVolume::getDims() const
  * \brief
  * \return
  */
-Vec3f kfusion::cuda::TsdfVolume::getVoxelSize() const
+Vec3f cv::kfusion::cuda::TsdfVolume::getVoxelSize() const
 {
     return Vec3f(size_[0] / dims_[0], size_[1] / dims_[1], size_[2] / dims_[2]);
 }
 
-const CudaData kfusion::cuda::TsdfVolume::data() const { return data_; }
-CudaData kfusion::cuda::TsdfVolume::data() {  return data_; }
-Vec3f kfusion::cuda::TsdfVolume::getSize() const { return size_; }
+const CudaData cv::kfusion::cuda::TsdfVolume::data() const { return data_; }
+CudaData cv::kfusion::cuda::TsdfVolume::data() {  return data_; }
+Vec3f cv::kfusion::cuda::TsdfVolume::getSize() const { return size_; }
 
-void kfusion::cuda::TsdfVolume::setSize(const Vec3f& size)
+void cv::kfusion::cuda::TsdfVolume::setSize(const Vec3f& size)
 { size_ = size; setTruncDist(trunc_dist_); }
 
-float kfusion::cuda::TsdfVolume::getTruncDist() const { return trunc_dist_; }
+float cv::kfusion::cuda::TsdfVolume::getTruncDist() const { return trunc_dist_; }
 
-void kfusion::cuda::TsdfVolume::setTruncDist(float distance)
+void cv::kfusion::cuda::TsdfVolume::setTruncDist(float distance)
 {
     Vec3f vsz = getVoxelSize();
     float max_coeff = std::max<float>(std::max<float>(vsz[0], vsz[1]), vsz[2]);
     trunc_dist_ = std::max (distance, 2.1f * max_coeff);
 }
-cv::Mat kfusion::cuda::TsdfVolume::get_cloud_host() const {return *cloud_host;};
-cv::Mat kfusion::cuda::TsdfVolume::get_normal_host() const {return *normal_host;};
-cv::Mat* kfusion::cuda::TsdfVolume::get_cloud_host_ptr() const {return cloud_host;};
-cv::Mat* kfusion::cuda::TsdfVolume::get_normal_host_ptr() const {return normal_host;};
-
-int kfusion::cuda::TsdfVolume::getMaxWeight() const { return max_weight_; }
-void kfusion::cuda::TsdfVolume::setMaxWeight(int weight) { max_weight_ = weight; }
-Affine3f kfusion::cuda::TsdfVolume::getPose() const  { return pose_; }
-void kfusion::cuda::TsdfVolume::setPose(const Affine3f& pose) { pose_ = pose; }
-float kfusion::cuda::TsdfVolume::getRaycastStepFactor() const { return raycast_step_factor_; }
-void kfusion::cuda::TsdfVolume::setRaycastStepFactor(float factor) { raycast_step_factor_ = factor; }
-float kfusion::cuda::TsdfVolume::getGradientDeltaFactor() const { return gradient_delta_factor_; }
-void kfusion::cuda::TsdfVolume::setGradientDeltaFactor(float factor) { gradient_delta_factor_ = factor; }
-void kfusion::cuda::TsdfVolume::swap(CudaData& data) { data_.swap(data); }
-void kfusion::cuda::TsdfVolume::applyAffine(const Affine3f& affine) { pose_ = affine * pose_; }
-void kfusion::cuda::TsdfVolume::clear()
+cv::Mat cv::kfusion::cuda::TsdfVolume::get_cloud_host() const {return *cloud_host;};
+cv::Mat cv::kfusion::cuda::TsdfVolume::get_normal_host() const {return *normal_host;};
+cv::Mat* cv::kfusion::cuda::TsdfVolume::get_cloud_host_ptr() const {return cloud_host;};
+cv::Mat* cv::kfusion::cuda::TsdfVolume::get_normal_host_ptr() const {return normal_host;};
+
+int cv::kfusion::cuda::TsdfVolume::getMaxWeight() const { return max_weight_; }
+void cv::kfusion::cuda::TsdfVolume::setMaxWeight(int weight) { max_weight_ = weight; }
+Affine3f cv::kfusion::cuda::TsdfVolume::getPose() const  { return pose_; }
+void cv::kfusion::cuda::TsdfVolume::setPose(const Affine3f& pose) { pose_ = pose; }
+float cv::kfusion::cuda::TsdfVolume::getRaycastStepFactor() const { return raycast_step_factor_; }
+void cv::kfusion::cuda::TsdfVolume::setRaycastStepFactor(float factor) { raycast_step_factor_ = factor; }
+float cv::kfusion::cuda::TsdfVolume::getGradientDeltaFactor() const { return gradient_delta_factor_; }
+void cv::kfusion::cuda::TsdfVolume::setGradientDeltaFactor(float factor) { gradient_delta_factor_ = factor; }
+void cv::kfusion::cuda::TsdfVolume::swap(CudaData& data) { data_.swap(data); }
+void cv::kfusion::cuda::TsdfVolume::applyAffine(const Affine3f& affine) { pose_ = affine * pose_; }
+void cv::kfusion::cuda::TsdfVolume::clear()
 {
     cloud_buffer = new cuda::DeviceArray<Point>();
     cloud = new cuda::DeviceArray<Point>();
@@ -112,7 +112,7 @@ void kfusion::cuda::TsdfVolume::clear()
  * \param camera_pose
  * \param intr
  */
-void kfusion::cuda::TsdfVolume::integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr)
+void cv::kfusion::cuda::TsdfVolume::integrate(const Dists& dists, const Affine3f& camera_pose, const Intr& intr)
 {
     Affine3f vol2cam = camera_pose.inv() * pose_;
 
@@ -133,7 +133,7 @@ void kfusion::cuda::TsdfVolume::integrate(const Dists& dists, const Affine3f& ca
  * \param depth
  * \param normals
  */
-void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals)
+void cv::kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Depth& depth, Normals& normals)
 {
     DeviceArray2D<device::Normal>& n = (DeviceArray2D<device::Normal>&)normals;
 
@@ -159,7 +159,7 @@ void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr&
  * \param points
  * \param normals
  */
-void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals)
+void cv::kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr& intr, Cloud& points, Normals& normals)
 {
     device::Normals& n = (device::Normals&)normals;
     device::Points& p = (device::Points&)points;
@@ -183,7 +183,7 @@ void kfusion::cuda::TsdfVolume::raycast(const Affine3f& camera_pose, const Intr&
  * \param cloud_buffer
  * \return
  */
-DeviceArray<Point> kfusion::cuda::TsdfVolume::fetchCloud(DeviceArray<Point>& cloud_buffer) const
+DeviceArray<cv::kfusion::Point> cv::kfusion::cuda::TsdfVolume::fetchCloud(DeviceArray<Point>& cloud_buffer) const
 {
     //    enum { DEFAULT_CLOUD_BUFFER_SIZE = 10 * 1000 * 1000 };
     enum { DEFAULT_CLOUD_BUFFER_SIZE = 256 * 256 * 256 };
@@ -208,7 +208,7 @@ DeviceArray<Point> kfusion::cuda::TsdfVolume::fetchCloud(DeviceArray<Point>& clo
  * @param cloud
  * @param normals
  */
-void kfusion::cuda::TsdfVolume::fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const
+void cv::kfusion::cuda::TsdfVolume::fetchNormals(const DeviceArray<Point>& cloud, DeviceArray<Normal>& normals) const
 {
     normals.create(cloud.size());
     DeviceArray<device::Point>& c = (DeviceArray<device::Point>&)cloud;
@@ -231,7 +231,7 @@ void kfusion::cuda::TsdfVolume::fetchNormals(const DeviceArray<Point>& cloud, De
  * \param camera_pose
  * \param intr
  */
-void kfusion::cuda::TsdfVolume::surface_fusion(const WarpField& warp_field,
+void cv::kfusion::cuda::TsdfVolume::surface_fusion(const WarpField& warp_field,
                                                std::vector<Vec3f> warped,
                                                std::vector<Vec3f> canonical,
                                                cuda::Depth& depth,
@@ -269,7 +269,7 @@ void kfusion::cuda::TsdfVolume::surface_fusion(const WarpField& warp_field,
  * \param voxel_center
  *
  */
-std::vector<float> kfusion::cuda::TsdfVolume::psdf(const std::vector<Vec3f>& warped,
+std::vector<float> cv::kfusion::cuda::TsdfVolume::psdf(const std::vector<Vec3f>& warped,
                                                    Dists& dists,
                                                    const Intr& intr)
 {
@@ -303,7 +303,7 @@ std::vector<float> kfusion::cuda::TsdfVolume::psdf(const std::vector<Vec3f>& war
  * \param k
  * \return
  */
-float kfusion::cuda::TsdfVolume::weighting(const std::vector<float>& dist_sqr, int k) const
+float cv::kfusion::cuda::TsdfVolume::weighting(const std::vector<float>& dist_sqr, int k) const
 {
     float distances = 0;
     for(auto distance : dist_sqr)
@@ -316,14 +316,14 @@ float kfusion::cuda::TsdfVolume::weighting(const std::vector<float>& dist_sqr, i
  * \param k
  * \return
  */
-void kfusion::cuda::TsdfVolume::compute_points()
+void cv::kfusion::cuda::TsdfVolume::compute_points()
 {
     *cloud = fetchCloud(*cloud_buffer);
     *cloud_host = cv::Mat(1, (int)cloud->size(), CV_32FC4);
     cloud->download(cloud_host->ptr<Point>());
 }
 
-void kfusion::cuda::TsdfVolume::compute_normals()
+void cv::kfusion::cuda::TsdfVolume::compute_normals()
 {
     fetchNormals(*cloud, *normal_buffer);
     *normal_host = cv::Mat(1, (int)cloud->size(), CV_32FC4);
diff --git a/modules/dynamicfusion/src/warp_field.cpp b/modules/dynamicfusion/src/warp_field.cpp
index 25dc0fdeaa9..7e388bae620 100644
--- a/modules/dynamicfusion/src/warp_field.cpp
+++ b/modules/dynamicfusion/src/warp_field.cpp
@@ -1,13 +1,14 @@
-#include <utils/dual_quaternion.hpp>
-#include <utils/knn_point_cloud.hpp>
-#include <kfusion/types.hpp>
+#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/utils/knn_point_cloud.hpp>
+#include <opencv2/kfusion/types.hpp>
 #include <nanoflann/nanoflann.hpp>
-#include "kfusion/warp_field.hpp"
+#include <opencv2/kfusion/warp_field.hpp>
 #include "internal.hpp"
 #include "precomp.hpp"
-#include <kfusion/optimisation.hpp>
+#include <opencv2/kfusion/optimisation.hpp>
 
-using namespace kfusion;
+using namespace cv::kfusion;
+using namespace cv;
 std::vector<utils::DualQuaternion<float>> neighbours; //THIS SHOULD BE SOMEWHERE ELSE BUT TOO SLOW TO REINITIALISE
 utils::PointCloud cloud;
 

From 26cbdcd00a412f134fcd4f3822c0907ffedde74a Mon Sep 17 00:00:00 2001
From: Mihai Bujanca <mihaibujanca@yahoo.com>
Date: Sat, 9 Sep 2017 12:10:28 +0100
Subject: [PATCH 4/4] More directory restructuring, getting CUDA through opencv
 rather than explicit

---
 modules/dynamicfusion/CMakeLists.txt          |  71 +++-----
 .../cmake/Modules/FindOpenNI.cmake            |  78 ---------
 modules/dynamicfusion/cmake/Targets.cmake     |  57 +++----
 .../include/opencv2/dynamicfusion.hpp         |   6 +-
 .../opencv2/dynamicfusion}/cuda/device.hpp    |   9 +-
 .../cuda/device_array.hpp                     |  81 +--------
 .../cuda/device_memory.hpp                    |  85 ++++++----
 .../opencv2/dynamicfusion/cuda/imgproc.hpp    |  38 +++++
 .../opencv2/dynamicfusion/cuda}/internal.hpp  |   2 +-
 .../cuda/kernel_containers.hpp                |   0
 .../opencv2/dynamicfusion/cuda}/precomp.hpp   |  10 +-
 .../cuda/projective_icp.hpp                   |   2 +-
 .../opencv2/dynamicfusion/cuda}/safe_call.hpp |   0
 .../dynamicfusion}/cuda/temp_utils.hpp        |   2 +-
 .../dynamicfusion}/cuda/texture_binder.hpp    |   4 +-
 .../cuda/tsdf_volume.hpp                      |   6 +-
 .../include/opencv2/dynamicfusion/kinfu.hpp   | 113 +++++++++++++
 .../optimisation.hpp                          |   2 +-
 .../{kfusion => dynamicfusion}/types.hpp      |  10 +-
 .../utils/dual_quaternion.hpp                 |   2 +-
 .../utils/knn_point_cloud.hpp                 |   2 +-
 .../{ => dynamicfusion}/utils/quaternion.hpp  |   2 +-
 .../{kfusion => dynamicfusion}/warp_field.hpp |   8 +-
 .../include/opencv2/io/capture.hpp            |  43 -----
 .../include/opencv2/kfusion/cuda/imgproc.hpp  |  38 -----
 .../include/opencv2/kfusion/exports.hpp       |   7 -
 .../include/opencv2/kfusion/kinfu.hpp         | 113 -------------
 modules/dynamicfusion/samples/demo.cpp        |   2 +-
 modules/dynamicfusion/src/core.cpp            |   4 +-
 modules/dynamicfusion/src/device_array.cpp    | 155 ++++++++++++++++++
 modules/dynamicfusion/src/device_memory.cpp   |   4 +-
 modules/dynamicfusion/src/imgproc.cpp         |   4 +-
 .../dynamicfusion/src/{cuda => }/imgproc.cu   |   2 +-
 modules/dynamicfusion/src/kinfu.cpp           |  28 ++--
 modules/dynamicfusion/src/precomp.cpp         |   4 +-
 .../dynamicfusion/src/{cuda => }/proj_icp.cu  |   4 +-
 modules/dynamicfusion/src/projective_icp.cpp  |   2 +-
 modules/dynamicfusion/src/tsdf_volume.cpp     |   8 +-
 .../src/{cuda => }/tsdf_volume.cu             |   6 +-
 modules/dynamicfusion/src/warp_field.cpp      |  14 +-
 40 files changed, 475 insertions(+), 553 deletions(-)
 delete mode 100644 modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion}/cuda/device.hpp (99%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/cuda/device_array.hpp (60%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/cuda/device_memory.hpp (87%)
 create mode 100644 modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/imgproc.hpp
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion/cuda}/internal.hpp (99%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/cuda/kernel_containers.hpp (100%)
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion/cuda}/precomp.hpp (75%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/cuda/projective_icp.hpp (97%)
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion/cuda}/safe_call.hpp (100%)
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion}/cuda/temp_utils.hpp (99%)
 rename modules/dynamicfusion/{src => include/opencv2/dynamicfusion}/cuda/texture_binder.hpp (96%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/cuda/tsdf_volume.hpp (96%)
 create mode 100644 modules/dynamicfusion/include/opencv2/dynamicfusion/kinfu.hpp
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/optimisation.hpp (99%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/types.hpp (90%)
 rename modules/dynamicfusion/include/opencv2/{ => dynamicfusion}/utils/dual_quaternion.hpp (99%)
 rename modules/dynamicfusion/include/opencv2/{ => dynamicfusion}/utils/knn_point_cloud.hpp (98%)
 rename modules/dynamicfusion/include/opencv2/{ => dynamicfusion}/utils/quaternion.hpp (99%)
 rename modules/dynamicfusion/include/opencv2/{kfusion => dynamicfusion}/warp_field.hpp (94%)
 delete mode 100644 modules/dynamicfusion/include/opencv2/io/capture.hpp
 delete mode 100644 modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
 delete mode 100644 modules/dynamicfusion/include/opencv2/kfusion/exports.hpp
 delete mode 100644 modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
 create mode 100644 modules/dynamicfusion/src/device_array.cpp
 rename modules/dynamicfusion/src/{cuda => }/imgproc.cu (99%)
 rename modules/dynamicfusion/src/{cuda => }/proj_icp.cu (99%)
 rename modules/dynamicfusion/src/{cuda => }/tsdf_volume.cu (99%)

diff --git a/modules/dynamicfusion/CMakeLists.txt b/modules/dynamicfusion/CMakeLists.txt
index a901d87dfd8..80950f3b660 100644
--- a/modules/dynamicfusion/CMakeLists.txt
+++ b/modules/dynamicfusion/CMakeLists.txt
@@ -1,58 +1,10 @@
 set(the_description "Non-rigid 3D reconstruction")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -DFORCE_INLINERS -D_MWAITXINTRIN_H_INCLUDED")
 
-#
-## ---[ Solution name
-#
-## ---[ utility
-list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake/Modules/")
 include(cmake/Utils.cmake)
-#include(cmake/Targets.cmake)
-#
-## ---[ find dependencies
-find_package(CUDA REQUIRED)
-#find_package(OpenNI)
-find_package(Ceres REQUIRED)
-include_directories(${CERES_INCLUDE_DIRS} ${Boost_INCLUDE_DIR} ${CUDA_INCLUDE_DIRS} ${OPENNI_INCLUDE_DIR} "kfusion/src/utils" "kfusion/include/nanoflann")
-#if(OPENNI_FOUND)
-#    message("FOUND OPENNI AT: ${OPENNI_INCLUDE_DIR}")
-#endif()
-## ---[ misc settings
-#if(USE_PROJECT_FOLDERS)
-#  set_property(GLOBAL PROPERTY USE_FOLDERS ON)
-#  set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
-#endif()
-#
-## ---[ cuda settings
-#set(HAVE_CUDA 1)
-#list(APPEND CUDA_NVCC_FLAGS "-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_20,code=sm_21;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35;-gencode;arch=compute_50,code=sm_50;-gencode;arch=compute_61,code=sm_61")
-#
-#if(UNIX OR APPLE)
-#  list(APPEND CUDA_NVCC_FLAGS "-Xcompiler;-fPIC;")
-#endif()
-#
-#warnings_disable(CMAKE_CXX_FLAGS /wd4985)
-#
-#set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "--ftz=true;--prec-div=false;--prec-sqrt=false")
-#add_module_library(kfusion)
-#if(OPENNI_FOUND)
-#    target_compile_definitions(kfusion PRIVATE OPENNI_FOUND=1)
-#endif()
-#target_link_libraries(kfusion ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY} ${OPENNI_LIBRARY} ${CERES_LIBRARIES})
-#add_subdirectory(apps)
-#
-#if(BUILD_TESTS)
-#  find_package(GTest REQUIRED)
-#  if( GTEST_FOUND )
-#    message( "Found Gtest at ${GTEST_ROOT}")
-#    message( "GTest Libs: ${GTEST_BOTH_LIBRARIES}")
-#    message( "GTest Include: ${GTEST_INCLUDE_DIRS}")
-#    include_directories(${GTEST_INCLUDE_DIRS})
-#    add_subdirectory(tests)
-#  endif()
-#endif()
-find_package(Boost REQUIRED COMPONENTS system filesystem program_options)
-ocv_define_module(dynamicfusion opencv_core opencv_calib3d opencv_viz opencv_highgui)
+include(cmake/Targets.cmake)
+
+ocv_define_module(dynamicfusion opencv_core opencv_viz opencv_imgproc)
 
 ocv_warnings_disable(CMAKE_CXX_FLAGS
         -Wundef
@@ -63,3 +15,20 @@ ocv_warnings_disable(CMAKE_CXX_FLAGS
         -Wunused-parameter
         -Wunused-function
         )
+
+find_package(Ceres QUIET)
+
+if (NOT Ceres_FOUND)
+    message(STATUS "${the_module} has been disabled because Ceres library has not been found")
+    ocv_module_disable(${the_module})
+else()
+    include_directories(${CERES_INCLUDE_DIRS})
+    ocv_target_link_libraries(${the_module} ${CERES_LIBRARIES})
+endif()
+if(NOT HAVE_CUDA)
+    message(STATUS "${the_module} currently requires CUDA to run")
+    ocv_module_disable(${the_module})
+else()
+    include_directories(${CUDA_INCLUDE_DIRS})
+    ocv_target_link_libraries(${the_module} ${CUDA_LIBRARIES} ${CUDA_CUDA_LIBRARY})
+endif()
diff --git a/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake b/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
deleted file mode 100644
index a8e6f51f915..00000000000
--- a/modules/dynamicfusion/cmake/Modules/FindOpenNI.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-###############################################################################
-# Find OpenNI
-#
-# This sets the following variables:
-# OPENNI_FOUND - True if OPENNI was found.
-# OPENNI_INCLUDE_DIRS - Directories containing the OPENNI include files.
-# OPENNI_LIBRARIES - Libraries needed to use OPENNI.
-# OPENNI_DEFINITIONS - Compiler flags for OPENNI.
-# 
-# For libusb-1.0, add USB_10_ROOT if not found
-
-find_package(PkgConfig QUIET)
-
-# Find LibUSB
-if(NOT WIN32)
-  pkg_check_modules(PC_USB_10 libusb-1.0)
-  find_path(USB_10_INCLUDE_DIR libusb-1.0/libusb.h
-            HINTS ${PC_USB_10_INCLUDEDIR} ${PC_USB_10_INCLUDE_DIRS} "${USB_10_ROOT}" "$ENV{USB_10_ROOT}"
-            PATH_SUFFIXES libusb-1.0)
-
-  find_library(USB_10_LIBRARY
-               NAMES usb-1.0 
-               HINTS ${PC_USB_10_LIBDIR} ${PC_USB_10_LIBRARY_DIRS} "${USB_10_ROOT}" "$ENV{USB_10_ROOT}"
-               PATH_SUFFIXES lib)
-               
-  include(FindPackageHandleStandardArgs)
-  find_package_handle_standard_args(USB_10 DEFAULT_MSG USB_10_LIBRARY USB_10_INCLUDE_DIR)
-   
-  if(NOT USB_10_FOUND)
-    message(STATUS "OpenNI disabled because libusb-1.0 not found.")     
-    return()
-  else()
-    include_directories(SYSTEM ${USB_10_INCLUDE_DIR})
-  endif()
-endif(NOT WIN32)
- 
-if(${CMAKE_VERSION} VERSION_LESS 2.8.2)
-  pkg_check_modules(PC_OPENNI openni-dev)
-else()
-  pkg_check_modules(PC_OPENNI QUIET openni-dev)
-endif()
-
-set(OPENNI_DEFINITIONS ${PC_OPENNI_CFLAGS_OTHER})
-
-set(OPENNI_SUFFIX)
-if(WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-  set(OPENNI_SUFFIX 64)
-endif(WIN32 AND CMAKE_SIZEOF_VOID_P EQUAL 8)
-
-#add a hint so that it can find it without the pkg-config
-find_path(OPENNI_INCLUDE_DIR XnStatus.h
-          HINTS ${PC_OPENNI_INCLUDEDIR} ${PC_OPENNI_INCLUDE_DIRS} /usr/include/openni /usr/include/ni "${OPENNI_ROOT}" "$ENV{OPENNI_ROOT}"
-          PATHS "$ENV{OPEN_NI_INSTALL_PATH${OPENNI_SUFFIX}}/Include"
-          PATH_SUFFIXES openni include Include)
-#add a hint so that it can find it without the pkg-config
-find_library(OPENNI_LIBRARY 
-             NAMES OpenNI${OPENNI_SUFFIX}
-             HINTS ${PC_OPENNI_LIBDIR} ${PC_OPENNI_LIBRARY_DIRS} /usr/lib "${OPENNI_ROOT}" "$ENV{OPENNI_ROOT}"
-             PATHS "$ENV{OPEN_NI_LIB${OPENNI_SUFFIX}}"
-             PATH_SUFFIXES lib Lib Lib64)
-
-if(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
-  set(OPENNI_LIBRARIES ${OPENNI_LIBRARY} ${LIBUSB_1_LIBRARIES})
-else()
-  set(OPENNI_LIBRARIES ${OPENNI_LIBRARY})
-endif()
-
-include(FindPackageHandleStandardArgs)
-find_package_handle_standard_args(OpenNI DEFAULT_MSG OPENNI_LIBRARY OPENNI_INCLUDE_DIR)
-    
-mark_as_advanced(OPENNI_LIBRARY OPENNI_INCLUDE_DIR)
-
-if(OPENNI_FOUND)  
-  # Add the include directories
-  set(OPENNI_INCLUDE_DIRS ${OPENNI_INCLUDE_DIR})  
-  message(STATUS "OpenNI found (include: ${OPENNI_INCLUDE_DIRS}, lib: ${OPENNI_LIBRARY})")
-endif(OPENNI_FOUND)
-
diff --git a/modules/dynamicfusion/cmake/Targets.cmake b/modules/dynamicfusion/cmake/Targets.cmake
index 2c66ece1969..3ad696e96e0 100644
--- a/modules/dynamicfusion/cmake/Targets.cmake
+++ b/modules/dynamicfusion/cmake/Targets.cmake
@@ -2,8 +2,8 @@
 # short command to setup source group
 function(kf_source_group group)
   cmake_parse_arguments(VW_SOURCE_GROUP "" "" "GLOB" ${ARGN})
-  file(GLOB srcs ${VW_SOURCE_GROUP_GLOB})    
-  #list(LENGTH ${srcs} ___size)  
+  file(GLOB srcs ${VW_SOURCE_GROUP_GLOB})
+  #list(LENGTH ${srcs} ___size)
   #if (___size GREATER 0)
     source_group(${group} FILES ${srcs})
   #endif()
@@ -12,7 +12,7 @@ endfunction()
 
 ################################################################################################
 # short command getting sources from standard directores
-macro(pickup_std_sources)  
+macro(pickup_std_sources)
   kf_source_group("Include" GLOB "include/${module_name}/*.h*")
   kf_source_group("Include\\cuda" GLOB "include/${module_name}/cuda/*.h*")
   kf_source_group("Source" GLOB "src/*.cpp" "src/*.h*")
@@ -25,13 +25,13 @@ endmacro()
 ################################################################################################
 # short command for declaring includes from other modules
 macro(declare_deps_includes)
-  foreach(__arg ${ARGN})    
+  foreach(__arg ${ARGN})
     get_filename_component(___path "${CMAKE_SOURCE_DIR}/modules/${__arg}/include" ABSOLUTE)
-    if (EXISTS ${___path})    
+    if (EXISTS ${___path})
       include_directories(${___path})
     endif()
   endforeach()
-  
+
   unset(___path)
   unset(__arg)
 endmacro()
@@ -40,21 +40,21 @@ endmacro()
 ################################################################################################
 # short command for setting defeault target properties
 function(default_properties target)
-  set_target_properties(${target} PROPERTIES    
+  set_target_properties(${target} PROPERTIES
     DEBUG_POSTFIX "d"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")  
-    
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
     if (NOT ${target} MATCHES "^test_")
       install(TARGETS ${the_target} RUNTIME DESTINATION ".")
     endif()
 endfunction()
 
 function(test_props target)
-  #os_project_label(${target} "[test]") 
+  #os_project_label(${target} "[test]")
   if(USE_PROJECT_FOLDERS)
     set_target_properties(${target} PROPERTIES FOLDER "Tests")
-  endif()  
+  endif()
 endfunction()
 
 function(app_props target)
@@ -68,11 +68,11 @@ endfunction()
 ################################################################################################
 # short command for setting defeault target properties
 function(default_properties target)
-  set_target_properties(${target} PROPERTIES    
+  set_target_properties(${target} PROPERTIES
     DEBUG_POSTFIX "d"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")  
-    
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+
     if (NOT ${target} MATCHES "^test_")
       install(TARGETS ${the_target} RUNTIME DESTINATION ".")
     endif()
@@ -82,32 +82,32 @@ endfunction()
 ################################################################################################
 # short command for adding library module
 macro(add_module_library name)
-  set(module_name ${name})  
+  set(module_name ${name})
   pickup_std_sources()
   include_directories(include src src/cuda src/utils)
-  
+
   set(__has_cuda OFF)
-  check_cuda(__has_cuda)  
-  
+  check_cuda(__has_cuda)
+
   set(__lib_type STATIC)
   if (${ARGV1} MATCHES "SHARED|STATIC")
     set(__lib_type ${ARGV1})
   endif()
 
-  if (__has_cuda)  
+  if (__has_cuda)
     cuda_add_library(${module_name} ${__lib_type} ${sources})
   else()
     add_library(${module_name} ${__lib_type} ${sources})
   endif()
-   
+
   if(MSVC)
     set_target_properties(${module_name} PROPERTIES DEFINE_SYMBOL KFUSION_API_EXPORTS)
   else()
     add_definitions(-DKFUSION_API_EXPORTS)
-  endif()  
-  
+  endif()
+
   default_properties(${module_name})
-  
+
   if(USE_PROJECT_FOLDERS)
     set_target_properties(${module_name} PROPERTIES FOLDER "Libraries")
   endif()
@@ -125,16 +125,7 @@ endmacro()
 ################################################################################################
 # short command for adding application module
 macro(add_application target sources)
-  add_executable(${target} ${sources})   
+  add_executable(${target} ${sources})
   default_properties(${target})
   app_props(${target})
 endmacro()
-
-
-################################################################################################
-# short command for adding test target
-macro(add_test the_target)
-  add_executable(${the_target} ${ARGN})
-  default_properties(${the_target})
-  test_props(${the_target})
-endmacro()
diff --git a/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
index 72b7cbd6dce..eb9867a571d 100644
--- a/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion.hpp
@@ -1,5 +1,7 @@
 #ifndef OPENCV_DYNAMICFUSION_HPP
 #define OPENCV_DYNAMICFUSION_HPP
-#include <opencv2/kfusion/kinfu.hpp>
-#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/dynamicfusion/kinfu.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
+#include <opencv2/dynamicfusion/cuda/tsdf_volume.hpp>
 #endif //OPENCV_DYNAMICFUSION_HPP
diff --git a/modules/dynamicfusion/src/cuda/device.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device.hpp
similarity index 99%
rename from modules/dynamicfusion/src/cuda/device.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device.hpp
index 497c8f84cf8..ee5948ff8da 100644
--- a/modules/dynamicfusion/src/cuda/device.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device.hpp
@@ -13,7 +13,8 @@
 //__kf_device__
 //kfusion::device::TsdfVolume::TsdfVolume(const TsdfVolume& other)
 //  : data(other.data), dims(other.dims), voxel_size(other.voxel_size), trunc_dist(other.trunc_dist), max_weight(other.max_weight) {}
-
+namespace cv
+{
 __kf_device__ kfusion::device::TsdfVolume::elem_type* kfusion::device::TsdfVolume::operator()(int x, int y, int z)
 { return data + x + y*dims.x + z*dims.y*dims.x; }
 
@@ -120,8 +121,4 @@ namespace kfusion
     template<> __kf_device__ void kfusion::device::gmem::StCs(const ushort2& val, ushort2* ptr) { *ptr = val; }
 #endif
 
-
-
-
-
-
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_array.hpp
similarity index 60%
rename from modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_array.hpp
index 83b7b6788fc..a39e295bb09 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_array.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_array.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <opencv2/kfusion/exports.hpp>
-#include <opencv2/kfusion/cuda/device_memory.hpp>
+
+#include "device_memory.hpp"
 
 #include <vector>
 
@@ -19,7 +19,7 @@ namespace cv
               * \author Anatoly Baksheev
               */
             template<class T>
-            class KF_EXPORTS DeviceArray : public DeviceMemory
+            class  DeviceArray : public DeviceMemory
         {
             public:
             /** \brief Element type. */
@@ -116,7 +116,7 @@ namespace cv
           * \author Anatoly Baksheev
           */
         template<class T>
-        class KF_EXPORTS DeviceArray2D : public DeviceMemory2D
+        class  DeviceArray2D : public DeviceMemory2D
     {
         public:
         /** \brief Element type. */
@@ -231,76 +231,3 @@ namespace device
 }
 }
 }
-
-/////////////////////  Inline implementations of DeviceArray ////////////////////////////////////////////
-
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray() {}
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::DeviceArray(const DeviceArray& other) : DeviceMemory(other) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>& cv::kfusion::cuda::DeviceArray<T>::operator=(const DeviceArray& other)
-{ DeviceMemory::operator=(other); return *this; }
-
-template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::create(size_t size)
-{ DeviceMemory::create(size * elem_size); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::release()
-{ DeviceMemory::release(); }
-
-template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::copyTo(DeviceArray& other) const
-{ DeviceMemory::copyTo(other); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::upload(const T *host_ptr, size_t size)
-{ DeviceMemory::upload(host_ptr, size * elem_size); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray<T>::download(T *host_ptr) const
-{ DeviceMemory::download( host_ptr ); }
-
-template<class T> void cv::kfusion::cuda::DeviceArray<T>::swap(DeviceArray& other_arg) { DeviceMemory::swap(other_arg); }
-
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::operator T*() { return ptr(); }
-template<class T> inline cv::kfusion::cuda::DeviceArray<T>::operator const T*() const { return ptr(); }
-template<class T> inline size_t cv::kfusion::cuda::DeviceArray<T>::size() const { return sizeBytes() / elem_size; }
-
-template<class T> inline       T* cv::kfusion::cuda::DeviceArray<T>::ptr()       { return DeviceMemory::ptr<T>(); }
-template<class T> inline const T* cv::kfusion::cuda::DeviceArray<T>::ptr() const { return DeviceMemory::ptr<T>(); }
-
-template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray<T>::upload(const std::vector<T, A>& data) { upload(&data[0], data.size()); }
-template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray<T>::download(std::vector<T, A>& data) const { data.resize(size()); if (!data.empty()) download(&data[0]); }
-
-/////////////////////  Inline implementations of DeviceArray2D ////////////////////////////////////////////
-
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D() {}
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols * elem_size, data, stepBytes) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D& other) : DeviceMemory2D(other) {}
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>& cv::kfusion::cuda::DeviceArray2D<T>::operator=(const DeviceArray2D& other)
-{ DeviceMemory2D::operator=(other); return *this; }
-
-template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::create(int rows, int cols)
-{ DeviceMemory2D::create(rows, cols * elem_size); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::release()
-{ DeviceMemory2D::release(); }
-
-template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::copyTo(DeviceArray2D& other) const
-{ DeviceMemory2D::copyTo(other); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols)
-{ DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size); }
-template<class T> inline void cv::kfusion::cuda::DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const
-{ DeviceMemory2D::download( host_ptr, host_step ); }
-
-template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray2D<T>::upload(const std::vector<T, A>& data, int cols)
-{ upload(&data[0], cols * elem_size, data.size()/cols, cols); }
-
-template<class T> template<class A> inline void cv::kfusion::cuda::DeviceArray2D<T>::download(std::vector<T, A>& data, int& elem_step) const
-{ elem_step = cols(); data.resize(cols() * rows()); if (!data.empty()) download(&data[0], colsBytes());  }
-
-template<class T> void  cv::kfusion::cuda::DeviceArray2D<T>::swap(DeviceArray2D& other_arg) { DeviceMemory2D::swap(other_arg); }
-
-template<class T> inline       T* cv::kfusion::cuda::DeviceArray2D<T>::ptr(int y)       { return DeviceMemory2D::ptr<T>(y); }
-template<class T> inline const T* cv::kfusion::cuda::DeviceArray2D<T>::ptr(int y) const { return DeviceMemory2D::ptr<T>(y); }
-
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::operator T*() { return ptr(); }
-template<class T> inline cv::kfusion::cuda::DeviceArray2D<T>::operator const T*() const { return ptr(); }
-
-template<class T> inline int cv::kfusion::cuda::DeviceArray2D<T>::cols() const { return DeviceMemory2D::colsBytes()/elem_size; }
-template<class T> inline int cv::kfusion::cuda::DeviceArray2D<T>::rows() const { return DeviceMemory2D::rows(); }
-
-template<class T> inline size_t cv::kfusion::cuda::DeviceArray2D<T>::elem_step() const { return DeviceMemory2D::step()/elem_size; }
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_memory.hpp
similarity index 87%
rename from modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_memory.hpp
index c7dde9ac190..7c7fdff6280 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/cuda/device_memory.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/device_memory.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <opencv2/kfusion/exports.hpp>
-#include <opencv2/kfusion/cuda/kernel_containers.hpp>
+
+#include "kernel_containers.hpp"
 
 namespace cv
 {
@@ -10,7 +10,7 @@ namespace cv
         namespace cuda
         {
             /** \brief Error handler. All GPU functions from this subsystem call the function to report an error. For internal use only */
-            KF_EXPORTS void error(const char *error_string, const char *file, const int line, const char *func = "");
+             void error(const char *error_string, const char *file, const int line, const char *func = "");
 
             //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
             /** \brief @b DeviceMemory class
@@ -20,7 +20,7 @@ namespace cv
               * \author Anatoly Baksheev
               */
 
-            class KF_EXPORTS DeviceMemory
+            class  DeviceMemory
                     {
                             public:
                             /** \brief Empty constructor. */
@@ -108,7 +108,7 @@ namespace cv
               * \author Anatoly Baksheev
               */
 
-            class KF_EXPORTS DeviceMemory2D
+            class  DeviceMemory2D
                     {
                             public:
                             /** \brief Empty constructor. */
@@ -224,37 +224,54 @@ namespace cv
     }
 }
 /////////////////////  Inline implementations of DeviceMemory ////////////////////////////////////////////
-
-template<class T> inline       T* cv::kfusion::cuda::DeviceMemory::ptr()       { return (      T*)data_; }
-template<class T> inline const T* cv::kfusion::cuda::DeviceMemory::ptr() const { return (const T*)data_; }
-
-template <class U> inline cv::kfusion::cuda::DeviceMemory::operator cv::kfusion::cuda::PtrSz<U>() const
+namespace cv
 {
-    PtrSz<U> result;
-    result.data = (U*)ptr<U>();
-    result.size = sizeBytes_/sizeof(U);
-    return result;
-}
+    namespace kfusion
+    {
+        namespace cuda
+        {
 
-/////////////////////  Inline implementations of DeviceMemory2D ////////////////////////////////////////////
+            template<class T>
+            inline T *DeviceMemory::ptr() { return (T *) data_; }
 
-template<class T>        T* cv::kfusion::cuda::DeviceMemory2D::ptr(int y_arg)       { return (      T*)((      char*)data_ + y_arg * step_); }
-template<class T>  const T* cv::kfusion::cuda::DeviceMemory2D::ptr(int y_arg) const { return (const T*)((const char*)data_ + y_arg * step_); }
+            template<class T>
+            inline const T *DeviceMemory::ptr() const { return (const T *) data_; }
 
-template <class U> cv::kfusion::cuda::DeviceMemory2D::operator cv::kfusion::cuda::PtrStep<U>() const
-{
-    PtrStep<U> result;
-    result.data = (U*)ptr<U>();
-    result.step = step_;
-    return result;
-}
+            template<class U>
+            inline DeviceMemory::operator PtrSz<U>() const {
+                PtrSz <U> result;
+                result.data = (U *) ptr<U>();
+                result.size = sizeBytes_ / sizeof(U);
+                return result;
+            }
 
-template <class U> cv::kfusion::cuda::DeviceMemory2D::operator cv::kfusion::cuda::PtrStepSz<U>() const
-{
-    PtrStepSz<U> result;
-    result.data = (U*)ptr<U>();
-    result.step = step_;
-    result.cols = colsBytes_/sizeof(U);
-    result.rows = rows_;
-    return result;
-}
+/////////////////////  Inline implementations of DeviceMemory2D ////////////////////////////////////////////
+
+            template<class T>
+            T *DeviceMemory2D::ptr(int y_arg) { return (T *) ((char *) data_ + y_arg * step_); }
+
+            template<class T>
+            const T *DeviceMemory2D::ptr(int y_arg) const {
+                return (const T *) ((const char *) data_ + y_arg * step_);
+            }
+
+            template<class U>
+            DeviceMemory2D::operator PtrStep<U>() const {
+                PtrStep <U> result;
+                result.data = (U *) ptr<U>();
+                result.step = step_;
+                return result;
+            }
+
+            template<class U>
+            DeviceMemory2D::operator PtrStepSz<U>() const {
+                PtrStepSz <U> result;
+                result.data = (U *) ptr<U>();
+                result.step = step_;
+                result.cols = colsBytes_ / sizeof(U);
+                result.rows = rows_;
+                return result;
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/imgproc.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/imgproc.hpp
new file mode 100644
index 00000000000..b809b9655a2
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/imgproc.hpp
@@ -0,0 +1,38 @@
+#pragma once
+
+#include <opencv2/dynamicfusion/types.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+             void depthBilateralFilter(const Depth& in, Depth& out, int ksz, float sigma_spatial, float sigma_depth);
+
+             void depthTruncation(Depth& depth, float threshold);
+
+             void depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth);
+
+             void computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals);
+
+             void computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals);
+
+             void computeDists(const Depth& depth, Dists& dists, const Intr& intr);
+
+             void cloudToDepth(const Cloud& cloud, Depth& depth);
+
+             void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
+
+             void resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out);
+
+             void waitAllDefaultStream();
+
+             void renderTangentColors(const Normals& normals, Image& image);
+
+             void renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+
+             void renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
+        }
+    }
+}
diff --git a/modules/dynamicfusion/src/internal.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/internal.hpp
similarity index 99%
rename from modules/dynamicfusion/src/internal.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/internal.hpp
index f964ea8e6a5..b1bb2f81658 100644
--- a/modules/dynamicfusion/src/internal.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/internal.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <opencv2/kfusion/cuda/device_array.hpp>
+#include "device_array.hpp"
 #include "safe_call.hpp"
 //#define USE_DEPTH
 namespace cv {
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/kernel_containers.hpp
similarity index 100%
rename from modules/dynamicfusion/include/opencv2/kfusion/cuda/kernel_containers.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/kernel_containers.hpp
diff --git a/modules/dynamicfusion/src/precomp.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/precomp.hpp
similarity index 75%
rename from modules/dynamicfusion/src/precomp.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/precomp.hpp
index b2c485e3f4c..4fd760f19c8 100644
--- a/modules/dynamicfusion/src/precomp.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/precomp.hpp
@@ -1,10 +1,10 @@
 #pragma once
 
-#include <opencv2/kfusion/types.hpp>
-#include <opencv2/kfusion/kinfu.hpp>
-#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
-#include <opencv2/kfusion/cuda/imgproc.hpp>
-#include <opencv2/kfusion/cuda/projective_icp.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
+#include <opencv2/dynamicfusion/kinfu.hpp>
+#include "tsdf_volume.hpp"
+#include "imgproc.hpp"
+#include "projective_icp.hpp"
 #include "internal.hpp"
 #include <iostream>
 #include "vector_functions.h"
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/projective_icp.hpp
similarity index 97%
rename from modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/projective_icp.hpp
index 33854a5238f..b9b209ee482 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/cuda/projective_icp.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/projective_icp.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <opencv2/kfusion/types.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
 
 namespace cv
 {
diff --git a/modules/dynamicfusion/src/safe_call.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/safe_call.hpp
similarity index 100%
rename from modules/dynamicfusion/src/safe_call.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/safe_call.hpp
diff --git a/modules/dynamicfusion/src/cuda/temp_utils.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/temp_utils.hpp
similarity index 99%
rename from modules/dynamicfusion/src/cuda/temp_utils.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/temp_utils.hpp
index 3709b658e7f..bbf633b80cf 100644
--- a/modules/dynamicfusion/src/cuda/temp_utils.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/temp_utils.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "cuda.h"
-#include <opencv2/kfusion/cuda/kernel_containers.hpp>
+#include "kernel_containers.hpp"
 
 namespace kfusion
 {
diff --git a/modules/dynamicfusion/src/cuda/texture_binder.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/texture_binder.hpp
similarity index 96%
rename from modules/dynamicfusion/src/cuda/texture_binder.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/texture_binder.hpp
index 0d3702199ef..1f438b02407 100644
--- a/modules/dynamicfusion/src/cuda/texture_binder.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/texture_binder.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <opencv2/kfusion/cuda/device_array.hpp>
-#include <safe_call.hpp>
+#include "device_array.hpp"
+#include "safe_call.hpp"
 
 namespace kfusion
 {
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/tsdf_volume.hpp
similarity index 96%
rename from modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/tsdf_volume.hpp
index 8a415c2a2ed..473cb2adb60 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/cuda/tsdf_volume.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/cuda/tsdf_volume.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
-#include <opencv2/kfusion/types.hpp>
-#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
 
 namespace cv
 {
@@ -10,7 +10,7 @@ namespace cv
         class WarpField;
         namespace cuda
         {
-            class KF_EXPORTS TsdfVolume
+            class TsdfVolume
             {
                 public:
                 TsdfVolume(const cv::Vec3i& dims);
diff --git a/modules/dynamicfusion/include/opencv2/dynamicfusion/kinfu.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/kinfu.hpp
new file mode 100644
index 00000000000..3394ef486a5
--- /dev/null
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/kinfu.hpp
@@ -0,0 +1,113 @@
+#ifndef DYNAMIC_FUSION_KINFU_HPP
+#define DYNAMIC_FUSION_KINFU_HPP
+//#pragma once
+
+#include <opencv2/dynamicfusion/types.hpp>
+#include <vector>
+#include <string>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/utils/quaternion.hpp>
+#include <opencv2/dynamicfusion/cuda/projective_icp.hpp>
+#include <opencv2/dynamicfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
+
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+            int getCudaEnabledDeviceCount();
+            void setDevice(int device);
+            std::string getDeviceName(int device);
+            bool checkIfPreFermiGPU(int device);
+            void printCudaDeviceInfo(int device);
+            void printShortCudaDeviceInfo(int device);
+        }
+
+        struct  KinFuParams
+        {
+            static KinFuParams default_params();
+            static KinFuParams default_params_dynamicfusion();
+
+            int cols;  //pixels
+            int rows;  //pixels
+
+            Intr intr;  //Camera parameters
+
+            Vec3i volume_dims; //number of voxels
+            Vec3f volume_size; //meters
+            Affine3f volume_pose; //meters, inital pose
+
+            float bilateral_sigma_depth;   //meters
+            float bilateral_sigma_spatial;   //pixels
+            int   bilateral_kernel_size;   //pixels
+
+            float icp_truncate_depth_dist; //meters
+            float icp_dist_thres;          //meters
+            float icp_angle_thres;         //radians
+            std::vector<int> icp_iter_num; //iterations for level index 0,1,..,3
+
+            float tsdf_min_camera_movement; //meters, integrate only if exceedes
+            float tsdf_trunc_dist;             //meters;
+            int tsdf_max_weight;               //frames
+
+            float raycast_step_factor;   // in voxel sizes
+            float gradient_delta_factor; // in voxel sizes
+
+            Vec3f light_pose; //meters
+
+        };
+
+        class  KinFu
+        {
+        public:
+            typedef cv::Ptr<KinFu> Ptr;
+
+            KinFu(const KinFuParams& params);
+
+            const KinFuParams& params() const;
+            KinFuParams& params();
+
+            const cuda::TsdfVolume& tsdf() const;
+            cuda::TsdfVolume& tsdf();
+
+            const cuda::ProjectiveICP& icp() const;
+            cuda::ProjectiveICP& icp();
+
+            const WarpField& getWarp() const;
+            WarpField& getWarp();
+
+            void reset();
+
+            bool operator()(const cuda::Depth& depth, const cuda::Image& image = cuda::Image());
+
+            void renderImage(cuda::Image& image, int flags = 0);
+            void dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals);
+            void renderImage(cuda::Image& image, const Affine3f& pose, int flags = 0);
+            void reprojectToDepth();
+
+            Affine3f getCameraPose (int time = -1) const;
+        private:
+            void allocate_buffers();
+
+            int frame_counter_;
+            KinFuParams params_;
+
+            std::vector<Affine3f> poses_;
+
+            cuda::Dists dists_;
+            cuda::Frame curr_, prev_, first_;
+
+            cuda::Cloud points_;
+            cuda::Normals normals_;
+            cuda::Depth depths_;
+
+            cv::Ptr<cuda::TsdfVolume> volume_;
+            cv::Ptr<cuda::ProjectiveICP> icp_;
+            cv::Ptr<WarpField> warp_;
+            std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> edges;
+        };
+    }
+}
+#endif
\ No newline at end of file
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/optimisation.hpp
similarity index 99%
rename from modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/optimisation.hpp
index 5877dc14e0d..749aa46eb85 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/optimisation.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/optimisation.hpp
@@ -2,7 +2,7 @@
 #define KFUSION_OPTIMISATION_H
 #include "ceres/ceres.h"
 #include "ceres/rotation.h"
-#include <opencv2/kfusion/warp_field.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
 
 namespace cv
 {
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/types.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/types.hpp
similarity index 90%
rename from modules/dynamicfusion/include/opencv2/kfusion/types.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/types.hpp
index 07b6a12093a..915710619c6 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/types.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/types.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <opencv2/kfusion/cuda/device_array.hpp>
+#include <opencv2/dynamicfusion/cuda/device_array.hpp>
 #include <opencv2/viz/vizcore.hpp>
 #include <iosfwd>
 
@@ -17,7 +17,7 @@ namespace cv
         typedef cv::Vec3i Vec3i;
         typedef cv::Affine3f Affine3f;
 
-        struct KF_EXPORTS Intr
+        struct  Intr
                 {
                         float fx, fy, cx, cy;
 
@@ -26,7 +26,7 @@ namespace cv
                 Intr operator()(int level_index) const;
                 };
 
-        KF_EXPORTS std::ostream& operator << (std::ostream& os, const Intr& intr);
+         std::ostream& operator << (std::ostream& os, const Intr& intr);
 
         struct Point
         {
@@ -74,7 +74,7 @@ namespace cv
 
         inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
 
-        struct KF_EXPORTS ScopeTime
+        struct  ScopeTime
                 {
                         const char* name;
                         double start;
@@ -82,7 +82,7 @@ namespace cv
                         ~ScopeTime();
                 };
 
-        struct KF_EXPORTS SampledScopeTime
+        struct  SampledScopeTime
                 {
                         public:
                         enum { EACH = 33 };
diff --git a/modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/dual_quaternion.hpp
similarity index 99%
rename from modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/utils/dual_quaternion.hpp
index 83ed53c413a..96cc3ed7223 100644
--- a/modules/dynamicfusion/include/opencv2/utils/dual_quaternion.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/dual_quaternion.hpp
@@ -1,7 +1,7 @@
 #ifndef DYNAMIC_FUSION_DUAL_QUATERNION_HPP
 #define DYNAMIC_FUSION_DUAL_QUATERNION_HPP
 #include<iostream>
-#include<opencv2/utils/quaternion.hpp>
+#include<opencv2/dynamicfusion/utils/quaternion.hpp>
 
 //Adapted from https://github.com/Poofjunior/QPose
 /**
diff --git a/modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/knn_point_cloud.hpp
similarity index 98%
rename from modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/utils/knn_point_cloud.hpp
index b72c51943b8..6bb3da24799 100644
--- a/modules/dynamicfusion/include/opencv2/utils/knn_point_cloud.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/knn_point_cloud.hpp
@@ -1,7 +1,7 @@
 #ifndef KFUSION_KNN_POINfloat_CLOUD_HPP
 #define KFUSION_KNN_POINfloat_CLOUD_HPP
 
-#include <opencv2/kfusion/types.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
 namespace cv
 {
     namespace kfusion
diff --git a/modules/dynamicfusion/include/opencv2/utils/quaternion.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/quaternion.hpp
similarity index 99%
rename from modules/dynamicfusion/include/opencv2/utils/quaternion.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/utils/quaternion.hpp
index 75784454ed1..8cf00a8b734 100644
--- a/modules/dynamicfusion/include/opencv2/utils/quaternion.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/utils/quaternion.hpp
@@ -2,7 +2,7 @@
 #define DYNAMIC_FUSION_QUATERNION_HPP
 #pragma once
 #include <iostream>
-#include <opencv2/kfusion/types.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
 //Adapted from https://github.com/Poofjunior/QPose
 namespace cv
 {
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp b/modules/dynamicfusion/include/opencv2/dynamicfusion/warp_field.hpp
similarity index 94%
rename from modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp
rename to modules/dynamicfusion/include/opencv2/dynamicfusion/warp_field.hpp
index 0779953c93f..2218948056b 100644
--- a/modules/dynamicfusion/include/opencv2/kfusion/warp_field.hpp
+++ b/modules/dynamicfusion/include/opencv2/dynamicfusion/warp_field.hpp
@@ -5,11 +5,11 @@
  * \brief
  * \details
  */
-#include <opencv2/utils/dual_quaternion.hpp>
-#include <opencv2/kfusion/types.hpp>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
 #include <nanoflann/nanoflann.hpp>
-#include <opencv2/utils/knn_point_cloud.hpp>
-#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/dynamicfusion/utils/knn_point_cloud.hpp>
+#include <opencv2/dynamicfusion/cuda/tsdf_volume.hpp>
 #define KNN_NEIGHBOURS 8
 
 namespace cv
diff --git a/modules/dynamicfusion/include/opencv2/io/capture.hpp b/modules/dynamicfusion/include/opencv2/io/capture.hpp
deleted file mode 100644
index 4e6d6b5f8c2..00000000000
--- a/modules/dynamicfusion/include/opencv2/io/capture.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#pragma once
-
-#include <opencv2/kfusion/kinfu.hpp>
-#include <string>
-
-namespace cv
-{
-    namespace kfusion
-    {
-        class KF_EXPORTS OpenNISource
-        {
-            public:
-            typedef kfusion::PixelRGB RGB24;
-
-            enum { PROP_OPENNI_REGISTRATION_ON  = 104 };
-
-            OpenNISource();
-            OpenNISource(int device);
-            OpenNISource(const std::string& oni_filename, bool repeat = false);
-
-            void open(int device);
-            void open(const std::string& oni_filename, bool repeat = false);
-            void release();
-
-            ~OpenNISource();
-
-            bool grab(cv::Mat &depth, cv::Mat &image);
-
-            //parameters taken from camera/oni
-            int shadow_value, no_sample_value;
-            float depth_focal_length_VGA;
-            float baseline;               // mm
-            double pixelSize;             // mm
-            unsigned short max_depth;     // mm
-
-            bool setRegistration (bool value = false);
-            private:
-            struct Impl;
-            cv::Ptr<Impl> impl_;
-            void getParams ();
-        };
-    }
-}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp b/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
deleted file mode 100644
index 7b2f5535ca6..00000000000
--- a/modules/dynamicfusion/include/opencv2/kfusion/cuda/imgproc.hpp
+++ /dev/null
@@ -1,38 +0,0 @@
-#pragma once
-
-#include <opencv2/kfusion/types.hpp>
-
-namespace cv
-{
-    namespace kfusion
-    {
-        namespace cuda
-        {
-            KF_EXPORTS void depthBilateralFilter(const Depth& in, Depth& out, int ksz, float sigma_spatial, float sigma_depth);
-
-            KF_EXPORTS void depthTruncation(Depth& depth, float threshold);
-
-            KF_EXPORTS void depthBuildPyramid(const Depth& depth, Depth& pyramid, float sigma_depth);
-
-            KF_EXPORTS void computeNormalsAndMaskDepth(const Intr& intr, Depth& depth, Normals& normals);
-
-            KF_EXPORTS void computePointNormals(const Intr& intr, const Depth& depth, Cloud& points, Normals& normals);
-
-            KF_EXPORTS void computeDists(const Depth& depth, Dists& dists, const Intr& intr);
-
-            KF_EXPORTS void cloudToDepth(const Cloud& cloud, Depth& depth);
-
-            KF_EXPORTS void resizeDepthNormals(const Depth& depth, const Normals& normals, Depth& depth_out, Normals& normals_out);
-
-            KF_EXPORTS void resizePointsNormals(const Cloud& points, const Normals& normals, Cloud& points_out, Normals& normals_out);
-
-            KF_EXPORTS void waitAllDefaultStream();
-
-            KF_EXPORTS void renderTangentColors(const Normals& normals, Image& image);
-
-            KF_EXPORTS void renderImage(const Depth& depth, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
-
-            KF_EXPORTS void renderImage(const Cloud& points, const Normals& normals, const Intr& intr, const Vec3f& light_pose, Image& image);
-        }
-    }
-}
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/exports.hpp b/modules/dynamicfusion/include/opencv2/kfusion/exports.hpp
deleted file mode 100644
index 43fdf142d90..00000000000
--- a/modules/dynamicfusion/include/opencv2/kfusion/exports.hpp
+++ /dev/null
@@ -1,7 +0,0 @@
-#pragma once
-
-#if (defined WIN32 || defined _WIN32 || defined WINCE) && defined KFUSION_API_EXPORTS
-  #define KF_EXPORTS __declspec(dllexport)
-#else
-  #define KF_EXPORTS
-#endif
diff --git a/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp b/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
deleted file mode 100644
index bf3cda7438a..00000000000
--- a/modules/dynamicfusion/include/opencv2/kfusion/kinfu.hpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#ifndef DYNAMIC_FUSION_KINFU_HPP
-#define DYNAMIC_FUSION_KINFU_HPP
-//#pragma once
-
-#include <opencv2/kfusion/types.hpp>
-#include <vector>
-#include <string>
-#include <opencv2/utils/dual_quaternion.hpp>
-#include <opencv2/utils/quaternion.hpp>
-#include <opencv2/kfusion/cuda/projective_icp.hpp>
-#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
-#include <opencv2/kfusion/warp_field.hpp>
-
-namespace cv
-{
-    namespace kfusion
-    {
-        namespace cuda
-        {
-            KF_EXPORTS int getCudaEnabledDeviceCount();
-            KF_EXPORTS void setDevice(int device);
-            KF_EXPORTS std::string getDeviceName(int device);
-            KF_EXPORTS bool checkIfPreFermiGPU(int device);
-            KF_EXPORTS void printCudaDeviceInfo(int device);
-            KF_EXPORTS void printShortCudaDeviceInfo(int device);
-        }
-
-        struct KF_EXPORTS KinFuParams
-                {
-                        static KinFuParams default_params();
-                        static KinFuParams default_params_dynamicfusion();
-
-                        int cols;  //pixels
-                        int rows;  //pixels
-
-                        Intr intr;  //Camera parameters
-
-                        Vec3i volume_dims; //number of voxels
-                        Vec3f volume_size; //meters
-                        Affine3f volume_pose; //meters, inital pose
-
-                        float bilateral_sigma_depth;   //meters
-                        float bilateral_sigma_spatial;   //pixels
-                        int   bilateral_kernel_size;   //pixels
-
-                        float icp_truncate_depth_dist; //meters
-                        float icp_dist_thres;          //meters
-                        float icp_angle_thres;         //radians
-                        std::vector<int> icp_iter_num; //iterations for level index 0,1,..,3
-
-                        float tsdf_min_camera_movement; //meters, integrate only if exceedes
-                        float tsdf_trunc_dist;             //meters;
-                        int tsdf_max_weight;               //frames
-
-                        float raycast_step_factor;   // in voxel sizes
-                        float gradient_delta_factor; // in voxel sizes
-
-                        Vec3f light_pose; //meters
-
-                };
-
-        class KF_EXPORTS KinFu
-                {
-                        public:
-                        typedef cv::Ptr<KinFu> Ptr;
-
-                        KinFu(const KinFuParams& params);
-
-                        const KinFuParams& params() const;
-                        KinFuParams& params();
-
-                        const cuda::TsdfVolume& tsdf() const;
-                        cuda::TsdfVolume& tsdf();
-
-                        const cuda::ProjectiveICP& icp() const;
-                        cuda::ProjectiveICP& icp();
-
-                        const WarpField& getWarp() const;
-                        WarpField& getWarp();
-
-                        void reset();
-
-                        bool operator()(const cuda::Depth& depth, const cuda::Image& image = cuda::Image());
-
-                        void renderImage(cuda::Image& image, int flags = 0);
-                        void dynamicfusion(cuda::Depth& depth, cuda::Cloud current_frame, cuda::Normals current_normals);
-                        void renderImage(cuda::Image& image, const Affine3f& pose, int flags = 0);
-                        void reprojectToDepth();
-
-                        Affine3f getCameraPose (int time = -1) const;
-                        private:
-                        void allocate_buffers();
-
-                        int frame_counter_;
-                        KinFuParams params_;
-
-                        std::vector<Affine3f> poses_;
-
-                        cuda::Dists dists_;
-                        cuda::Frame curr_, prev_, first_;
-
-                        cuda::Cloud points_;
-                        cuda::Normals normals_;
-                        cuda::Depth depths_;
-
-                        cv::Ptr<cuda::TsdfVolume> volume_;
-                        cv::Ptr<cuda::ProjectiveICP> icp_;
-                        cv::Ptr<WarpField> warp_;
-                        std::vector<std::pair<utils::DualQuaternion<float>, utils::DualQuaternion<float>>> edges;
-                };
-    }
-}
-#endif
\ No newline at end of file
diff --git a/modules/dynamicfusion/samples/demo.cpp b/modules/dynamicfusion/samples/demo.cpp
index 30b21d53bb2..4d8ab6cc0ba 100644
--- a/modules/dynamicfusion/samples/demo.cpp
+++ b/modules/dynamicfusion/samples/demo.cpp
@@ -2,7 +2,7 @@
 #include <opencv2/highgui/highgui_c.h>
 #include <opencv2/highgui.hpp>
 #include <opencv2/dynamicfusion.hpp>
-
+#include <opencv2/dynamicfusion/cuda/device_array.hpp>
 using namespace cv;
 struct DynamicFusionApp
 {
diff --git a/modules/dynamicfusion/src/core.cpp b/modules/dynamicfusion/src/core.cpp
index 5a1a568cd22..7fefc23f212 100644
--- a/modules/dynamicfusion/src/core.cpp
+++ b/modules/dynamicfusion/src/core.cpp
@@ -1,5 +1,5 @@
-#include <opencv2/kfusion/kinfu.hpp>
-#include "safe_call.hpp"
+#include <opencv2/dynamicfusion/kinfu.hpp>
+#include <opencv2/dynamicfusion/cuda/safe_call.hpp>
 
 #include <cuda.h>
 #include <cstdio>
diff --git a/modules/dynamicfusion/src/device_array.cpp b/modules/dynamicfusion/src/device_array.cpp
new file mode 100644
index 00000000000..538fb9c56c3
--- /dev/null
+++ b/modules/dynamicfusion/src/device_array.cpp
@@ -0,0 +1,155 @@
+#include <opencv2/dynamicfusion/cuda/device_array.hpp>
+/////////////////////   implementations of DeviceArray ////////////////////////////////////////////
+namespace cv
+{
+    namespace kfusion
+    {
+        namespace cuda
+        {
+
+
+            template<class T>
+            DeviceArray<T>::DeviceArray() {}
+
+            template<class T>
+            DeviceArray<T>::DeviceArray(size_t size) : DeviceMemory(size * elem_size) {}
+
+            template<class T>
+            DeviceArray<T>::DeviceArray(T *ptr, size_t size) : DeviceMemory(ptr, size * elem_size) {}
+
+            template<class T>
+            DeviceArray<T>::DeviceArray(const DeviceArray &other) : DeviceMemory(other) {}
+
+            template<class T>
+            DeviceArray<T> &DeviceArray<T>::operator=(const DeviceArray &other) {
+                DeviceMemory::operator=(other);
+                return *this;
+            }
+
+            template<class T>
+            void DeviceArray<T>::create(size_t size) { DeviceMemory::create(size * elem_size); }
+
+            template<class T>
+            void DeviceArray<T>::release() { DeviceMemory::release(); }
+
+            template<class T>
+            void DeviceArray<T>::copyTo(DeviceArray &other) const { DeviceMemory::copyTo(other); }
+
+            template<class T>
+            void DeviceArray<T>::upload(const T *host_ptr, size_t size) { DeviceMemory::upload(host_ptr, size * elem_size); }
+
+            template<class T>
+            void DeviceArray<T>::download(T *host_ptr) const { DeviceMemory::download(host_ptr); }
+
+            template<class T>
+            void DeviceArray<T>::swap(DeviceArray &other_arg) { DeviceMemory::swap(other_arg); }
+
+            template<class T>
+            DeviceArray<T>::operator T *() { return ptr(); }
+
+            template<class T>
+            DeviceArray<T>::operator const T *() const { return ptr(); }
+
+            template<class T>
+            size_t DeviceArray<T>::size() const { return sizeBytes() / elem_size; }
+
+            template<class T>
+            T *DeviceArray<T>::ptr() { return DeviceMemory::ptr<T>(); }
+
+            template<class T>
+            const T *DeviceArray<T>::ptr() const { return DeviceMemory::ptr<T>(); }
+
+            template<class T>
+            template<class A>
+            void DeviceArray<T>::upload(const std::vector <T, A> &data) { upload(&data[0], data.size()); }
+
+            template<class T>
+            template<class A>
+            void DeviceArray<T>::download(std::vector <T, A> &data) const {
+                data.resize(size());
+                if (!data.empty()) download(&data[0]);
+            }
+
+/////////////////////   implementations of DeviceArray2D ////////////////////////////////////////////
+
+            template<class T>
+            DeviceArray2D<T>::DeviceArray2D() {}
+
+            template<class T>
+            DeviceArray2D<T>::DeviceArray2D(int rows, int cols) : DeviceMemory2D(rows, cols * elem_size) {}
+
+            template<class T>
+            DeviceArray2D<T>::DeviceArray2D(int rows, int cols, void *data, size_t stepBytes) : DeviceMemory2D(rows, cols *
+                                                                                                                     elem_size,
+                                                                                                               data,
+                                                                                                               stepBytes) {}
+
+            template<class T>
+            DeviceArray2D<T>::DeviceArray2D(const DeviceArray2D &other) : DeviceMemory2D(other) {}
+
+            template<class T>
+            DeviceArray2D <T> &DeviceArray2D<T>::operator=(const DeviceArray2D &other) {
+                DeviceMemory2D::operator=(other);
+                return *this;
+            }
+
+            template<class T>
+            void DeviceArray2D<T>::create(int rows, int cols) { DeviceMemory2D::create(rows, cols * elem_size); }
+
+            template<class T>
+            void DeviceArray2D<T>::release() { DeviceMemory2D::release(); }
+
+            template<class T>
+            void DeviceArray2D<T>::copyTo(DeviceArray2D &other) const { DeviceMemory2D::copyTo(other); }
+
+            template<class T>
+            void DeviceArray2D<T>::upload(const void *host_ptr, size_t host_step, int rows, int cols) {
+                DeviceMemory2D::upload(host_ptr, host_step, rows, cols * elem_size);
+            }
+
+            template<class T>
+            void DeviceArray2D<T>::download(void *host_ptr, size_t host_step) const {
+                DeviceMemory2D::download(host_ptr, host_step);
+            }
+
+            template<class T>
+            template<class A>
+            void DeviceArray2D<T>::upload(const std::vector <T, A> &data, int cols) {
+                upload(&data[0], cols * elem_size, data.size() / cols, cols);
+            }
+
+            template<class T>
+            template<class A>
+            void DeviceArray2D<T>::download(std::vector <T, A> &data, int &elem_step) const {
+                elem_step = cols();
+                data.resize(cols() * rows());
+                if (!data.empty()) download(&data[0], colsBytes());
+            }
+
+            template<class T>
+            void DeviceArray2D<T>::swap(DeviceArray2D &other_arg) { DeviceMemory2D::swap(other_arg); }
+
+            template<class T>
+            T *DeviceArray2D<T>::ptr(int y) { return DeviceMemory2D::ptr<T>(y); }
+
+            template<class T>
+            const T *DeviceArray2D<T>::ptr(int y) const { return DeviceMemory2D::ptr<T>(y); }
+
+            template<class T>
+            DeviceArray2D<T>::operator T *() { return ptr(); }
+
+            template<class T>
+            DeviceArray2D<T>::operator const T *() const { return ptr(); }
+
+            template<class T>
+            int DeviceArray2D<T>::cols() const { return DeviceMemory2D::colsBytes() / elem_size; }
+
+            template<class T>
+            int DeviceArray2D<T>::rows() const { return DeviceMemory2D::rows(); }
+
+            template<class T>
+            size_t DeviceArray2D<T>::elem_step() const { return DeviceMemory2D::step() / elem_size; }
+
+        }
+    }
+}
\ No newline at end of file
diff --git a/modules/dynamicfusion/src/device_memory.cpp b/modules/dynamicfusion/src/device_memory.cpp
index 84e30fb3f91..abbed7465e1 100644
--- a/modules/dynamicfusion/src/device_memory.cpp
+++ b/modules/dynamicfusion/src/device_memory.cpp
@@ -1,5 +1,5 @@
-#include <opencv2/kfusion/cuda/device_memory.hpp>
-#include "safe_call.hpp"
+#include <opencv2/dynamicfusion/cuda/device_memory.hpp>
+#include <opencv2/dynamicfusion/cuda/safe_call.hpp>
 #include <cassert>
 #include <iostream>
 #include <cstdlib>
diff --git a/modules/dynamicfusion/src/imgproc.cpp b/modules/dynamicfusion/src/imgproc.cpp
index d5fedb60aa0..37867ff7575 100644
--- a/modules/dynamicfusion/src/imgproc.cpp
+++ b/modules/dynamicfusion/src/imgproc.cpp
@@ -1,5 +1,5 @@
-#include "precomp.hpp"
-#include <opencv2/kfusion/types.hpp>
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
 using namespace cv;
 /**
  *
diff --git a/modules/dynamicfusion/src/cuda/imgproc.cu b/modules/dynamicfusion/src/imgproc.cu
similarity index 99%
rename from modules/dynamicfusion/src/cuda/imgproc.cu
rename to modules/dynamicfusion/src/imgproc.cu
index 511b629f09b..f512f3559ea 100644
--- a/modules/dynamicfusion/src/cuda/imgproc.cu
+++ b/modules/dynamicfusion/src/imgproc.cu
@@ -1,5 +1,5 @@
 #include <host_defines.h>
-#include "device.hpp"
+#include <opencv2/dynamicfusion/cuda/device.hpp>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Depth bilateral filter
diff --git a/modules/dynamicfusion/src/kinfu.cpp b/modules/dynamicfusion/src/kinfu.cpp
index 82491d9d3d0..3d44c6755bb 100644
--- a/modules/dynamicfusion/src/kinfu.cpp
+++ b/modules/dynamicfusion/src/kinfu.cpp
@@ -1,16 +1,13 @@
-#include "precomp.hpp"
-#include "internal.hpp"
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
+#include <opencv2/dynamicfusion/cuda/internal.hpp>
 #include <tgmath.h>
-#include <opencv2/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
 #include <nanoflann/nanoflann.hpp>
-#include <opencv2/utils/quaternion.hpp>
-#include <opencv2/utils/knn_point_cloud.hpp>
-#include <opencv2/kfusion/warp_field.hpp>
-#include <opencv2/kfusion/cuda/tsdf_volume.hpp>
-#include <opencv2/viz/vizcore.hpp>
-
-using namespace std;
-using namespace cv;
+#include <opencv2/dynamicfusion/utils/quaternion.hpp>
+#include <opencv2/dynamicfusion/utils/knn_point_cloud.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
+#include <opencv2/dynamicfusion/cuda/tsdf_volume.hpp>
+#include <opencv2/dynamicfusion/kinfu.hpp>
 static inline float deg2rad (float alpha) { return alpha * 0.017453293f; }
 
 /**
@@ -194,7 +191,7 @@ void cv::kfusion::KinFu::allocate_buffers()
 void cv::kfusion::KinFu::reset()
 {
     if (frame_counter_)
-        cout << "Reset" << endl;
+        std::cout << "Reset" << std::endl;
 
     frame_counter_ = 0;
     poses_.clear();
@@ -393,13 +390,8 @@ void cv::kfusion::KinFu::dynamicfusion(cv::kfusion::cuda::Depth& depth, cv::kfus
 
 //    getWarp().warp(warped, warped_normals);
 //    //ScopeTime time("fusion");
-    tsdf().surface_fusion(getWarp(), warped, canonical_visible, depth, camera_pose, params_.intr);
+    tsdf().surface_fusion(*warp_, warped, canonical_visible, depth, camera_pose, params_.intr);
 
-    cv::Mat depth_cloud(depth.rows(),depth.cols(), CV_16U);
-    depth.download(depth_cloud.ptr<void>(), depth_cloud.step);
-    cv::Mat display;
-    depth_cloud.convertTo(display, CV_8U, 255.0/4000);
-    cv::viz::imshow("Depth diff", display);
     volume_->compute_points();
     volume_->compute_normals();
 }
diff --git a/modules/dynamicfusion/src/precomp.cpp b/modules/dynamicfusion/src/precomp.cpp
index 05703fecf4a..5b351f05825 100644
--- a/modules/dynamicfusion/src/precomp.cpp
+++ b/modules/dynamicfusion/src/precomp.cpp
@@ -1,5 +1,5 @@
-#include "precomp.hpp"
-#include "internal.hpp"
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
+#include <opencv2/dynamicfusion/cuda/internal.hpp>
 using namespace cv;
 ////////////////////////////////////////////////////////////////////////////////////////////////////////////////
 /// Kinfu/types implementation
diff --git a/modules/dynamicfusion/src/cuda/proj_icp.cu b/modules/dynamicfusion/src/proj_icp.cu
similarity index 99%
rename from modules/dynamicfusion/src/cuda/proj_icp.cu
rename to modules/dynamicfusion/src/proj_icp.cu
index 159ab4e3b04..a07f7e0a277 100644
--- a/modules/dynamicfusion/src/cuda/proj_icp.cu
+++ b/modules/dynamicfusion/src/proj_icp.cu
@@ -1,5 +1,5 @@
-#include "device.hpp"
-#include "texture_binder.hpp"
+#include <opencv2/dynamicfusion/cuda/device.hpp>
+#include <opencv2/dynamicfusion/cuda/texture_binder.hpp>
 
 
 namespace kfusion
diff --git a/modules/dynamicfusion/src/projective_icp.cpp b/modules/dynamicfusion/src/projective_icp.cpp
index 0e1e4f8f18c..0411133486e 100644
--- a/modules/dynamicfusion/src/projective_icp.cpp
+++ b/modules/dynamicfusion/src/projective_icp.cpp
@@ -1,4 +1,4 @@
-#include "precomp.hpp"
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
 
 
 using namespace cv::kfusion;
diff --git a/modules/dynamicfusion/src/tsdf_volume.cpp b/modules/dynamicfusion/src/tsdf_volume.cpp
index eb0bac2f31d..a4e93de1faa 100644
--- a/modules/dynamicfusion/src/tsdf_volume.cpp
+++ b/modules/dynamicfusion/src/tsdf_volume.cpp
@@ -1,9 +1,9 @@
-#include <opencv2/utils/dual_quaternion.hpp>
-#include <opencv2/kfusion/warp_field.hpp>
-#include <opencv2/utils/knn_point_cloud.hpp>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
+#include <opencv2/dynamicfusion/utils/knn_point_cloud.hpp>
 #include <opencv2/viz/vizcore.hpp>
 #include <numeric>
-#include "precomp.hpp"
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
 #include <algorithm>
 using namespace cv::kfusion;
 using namespace cv::kfusion::cuda;
diff --git a/modules/dynamicfusion/src/cuda/tsdf_volume.cu b/modules/dynamicfusion/src/tsdf_volume.cu
similarity index 99%
rename from modules/dynamicfusion/src/cuda/tsdf_volume.cu
rename to modules/dynamicfusion/src/tsdf_volume.cu
index bf41284a78d..a3cee59091c 100644
--- a/modules/dynamicfusion/src/cuda/tsdf_volume.cu
+++ b/modules/dynamicfusion/src/tsdf_volume.cu
@@ -1,7 +1,7 @@
 #include <vector_functions.h>
-#include "device.hpp"
-#include "texture_binder.hpp"
-#include "../internal.hpp"
+#include <opencv2/dynamicfusion/cuda/device.hpp>
+#include <opencv2/dynamicfusion/cuda/texture_binder.hpp>
+#include <opencv2/dynamicfusion/cuda/internal.hpp>
 #include "math_constants.h"
 using namespace kfusion::device;
 
diff --git a/modules/dynamicfusion/src/warp_field.cpp b/modules/dynamicfusion/src/warp_field.cpp
index 7e388bae620..cf217dad449 100644
--- a/modules/dynamicfusion/src/warp_field.cpp
+++ b/modules/dynamicfusion/src/warp_field.cpp
@@ -1,11 +1,11 @@
-#include <opencv2/utils/dual_quaternion.hpp>
-#include <opencv2/utils/knn_point_cloud.hpp>
-#include <opencv2/kfusion/types.hpp>
 #include <nanoflann/nanoflann.hpp>
-#include <opencv2/kfusion/warp_field.hpp>
-#include "internal.hpp"
-#include "precomp.hpp"
-#include <opencv2/kfusion/optimisation.hpp>
+#include <opencv2/dynamicfusion/utils/dual_quaternion.hpp>
+#include <opencv2/dynamicfusion/utils/knn_point_cloud.hpp>
+#include <opencv2/dynamicfusion/types.hpp>
+#include <opencv2/dynamicfusion/warp_field.hpp>
+#include <opencv2/dynamicfusion/cuda/internal.hpp>
+#include <opencv2/dynamicfusion/cuda/precomp.hpp>
+#include <opencv2/dynamicfusion/optimisation.hpp>
 
 using namespace cv::kfusion;
 using namespace cv;