diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index e1759847..18c4d945 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -134,7 +134,7 @@ jobs: - name: Checkout hpddm run: | git clone https://github.com/hpddm/hpddm.git hpddm - cd hpddm && git checkout 5890d5addf3962d539dc25c441ec3ff4af93b3ab + cd hpddm && git checkout 24aed69dbde7ef1526ae87ccf4f39ceb840bccea # uses: actions/checkout@v3 # with: # path: "hpddm" @@ -169,12 +169,12 @@ jobs: make doc - name: Check c++ format - uses: DoozyX/clang-format-lint-action@v0.16.2 + uses: DoozyX/clang-format-lint-action@v0.20 with: source: 'htool/include htool/tests' # exclude: './third_party ./external' extensions: 'hpp,cpp' - clangFormatVersion: 16 + clangFormatVersion: 20 style: file - name: Check cmake format @@ -216,9 +216,9 @@ jobs: fetch-depth: 0 - uses: actions/download-artifact@v4 - name: Upload coverage report - uses: codecov/codecov-action@v4 + uses: codecov/codecov-action@v5 with: fail_ci_if_error: true - file: ./coverage.info + files: ./coverage.info token: ${{ secrets.CODECOV_TOKEN }} verbose: true diff --git a/CHANGELOG.md b/CHANGELOG.md index 699dbfdc..f31ca301 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -26,9 +26,41 @@ All notable changes to this project will be documented in this file. ## Unreleased +### Added + +- `HMatrix` recompression with SVD. +- Generic recompressed low-rank compression with `RecompressedLowRankGenerator`. +- Checks about `UPLO` for hmatrix factorization. +- `HMatrixBuilder` for easier `HMatrix` creation (especially when using only the hmatrix part of Htool-DDM). +- `add_hmatrix_vector_product` and `add_hmatrix_matrix_product` for working in user numbering. For C++17 and onward, these functions have preliminary support for execution policies with default being sequential execution. This tries to some extent to follow `` API. +- `task_based_tree_builder.hpp` for miscellaneous functions used for task based approach. +- `hmatrix_output_dot.hpp` for L0 and block tree visualization. +- `task_based_compute_blocks` for task based alternative to `compute_blocks`. +- `task_based_internal_add_hmatrix_vector_product` for task based alternative to `internal_add_hmatrix_vector_product`. +- `task_based_internal_add_hmatrix_hmatrix_product` for task based alternative to `internal_add_hmatrix_hmatrix_product`. +- `task_based_internal_triangular_hmatrix_hmatrix_solve` for task based alternative to `internal_triangular_hmatrix_hmatrix_solve`. +- `task_based_lu_factorization` and `task_based_cholesky_factorization` for task based alternatives to `lu_factorization` and `cholesky_factorization`. +- `test_task_based_hmatrix_***.hpp` for testing various task based features. +- `internal_add_lrmat_hmatrix` is now overloaded to handle the case where the HMatrix is larger than the LowRankMatrix. +- `get_leaves_from` is overloaded to return non const arguments. +- `get_false_positive` and `get_L0` in a tree builder. +- `left_hmatrix_ancestor_of_right_hmatrix` and `left_hmatrix_descendant_of_right_hmatrix` for returning parent and children of a hmatrix. + +### Changed + +- `VirtualInternalLowRankGenerator` and `VirtualLowRankGenerator`'s `copy_low_rank_approximation` function takes a `LowRankMatrix` as input to populate it and returns a boolean. The return value is true if the compression succeded, false otherwise. +- `LowRankMatrix` constructors changed. It only takes sizes and an epsilon or a required rank. Then, it is expected to call a `VirtualInternalLowRankGenerator` to populate it. +- `ClusterTreeBuilder` has now one strategy as `VirtualPartitioning`. Usual implementations are still available, for example using `Partitioning`. +- When using `ClusterTreeBuilder` with `number_of_children=2^spatial_dimension`, it will do a binary/quad/octo-tree instead of `number_of_children` cut along the main direction. +- `ClusterTreeBuilder` parameter `minclustersize` was removed, and a parameter `maximal_leaf_size` has been added. +- `build` can now use `task_based_compute_blocks` + ### Fixed -- fix inline definition of `logging_level_to_string` +- Fix inline definition of `logging_level_to_string`. +- Fix error when resizing `Matrix`. +- Fix error due to using `int` instead of `size_t`, thanks to @vdubos. +- Fix warnings with `-Wold-style-cast`. ## [0.9.0] - 2024-09-19 diff --git a/CMakeLists.txt b/CMakeLists.txt index 206675a2..c54055f0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,16 +16,6 @@ else() LANGUAGES CXX) endif() -# To force c++14 -if(${CMAKE_VERSION} VERSION_LESS 3.1) - add_compile_options(-std=c++14) -elseif(${CMAKE_VERSION} VERSION_LESS 3.6.3 AND ${CMAKE_CXX_COMPILER_ID} STREQUAL "Intel") - add_compile_options(-std=c++14) -else() - set(CMAKE_CXX_STANDARD 14) - set(CMAKE_CXX_STANDARD_REQUIRED ON) -endif() - if(CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) # To set default CMAKE_BUILD_TYPE set(default_build_type "Release") @@ -94,23 +84,21 @@ option(HTOOL_WITH_STRICT_TESTS "Add -Werror to the tests" OFF) #=============================================================================# # MPI find_package(MPI REQUIRED) +message(STATUS "MPI libraries:" "${MPI_LIBRARIES}") +message(STATUS "MPI include files:" "${MPI_INCLUDE_PATH}") +message(STATUS "Tests run: ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS") separate_arguments(MPIEXEC_PREFLAGS) # to support multi flags -message(STATUS "Run: ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} ${MPIEXEC_MAX_NUMPROCS} ${MPIEXEC_PREFLAGS} EXECUTABLE ${MPIEXEC_POSTFLAGS} ARGS") # OPENMP find_package(OpenMP) # BLAS find_package(BLAS REQUIRED) -message("-- Found Blas implementation:" "${BLAS_LIBRARIES}") +message(STATUS "Blas implementation:" "${BLAS_LIBRARIES}") # LAPACK find_package(LAPACK) -message("-- Found Lapack:" "${LAPACK_LIBRARIES}") - -# # ARPACK -# find_package(ARPACK) -# message("-- Found Arpack:" "${ARPACK_LIBRARIES}") +message(STATUS "Lapack implementation:" "${LAPACK_LIBRARIES}") # HPDDM find_package(HPDDM) @@ -120,12 +108,12 @@ find_package(HPDDM) #=============================================================================# #=== HTOOL as header only library add_library(htool INTERFACE) -target_include_directories(htool INTERFACE $ $ ${MPI_INCLUDE_PATH} ${HPDDM_INCLUDE_DIRS} ${MKL_INC_DIR}) -target_link_libraries(htool INTERFACE MPI::MPI_CXX ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} ${ARPACK_LIBRARIES}) +target_include_directories(htool INTERFACE $ $ ${HPDDM_INCLUDE_DIRS} ${MKL_INC_DIR}) +target_link_libraries(htool INTERFACE MPI::MPI_CXX BLAS::BLAS LAPACK::LAPACK) if(OpenMP_CXX_FOUND) target_link_libraries(htool INTERFACE OpenMP::OpenMP_CXX) endif() -target_compile_features(htool INTERFACE cxx_std_11) +target_compile_features(htool INTERFACE cxx_std_17) if("${BLA_VENDOR}" STREQUAL "Intel10_32" OR "${BLA_VENDOR}" STREQUAL "Intel10_64lp" @@ -184,9 +172,18 @@ endif() # Add tests if((CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) AND BUILD_TESTING) - if(CODE_COVERAGE AND (CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")) - target_compile_options(htool INTERFACE -fprofile-arcs -ftest-coverage) - target_link_libraries(htool INTERFACE gcov) + if(CODE_COVERAGE) + if(CMAKE_C_COMPILER_ID MATCHES "GNU" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") + message(STATUS "Code coverage enabled with GNU.") + target_compile_options(htool INTERFACE -fprofile-arcs -ftest-coverage -fno-elide-constructors -fno-default-inline) + target_link_libraries(htool INTERFACE gcov) + elseif(CMAKE_C_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "Clang") + message(STATUS "Code coverage enabled with LLVM.") + target_compile_options(htool INTERFACE -fprofile-instr-generate -fcoverage-mapping) + target_link_options(htool INTERFACE -fprofile-instr-generate -fcoverage-mapping) + else() + message(STATUS "Code coverage not available.") + endif() endif() if(HTOOL_WITH_STRICT_TESTS) target_compile_options(htool INTERFACE -Werror) @@ -198,7 +195,7 @@ if((CMAKE_PROJECT_NAME STREQUAL PROJECT_NAME) AND BUILD_TESTING) -Wshadow -Wnon-virtual-dtor -pedantic - # -Wold-style-cast + -Wold-style-cast -Wcast-align -Wunused -Woverloaded-virtual diff --git a/cmake/version.cmake b/cmake/version.cmake index ccf9a90b..e9064cab 100644 --- a/cmake/version.cmake +++ b/cmake/version.cmake @@ -36,7 +36,7 @@ function(check_version_number CODE_VERSION_FILE CODE_VARIABLE_MAJOR_VERSION CODE set(code_subminor_version_number ${CMAKE_MATCH_1}) set(code_version_number "${code_major_version_number}.${code_minor_version_number}.${code_subminor_version_number}") - message("Htool version: " "${code_version_number}") + message(STATUS "Htool version: " "${code_version_number}") # Check version number: error if code unconsistent if(NOT "${code_version_number}" STREQUAL "${CMAKE_PROJECT_VERSION}") message(FATAL_ERROR "Inconsistent version number:\n* Source code version number: ${code_version_number}\n* CMake version number: ${CMAKE_PROJECT_VERSION}\n") diff --git a/examples/compression_comparison.cpp b/examples/compression_comparison.cpp index f6161fa6..c13965ca 100644 --- a/examples/compression_comparison.cpp +++ b/examples/compression_comparison.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include using namespace std; @@ -96,43 +97,56 @@ int main(int argc, char *argv[]) { MyMatrix A(spatial_dimension, target_cluster.get_permutation(), source_cluster.get_permutation(), target_coordinates, source_coordinates); double norm_A = A.normFrob(); - // SVD with fixed rank - SVD compressor_SVD; - LowRankMatrix A_SVD(A, compressor_SVD, target_cluster, source_cluster, reqrank_max, epsilon); + // SVD + SVD compressor_SVD(A); + LowRankMatrix A_SVD(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor_SVD.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_SVD); std::vector SVD_fixed_errors; for (int k = 0; k < A_SVD.rank_of() + 1; k++) { SVD_fixed_errors.push_back(Frobenius_absolute_error(target_cluster, source_cluster, A_SVD, A, k) / norm_A); } - // fullACA with fixed rank - fullACA compressor_fullACA; - LowRankMatrix A_fullACA_fixed(A, compressor_fullACA, target_cluster, source_cluster, reqrank_max, epsilon); + // fullACA + fullACA compressor_fullACA(A); + LowRankMatrix A_fullACA_fixed(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor_fullACA.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_fullACA_fixed); std::vector fullACA_fixed_errors; for (int k = 0; k < A_fullACA_fixed.rank_of() + 1; k++) { fullACA_fixed_errors.push_back(Frobenius_absolute_error(target_cluster, source_cluster, A_fullACA_fixed, A, k) / norm_A); } - // partialACA with fixed rank - partialACA compressor_partialACA; - LowRankMatrix A_partialACA_fixed(A, compressor_partialACA, target_cluster, source_cluster, reqrank_max, epsilon); + // partialACA + partialACA compressor_partialACA(A); + LowRankMatrix A_partialACA_fixed(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor_partialACA.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_partialACA_fixed); std::vector partialACA_fixed_errors; for (int k = 0; k < A_partialACA_fixed.rank_of() + 1; k++) { partialACA_fixed_errors.push_back(Frobenius_absolute_error(target_cluster, source_cluster, A_partialACA_fixed, A, k) / norm_A); } - // sympartialACA with fixed rank - sympartialACA compressor_sympartialACA; - LowRankMatrix A_sympartialACA_fixed(A, compressor_sympartialACA, target_cluster, source_cluster, reqrank_max, epsilon); + // sympartialACA + sympartialACA compressor_sympartialACA(A); + LowRankMatrix A_sympartialACA_fixed(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor_sympartialACA.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_sympartialACA_fixed); std::vector sympartialACA_fixed_errors; for (int k = 0; k < A_sympartialACA_fixed.rank_of() + 1; k++) { sympartialACA_fixed_errors.push_back(Frobenius_absolute_error(target_cluster, source_cluster, A_sympartialACA_fixed, A, k) / norm_A); } + // sympartialACA with recompression + RecompressedLowRankGenerator compressor_recompressed_sympartialACA(compressor_sympartialACA, std::function &)>(SVD_recompression)); + LowRankMatrix A_recompressed_sympartialACA_fixed(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor_recompressed_sympartialACA.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_recompressed_sympartialACA_fixed); + std::vector recompressed_sympartialACA_fixed_errors; + for (int k = 0; k < A_recompressed_sympartialACA_fixed.rank_of() + 1; k++) { + recompressed_sympartialACA_fixed_errors.push_back(Frobenius_absolute_error(target_cluster, source_cluster, A_recompressed_sympartialACA_fixed, A, k) / norm_A); + } + // Output ofstream file_fixed((outputpath + "/" + outputfile).c_str()); - file_fixed << "Rank,SVD,Full ACA,partial ACA,sym partial ACA" << endl; + file_fixed << "Rank,SVD,Full ACA,partial ACA,sym partial ACA,recompressed sym partial ACA" << endl; for (int i = 0; i < reqrank_max; i++) { - file_fixed << i << "," << SVD_fixed_errors[i] << "," << fullACA_fixed_errors[i] << "," << partialACA_fixed_errors[i] << "," << sympartialACA_fixed_errors[i] << endl; + file_fixed << i << "," << SVD_fixed_errors[i] << "," << fullACA_fixed_errors[i] << "," << partialACA_fixed_errors[i] << "," << sympartialACA_fixed_errors[i] << "," << recompressed_sympartialACA_fixed_errors[i] << endl; } // Finalize the MPI environment. diff --git a/examples/smallest_example.cpp b/examples/smallest_example.cpp index 2696c1c3..3a5092cb 100644 --- a/examples/smallest_example.cpp +++ b/examples/smallest_example.cpp @@ -91,7 +91,7 @@ int main(int argc, char *argv[]) { // Distributed operator char symmetry = 'S'; char UPLO = 'U'; - DefaultApproximationBuilder default_approximation_builder(A, cluster, cluster, epsilon, eta, symmetry, UPLO, MPI_COMM_WORLD); + DefaultApproximationBuilder default_approximation_builder(A, cluster, cluster, HMatrixTreeBuilder(epsilon, eta, symmetry, UPLO), MPI_COMM_WORLD); DistributedOperator &distributed_operator = default_approximation_builder.distributed_operator; // Matrix vector product diff --git a/examples/smallest_example.sh b/examples/smallest_example.sh index 2302171c..129cfede 100755 --- a/examples/smallest_example.sh +++ b/examples/smallest_example.sh @@ -18,4 +18,4 @@ mpirun -np 2 ./examples/smallest_example ${outputpath} -# python3 ../tools/plot_hmatrix.py --inputfile ../output/examples/smallest_example/smallest_example_plot --sizeWorld 2 +# python3 ../tools/plot_hmatrix.py --inputfile ../output/examples/smallest_example/local_hmatrix_0.csv diff --git a/examples/visucluster.cpp b/examples/visucluster.cpp index 7d061825..b981adfc 100644 --- a/examples/visucluster.cpp +++ b/examples/visucluster.cpp @@ -1,3 +1,4 @@ +#include #include #include @@ -28,7 +29,7 @@ int main(int argc, char *argv[]) { Cluster cluster = recursive_build_strategy.create_cluster_tree(size, spatial_dimension, p.data(), 2, 2); // Output - save_clustered_geometry(cluster, 3, p.data(), outputname + "/clustering_output", {1, 2, 3}); + save_clustered_geometry(cluster, spatial_dimension, p.data(), outputname + "/clustering_output", {1, 2, 3}); return 0; } diff --git a/include/htool/basic_types/vector.hpp b/include/htool/basic_types/vector.hpp index e19266d1..d27e5002 100644 --- a/include/htool/basic_types/vector.hpp +++ b/include/htool/basic_types/vector.hpp @@ -133,8 +133,8 @@ int vector_to_bytes(const std::vector vect, const std::string &file) { return 1; // LCOV_EXCL_LINE } int size = vect.size(); - out.write((char *)(&size), sizeof(int)); - out.write((char *)&(vect[0]), size * sizeof(T)); + out.write(reinterpret_cast(&size), sizeof(int)); + out.write(reinterpret_cast(vect.data()), size * sizeof(T)); out.close(); return 0; @@ -151,9 +151,9 @@ int bytes_to_vector(std::vector &vect, const std::string &file) { } int size = 0; - in.read((char *)(&size), sizeof(int)); + in.read(reinterpret_cast(&size), sizeof(int)); vect.resize(size); - in.read((char *)&(vect[0]), size * sizeof(T)); + in.read(reinterpret_cast(vect.data()), size * sizeof(T)); in.close(); return 0; diff --git a/include/htool/clustering/cluster_node.hpp b/include/htool/clustering/cluster_node.hpp index ae6adcb6..1e09a23c 100644 --- a/include/htool/clustering/cluster_node.hpp +++ b/include/htool/clustering/cluster_node.hpp @@ -62,7 +62,7 @@ class Cluster : public TreeNode, ClusterTreeDatam_tree_data->m_max_depth; } unsigned int get_minimal_depth() const { return this->m_tree_data->m_min_depth; } - unsigned int get_minclustersize() const { return this->m_tree_data->m_minclustersize; } + unsigned int get_maximal_leaf_size() const { return this->m_tree_data->m_maximal_leaf_size; } const std::vector *> &get_clusters_on_partition() const { return this->m_tree_data->m_clusters_on_partition; } const Cluster &get_cluster_on_partition(size_t index) const { return *this->m_tree_data->m_clusters_on_partition[index]; @@ -75,7 +75,7 @@ class Cluster : public TreeNode, ClusterTreeDatam_tree_data->m_is_permutation_local = is_permutation_local; } void set_minimal_depth(unsigned int minimal_depth) { this->m_tree_data->m_min_depth = minimal_depth; } void set_maximal_depth(unsigned int maximal_depth) { this->m_tree_data->m_max_depth = maximal_depth; } - void set_minclustersize(unsigned int minclustersize) { this->m_tree_data->m_minclustersize = minclustersize; } + void set_maximal_leaf_size(unsigned int maximal_leaf_size) { this->m_tree_data->m_maximal_leaf_size = maximal_leaf_size; } // Operator overloading bool operator==(const Cluster &rhs) const { return this->get_offset() == rhs.get_offset() && this->get_size() == rhs.get_size() && this->m_tree_data == rhs.m_tree_data && this->get_depth() == rhs.get_depth() && this->get_counter() == rhs.get_counter(); } diff --git a/include/htool/clustering/cluster_output.hpp b/include/htool/clustering/cluster_output.hpp index d1af4b4b..2b4b33a9 100644 --- a/include/htool/clustering/cluster_output.hpp +++ b/include/htool/clustering/cluster_output.hpp @@ -34,7 +34,7 @@ void save_cluster_tree(const Cluster &cluster, std::string // Cluster tree properties std::ofstream output_permutation(filename + "_cluster_tree_properties.csv"); - output_permutation << "minclustersize: " << cluster.get_minclustersize() << "\n"; + output_permutation << "maximal leaf size: " << cluster.get_maximal_leaf_size() << "\n"; output_permutation << "maximal depth: " << cluster.get_maximal_depth() << "\n"; output_permutation << "minimal depth: " << cluster.get_minimal_depth() << "\n"; output_permutation << "permutation: "; @@ -128,7 +128,7 @@ Cluster read_cluster_tree(std::string file_cluster_tree_pr std::getline(input_permutation, line); splitted_string = split(line, " "); - root_cluster.set_minclustersize(std::stoul(splitted_string.back())); + root_cluster.set_maximal_leaf_size(std::stoul(splitted_string.back())); std::getline(input_permutation, line); splitted_string = split(line, " "); root_cluster.set_maximal_depth(std::stoi(splitted_string.back())); diff --git a/include/htool/clustering/cluster_tree_data.hpp b/include/htool/clustering/cluster_tree_data.hpp index 3c3551bd..8fede045 100644 --- a/include/htool/clustering/cluster_tree_data.hpp +++ b/include/htool/clustering/cluster_tree_data.hpp @@ -13,7 +13,7 @@ class Cluster; template struct ClusterTreeData { // Parameters - unsigned int m_minclustersize{10}; // minimal number of geometric point in a cluster + unsigned int m_maximal_leaf_size{10}; // minimal number of geometric point in a cluster // Information unsigned int m_max_depth{std::numeric_limits::min()}; // maximum depth of the tree diff --git a/include/htool/clustering/clustering.hpp b/include/htool/clustering/clustering.hpp deleted file mode 100644 index a6a4f20c..00000000 --- a/include/htool/clustering/clustering.hpp +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef HTOOL_CLUSTERING_HPP -#define HTOOL_CLUSTERING_HPP - -#include "cluster_node.hpp" -#include "cluster_output.hpp" -#include "tree_builder/tree_builder.hpp" - -#endif diff --git a/include/htool/clustering/implementations/partitioning.hpp b/include/htool/clustering/implementations/partitioning.hpp new file mode 100644 index 00000000..d47a6a6c --- /dev/null +++ b/include/htool/clustering/implementations/partitioning.hpp @@ -0,0 +1,211 @@ +#ifndef HTOOL_CLUSTERING_PARTITIONING_HPP +#define HTOOL_CLUSTERING_PARTITIONING_HPP + +#include "../../basic_types/vector.hpp" // for Matrix +#include "../../matrix/matrix.hpp" // for vector operations +#include "../../misc/evp.hpp" // for solve_EVP_2, solve_EVP_3 +#include "../interfaces/virtual_partitioning.hpp" // for VirtualPartitioning +#include +namespace htool { + +template +class Partitioning : public VirtualPartitioning { + public: + std::vector> compute_partitioning(Cluster ¤t_cluster, int spatial_dimension, const CoordinatePrecision *coordinates, const CoordinatePrecision *const radii, const CoordinatePrecision *const weights, int number_of_partitions) override { + + std::vector &permutation = current_cluster.get_permutation(); + auto current_offset = current_cluster.get_offset(); + auto current_size = current_cluster.get_size(); + + // Compute directions + Matrix directions; + directions = ComputationDirectionPolicy::compute_direction(current_cluster, spatial_dimension, coordinates, radii, weights); + + // If number of partitions corresponds to 2^d, we do binary/quadtree/octree/... + if (number_of_partitions == pow(2, spatial_dimension)) { + std::vector dimensions(spatial_dimension); + std::iota(dimensions.begin(), dimensions.end(), int(0)); + std::stack> stack; + stack.push(std::make_tuple(current_cluster.get_offset(), current_cluster.get_size(), 0)); + std::vector> result; + while (!stack.empty()) { + auto tmp_partition = stack.top(); + stack.pop(); + auto tmp_offset = std::get<0>(tmp_partition); + auto tmp_size = std::get<1>(tmp_partition); + auto tmp_dimension = std::get<2>(tmp_partition); + + std::vector direction = directions.get_col(tmp_dimension); + + std::sort(permutation.begin() + tmp_offset, permutation.begin() + tmp_offset + tmp_size, [&](int a, int b) { + CoordinatePrecision c = std::inner_product(coordinates + spatial_dimension * a, coordinates + spatial_dimension * (1 + a), direction.data(), CoordinatePrecision(0)); + CoordinatePrecision d = std::inner_product(coordinates + spatial_dimension * b, coordinates + spatial_dimension * (1 + b), direction.data(), CoordinatePrecision(0)); + return c < d; + }); + + auto tmp = SplittingPolicy::splitting(tmp_offset, tmp_size, spatial_dimension, coordinates, current_cluster.get_permutation(), direction, 2); + + if ((tmp_dimension < spatial_dimension - 1) and tmp.size() == 2) { + stack.push(std::make_tuple(tmp[1].first, tmp[1].second, tmp_dimension + 1)); + stack.push(std::make_tuple(tmp[0].first, tmp[0].second, tmp_dimension + 1)); + } else if ((tmp_dimension == spatial_dimension - 1) and tmp.size() == 2) { + result.insert(result.end(), tmp.begin(), tmp.end()); + } else { + break; + } + } + + if (result.size() == number_of_partitions) { + std::sort(result.begin(), result.end(), [](auto a, auto b) { return a.first < b.first; }); + return result; + } + } + + // Sort along main direction + std::sort(permutation.begin() + current_offset, permutation.begin() + current_offset + current_size, [&](int a, int b) { + CoordinatePrecision c = std::inner_product(coordinates + spatial_dimension * a, coordinates + spatial_dimension * (1 + a), directions.data(), CoordinatePrecision(0)); + CoordinatePrecision d = std::inner_product(coordinates + spatial_dimension * b, coordinates + spatial_dimension * (1 + b), directions.data(), CoordinatePrecision(0)); + return c < d; + }); + + // Split along main direction + return SplittingPolicy::splitting(current_cluster.get_offset(), current_cluster.get_size(), spatial_dimension, coordinates, current_cluster.get_permutation(), directions.get_col(0), number_of_partitions); + } +}; + +template +class ComputeLargestExtent { + public: + static Matrix compute_direction(const Cluster &cluster, int spatial_dimension, const T *const coordinates, const T *const, const T *const weights) { + if (spatial_dimension != 2 && spatial_dimension != 3) { + htool::Logger::get_instance().log(LogLevel::ERROR, "clustering not define for spatial dimension !=2 and !=3"); // LCOV_EXCL_LINE + } + + const std::vector &permutation = cluster.get_permutation(); + Matrix cov(spatial_dimension, spatial_dimension); + Matrix directions(spatial_dimension, spatial_dimension); + + for (int j = 0; j < cluster.get_size(); j++) { + std::vector u(spatial_dimension, 0); + for (int p = 0; p < spatial_dimension; p++) { + u[p] = coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p] - cluster.get_center()[p]; + } + + for (int p = 0; p < spatial_dimension; p++) { + for (int q = 0; q < spatial_dimension; q++) { + cov(p, q) += weights[permutation[j + cluster.get_offset()]] * u[p] * u[q]; + } + } + } + if (spatial_dimension == 2) { + directions = solve_EVP_2(cov); + } else if (spatial_dimension == 3) { + directions = solve_EVP_3(cov); + } + return directions; + } +}; + +template +class ComputeBoundingBox { + public: + static Matrix compute_direction(const Cluster &cluster, int spatial_dimension, const T *const coordinates, const T *const, const T *const) { + + const std::vector &permutation = cluster.get_permutation(); + + // min max for each axis + std::vector min_point(spatial_dimension, std::numeric_limits::max()); + std::vector max_point(spatial_dimension, std::numeric_limits::min()); + for (int j = 0; j < cluster.get_size(); j++) { + std::vector u(spatial_dimension, 0); + for (int p = 0; p < spatial_dimension; p++) { + if (min_point[p] > coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p]) { + min_point[p] = coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p]; + } + if (max_point[p] < coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p]) { + max_point[p] = coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p]; + } + u[p] = coordinates[spatial_dimension * permutation[j + cluster.get_offset()] + p] - cluster.get_center()[p]; + } + } + + // Direction of largest extent + std::vector indexes(spatial_dimension); + std::iota(indexes.begin(), indexes.end(), int(0)); + std::sort(indexes.begin(), indexes.end(), [&max_point, &min_point](int a, int b) { return (max_point[a] - min_point[a]) < (max_point[b] - min_point[b]); }); + Matrix directions(spatial_dimension, spatial_dimension); + for (int dim = 0; dim < spatial_dimension; dim++) { + directions(indexes[spatial_dimension - 1 - dim], dim) = 1; + } + return directions; + } +}; + +template +class RegularSplitting { + public: + static std::vector> splitting(int offset, int size, int, const T *const, const std::vector &, const std::vector &, int number_of_partition) { + + std::vector> current_partition(number_of_partition); + + // Children + int children_size = int(size / number_of_partition); + + for (int p = 0; p < number_of_partition - 1; p++) { + current_partition[p] = std::pair(offset + children_size * p, children_size); + } + current_partition.back() = std::pair(offset + children_size * (number_of_partition - 1), size - children_size * (number_of_partition - 1)); + + return current_partition; + } +}; + +template +class GeometricSplitting { + public: + static std::vector> splitting(int offset, int size, int spatial_dimension, const T *const coordinates, const std::vector &permutation, const std::vector &direction, int number_of_partition) { + + // Geometry of current cluster + std::vector> numbering(number_of_partition); + std::vector> current_partition; + + if (size > number_of_partition) { // otherwise it won't be possible anyway + current_partition.resize(number_of_partition); + std::vector first_point = std::vector(coordinates + spatial_dimension * permutation[offset], coordinates + spatial_dimension * (1 + permutation[offset])); + std::vector last_point = std::vector(coordinates + spatial_dimension * permutation[offset + size - 1], coordinates + spatial_dimension * (1 + permutation[offset + size - 1])); + + T geometric_distance = dprod(direction, last_point - first_point); + T children_geometric_size = geometric_distance / number_of_partition; + + auto count = permutation.begin() + offset; + std::vector offsets(number_of_partition, 0); + std::vector sizes(number_of_partition, 0); + + for (int p = 0; p < number_of_partition - 1; p++) { + auto result = std::find_if(count, permutation.begin() + offset + size, [&](int a) { + return dprod(direction, std::vector(coordinates + spatial_dimension * a, coordinates + spatial_dimension * (1 + a)) - first_point) > children_geometric_size; + }); + if (result != permutation.end()) { + offsets[p] = count - permutation.begin(); + sizes[p] = (result - permutation.begin()) - (count - permutation.begin()); + count = result; + first_point = std::vector(coordinates + spatial_dimension * (*result), coordinates + spatial_dimension * (*result + 1)); + } else { + offsets[p] = 0; + sizes[p] = 0; + break; + } + } + offsets.back() = (count - permutation.begin()); + sizes.back() = size - std::accumulate(sizes.begin(), sizes.end() - 1, 0); + for (int p = 0; p < number_of_partition; p++) { + current_partition[p] = std::pair(offsets[p], sizes[p]); + } + } + return current_partition; + } +}; + +} // namespace htool + +#endif diff --git a/include/htool/clustering/interfaces/virtual_partitioning.hpp b/include/htool/clustering/interfaces/virtual_partitioning.hpp new file mode 100644 index 00000000..1b95491c --- /dev/null +++ b/include/htool/clustering/interfaces/virtual_partitioning.hpp @@ -0,0 +1,18 @@ +#ifndef HTOOL_CLUSTERING_VIRTUAL_PARTITIONING_HPP +#define HTOOL_CLUSTERING_VIRTUAL_PARTITIONING_HPP + +#include "../cluster_node.hpp" // for Cluster + +namespace htool { + +template +class VirtualPartitioning { + public: + virtual std::vector> compute_partitioning(Cluster ¤t_cluster, int spatial_dimension, const CoordinatePrecision *coordinates, const CoordinatePrecision *const radii, const CoordinatePrecision *const weights, int number_of_partitions) = 0; + + virtual ~VirtualPartitioning() {} +}; + +} // namespace htool + +#endif diff --git a/include/htool/clustering/tree_builder/direction_computation.hpp b/include/htool/clustering/tree_builder/direction_computation.hpp deleted file mode 100644 index 9ee1c8c2..00000000 --- a/include/htool/clustering/tree_builder/direction_computation.hpp +++ /dev/null @@ -1,97 +0,0 @@ -#ifndef HTOOL_CLUSTERING_TREE_BUILDER_DIRECTION_COMPUTATION_HPP -#define HTOOL_CLUSTERING_TREE_BUILDER_DIRECTION_COMPUTATION_HPP - -#include "../../misc/evp.hpp" // for solve_EVP_2, solve_EVP_3 -#include "../../misc/logger.hpp" // for Logger, LogLevel -#include "../cluster_node.hpp" // for Cluster -#include "htool/matrix/matrix.hpp" // for Matrix -#include // for numeric_limits -#include // for basic_string -#include // for vector - -namespace htool { - -template -class VirtualDirectionComputationStrategy { - public: - virtual std::vector compute_direction(const Cluster *cluster, const std::vector &permutation, int spatial_dimension, const T *const coordinates, const T *const, const T *const weights) = 0; - - virtual ~VirtualDirectionComputationStrategy() {} -}; - -template -class ComputeLargestExtent final : public VirtualDirectionComputationStrategy { - public: - using VirtualDirectionComputationStrategy::VirtualDirectionComputationStrategy; - - std::vector compute_direction(const Cluster *cluster, const std::vector &permutation, int spatial_dimension, const T *const coordinates, const T *const, const T *const weights) override { - if (spatial_dimension != 2 && spatial_dimension != 3) { - htool::Logger::get_instance().log(LogLevel::ERROR, "clustering not define for spatial dimension !=2 and !=3"); // LCOV_EXCL_LINE - // throw std::logic_error("[Htool error] clustering not define for spatial dimension !=2 and !=3"); // LCOV_EXCL_LINE - } - - Matrix cov(spatial_dimension, spatial_dimension); - std::vector direction(spatial_dimension, 0); - - for (int j = 0; j < cluster->get_size(); j++) { - std::vector u(spatial_dimension, 0); - for (int p = 0; p < spatial_dimension; p++) { - u[p] = coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p] - cluster->get_center()[p]; - } - - for (int p = 0; p < spatial_dimension; p++) { - for (int q = 0; q < spatial_dimension; q++) { - cov(p, q) += weights[permutation[j + cluster->get_offset()]] * u[p] * u[q]; - } - } - } - if (spatial_dimension == 2) { - direction = solve_EVP_2(cov); - } else if (spatial_dimension == 3) { - direction = solve_EVP_3(cov); - } - return direction; - } -}; - -template -class ComputeBoundingBox final : public VirtualDirectionComputationStrategy { - public: - using VirtualDirectionComputationStrategy::VirtualDirectionComputationStrategy; - - std::vector compute_direction(const Cluster *cluster, const std::vector &permutation, int spatial_dimension, const T *const coordinates, const T *const, const T *const) override { - - // min max for each axis - std::vector min_point(spatial_dimension, std::numeric_limits::max()); - std::vector max_point(spatial_dimension, std::numeric_limits::min()); - for (int j = 0; j < cluster->get_size(); j++) { - std::vector u(spatial_dimension, 0); - for (int p = 0; p < spatial_dimension; p++) { - if (min_point[p] > coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p]) { - min_point[p] = coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p]; - } - if (max_point[p] < coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p]) { - max_point[p] = coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p]; - } - u[p] = coordinates[spatial_dimension * permutation[j + cluster->get_offset()] + p] - cluster->get_center()[p]; - } - } - - // Direction of largest extent - T max_distance(std::numeric_limits::min()); - int dir_axis = 0; - for (int p = 0; p < spatial_dimension; p++) { - if (max_distance < max_point[p] - min_point[p]) { - max_distance = max_point[p] - min_point[p]; - dir_axis = p; - } - } - std::vector direction(spatial_dimension, 0); - direction[dir_axis] = 1; - return direction; - } -}; - -} // namespace htool - -#endif diff --git a/include/htool/clustering/tree_builder/recursive_build.hpp b/include/htool/clustering/tree_builder/recursive_build.hpp deleted file mode 100644 index 17d34423..00000000 --- a/include/htool/clustering/tree_builder/recursive_build.hpp +++ /dev/null @@ -1,222 +0,0 @@ -#ifndef HTOOL_CLUSTERING_TREE_BUILDER_HPP -#define HTOOL_CLUSTERING_TREE_BUILDER_HPP - -#include "../../basic_types/vector.hpp" // for norm2 -#include "../../misc/logger.hpp" // for Logger, LogLevel -#include "../cluster_node.hpp" // for Cluster -#include "direction_computation.hpp" -#include "splitting.hpp" -#include // for sort, transform -#include // for pow, log, floor -#include // for deque -#include // for _1, bind, multiplies -#include // for shared_ptr, make_shared -#include // for iota, inner_product -#include // for stack -#include // for basic_string -#include // for pair -#include // for vector - -namespace htool { - -template -class ClusterTreeBuilder { - - std::vector> m_partition{}; - int m_minclustersize{10}; - enum PartitionType { PowerOfNumberOfChildren, - Given, - Simple }; - PartitionType m_partition_type{Simple}; - - std::vector compute_center(int spatial_dimension, const T *coordinates, const T *weights, int offset, int size, const int *permutation = nullptr) const; - T compute_radius(int spatial_dimension, const T *coordinates, const T *radii, std::vector center, int offset, int size, const int *permutation = nullptr) const; - - std::shared_ptr> m_direction_computation_strategy = std::make_shared>(); - std::shared_ptr> m_splitting_strategy = std::make_shared>(); - - public: - ClusterTreeBuilder() {} - - void set_direction_computation_strategy(std::shared_ptr> direction_computation_strategy) { m_direction_computation_strategy = direction_computation_strategy; } - - void set_splitting_strategy(std::shared_ptr> splitting_strategy) { m_splitting_strategy = splitting_strategy; } - - void set_minclustersize(int minclustersize) { m_minclustersize = minclustersize; } - - Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, const T *radii, const T *weights, int number_of_children, int size_of_partition, const int *partition); - - Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, int number_of_children, int size_of_partition) { return create_cluster_tree(number_of_points, spatial_dimension, coordinates, nullptr, nullptr, number_of_children, size_of_partition, nullptr); } - - Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, int number_of_children, int size_of_partition, const int *partition) { return create_cluster_tree(number_of_points, spatial_dimension, coordinates, nullptr, nullptr, number_of_children, size_of_partition, partition); } -}; - -template -Cluster ClusterTreeBuilder::create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, const T *radii, const T *weights, int number_of_children, int size_partition, const int *partition) { - - // default values - std::vector default_radii{}; - std::vector default_weights{}; - if (radii == nullptr) { - default_radii.resize(number_of_points, 0); - radii = default_radii.data(); - } - if (weights == nullptr) { - default_weights.resize(number_of_points, 1); - weights = default_weights.data(); - } - - // Intialization of root - std::vector center = compute_center(spatial_dimension, coordinates, weights, 0, number_of_points); - T radius = compute_radius(spatial_dimension, coordinates, radii, center, 0, number_of_points); - - Cluster root_cluster(radius, center, -1, 0, number_of_points); - std::vector &permutation = root_cluster.get_permutation(); - - // Taking care of partition initialisation - std::stack *> cluster_stack(std::deque *>{&root_cluster}); - int depth_of_partition; - int number_of_children_on_partition_level = size_partition; - int additional_children_on_last_partition = 0; - if (partition != nullptr) { - m_partition_type = Given; - depth_of_partition = 1; - cluster_stack.pop(); - bool is_child_on_partition = true; - root_cluster.set_is_permutation_local(true); - for (int p = 0; p < size_partition; p++) { - center = compute_center(spatial_dimension, coordinates, weights, partition[2 * p], partition[2 * p + 1], permutation.data()); - radius = compute_radius(spatial_dimension, coordinates, radii, center, partition[2 * p], partition[2 * p + 1], permutation.data()); - Cluster *cluster_on_partition = root_cluster.add_child(radius, center, p, partition[2 * p], partition[2 * p + 1], p, is_child_on_partition); - cluster_stack.push(cluster_on_partition); - } - } else { - m_partition_type = Simple; - if (size_partition >= number_of_children) { - depth_of_partition = static_cast(floor(log(size_partition) / log(number_of_children))); - number_of_children_on_partition_level = number_of_children; - if (size_partition != std::pow(number_of_children, depth_of_partition)) { - Logger::get_instance().log(LogLevel::WARNING, "The given size for the partition is not a power of the number of children in the cluster tree."); - additional_children_on_last_partition = size_partition - std::pow(number_of_children, depth_of_partition); - } - } else { - depth_of_partition = 1; - } - } - - // Recursive build - std::vector> current_splitting(number_of_children); - - while (!cluster_stack.empty()) { - Cluster *current_cluster = cluster_stack.top(); - cluster_stack.pop(); - auto current_offset = current_cluster->get_offset(); - auto current_size = current_cluster->get_size(); - int current_number_of_children = ((current_cluster->get_depth() == depth_of_partition - 1) && (m_partition_type == Simple)) ? number_of_children_on_partition_level : number_of_children; - - if ((current_cluster->get_depth() == depth_of_partition - 1) && (m_partition_type == Simple) && current_cluster->get_counter() == std::pow(number_of_children, current_cluster->get_depth()) - 1) { - current_number_of_children += additional_children_on_last_partition; - } - - // Direction of largest extent - std::vector direction(spatial_dimension, 0); - direction = m_direction_computation_strategy->compute_direction(current_cluster, permutation, spatial_dimension, coordinates, radii, weights); - - // Sort along direction - std::sort(permutation.begin() + current_offset, permutation.begin() + current_offset + current_size, [&](int a, int b) { - T c = std::inner_product(coordinates + spatial_dimension * a, coordinates + spatial_dimension * (1 + a), direction.data(), T(0)); - T d = std::inner_product(coordinates + spatial_dimension * b, coordinates + spatial_dimension * (1 + b), direction.data(), T(0)); - return c < d; - }); - - // Compute numbering - m_splitting_strategy->splitting(current_cluster, permutation, spatial_dimension, coordinates, current_number_of_children, m_minclustersize, direction, current_splitting); - std::vector *> children; - - if (current_splitting.size() > 0) { - bool is_child_on_partition = false; - for (int p = 0; p < current_splitting.size(); p++) { - center = compute_center(spatial_dimension, coordinates, weights, current_splitting[p].first, current_splitting[p].second, permutation.data()); - radius = compute_radius(spatial_dimension, coordinates, radii, center, current_splitting[p].first, current_splitting[p].second, permutation.data()); - - int rank_of_child = current_cluster->get_rank(); - int counter_of_child = current_cluster->get_counter() * current_number_of_children + p; - if ((current_cluster->get_depth() == depth_of_partition - 1) && (m_partition_type == Simple)) { - rank_of_child = current_cluster->get_counter() * number_of_children_on_partition_level + p; - counter_of_child = rank_of_child; - is_child_on_partition = true; - } - - children.emplace_back(current_cluster->add_child(radius, center, rank_of_child, current_splitting[p].first, current_splitting[p].second, counter_of_child, is_child_on_partition)); - } - - // Recursivity - for (auto &child : children) { - cluster_stack.push(child); - } - } else if (current_cluster->get_rank() < 0) { - htool::Logger::get_instance().log(LogLevel::ERROR, "Cluster tree reached maximal depth, but not enough children to define a partition"); // LCOV_EXCL_LINE - } else { - current_cluster->set_maximal_depth(std::max(current_cluster->get_maximal_depth(), current_cluster->get_depth())); - current_cluster->set_minimal_depth(std::min(current_cluster->get_minimal_depth(), current_cluster->get_depth())); - } - - if (m_partition_type == PartitionType::Simple && current_cluster->get_depth() == depth_of_partition - 1) { - for (const auto &child : children) { - m_partition.emplace_back(child->get_offset(), child->get_size()); - } - } - } - - return root_cluster; -} - -template -std::vector ClusterTreeBuilder::compute_center(int spatial_dimension, const T *coordinates, const T *weights, int offset, int size, const int *current_permutation) const { - std::vector center(spatial_dimension, 0); - - bool is_first_permutation = (current_permutation == nullptr); - std::vector first_permutation(is_first_permutation ? size : 0, 0); - std::iota(first_permutation.begin(), first_permutation.end(), int(0)); - const int *permutation = is_first_permutation ? first_permutation.data() : current_permutation; - - // Mass of the cluster - T total_weight = 0; - for (int j = 0; j < size; j++) { - total_weight += weights[permutation[j + offset]]; - } - - // Center of the cluster - for (int j = 0; j < size; j++) { - for (int p = 0; p < spatial_dimension; p++) { - center[p] += weights[permutation[j + offset]] * coordinates[spatial_dimension * permutation[j + offset] + p]; - } - } - - std::transform(center.begin(), center.end(), center.begin(), std::bind(std::multiplies(), std::placeholders::_1, static_cast(1.) / total_weight)); - return center; -} - -template -T ClusterTreeBuilder::compute_radius(int spatial_dimension, const T *coordinates, const T *radii, std::vector center, int offset, int size, const int *current_permutation) const { - T radius = 0; - - bool is_first_permutation = (current_permutation == nullptr); - std::vector first_permutation(is_first_permutation ? size : 0, 0); - std::iota(first_permutation.begin(), first_permutation.end(), int(0)); - const int *permutation = is_first_permutation ? first_permutation.data() : current_permutation; - - for (int j = 0; j < size; j++) { - std::vector u(spatial_dimension, 0); - for (int p = 0; p < spatial_dimension; p++) { - u[p] = coordinates[spatial_dimension * permutation[j + offset] + p] - center[p]; - } - - radius = std::max(radius, norm2(u) + radii[permutation[j + offset]]); - } - return radius; -} - -} // namespace htool - -#endif diff --git a/include/htool/clustering/tree_builder/splitting.hpp b/include/htool/clustering/tree_builder/splitting.hpp deleted file mode 100644 index e430c99f..00000000 --- a/include/htool/clustering/tree_builder/splitting.hpp +++ /dev/null @@ -1,160 +0,0 @@ -#ifndef HTOOL_CLUSTERING_TREE_BUILDER_SPLITTING_HPP -#define HTOOL_CLUSTERING_TREE_BUILDER_SPLITTING_HPP - -#include "../cluster_node.hpp" // for Cluster -#include "htool/basic_types/vector.hpp" // for dprod -#include // for max, all_of, find_if -#include // for accumulate -#include // for pair -#include // for vector - -namespace htool { - -template -class VirtualSplittingStrategy { - public: - virtual void splitting(Cluster *curr_cluster, const std::vector &, int, const T *, int current_number_of_children, int minclustersize, const std::vector &, std::vector> ¤t_partition) = 0; - - virtual ~VirtualSplittingStrategy() {} -}; -template -class RegularSplitting final : public VirtualSplittingStrategy { - public: - using VirtualSplittingStrategy::VirtualSplittingStrategy; - - void splitting(Cluster *curr_cluster, const std::vector &, int, const T *, int current_number_of_children, int minclustersize, const std::vector &, std::vector> ¤t_partition) { - - int size = curr_cluster->get_size(); - current_partition.resize(current_number_of_children); - - // Children - int children_size = int(size / current_number_of_children); - if (children_size > minclustersize) { - for (int p = 0; p < current_number_of_children - 1; p++) { - current_partition[p] = std::pair(curr_cluster->get_offset() + children_size * p, children_size); - } - current_partition.back() = std::pair(curr_cluster->get_offset() + children_size * (current_number_of_children - 1), size - children_size * (current_number_of_children - 1)); - } else { - current_partition.clear(); - } - } -}; - -template -class GeometricSplitting final : public VirtualSplittingStrategy { - RegularSplitting regular_splitting; - - public: - using VirtualSplittingStrategy::VirtualSplittingStrategy; - void splitting(Cluster *curr_cluster, const std::vector &permutation, int spatial_dimension, const T *x, int current_number_of_children, int minclustersize, const std::vector &dir, std::vector> ¤t_partition) { - - // Geometry of current cluster - int offset = curr_cluster->get_offset(); - int size = curr_cluster->get_size(); - std::vector> numbering(current_number_of_children); - current_partition.resize(current_number_of_children); - - if (size > current_number_of_children) { // otherwise it won't be possible anyway - - std::vector first_point = std::vector(x + spatial_dimension * permutation[offset], x + spatial_dimension * (1 + permutation[offset])); - std::vector last_point = std::vector(x + spatial_dimension * permutation[offset + size - 1], x + spatial_dimension * (1 + permutation[offset + size - 1])); - - T geometric_distance = dprod(dir, last_point - first_point); - T children_geometric_size = geometric_distance / current_number_of_children; - - auto count = permutation.begin() + offset; - std::vector offsets(current_number_of_children, 0); - std::vector sizes(current_number_of_children, 0); - - for (int p = 0; p < current_number_of_children - 1; p++) { - auto result = std::find_if(count, permutation.begin() + offset + size, [&](int a) { - return dprod(dir, std::vector(x + spatial_dimension * a, x + spatial_dimension * (1 + a)) - first_point) > children_geometric_size; - }); - if (result != permutation.end()) { - offsets[p] = count - permutation.begin(); - sizes[p] = (result - permutation.begin()) - (count - permutation.begin()); - count = result; - first_point = std::vector(x + spatial_dimension * (*result), x + spatial_dimension * (*result + 1)); - } else { - offsets[p] = 0; - sizes[p] = 0; - break; - } - } - offsets.back() = (count - permutation.begin()); - sizes.back() = size - std::accumulate(sizes.begin(), sizes.end() - 1, 0); - - if (std::all_of(sizes.begin(), sizes.end(), [&](int a) { return a > minclustersize; })) { - for (int p = 0; p < current_number_of_children; p++) { - current_partition[p] = std::pair(offsets[p], sizes[p]); - } - } else { - regular_splitting.splitting(curr_cluster, permutation, spatial_dimension, x, current_number_of_children, minclustersize, dir, current_partition); - } - } else { - current_partition.clear(); - } - } -}; - -template -class GeometricSplittingTest { - RegularSplitting regular_splitting; - - public: - GeometricSplittingTest() {} - - void splitting(Cluster *curr_cluster, const std::vector &permutation, int spatial_dimension, const T *x, int current_number_of_children, int minclustersize, const std::vector &dir, std::vector> ¤t_partition) { - - // Geometry of current cluster - int offset = curr_cluster->get_offset(); - int size = curr_cluster->get_size(); - std::vector> numbering(current_number_of_children); - current_partition.resize(current_number_of_children); - - if (size > current_number_of_children) { // otherwise it won't be possible anyway - - std::vector first_point = std::vector(x + spatial_dimension * permutation[offset], x + spatial_dimension * (1 + permutation[offset])); - std::vector last_point = std::vector(x + spatial_dimension * permutation[offset + size - 1], x + spatial_dimension * (1 + permutation[offset + size - 1])); - - T geometric_distance = dprod(dir, last_point - first_point); - T children_geometric_size = geometric_distance / current_number_of_children; - - auto count = permutation.begin() + offset; - std::vector offsets(current_number_of_children, 0); - std::vector sizes(current_number_of_children, 0); - - for (int p = 0; p < current_number_of_children - 1; p++) { - auto result = std::find_if(count, permutation.begin() + offset + size, [&](int a) { - return dprod(dir, std::vector(x + spatial_dimension * a, x + spatial_dimension * (1 + a)) - first_point) > children_geometric_size; - }); - if (result != permutation.end()) { - offsets[p] = count - permutation.begin(); - sizes[p] = (result - permutation.begin()) - (count - permutation.begin()); - count = result; - first_point = std::vector(x + spatial_dimension * (*result), x + spatial_dimension * (*result + 1)); - } else { - offsets[p] = 0; - sizes[p] = 0; - break; - } - } - offsets.back() = (count - permutation.begin()); - sizes.back() = size - std::accumulate(sizes.begin(), sizes.end() - 1, 0); - - if (std::all_of(sizes.begin(), sizes.end(), [&](int a) { return a > minclustersize; })) { - for (int p = 0; p < current_number_of_children; p++) { - current_partition[p] = std::pair(offsets[p], sizes[p]); - } - } else { - regular_splitting.splitting(curr_cluster, permutation, spatial_dimension, x, current_number_of_children, minclustersize, dir, current_partition); - } - } else { - current_partition.clear(); - } - } -}; - -} // namespace htool - -#endif diff --git a/include/htool/clustering/tree_builder/tree_builder.hpp b/include/htool/clustering/tree_builder/tree_builder.hpp index 02c07b93..9d372969 100644 --- a/include/htool/clustering/tree_builder/tree_builder.hpp +++ b/include/htool/clustering/tree_builder/tree_builder.hpp @@ -1,8 +1,220 @@ -#ifndef HTOOL_CLUSTERING_CLUSTER_TREE_BUILDER_HPP -#define HTOOL_CLUSTERING_CLUSTER_TREE_BUILDER_HPP +#ifndef HTOOL_CLUSTERING_TREE_BUILDER_HPP +#define HTOOL_CLUSTERING_TREE_BUILDER_HPP -#include "direction_computation.hpp" -#include "recursive_build.hpp" -#include "splitting.hpp" +#include "../../basic_types/vector.hpp" // for norm2 +#include "../../misc/logger.hpp" // for Logger, LogLevel +#include "../cluster_node.hpp" // for Cluster +#include "../implementations/partitioning.hpp" // for Partitioning +#include "../interfaces/virtual_partitioning.hpp" // for VirtualPartitioning +#include // for sort, transform +#include // for pow, log, floor +#include // for deque +#include // for _1, bind, multiplies +#include // for shared_ptr, make_shared +#include // for iota, inner_product +#include // for stack +#include // for basic_string +#include // for pair +#include // for vector + +namespace htool { + +template +class ClusterTreeBuilder { + + int m_maximal_leaf_size{10}; + bool m_is_complete = false; + std::shared_ptr> m_partitioning_strategy = std::make_shared, RegularSplitting>>(); + enum PartitionType { PowerOfNumberOfChildren, + Given, + Simple }; + + std::vector compute_center(int spatial_dimension, const T *coordinates, const T *weights, int offset, int size, const int *permutation = nullptr) const; + T compute_radius(int spatial_dimension, const T *coordinates, const T *radii, std::vector center, int offset, int size, const int *permutation = nullptr) const; + + public: + // ClusterTreeBuilder(bool is_complete) {} + + void set_maximal_leaf_size(int maximal_leaf_size) { m_maximal_leaf_size = maximal_leaf_size; } + void set_is_complete(bool is_complete) { m_is_complete = is_complete; } + void set_partitioning_strategy(std::shared_ptr> partitioning_strategy) { m_partitioning_strategy = partitioning_strategy; } + + Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, const T *radii, const T *weights, int number_of_children, int size_of_partition, const int *partition) const; + + Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, int number_of_children, int size_of_partition) const { return create_cluster_tree(number_of_points, spatial_dimension, coordinates, nullptr, nullptr, number_of_children, size_of_partition, nullptr); } + + Cluster create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, int number_of_children, int size_of_partition, const int *partition) const { return create_cluster_tree(number_of_points, spatial_dimension, coordinates, nullptr, nullptr, number_of_children, size_of_partition, partition); } +}; + +template +Cluster ClusterTreeBuilder::create_cluster_tree(int number_of_points, int spatial_dimension, const T *coordinates, const T *radii, const T *weights, int number_of_children, int size_partition, const int *partition) const { + + std::vector> m_partition{}; + PartitionType partition_type{Simple}; + + // default values + std::vector default_radii{}; + std::vector default_weights{}; + if (radii == nullptr) { + default_radii.resize(number_of_points, 0); + radii = default_radii.data(); + } + if (weights == nullptr) { + default_weights.resize(number_of_points, 1); + weights = default_weights.data(); + } + + // Intialization of root + std::vector center = compute_center(spatial_dimension, coordinates, weights, 0, number_of_points); + T radius = compute_radius(spatial_dimension, coordinates, radii, center, 0, number_of_points); + + Cluster root_cluster(radius, center, -1, 0, number_of_points); + std::vector &permutation = root_cluster.get_permutation(); + root_cluster.set_maximal_leaf_size(m_maximal_leaf_size); + + // Taking care of partition initialisation + std::stack *> cluster_stack(std::deque *>{&root_cluster}); + int depth_of_partition; + int number_of_children_on_partition_level = size_partition; + int additional_children_on_last_partition = 0; + if (partition != nullptr) { + partition_type = Given; + depth_of_partition = 1; + cluster_stack.pop(); + bool is_child_on_partition = true; + root_cluster.set_is_permutation_local(true); + for (int p = 0; p < size_partition; p++) { + center = compute_center(spatial_dimension, coordinates, weights, partition[2 * p], partition[2 * p + 1], permutation.data()); + radius = compute_radius(spatial_dimension, coordinates, radii, center, partition[2 * p], partition[2 * p + 1], permutation.data()); + Cluster *cluster_on_partition = root_cluster.add_child(radius, center, p, partition[2 * p], partition[2 * p + 1], p, is_child_on_partition); + cluster_stack.push(cluster_on_partition); + } + } else { + partition_type = Simple; + if (size_partition >= number_of_children) { + depth_of_partition = static_cast(floor(log(size_partition) / log(number_of_children))); + number_of_children_on_partition_level = number_of_children; + if (size_partition != std::pow(number_of_children, depth_of_partition)) { + Logger::get_instance().log(LogLevel::WARNING, "The given size for the partition is not a power of the number of children in the cluster tree."); + additional_children_on_last_partition = size_partition - std::pow(number_of_children, depth_of_partition); + } + } else { + depth_of_partition = 1; + } + } + + // Recursive build + std::vector> current_splitting(number_of_children); + + while (!cluster_stack.empty()) { + Cluster *current_cluster = cluster_stack.top(); + cluster_stack.pop(); + int current_number_of_children = ((current_cluster->get_depth() == depth_of_partition - 1) && (partition_type == Simple)) ? number_of_children_on_partition_level : number_of_children; + + if ((current_cluster->get_depth() == depth_of_partition - 1) && (partition_type == Simple) && current_cluster->get_counter() == std::pow(number_of_children, current_cluster->get_depth()) - 1) { + current_number_of_children += additional_children_on_last_partition; + } + + current_splitting = m_partitioning_strategy->compute_partitioning(*current_cluster, spatial_dimension, coordinates, radii, weights, current_number_of_children); + + std::vector *> children; + if (current_splitting.size() == current_number_of_children and std::all_of(current_splitting.begin(), current_splitting.end(), [](auto a) { return a.second > 0; })) { + bool is_child_on_partition = false; + for (int p = 0; p < current_splitting.size(); p++) { + center = compute_center(spatial_dimension, coordinates, weights, current_splitting[p].first, current_splitting[p].second, permutation.data()); + radius = compute_radius(spatial_dimension, coordinates, radii, center, current_splitting[p].first, current_splitting[p].second, permutation.data()); + + int rank_of_child = current_cluster->get_rank(); + int counter_of_child = current_cluster->get_counter() * current_number_of_children + p; + if ((current_cluster->get_depth() == depth_of_partition - 1) && (partition_type == Simple)) { + rank_of_child = current_cluster->get_counter() * number_of_children_on_partition_level + p; + counter_of_child = rank_of_child; + is_child_on_partition = true; + } + + children.emplace_back(current_cluster->add_child(radius, center, rank_of_child, current_splitting[p].first, current_splitting[p].second, counter_of_child, is_child_on_partition)); + } + + // Recursivity + if (m_is_complete and std::any_of(children.begin(), children.end(), [&](auto a) { return a->get_size() > m_maximal_leaf_size; })) { + for (auto &child : children) { + cluster_stack.push(child); + } + } else if (m_is_complete) { + current_cluster->set_maximal_depth(std::max(current_cluster->get_maximal_depth(), current_cluster->get_depth())); + current_cluster->set_minimal_depth(std::min(current_cluster->get_minimal_depth(), current_cluster->get_depth())); + } else { + for (auto &child : children) { + if (child->get_size() > m_maximal_leaf_size) { + cluster_stack.push(child); + } else { + child->set_maximal_depth(std::max(child->get_maximal_depth(), child->get_depth())); + child->set_minimal_depth(std::min(child->get_minimal_depth(), child->get_depth())); + } + } + } + } else { + current_cluster->set_maximal_depth(std::max(current_cluster->get_maximal_depth(), current_cluster->get_depth())); + current_cluster->set_minimal_depth(std::min(current_cluster->get_minimal_depth(), current_cluster->get_depth())); + htool::Logger::get_instance().log(LogLevel::WARNING, "Partitioning of cluster (" + std::to_string(current_cluster->get_offset()) + "," + std::to_string(current_cluster->get_offset() + current_cluster->get_size()) + ") failed."); // LCOV_EXCL_LINE + } + + if (partition_type == PartitionType::Simple && current_cluster->get_depth() == depth_of_partition - 1) { + for (const auto &child : children) { + m_partition.emplace_back(child->get_offset(), child->get_size()); + } + } + } + + return root_cluster; +} + +template +std::vector ClusterTreeBuilder::compute_center(int spatial_dimension, const T *coordinates, const T *weights, int offset, int size, const int *current_permutation) const { + std::vector center(spatial_dimension, 0); + + bool is_first_permutation = (current_permutation == nullptr); + std::vector first_permutation(is_first_permutation ? size : 0, 0); + std::iota(first_permutation.begin(), first_permutation.end(), int(0)); + const int *permutation = is_first_permutation ? first_permutation.data() : current_permutation; + + // Mass of the cluster + T total_weight = 0; + for (int j = 0; j < size; j++) { + total_weight += weights[permutation[j + offset]]; + } + + // Center of the cluster + for (int j = 0; j < size; j++) { + for (int p = 0; p < spatial_dimension; p++) { + center[p] += weights[permutation[j + offset]] * coordinates[spatial_dimension * permutation[j + offset] + p]; + } + } + + std::transform(center.begin(), center.end(), center.begin(), std::bind(std::multiplies(), std::placeholders::_1, static_cast(1.) / total_weight)); + return center; +} + +template +T ClusterTreeBuilder::compute_radius(int spatial_dimension, const T *coordinates, const T *radii, std::vector center, int offset, int size, const int *current_permutation) const { + T radius = 0; + + bool is_first_permutation = (current_permutation == nullptr); + std::vector first_permutation(is_first_permutation ? size : 0, 0); + std::iota(first_permutation.begin(), first_permutation.end(), int(0)); + const int *permutation = is_first_permutation ? first_permutation.data() : current_permutation; + + for (int j = 0; j < size; j++) { + std::vector u(spatial_dimension, 0); + for (int p = 0; p < spatial_dimension; p++) { + u[p] = coordinates[spatial_dimension * permutation[j + offset] + p] - center[p]; + } + + radius = std::max(radius, norm2(u) + radii[permutation[j + offset]]); + } + return radius; +} + +} // namespace htool #endif diff --git a/include/htool/distributed_operator/distributed_operator.hpp b/include/htool/distributed_operator/distributed_operator.hpp index c3705c09..c3939e28 100644 --- a/include/htool/distributed_operator/distributed_operator.hpp +++ b/include/htool/distributed_operator/distributed_operator.hpp @@ -1,18 +1,18 @@ #ifndef HTOOL_DISTRIBUTED_OPERATOR_HPP #define HTOOL_DISTRIBUTED_OPERATOR_HPP -#include "../local_operators/virtual_local_operator.hpp" // for VirtualLocalOperator -#include "../misc/logger.hpp" // for Logger, LogLevel -#include "../misc/misc.hpp" // for conj_if_complex -#include "../misc/user.hpp" // for NbrToStr, StrToNbr -#include "../wrappers/wrapper_mpi.hpp" // for wrapper_mpi -#include "interfaces/partition.hpp" // for IPartition -#include // for fill, copy_n, fill_n, transform -#include // for plus -#include // for map -#include // for MPI_Comm_rank, MPI_Comm_size -#include // for basic_string, operator<, string -#include // for vector +#include "../misc/logger.hpp" // for Logger, LogLevel +#include "../misc/misc.hpp" // for conj_if_complex +#include "../misc/user.hpp" // for NbrToStr, StrToNbr +#include "../wrappers/wrapper_mpi.hpp" // for wrapper_mpi +#include "interfaces/virtual_local_operator.hpp" // for VirtualLocalOperator +#include "interfaces/virtual_partition.hpp" // for IPartition +#include // for fill, copy_n, fill_n, transform +#include // for plus +#include // for map +#include // for MPI_Comm_rank, MPI_Comm_size +#include // for basic_string, operator<, string +#include // for vector namespace htool { template @@ -20,8 +20,8 @@ class DistributedOperator { private: // - const IPartition &m_target_partition; - const IPartition &m_source_partition; + const VirtualPartition &m_target_partition; + const VirtualPartition &m_source_partition; // Local operators std::vector *> m_local_operators = {}; @@ -44,7 +44,7 @@ class DistributedOperator { virtual ~DistributedOperator() = default; // Constructor - explicit DistributedOperator(const IPartition &target_partition, const IPartition &source_partition, char symmetry, char UPLO, MPI_Comm comm) : m_target_partition(target_partition), m_source_partition(source_partition), m_symmetry(symmetry), m_UPLO(UPLO), m_comm(comm) {} + explicit DistributedOperator(const VirtualPartition &target_partition, const VirtualPartition &source_partition, char symmetry, char UPLO, MPI_Comm comm) : m_target_partition(target_partition), m_source_partition(source_partition), m_symmetry(symmetry), m_UPLO(UPLO), m_comm(comm) {} void add_local_operator(const VirtualLocalOperator *local_operator) { m_local_operators.push_back(local_operator); @@ -85,8 +85,8 @@ class DistributedOperator { char get_symmetry_type() const { return m_symmetry; } char get_storage_type() const { return m_UPLO; } MPI_Comm get_comm() const { return m_comm; } - const IPartition &get_target_partition() const { return m_target_partition; } - const IPartition &get_source_partition() const { return m_source_partition; } + const VirtualPartition &get_target_partition() const { return m_target_partition; } + const VirtualPartition &get_source_partition() const { return m_source_partition; } }; template diff --git a/include/htool/local_operators/local_dense_matrix.hpp b/include/htool/distributed_operator/implementations/local_operators/local_dense_matrix.hpp similarity index 88% rename from include/htool/local_operators/local_dense_matrix.hpp rename to include/htool/distributed_operator/implementations/local_operators/local_dense_matrix.hpp index 9c362588..4b64ce87 100644 --- a/include/htool/local_operators/local_dense_matrix.hpp +++ b/include/htool/distributed_operator/implementations/local_operators/local_dense_matrix.hpp @@ -1,14 +1,14 @@ -#ifndef HTOOL_TESTING_LOCAL_DENSE_MATRIX_HPP -#define HTOOL_TESTING_LOCAL_DENSE_MATRIX_HPP +#ifndef HTOOL_DISTRIBUTED_OPERATOR_LOCAL_DENSE_MATRIX_HPP +#define HTOOL_DISTRIBUTED_OPERATOR_LOCAL_DENSE_MATRIX_HPP -#include "../clustering/cluster_node.hpp" // for Cluster -#include "../matrix/matrix.hpp" // for Matrix -#include "local_operator.hpp" // for LocalOperator +#include "../../../clustering/cluster_node.hpp" // for Cluster +#include "../../../matrix/matrix.hpp" // for Matrix +#include "local_operator.hpp" // for LocalOperator namespace htool { -template +template > class LocalDenseMatrix final : public LocalOperator { const Matrix &m_data; diff --git a/include/htool/local_operators/local_hmatrix.hpp b/include/htool/distributed_operator/implementations/local_operators/local_hmatrix.hpp similarity index 86% rename from include/htool/local_operators/local_hmatrix.hpp rename to include/htool/distributed_operator/implementations/local_operators/local_hmatrix.hpp index fced428b..10ff1732 100644 --- a/include/htool/local_operators/local_hmatrix.hpp +++ b/include/htool/distributed_operator/implementations/local_operators/local_hmatrix.hpp @@ -1,12 +1,12 @@ -#ifndef HTOOL_TESTING_LOCAL_HMATRIX_HPP -#define HTOOL_TESTING_LOCAL_HMATRIX_HPP - -#include "../clustering/cluster_node.hpp" // for Cluster -#include "../hmatrix/hmatrix.hpp" -#include "../hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp" -#include "../hmatrix/linalg/add_hmatrix_vector_product.hpp" -#include "htool/misc/misc.hpp" +#ifndef HTOOL_DISTRIBUTED_OPERATOR_LOCAL_HMATRIX_HPP +#define HTOOL_DISTRIBUTED_OPERATOR_LOCAL_HMATRIX_HPP + +#include "../../../clustering/cluster_node.hpp" // for Cluster +#include "../../../hmatrix/hmatrix.hpp" +#include "../../../hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp" +#include "../../../hmatrix/linalg/add_hmatrix_vector_product.hpp" +#include "../../../misc/misc.hpp" #include "local_operator.hpp" namespace htool { diff --git a/include/htool/local_operators/local_operator.hpp b/include/htool/distributed_operator/implementations/local_operators/local_operator.hpp similarity index 95% rename from include/htool/local_operators/local_operator.hpp rename to include/htool/distributed_operator/implementations/local_operators/local_operator.hpp index 93104a30..d46ed044 100644 --- a/include/htool/local_operators/local_operator.hpp +++ b/include/htool/distributed_operator/implementations/local_operators/local_operator.hpp @@ -1,18 +1,18 @@ -#ifndef HTOOL_TESTING_LOCAL_OPERATOR_HPP -#define HTOOL_TESTING_LOCAL_OPERATOR_HPP +#ifndef HTOOL_DISTRIBUTED_OPERATOR_LOCAL_OPERATOR_HPP +#define HTOOL_DISTRIBUTED_OPERATOR_LOCAL_OPERATOR_HPP -#include "../clustering/cluster_node.hpp" // for user_to_clu... -#include "../local_operators/virtual_local_operator.hpp" // for VirtualLoca... -#include "../misc/logger.hpp" // for Logger, Log... -#include "../misc/misc.hpp" // for conj_if_com... -#include // for max, copy_n -#include // for basic_string -#include // for vector +#include "../../../clustering/cluster_node.hpp" // for user_to_clu... +#include "../../../misc/logger.hpp" // for Logger, Log... +#include "../../../misc/misc.hpp" // for conj_if_com... +#include "../../interfaces/virtual_local_operator.hpp" // for VirtualLoca... +#include // for max, copy_n +#include // for basic_string +#include // for vector namespace htool { -template +template > class LocalOperator : public VirtualLocalOperator { protected: const Cluster &m_target_cluster, &m_source_cluster; diff --git a/include/htool/distributed_operator/implementations/partition_from_cluster.hpp b/include/htool/distributed_operator/implementations/partition_from_cluster.hpp index 642bc76b..2d1bafb3 100644 --- a/include/htool/distributed_operator/implementations/partition_from_cluster.hpp +++ b/include/htool/distributed_operator/implementations/partition_from_cluster.hpp @@ -1,14 +1,14 @@ #ifndef HTOOL_DISTRIBUTED_OPERATOR_PARTITION_FROM_CLUSTER_HPP #define HTOOL_DISTRIBUTED_OPERATOR_PARTITION_FROM_CLUSTER_HPP -#include "../../clustering/cluster_node.hpp" // for global_to_root_cluster -#include "../../misc/misc.hpp" // for underlying_type -#include "../interfaces/partition.hpp" // for IPartition +#include "../../clustering/cluster_node.hpp" // for global_to_root_cluster +#include "../../misc/misc.hpp" // for underlying_type +#include "../interfaces/virtual_partition.hpp" // for IPartition namespace htool { template > -class PartitionFromCluster final : public IPartition { +class PartitionFromCluster final : public VirtualPartition { const Cluster &m_root_cluster; public: diff --git a/include/htool/local_operators/virtual_local_operator.hpp b/include/htool/distributed_operator/interfaces/virtual_local_operator.hpp similarity index 90% rename from include/htool/local_operators/virtual_local_operator.hpp rename to include/htool/distributed_operator/interfaces/virtual_local_operator.hpp index 82125c29..f9ace2df 100644 --- a/include/htool/local_operators/virtual_local_operator.hpp +++ b/include/htool/distributed_operator/interfaces/virtual_local_operator.hpp @@ -1,5 +1,5 @@ -#ifndef HTOOL_VIRTUAL_LOCAL_OPERATOR_HPP -#define HTOOL_VIRTUAL_LOCAL_OPERATOR_HPP +#ifndef HTOOL_DISTRIBUTED_OPERATOR_VIRTUAL_LOCAL_OPERATOR_HPP +#define HTOOL_DISTRIBUTED_OPERATOR_VIRTUAL_LOCAL_OPERATOR_HPP namespace htool { template diff --git a/include/htool/distributed_operator/interfaces/partition.hpp b/include/htool/distributed_operator/interfaces/virtual_partition.hpp similarity index 60% rename from include/htool/distributed_operator/interfaces/partition.hpp rename to include/htool/distributed_operator/interfaces/virtual_partition.hpp index 5d7e6363..18eb894c 100644 --- a/include/htool/distributed_operator/interfaces/partition.hpp +++ b/include/htool/distributed_operator/interfaces/virtual_partition.hpp @@ -1,10 +1,10 @@ -#ifndef HTOOL_DISTIRBUTED_OPERATOR_PARTITION_HPP -#define HTOOL_DISTIRBUTED_OPERATOR_PARTITION_HPP +#ifndef HTOOL_DISTIRBUTED_OPERATOR_VIRTUAL_PARTITION_HPP +#define HTOOL_DISTIRBUTED_OPERATOR_VIRTUAL_PARTITION_HPP namespace htool { template -class IPartition { +class VirtualPartition { public: virtual int get_size_of_partition(int subdomain_number) const = 0; virtual int get_offset_of_partition(int subdomain_number) const = 0; @@ -18,15 +18,15 @@ class IPartition { virtual void local_partition_to_local_numbering(int subdomain_number, const CoefficientPrecision *const in, CoefficientPrecision *const out) const = 0; virtual bool is_renumbering_local() const = 0; - virtual ~IPartition() = default; + virtual ~VirtualPartition() = default; protected: // no copy - IPartition() = default; - IPartition(const IPartition &) = default; - IPartition &operator=(const IPartition &) = default; - IPartition(IPartition &&IPartition) noexcept = default; - IPartition &operator=(IPartition &&IPartition) noexcept = default; + VirtualPartition() = default; + VirtualPartition(const VirtualPartition &) = default; + VirtualPartition &operator=(const VirtualPartition &) = default; + VirtualPartition(VirtualPartition &&IPartition) noexcept = default; + VirtualPartition &operator=(VirtualPartition &&IPartition) noexcept = default; }; } // namespace htool diff --git a/include/htool/distributed_operator/utility.hpp b/include/htool/distributed_operator/utility.hpp index 55aa50ac..7d2f5a15 100644 --- a/include/htool/distributed_operator/utility.hpp +++ b/include/htool/distributed_operator/utility.hpp @@ -1,17 +1,17 @@ #ifndef HTOOL_DISTRIBUTED_OPERATOR_UTILITY_HPP #define HTOOL_DISTRIBUTED_OPERATOR_UTILITY_HPP -#include "../clustering/cluster_node.hpp" // for Cluster -#include "../hmatrix/hmatrix.hpp" // for HMatrix -#include "../hmatrix/interfaces/virtual_generator.hpp" // for GeneratorW... -#include "../hmatrix/tree_builder/tree_builder.hpp" // for HMatrixTre... -#include "../local_operators/local_hmatrix.hpp" // for LocalHMatrix -#include "../local_operators/virtual_local_operator.hpp" // for VirtualLocalOperator -#include "../misc/misc.hpp" // for underlying... -#include "distributed_operator.hpp" // for Distribute... -#include "implementations/partition_from_cluster.hpp" // for PartitionF... -#include // for function -#include // for MPI_Comm +#include "../clustering/cluster_node.hpp" // for Cluster +#include "../hmatrix/hmatrix.hpp" // for HMatrix +#include "../hmatrix/interfaces/virtual_generator.hpp" // for GeneratorW... +#include "../hmatrix/tree_builder/tree_builder.hpp" // for HMatrixTre... +#include "../misc/misc.hpp" // for underlying... +#include "distributed_operator.hpp" // for Distribute... +#include "implementations/local_operators/local_hmatrix.hpp" // for LocalHMatrix +#include "implementations/partition_from_cluster.hpp" // for PartitionF... +#include "interfaces/virtual_local_operator.hpp" // for VirtualLocalOperator +#include // for function +#include // for MPI_Comm namespace htool { @@ -28,7 +28,7 @@ class CustomApproximationBuilder { }; template > -class DistributedOperatorFromHMatrix { +class DefaultApproximationBuilder { private: std::function get_rankWorld = [](MPI_Comm comm) { int rankWorld; @@ -36,7 +36,7 @@ class DistributedOperatorFromHMatrix { return rankWorld; }; public: - const HMatrix hmatrix; + HMatrix hmatrix; private: const LocalHMatrix local_hmatrix; @@ -46,55 +46,38 @@ class DistributedOperatorFromHMatrix { DistributedOperator &distributed_operator; const HMatrix *block_diagonal_hmatrix{nullptr}; - DistributedOperatorFromHMatrix(const VirtualInternalGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_builder, MPI_Comm communicator) : hmatrix(hmatrix_builder.build(generator)), local_hmatrix(hmatrix, hmatrix_builder.get_target_cluster().get_cluster_on_partition(get_rankWorld(communicator)), hmatrix_builder.get_source_cluster(), hmatrix_builder.get_symmetry(), hmatrix_builder.get_UPLO(), false, false), distributed_operator_holder(target_cluster, source_cluster, hmatrix_builder.get_symmetry(), hmatrix_builder.get_UPLO(), communicator, local_hmatrix), distributed_operator(distributed_operator_holder.distributed_operator) { - block_diagonal_hmatrix = hmatrix.get_sub_hmatrix(hmatrix_builder.get_target_cluster().get_cluster_on_partition(get_rankWorld(communicator)), hmatrix_builder.get_source_cluster().get_cluster_on_partition(get_rankWorld(communicator))); + DefaultApproximationBuilder(const VirtualInternalGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_tree_builder, MPI_Comm communicator) : hmatrix(hmatrix_tree_builder.build(generator, target_cluster, source_cluster, get_rankWorld(communicator), get_rankWorld(communicator))), local_hmatrix(hmatrix, target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster, hmatrix_tree_builder.get_symmetry(), hmatrix_tree_builder.get_UPLO(), false, false), distributed_operator_holder(target_cluster, source_cluster, hmatrix_tree_builder.get_symmetry(), hmatrix_tree_builder.get_UPLO(), communicator, local_hmatrix), distributed_operator(distributed_operator_holder.distributed_operator) { + block_diagonal_hmatrix = hmatrix.get_sub_hmatrix(target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster.get_cluster_on_partition(get_rankWorld(communicator))); } - DistributedOperatorFromHMatrix(const VirtualGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_builder, MPI_Comm communicator) : DistributedOperatorFromHMatrix(InternalGeneratorWithPermutation(generator, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()), target_cluster, source_cluster, hmatrix_builder, communicator) { - } + DefaultApproximationBuilder(const VirtualGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_tree_builder, MPI_Comm communicator) : DefaultApproximationBuilder(InternalGeneratorWithPermutation(generator, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()), target_cluster, source_cluster, hmatrix_tree_builder, communicator) {} }; template > -class DefaultApproximationBuilder { +class DefaultLocalApproximationBuilder { private: std::function get_rankWorld = [](MPI_Comm comm) { int rankWorld; MPI_Comm_rank(comm, &rankWorld); return rankWorld; }; - DistributedOperatorFromHMatrix distributed_operator_builder; - - public: - const HMatrix &hmatrix; public: - DistributedOperator &distributed_operator; - const HMatrix *block_diagonal_hmatrix{nullptr}; - - DefaultApproximationBuilder(const VirtualInternalGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, htool::underlying_type epsilon, htool::underlying_type eta, char symmetry, char UPLO, MPI_Comm communicator) : distributed_operator_builder(generator, target_cluster, source_cluster, HMatrixTreeBuilder(target_cluster, source_cluster, epsilon, eta, symmetry, UPLO, -1, get_rankWorld(communicator), get_rankWorld(communicator)), communicator), hmatrix(distributed_operator_builder.hmatrix), distributed_operator(distributed_operator_builder.distributed_operator), block_diagonal_hmatrix(distributed_operator_builder.block_diagonal_hmatrix) {} - - DefaultApproximationBuilder(const VirtualGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, htool::underlying_type epsilon, htool::underlying_type eta, char symmetry, char UPLO, MPI_Comm communicator) : DefaultApproximationBuilder(InternalGeneratorWithPermutation(generator, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()), target_cluster, source_cluster, epsilon, eta, symmetry, UPLO, communicator) {} -}; + HMatrix hmatrix; -template > -class DefaultLocalApproximationBuilder { private: - std::function get_rankWorld = [](MPI_Comm comm) { - int rankWorld; - MPI_Comm_rank(comm, &rankWorld); - return rankWorld; }; - DistributedOperatorFromHMatrix distributed_operator_builder; - - public: - const HMatrix &hmatrix; + const LocalHMatrix local_hmatrix; + CustomApproximationBuilder distributed_operator_holder; public: DistributedOperator &distributed_operator; const HMatrix *block_diagonal_hmatrix{nullptr}; public: - DefaultLocalApproximationBuilder(const VirtualInternalGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, htool::underlying_type epsilon, htool::underlying_type eta, char symmetry, char UPLO, MPI_Comm communicator) : distributed_operator_builder(generator, target_cluster, source_cluster, HMatrixTreeBuilder(target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster.get_cluster_on_partition(get_rankWorld(communicator)), epsilon, eta, symmetry, UPLO, -1, get_rankWorld(communicator), get_rankWorld(communicator)), communicator), hmatrix(distributed_operator_builder.hmatrix), distributed_operator(distributed_operator_builder.distributed_operator), block_diagonal_hmatrix(distributed_operator_builder.block_diagonal_hmatrix) {} + DefaultLocalApproximationBuilder(const VirtualInternalGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_tree_builder, MPI_Comm communicator) : hmatrix(hmatrix_tree_builder.build(generator, target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster.get_cluster_on_partition(get_rankWorld(communicator)))), local_hmatrix(hmatrix, target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster.get_cluster_on_partition(get_rankWorld(communicator)), hmatrix_tree_builder.get_symmetry(), hmatrix_tree_builder.get_UPLO(), false, false), distributed_operator_holder(target_cluster, source_cluster, hmatrix_tree_builder.get_symmetry(), hmatrix_tree_builder.get_UPLO(), communicator, local_hmatrix), distributed_operator(distributed_operator_holder.distributed_operator) { + block_diagonal_hmatrix = hmatrix.get_sub_hmatrix(target_cluster.get_cluster_on_partition(get_rankWorld(communicator)), source_cluster.get_cluster_on_partition(get_rankWorld(communicator))); + } - DefaultLocalApproximationBuilder(const VirtualGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, htool::underlying_type epsilon, htool::underlying_type eta, char symmetry, char UPLO, MPI_Comm communicator) : DefaultLocalApproximationBuilder(InternalGeneratorWithPermutation(generator, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()), target_cluster, source_cluster, epsilon, eta, symmetry, UPLO, communicator) {} + DefaultLocalApproximationBuilder(const VirtualGenerator &generator, const Cluster &target_cluster, const Cluster &source_cluster, const HMatrixTreeBuilder &hmatrix_tree_builder, MPI_Comm communicator) : DefaultLocalApproximationBuilder(InternalGeneratorWithPermutation(generator, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()), target_cluster, source_cluster, hmatrix_tree_builder, communicator) {} }; } // namespace htool diff --git a/include/htool/hmatrix/hmatrix.hpp b/include/htool/hmatrix/hmatrix.hpp index 314600ee..6df5fef9 100644 --- a/include/htool/hmatrix/hmatrix.hpp +++ b/include/htool/hmatrix/hmatrix.hpp @@ -41,7 +41,7 @@ class HMatrix : public TreeNode> m_dense_data{nullptr}; - std::unique_ptr> m_low_rank_data{nullptr}; + std::unique_ptr> m_low_rank_data{nullptr}; // char m_symmetry_type_for_leaves{'N'}; @@ -57,7 +57,7 @@ class HMatrix : public TreeNode *target_cluster, const Cluster *source_cluster) : TreeNode>(parent), m_target_cluster(target_cluster), m_source_cluster(source_cluster) {} - HMatrix(const HMatrix &rhs) : TreeNode, HMatrixTreeData>(rhs), m_target_cluster(rhs.m_target_cluster), m_source_cluster(rhs.m_source_cluster), m_symmetry(rhs.m_symmetry), m_UPLO(rhs.m_UPLO), m_symmetry_type_for_leaves(), m_storage_type(rhs.m_storage_type) { + HMatrix(const HMatrix &rhs) : TreeNode, HMatrixTreeData>(rhs), m_target_cluster(rhs.m_target_cluster), m_source_cluster(rhs.m_source_cluster), m_symmetry(rhs.m_symmetry), m_UPLO(rhs.m_UPLO), m_symmetry_type_for_leaves(rhs.m_symmetry_type_for_leaves), m_UPLO_for_leaves(rhs.m_UPLO_for_leaves), m_storage_type(rhs.m_storage_type) { if (m_target_cluster->is_root() or is_cluster_on_partition(*m_target_cluster)) { Logger::get_instance().log(LogLevel::INFO, "Deep copy of HMatrix"); } @@ -72,7 +72,7 @@ class HMatrix : public TreeNode>(*rhs.m_dense_data); } if (rhs.m_low_rank_data) { - m_low_rank_data = std::make_unique>(*rhs.m_low_rank_data); + m_low_rank_data = std::make_unique>(*rhs.m_low_rank_data); } } HMatrix &operator=(const HMatrix &rhs) { @@ -99,7 +99,7 @@ class HMatrix : public TreeNode>(*rhs.m_dense_data); } if (rhs.m_low_rank_data) { - m_low_rank_data = std::make_unique>(*rhs.m_low_rank_data); + m_low_rank_data = std::make_unique>(*rhs.m_low_rank_data); } return *this; } @@ -145,8 +145,8 @@ class HMatrix : public TreeNode *get_dense_data() const { return m_dense_data.get(); } Matrix *get_dense_data() { return m_dense_data.get(); } - const LowRankMatrix *get_low_rank_data() const { return m_low_rank_data.get(); } - LowRankMatrix *get_low_rank_data() { return m_low_rank_data.get(); } + const LowRankMatrix *get_low_rank_data() const { return m_low_rank_data.get(); } + LowRankMatrix *get_low_rank_data() { return m_low_rank_data.get(); } char get_symmetry() const { return m_symmetry; } char get_UPLO() const { return m_UPLO; } const HMatrixTreeData *get_hmatrix_tree_data() const { return this->m_tree_data.get(); } @@ -205,7 +205,7 @@ class HMatrix : public TreeNodem_tree_data->m_eta = eta; } void set_epsilon(underlying_type epsilon) { this->m_tree_data->m_epsilon = epsilon; } - void set_low_rank_generator(std::shared_ptr> ptr) { this->m_tree_data->m_low_rank_generator = ptr; } + void set_low_rank_generator(std::shared_ptr> ptr) { this->m_tree_data->m_low_rank_generator = ptr; } void set_admissibility_condition(std::shared_ptr> ptr) { this->m_tree_data->m_admissibility_condition = ptr; } void set_minimal_target_depth(unsigned int minimal_target_depth) { this->m_tree_data->m_minimal_target_depth = minimal_target_depth; } void set_minimal_source_depth(unsigned int minimal_source_depth) { this->m_tree_data->m_minimal_source_depth = minimal_source_depth; } @@ -225,9 +225,15 @@ class HMatrix : public TreeNode &generator, const VirtualLowRankGenerator &low_rank_generator, int reqrank, underlying_type epsilon) { - m_low_rank_data = std::make_unique>(generator, low_rank_generator, *m_target_cluster, *m_source_cluster, reqrank, epsilon); + bool compute_low_rank_data(const VirtualInternalLowRankGenerator &low_rank_generator, int reqrank, underlying_type epsilon) { + m_low_rank_data = std::make_unique>(m_target_cluster->get_size(), m_source_cluster->get_size(), reqrank, epsilon); m_storage_type = StorageType::LowRank; + + if (reqrank > 0) { + return low_rank_generator.copy_low_rank_approximation(m_target_cluster->get_size(), m_source_cluster->get_size(), m_target_cluster->get_offset(), m_source_cluster->get_offset(), reqrank, *m_low_rank_data); + } else { + return low_rank_generator.copy_low_rank_approximation(m_target_cluster->get_size(), m_source_cluster->get_size(), m_target_cluster->get_offset(), m_source_cluster->get_offset(), *m_low_rank_data); + } } void clear_low_rank_data() { m_low_rank_data.reset(); } @@ -267,6 +273,28 @@ std::pair * return result; } +template > +std::vector *> get_low_rank_leaves_from(HMatrix &hmatrix) { + std::vector *> leaves; + + std::stack *> hmatrix_stack; + hmatrix_stack.push(&hmatrix); + + while (!hmatrix_stack.empty()) { + HMatrix *current_hmatrix = hmatrix_stack.top(); + hmatrix_stack.pop(); + + if (current_hmatrix->is_leaf() && current_hmatrix->is_low_rank()) { + leaves.push_back(current_hmatrix); + } + + for (const auto &child : current_hmatrix->get_children()) { + hmatrix_stack.push(child.get()); + } + } + return leaves; +} + template > void copy_to_dense(const HMatrix &hmatrix, CoefficientPrecision *ptr) { @@ -442,5 +470,18 @@ void copy_diagonal_in_user_numbering(const HMatrix> +bool left_hmatrix_ancestor_of_right_hmatrix(const HMatrix &hmatrix, const HMatrix &other_hmatrix) { + if ((left_cluster_contains_right_cluster(hmatrix.get_target_cluster(), other_hmatrix.get_target_cluster())) and (left_cluster_contains_right_cluster(hmatrix.get_source_cluster(), other_hmatrix.get_source_cluster()))) { + return true; + } + return false; +} + +template > +bool left_hmatrix_descendant_of_right_hmatrix(const HMatrix &hmatrix, const HMatrix &other_hmatrix) { + return left_hmatrix_ancestor_of_right_hmatrix(other_hmatrix, hmatrix); +} + } // namespace htool #endif diff --git a/include/htool/hmatrix/hmatrix_output.hpp b/include/htool/hmatrix/hmatrix_output.hpp index 0e7891d8..e928994d 100644 --- a/include/htool/hmatrix/hmatrix_output.hpp +++ b/include/htool/hmatrix/hmatrix_output.hpp @@ -85,16 +85,16 @@ template get_tree_parameters(const HMatrix &hmatrix) { const HMatrixTreeData *hmatrix_tree_data = hmatrix.get_hmatrix_tree_data(); std::map tree_parameters; - tree_parameters["Eta"] = std::to_string(hmatrix_tree_data->m_eta); - tree_parameters["Epsilon"] = std::to_string(hmatrix_tree_data->m_epsilon); - tree_parameters["MinTargetDepth"] = std::to_string(hmatrix_tree_data->m_minimal_target_depth); - tree_parameters["MinClusterSizeTarget"] = std::to_string(hmatrix.get_target_cluster().get_minclustersize()); - tree_parameters["MaxClusterDepthTarget"] = std::to_string(hmatrix.get_target_cluster().get_maximal_depth()); - tree_parameters["MinClusterDepthTarget"] = std::to_string(hmatrix.get_target_cluster().get_minimal_depth()); - tree_parameters["MinSourceDepth"] = std::to_string(hmatrix_tree_data->m_minimal_source_depth); - tree_parameters["MinClusterSizeSource"] = std::to_string(hmatrix.get_source_cluster().get_minclustersize()); - tree_parameters["MaxClusterDepthSource"] = std::to_string(hmatrix.get_source_cluster().get_maximal_depth()); - tree_parameters["MinClusterDepthSource"] = std::to_string(hmatrix.get_source_cluster().get_minimal_depth()); + tree_parameters["Eta"] = std::to_string(hmatrix_tree_data->m_eta); + tree_parameters["Epsilon"] = std::to_string(hmatrix_tree_data->m_epsilon); + tree_parameters["MinTargetDepth"] = std::to_string(hmatrix_tree_data->m_minimal_target_depth); + tree_parameters["MaxClusterLeafSizeTarget"] = std::to_string(hmatrix.get_target_cluster().get_maximal_leaf_size()); + tree_parameters["MaxClusterDepthTarget"] = std::to_string(hmatrix.get_target_cluster().get_maximal_depth()); + tree_parameters["MinClusterDepthTarget"] = std::to_string(hmatrix.get_target_cluster().get_minimal_depth()); + tree_parameters["MinSourceDepth"] = std::to_string(hmatrix_tree_data->m_minimal_source_depth); + tree_parameters["MaxClusterLeafSizeSource"] = std::to_string(hmatrix.get_source_cluster().get_maximal_leaf_size()); + tree_parameters["MaxClusterDepthSource"] = std::to_string(hmatrix.get_source_cluster().get_maximal_depth()); + tree_parameters["MinClusterDepthSource"] = std::to_string(hmatrix.get_source_cluster().get_minimal_depth()); return tree_parameters; } @@ -136,8 +136,8 @@ std::map get_hmatrix_information(const HMatrix hmatrix_information; - unsigned int nb_rows = hmatrix.get_target_cluster().get_size(); - unsigned int nb_cols = hmatrix.get_source_cluster().get_size(); + std::size_t nb_rows = hmatrix.get_target_cluster().get_size(); // unsigned int replaced by std::size_t + std::size_t nb_cols = hmatrix.get_source_cluster().get_size(); // 0 : dense mat ; 1 : lr mat ; 2 : rank ; 3 : local_size std::array maxinfos = {0, 0, 0}; diff --git a/include/htool/hmatrix/hmatrix_output_dot.hpp b/include/htool/hmatrix/hmatrix_output_dot.hpp new file mode 100644 index 00000000..9d703910 --- /dev/null +++ b/include/htool/hmatrix/hmatrix_output_dot.hpp @@ -0,0 +1,214 @@ +#ifndef HTOOL_HMATRIX_OUTPUT_DOT_HPP +#define HTOOL_HMATRIX_OUTPUT_DOT_HPP + +#include "hmatrix.hpp" + +/* +This file contain usefull functions for visualizing the block tree of a hierarchical matrix +*/ + +namespace htool { + +struct TreeCounts { + size_t call_count = 0; + size_t node_count = 0; +}; + +/** + * @brief Creates a DOT file visualizing the block tree of a hierarchical matrix. + * + * This function takes a hierarchical matrix and a set of submatrices to color + * and generates a DOT file which can be used to visualize the block tree of the hierarchical matrix. + * The DOT file will contain a node for each HMatrix in the hierarchical matrix, + * and an edge between two nodes if the corresponding HMatrices are in the same + * block tree. The nodes are colored according to the set of submatrices given as input. + * + * The DOT file also contains a tooltip that summarizes the block tree, giving + * the number of HMatrices and the number of submatrices to color. + * + * @param hmatrix The input hierarchical matrix for which the block tree is visualized. + * @param subhmatrices_to_color The set of submatrices to color in the block tree. + * @param dotFile The output stream to write the DOT file to. + */ +// template > +// void view_block_tree(HMatrix &hmatrix, std::vector *> &subhmatrices_to_color, std::ostream &dotFile) { +// // Start the DOT file content +// dotFile << "digraph {\n"; + +// // Create the block tree by adding nodes and edges to the DOT file +// TreeCounts counts; +// create_block_tree(hmatrix, dotFile, counts, subhmatrices_to_color); + +// // Add tooltip information to the DOT file summarizing the block tree +// dotFile << " tooltip=\"Block tree information: \\n" +// << "number of HMatrices: " << counts.node_count << "\\n" +// << "number of subHMatrices to color: " << subhmatrices_to_color.size() << "\";\n"; +// dotFile << "}\n"; +// } + +// overload for const +template > +void view_block_tree(HMatrix &hmatrix, const std::vector *> &subhmatrices_to_color, std::ostream &dotFile) { + // Start the DOT file content + dotFile << "digraph {\n"; + + // Create the block tree by adding nodes and edges to the DOT file + TreeCounts counts; + create_block_tree(hmatrix, dotFile, counts, subhmatrices_to_color); + + // Add tooltip information to the DOT file summarizing the block tree + dotFile << " tooltip=\"Block tree information: \\n" + << "number of HMatrices: " << counts.node_count << "\\n" + << "number of subHMatrices to color: " << subhmatrices_to_color.size() << "\";\n"; + dotFile << "}\n"; +} + +/** + * @brief Creates a unique identifier for a given HMatrix based on its target and source cluster offsets. + * + * This function generates an identifier by concatenating the minimum and maximum offsets of the target and source clusters. + * This identifier is used to label nodes in the block tree visualization of the HMatrix. + * + * @param hmatrix The input hierarchical matrix for which the identifier is generated. + * @return A unique identifier for the given HMatrix. + */ +template > +std::string get_hmatrix_id(const HMatrix &hmatrix) { + // Get the minimum and maximum offsets of the target and source clusters + std::string target_min = std::to_string(hmatrix.get_target_cluster().get_offset()); + std::string target_max = std::to_string(hmatrix.get_target_cluster().get_offset() + hmatrix.get_target_cluster().get_size() - 1); + std::string source_min = std::to_string(hmatrix.get_source_cluster().get_offset()); + std::string source_max = std::to_string(hmatrix.get_source_cluster().get_offset() + hmatrix.get_source_cluster().get_size() - 1); + + // Create the identifier by concatenating the offsets + return target_min + "_" + target_max + "_" + source_min + "_" + source_max; +} + +/** + * @brief Adds a node to the block tree in the DOT file. + * + * This function adds a new node to the dot file. + * It also adds the node information in a tooltip. + * If the node is in subhmatrices_to_color, it colors it in light blue. + * The function also increments the node count. + * + * @param hmatrix The input hierarchical matrix for which the node is created. + * @param dotFile The output stream to write the node data to. + * @param counts The TreeCounts object to increment the node count. + * @param subhmatrices_to_color A vector of nodes to be highlighted. + */ +// template > +// void add_node_to_block_tree(HMatrix &hmatrix, std::ostream &dotFile, TreeCounts &counts, std::vector *> &subhmatrices_to_color) { +// // Define node +// dotFile << " H_" << get_hmatrix_id(hmatrix) << " [tooltip=\"Node information: \\n"; + +// // Add the node information in a tooltip. +// auto hmatrix_info = get_hmatrix_information(hmatrix); +// for (auto &info : hmatrix_info) { +// dotFile << info.first << ": " << info.second << "\\n"; +// } + +// // Check if the current node is in subhmatrices_to_color and color it in light blue if true +// dotFile << "\""; +// if (std::find(subhmatrices_to_color.begin(), subhmatrices_to_color.end(), &hmatrix) != subhmatrices_to_color.end()) { +// dotFile << ", style=filled, fillcolor=lightblue"; +// } + +// // End of node definition +// dotFile << "];\n"; + +// // Increment node count +// counts.node_count++; +// } + +// overload for const +template > +void add_node_to_block_tree(HMatrix &hmatrix, std::ostream &dotFile, TreeCounts &counts, const std::vector *> &subhmatrices_to_color) { + // Define node + dotFile << " H_" << get_hmatrix_id(hmatrix) << " [tooltip=\"Node information: \\n"; + + // Add the node information in a tooltip. + auto hmatrix_info = get_hmatrix_information(hmatrix); + for (auto &info : hmatrix_info) { + dotFile << info.first << ": " << info.second << "\\n"; + } + + // Check if the current node is in subhmatrices_to_color and color it in light blue if true + dotFile << "\""; + if (std::find(subhmatrices_to_color.begin(), subhmatrices_to_color.end(), &hmatrix) != subhmatrices_to_color.end()) { + dotFile << ", style=filled, fillcolor=lightblue"; + } + + // End of node definition + dotFile << "];\n"; + + // Increment node count + counts.node_count++; +} + +/** + * @brief Constructs a block tree visualization for a hierarchical matrix (HMatrix). + * + * This function generates the structure of a block tree by recursively adding nodes + * and edges to a provided DOT file stream. It starts by adding the root node and + * then iterates over the children of each node, adding them to the DOT file and + * linking them with edges. The tree is constructed in a depth-first manner. + * Each node and edge is annotated with tooltips for additional information. + * + * @param hmatrix The input hierarchical matrix for which the block tree is created. + * @param dotFile The output stream to write the DOT representation of the block tree. + * @param counts A TreeCounts object that tracks the number of calls and nodes processed. + * @param subhmatrices_to_color A vector of pointers to HMatrix nodes, representing an initial set of nodes. + */ +// template > +// void create_block_tree(HMatrix &hmatrix, std::ostream &dotFile, TreeCounts &counts, std::vector *> &subhmatrices_to_color) { +// // Increment the call count +// counts.call_count++; + +// // Add root node +// if (counts.call_count == 1) { +// // Add the root node to the dot file +// add_node_to_block_tree(hmatrix, dotFile, counts, subhmatrices_to_color); +// } + +// // Add child nodes +// for (const auto &child : hmatrix.get_children()) { +// // Add the child node to the dot file +// add_node_to_block_tree(*child.get(), dotFile, counts, subhmatrices_to_color); + +// // Add an edge between the parent and child nodes +// dotFile << " H_" << get_hmatrix_id(hmatrix) << " -> H_" << get_hmatrix_id(*child.get()) << " [tooltip=\"" << get_hmatrix_id(hmatrix) << " -> " << get_hmatrix_id(*child.get()) << "\"];\n"; + +// // Recursively add child nodes +// create_block_tree(*child.get(), dotFile, counts, subhmatrices_to_color); +// } +// } + +// overload for const +template > +void create_block_tree(HMatrix &hmatrix, std::ostream &dotFile, TreeCounts &counts, const std::vector *> &subhmatrices_to_color) { + // Increment the call count + counts.call_count++; + + // Add root node + if (counts.call_count == 1) { + // Add the root node to the dot file + add_node_to_block_tree(hmatrix, dotFile, counts, subhmatrices_to_color); + } + + // Add child nodes + for (const auto &child : hmatrix.get_children()) { + // Add the child node to the dot file + add_node_to_block_tree(*child.get(), dotFile, counts, subhmatrices_to_color); + + // Add an edge between the parent and child nodes + dotFile << " H_" << get_hmatrix_id(hmatrix) << " -> H_" << get_hmatrix_id(*child.get()) << " [tooltip=\"" << get_hmatrix_id(hmatrix) << " -> " << get_hmatrix_id(*child.get()) << "\"];\n"; + + // Recursively add child nodes + create_block_tree(*child.get(), dotFile, counts, subhmatrices_to_color); + } +} + +} // namespace htool + +#endif \ No newline at end of file diff --git a/include/htool/hmatrix/hmatrix_tree_data.hpp b/include/htool/hmatrix/hmatrix_tree_data.hpp index 0624736e..8381837a 100644 --- a/include/htool/hmatrix/hmatrix_tree_data.hpp +++ b/include/htool/hmatrix/hmatrix_tree_data.hpp @@ -29,7 +29,7 @@ struct HMatrixTreeData { mutable std::map> m_timings; // Strategies - std::shared_ptr> + std::shared_ptr> m_low_rank_generator; std::shared_ptr> m_admissibility_condition; }; diff --git a/include/htool/hmatrix/interfaces/virtual_admissibility_condition.hpp b/include/htool/hmatrix/interfaces/virtual_admissibility_condition.hpp index c4bf5cda..6da2028c 100644 --- a/include/htool/hmatrix/interfaces/virtual_admissibility_condition.hpp +++ b/include/htool/hmatrix/interfaces/virtual_admissibility_condition.hpp @@ -1,5 +1,5 @@ -#ifndef HTOOL_BLOCKS_ADMISSIBILITY_CONDITIONS_HPP -#define HTOOL_BLOCKS_ADMISSIBILITY_CONDITIONS_HPP +#ifndef HTOOL_HMATRIX_VIRTUAL_BLOCKS_ADMISSIBILITY_CONDITIONS_HPP +#define HTOOL_HMATRIX_VIRTUAL_BLOCKS_ADMISSIBILITY_CONDITIONS_HPP #include "../../basic_types/vector.hpp" // for norm2 #include "../../clustering/cluster_node.hpp" // for Cluster diff --git a/include/htool/hmatrix/interfaces/virtual_dense_blocks_generator.hpp b/include/htool/hmatrix/interfaces/virtual_dense_blocks_generator.hpp index 31b15bcf..16458e69 100644 --- a/include/htool/hmatrix/interfaces/virtual_dense_blocks_generator.hpp +++ b/include/htool/hmatrix/interfaces/virtual_dense_blocks_generator.hpp @@ -1,5 +1,5 @@ -#ifndef HTOOL_DENSE_GENERATOR_HPP -#define HTOOL_DENSE_GENERATOR_HPP +#ifndef HTOOL_HMATRIX_VIRTUAL_DENSE_GENERATOR_HPP +#define HTOOL_HMATRIX_VIRTUAL_DENSE_GENERATOR_HPP #include // for vector diff --git a/include/htool/hmatrix/interfaces/virtual_generator.hpp b/include/htool/hmatrix/interfaces/virtual_generator.hpp index b1f44676..60b2fa35 100644 --- a/include/htool/hmatrix/interfaces/virtual_generator.hpp +++ b/include/htool/hmatrix/interfaces/virtual_generator.hpp @@ -1,5 +1,5 @@ -#ifndef HTOOL_GENERATOR_HPP -#define HTOOL_GENERATOR_HPP +#ifndef HTOOL_HMATRIX_VIRTUAL_GENERATOR_HPP +#define HTOOL_HMATRIX_VIRTUAL_GENERATOR_HPP namespace htool { diff --git a/include/htool/hmatrix/interfaces/virtual_lrmat_generator.hpp b/include/htool/hmatrix/interfaces/virtual_lrmat_generator.hpp index 020fa9db..6fab5f96 100644 --- a/include/htool/hmatrix/interfaces/virtual_lrmat_generator.hpp +++ b/include/htool/hmatrix/interfaces/virtual_lrmat_generator.hpp @@ -1,25 +1,60 @@ -#ifndef HTOOL_VIRTUAL_LRMAT_GENERATOR_HPP -#define HTOOL_VIRTUAL_LRMAT_GENERATOR_HPP +#ifndef HTOOL_HMATRIX_VIRTUAL_LRMAT_GENERATOR_HPP +#define HTOOL_HMATRIX_VIRTUAL_LRMAT_GENERATOR_HPP #include "../../clustering/cluster_node.hpp" // for Cluster +#include "../../hmatrix/lrmat/lrmat.hpp" // for LowRankMatrix #include "../../matrix/matrix.hpp" // for Matrix #include "../../misc/misc.hpp" // for underlying_type -#include "virtual_generator.hpp" // for VirtualGenerator namespace htool { +template +class VirtualInternalLowRankGenerator { + public: + VirtualInternalLowRankGenerator() {} + + // C style + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const = 0; + + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const = 0; + + virtual ~VirtualInternalLowRankGenerator() {} +}; + template > class VirtualLowRankGenerator { public: VirtualLowRankGenerator() {} // C style - virtual void copy_low_rank_approximation(const VirtualInternalGenerator &A, const Cluster &t, const Cluster &s, underlying_type epsilon, int &rank, Matrix &U, Matrix &V) const = 0; + virtual bool copy_low_rank_approximation(int M, int N, const int *rows, const int *cols, LowRankMatrix &lrmat) const = 0; + + virtual bool copy_low_rank_approximation(int M, int N, const int *rows, const int *cols, int reqrank, LowRankMatrix &lrmat) const = 0; - virtual bool is_htool_owning_data() const { return true; } virtual ~VirtualLowRankGenerator() {} }; +template +class InternalLowRankGenerator : public VirtualInternalLowRankGenerator { + + protected: + const VirtualLowRankGenerator &m_low_rank_generator; + const int *m_target_permutation; + const int *m_source_permutation; + + public: + InternalLowRankGenerator(const VirtualLowRankGenerator &low_rank_generator, const int *target_permutation, const int *source_permutation) : m_low_rank_generator(low_rank_generator), m_target_permutation(target_permutation), m_source_permutation(source_permutation) { + } + + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { + return m_low_rank_generator.copy_low_rank_approximation(M, N, m_target_permutation + row_offset, m_source_permutation + col_offset, lrmat); + } + + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const override { + return m_low_rank_generator.copy_low_rank_approximation(M, N, m_target_permutation + row_offset, m_source_permutation + col_offset, reqrank, lrmat); + } +}; + } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_hmatrix_hmatrix_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_hmatrix_product.hpp index 4a230203..8833de5d 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_hmatrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_hmatrix_product.hpp @@ -20,8 +20,8 @@ namespace htool { -template -void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +template > +void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { if (beta != CoefficientPrecision(1)) { scale(beta, C); } @@ -30,7 +30,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP bool block_tree_not_consistent = (A.get_target_cluster().get_rank() < 0 || A.get_source_cluster().get_rank() < 0 || B.get_target_cluster().get_rank() < 0 || B.get_source_cluster().get_rank() < 0); - std::vector> low_rank_matrices; + std::vector> low_rank_matrices; std::vector *> output_clusters, middle_clusters, input_clusters; const Cluster &output_cluster = transa == 'N' ? A.get_target_cluster() : A.get_source_cluster(); @@ -75,7 +75,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP for (auto &output_cluster_child : output_clusters) { for (auto &input_cluster_child : input_clusters) { - low_rank_matrices.emplace_back(C.get_epsilon()); + low_rank_matrices.emplace_back(output_cluster_child->get_size(), input_cluster_child->get_size(), C.get_epsilon()); low_rank_matrices.back().get_U().resize(output_cluster_child->get_size(), 0); low_rank_matrices.back().get_V().resize(0, input_cluster_child->get_size()); for (auto &middle_cluster_child : middle_clusters) { @@ -119,7 +119,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP } } -template +template > void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const HMatrix &B, CoefficientPrecision beta, Matrix &C) { if (beta != CoefficientPrecision(1)) { @@ -206,7 +206,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP } } -template +template > void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const HMatrix &B, CoefficientPrecision beta, HMatrix &C) { if (C.is_dense()) { @@ -276,7 +276,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP } } else { if (A.is_low_rank() || B.is_low_rank()) { - LowRankMatrix lrmat(A.is_low_rank() ? A.get_low_rank_data()->get_epsilon() : B.get_low_rank_data()->get_epsilon()); + LowRankMatrix lrmat(A.nb_rows(), B.nb_cols(), A.is_low_rank() ? A.get_low_rank_data()->get_epsilon() : B.get_low_rank_data()->get_epsilon()); if (A.is_dense() and B.is_low_rank()) { add_matrix_lrmat_product(transa, transb, alpha, *A.get_dense_data(), *B.get_low_rank_data(), beta, lrmat); @@ -292,7 +292,7 @@ void internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientP internal_add_lrmat_hmatrix(lrmat, C); } else { - LowRankMatrix lrmat(C.get_epsilon()); + LowRankMatrix lrmat(A.nb_rows(), B.nb_cols(), C.get_epsilon()); if (A.is_dense() and B.is_dense()) { add_matrix_matrix_product(transa, transb, alpha, *A.get_dense_data(), *B.get_dense_data(), beta, lrmat); } else if (A.is_dense() and B.is_hierarchical()) { diff --git a/include/htool/hmatrix/linalg/add_hmatrix_lrmat_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_lrmat_product.hpp index 9f643414..7f0307fe 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_lrmat_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_lrmat_product.hpp @@ -1,21 +1,21 @@ #ifndef HTOOL_HMATRIX_LINALG_ADD_HMATRIX_LOW_RANK_MATRIX_PRODUCT_HPP #define HTOOL_HMATRIX_LINALG_ADD_HMATRIX_LOW_RANK_MATRIX_PRODUCT_HPP -#include "../../matrix/linalg/scale.hpp" // for scale -#include "../../matrix/linalg/transpose.hpp" // for transpose -#include "../../matrix/matrix.hpp" // for Matrix -#include "../../misc/misc.hpp" // for conj_if_complex -#include "../hmatrix.hpp" // for HMatrix -#include "../lrmat/lrmat.hpp" // for LowRankMatrix -#include "../lrmat/utils/recompression.hpp" // for recompression -#include "add_hmatrix_matrix_product.hpp" // for add_hmatrix_m... -#include "add_lrmat_hmatrix.hpp" // for add_lrmat_hma... -#include // for copy_n +#include "../../matrix/linalg/scale.hpp" // for scale +#include "../../matrix/linalg/transpose.hpp" // for transpose +#include "../../matrix/matrix.hpp" // for Matrix +#include "../../misc/misc.hpp" // for conj_if_complex +#include "../hmatrix.hpp" // for HMatrix +#include "../lrmat/lrmat.hpp" // for LowRankMatrix +#include "../lrmat/utils/SVD_recompression.hpp" // for recompression +#include "add_hmatrix_matrix_product.hpp" // for add_hmatrix_m... +#include "add_lrmat_hmatrix.hpp" // for add_lrmat_hma... +#include // for copy_n namespace htool { -template -void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, Matrix &C) { +template > +void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, Matrix &C) { auto rank = B.rank_of(); if (rank != 0) { @@ -37,8 +37,8 @@ void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPre } } -template -void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +template > +void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { auto rank = B.rank_of(); if (rank != 0) { @@ -108,18 +108,18 @@ void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPre C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } } -template -void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, HMatrix &C) { +template > +void internal_add_hmatrix_lrmat_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const LowRankMatrix &B, CoefficientPrecision beta, HMatrix &C) { if (beta != CoefficientPrecision(1)) { scale(beta, C); } - LowRankMatrix lrmat(B.get_epsilon()); + LowRankMatrix lrmat(A.nb_rows(), B.nb_cols(), B.get_epsilon()); internal_add_hmatrix_lrmat_product(transa, transb, alpha, A, B, CoefficientPrecision(1), lrmat); internal_add_lrmat_hmatrix(lrmat, C); } diff --git a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp index 9aa3c834..7e3e738f 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product.hpp @@ -9,20 +9,35 @@ #include "../../wrappers/wrapper_blas.hpp" // for Blas #include "../hmatrix.hpp" // for HMatrix #include "../lrmat/lrmat.hpp" // for LowRankMatrix -#include "../lrmat/utils/recompression.hpp" // for recompression +#include "../lrmat/utils/SVD_recompression.hpp" // for recompression #include "add_hmatrix_matrix_product_row_major.hpp" // for sequential_ad... -#include // for copy_n, min -#include // for vector +#include "execution_policies.hpp" +#include // for copy_n, min +#include // for vector namespace htool { -template -void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, Matrix &C) { +template > +void internal_add_hmatrix_matrix_product(ExecutionPolicy &&, char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, Matrix &C) { if (transb == 'N') { Matrix transposed_B(B.nb_cols(), B.nb_rows()), transposed_C(C.nb_cols(), C.nb_rows()); transpose(B, transposed_B); transpose(C, transposed_C); +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + if constexpr (std::is_execution_policy_v>) { + if constexpr (std::is_same_v, std::execution::parallel_policy>) { + openmp_internal_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, transposed_B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); + } else if constexpr (std::is_same_v, std::execution::sequenced_policy>) { + sequential_internal_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, transposed_B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); + } else { + static_assert(std::is_same_v, std::execution::sequenced_policy> || std::is_same_v, std::execution::parallel_policy>, "Invalid execution policy for add_hmatrix_vector_product."); + } + } else { + static_assert(std::is_execution_policy_v>, "Invalid execution policy for add_hmatrix_vector_product."); + } +#else sequential_internal_add_hmatrix_matrix_product_row_major(transa, transb, alpha, A, transposed_B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); +#endif transpose(transposed_C, C); } else { Matrix transposed_C(C.nb_cols(), C.nb_rows()); @@ -34,13 +49,36 @@ void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPr conj_if_complex(buffer_B.data(), buffer_B.size()); } +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + if constexpr (std::is_execution_policy_v>) { + if constexpr (std::is_same_v, std::execution::parallel_policy>) { + openmp_internal_add_hmatrix_matrix_product_row_major(transa, 'N', alpha, A, transb == 'C' ? buffer_B.data() : B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); + } else if constexpr (std::is_same_v, std::execution::sequenced_policy>) { + sequential_internal_add_hmatrix_matrix_product_row_major(transa, 'N', alpha, A, transb == 'C' ? buffer_B.data() : B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); + } else { + static_assert(!std::is_same_v, std::execution::sequenced_policy> && !std::is_same_v, std::execution::parallel_policy>, "Invalid execution policy for add_hmatrix_vector_product."); + } + } else { + static_assert(!std::is_execution_policy_v>, "Invalid execution policy for add_hmatrix_vector_product."); + } +#else sequential_internal_add_hmatrix_matrix_product_row_major(transa, 'N', alpha, A, transb == 'C' ? buffer_B.data() : B.data(), beta, transposed_C.data(), transposed_C.nb_rows()); +#endif transpose(transposed_C, C); } } -template -void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +template > +void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, Matrix &C) { +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + internal_add_hmatrix_matrix_product(std::execution::seq, transa, transb, alpha, A, B, beta, C); +#else + internal_add_hmatrix_matrix_product(nullptr, transa, transb, alpha, A, B, beta, C); +#endif +} + +template > +void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, LowRankMatrix &C) { bool C_is_overwritten = (beta == CoefficientPrecision(0) || C.rank_of() == 0); int nb_rows = (transa == 'N') ? A.nb_rows() : A.nb_cols(); @@ -114,9 +152,62 @@ void internal_add_hmatrix_matrix_product(char transa, char transb, CoefficientPr // Set C C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } +#if !defined(__cpp_lib_execution) || __cplusplus < 201703L +# if defined(__clang__) +# pragma clang diagnostic push +# pragma clang diagnostic ignored "-Wunused-parameter" +# elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wunused-parameter" +# endif +#endif +template > +void add_hmatrix_matrix_product(ExecutionPolicy &&execution_policy, char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, Matrix &C, CoefficientPrecision *buffer = nullptr) { + auto &target_cluster = A.get_target_cluster(); + auto &source_cluster = A.get_source_cluster(); + + std::vector tmp(buffer == nullptr ? (target_cluster.get_size() + source_cluster.get_size()) * B.nb_cols() : 0, 0); + + Matrix permuted_B; + Matrix permuted_C; + permuted_B.assign(B.nb_rows(), B.nb_cols(), (buffer == nullptr) ? tmp.data() : buffer, false); + permuted_C.assign(C.nb_rows(), C.nb_cols(), (buffer == nullptr) ? tmp.data() + B.nb_rows() * B.nb_cols() : buffer + B.nb_rows() * B.nb_cols(), false); + + for (int i = 0; i < B.nb_cols(); i++) { + user_to_cluster(source_cluster, B.data() + source_cluster.get_size() * i, permuted_B.data() + source_cluster.get_size() * i); + } + for (int i = 0; i < C.nb_cols(); i++) { + user_to_cluster(target_cluster, C.data() + target_cluster.get_size() * i, permuted_C.data() + target_cluster.get_size() * i); + } + +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + internal_add_hmatrix_matrix_product(execution_policy, transa, transb, alpha, A, permuted_B, beta, permuted_C); +#else + internal_add_hmatrix_matrix_product(transa, transb, alpha, A, permuted_B, beta, permuted_C); +#endif + for (int i = 0; i < C.nb_cols(); i++) { + cluster_to_user(target_cluster, permuted_C.data() + target_cluster.get_size() * i, C.data() + target_cluster.get_size() * i); + } +} +#if !defined(__cpp_lib_execution) || __cplusplus < 201703L +# if defined(__clang__) +# pragma clang diagnostic pop +# elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic pop +# endif +#endif + +template > +void add_hmatrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const Matrix &B, CoefficientPrecision beta, Matrix &C, CoefficientPrecision *buffer = nullptr) { +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + add_hmatrix_matrix_product(std::execution::seq, transa, transb, alpha, A, B, beta, C, buffer); +#else + add_hmatrix_matrix_product(nullptr, transa, transb, alpha, A, B, beta, C, buffer); +#endif +} } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp index b7847f5b..0abeedce 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_matrix_product_row_major.hpp @@ -54,7 +54,7 @@ void internal_add_hmatrix_matrix_product_row_major(char transa, char transb, std } } -template +template > void sequential_internal_add_hmatrix_matrix_product_row_major(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *B, CoefficientPrecision beta, CoefficientPrecision *C, int mu) { std::vector *> leaves; std::vector *> leaves_for_symmetry; @@ -109,7 +109,7 @@ void sequential_internal_add_hmatrix_matrix_product_row_major(char transa, char Blas::axpy(&out_size, &alpha, temp.data(), &incx, C, &incy); } -template +template > void openmp_internal_add_hmatrix_matrix_product_row_major(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *B, CoefficientPrecision beta, CoefficientPrecision *C, int mu) { std::vector *> leaves; std::vector *> leaves_for_symmetry; diff --git a/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp b/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp index 91da8197..d874dc2f 100644 --- a/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp +++ b/include/htool/hmatrix/linalg/add_hmatrix_vector_product.hpp @@ -6,9 +6,10 @@ #include "../../wrappers/wrapper_blas.hpp" // for Blas #include "../hmatrix.hpp" // for HMatrix #include "../lrmat/linalg/add_lrmat_vector_product.hpp" // for add_lrm... -#include // for transform -#include // for complex -#include // for vector +#include "execution_policies.hpp" +#include // for transform +#include // for complex +#include // for vector namespace htool { @@ -52,7 +53,7 @@ void internal_add_hmatrix_vector_product(char trans, std::complex +template > void sequential_internal_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { int out_size(A.get_target_cluster().get_size()); @@ -96,7 +97,7 @@ void sequential_internal_add_hmatrix_vector_product(char trans, CoefficientPreci } } -template +template > void openmp_internal_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out) { std::vector *> leaves; std::vector *> leaves_for_symmetry; @@ -158,6 +159,42 @@ void openmp_internal_add_hmatrix_vector_product(char trans, CoefficientPrecision } } +template > +void add_hmatrix_vector_product(ExecutionPolicy &&, char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, CoefficientPrecision *buffer = nullptr) { + auto &source_cluster = A.get_source_cluster(); + auto &target_cluster = A.get_target_cluster(); + std::vector tmp(buffer == nullptr ? target_cluster.get_size() + source_cluster.get_size() : 0, 0); + CoefficientPrecision *buffer_ptr = buffer == nullptr ? tmp.data() : buffer; + user_to_cluster(source_cluster, in, buffer_ptr); + user_to_cluster(target_cluster, out, buffer_ptr + source_cluster.get_size()); + +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + if constexpr (std::is_execution_policy_v>) { + if constexpr (std::is_same_v, std::execution::parallel_policy>) { + openmp_internal_add_hmatrix_vector_product(trans, alpha, A, buffer_ptr, beta, buffer_ptr + source_cluster.get_size()); + } else if constexpr (std::is_same_v, std::execution::sequenced_policy>) { + sequential_internal_add_hmatrix_vector_product(trans, alpha, A, buffer_ptr, beta, buffer_ptr + source_cluster.get_size()); + } else { + static_assert(std::is_same_v, std::execution::sequenced_policy> || std::is_same_v, std::execution::parallel_policy>, "Invalid execution policy for add_hmatrix_vector_product."); + } + } else { + static_assert(std::is_execution_policy_v>, "Invalid execution policy for add_hmatrix_vector_product."); + } +#else + sequential_internal_add_hmatrix_vector_product(trans, alpha, A, buffer_ptr, beta, buffer_ptr + source_cluster.get_size()); +#endif + cluster_to_user(target_cluster, buffer_ptr + source_cluster.get_size(), out); +} + +template > +void add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, CoefficientPrecision *buffer = nullptr) { +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + add_hmatrix_vector_product(std::execution::seq, trans, alpha, A, in, beta, out, buffer); +#else + add_hmatrix_vector_product(nullptr, trans, alpha, A, in, beta, out, buffer); +#endif +} + } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_lrmat_hmatrix.hpp b/include/htool/hmatrix/linalg/add_lrmat_hmatrix.hpp index 520d7b31..ab4d6576 100644 --- a/include/htool/hmatrix/linalg/add_lrmat_hmatrix.hpp +++ b/include/htool/hmatrix/linalg/add_lrmat_hmatrix.hpp @@ -11,8 +11,8 @@ #include // for vector namespace htool { -template -void internal_add_lrmat_hmatrix(const LowRankMatrix &lrmat, HMatrix &hmatrix) { +template > +void internal_add_lrmat_hmatrix(const LowRankMatrix &lrmat, HMatrix &hmatrix) { std::vector *> leaves; preorder_tree_traversal(hmatrix, [&leaves](HMatrix ¤t_node) { if (!current_node.is_hierarchical()) { @@ -51,5 +51,52 @@ void internal_add_lrmat_hmatrix(const LowRankMatrix +void internal_add_lrmat_hmatrix(const LowRankMatrix &lrmat, const Cluster &target_cluster_lrmat, const Cluster &source_cluster_lrmat, HMatrix &hmatrix) { + // Check if hmatrix is larger + if (target_cluster_lrmat.get_size() < hmatrix.get_target_cluster().get_size() || source_cluster_lrmat.get_size() < hmatrix.get_source_cluster().get_size()) { + htool::Logger::get_instance().log(LogLevel::ERROR, "LowRankMatrix is larger than HMatrix"); // LCOV_EXCL_LINE + } + + std::vector *> leaves; + preorder_tree_traversal(hmatrix, [&leaves](HMatrix ¤t_node) { + if (!current_node.is_hierarchical()) { + leaves.push_back(¤t_node); + } + }); + for (auto &leaf : leaves) { + auto &target_cluster = leaf->get_target_cluster(); + auto &source_cluster = leaf->get_source_cluster(); + + if (leaf->is_dense()) { + const Matrix &U = lrmat.get_U(); + const Matrix &V = lrmat.get_V(); + int row_offset = target_cluster.get_offset() - target_cluster_lrmat.get_offset(); + int col_offset = source_cluster.get_offset() - source_cluster_lrmat.get_offset(); + + const CoefficientPrecision *restricted_V_ptr = V.data() + col_offset * V.nb_rows(); + Matrix restricted_U(target_cluster.get_size(), lrmat.rank_of()); + for (int i = 0; i < lrmat.rank_of(); i++) { + std::copy_n(U.data() + U.nb_rows() * i + row_offset, target_cluster.get_size(), restricted_U.data() + restricted_U.nb_rows() * i); + } + + char transa = 'N'; + char transb = 'N'; + int M = target_cluster.get_size(); + int N = source_cluster.get_size(); + int K = lrmat.rank_of(); + int lda = M; + int ldb = K; + int ldc = M; + CoefficientPrecision alpha = 1; + CoefficientPrecision beta = 1; + Blas::gemm(&transa, &transb, &M, &N, &K, &alpha, restricted_U.data(), &lda, restricted_V_ptr, &ldb, &beta, leaf->get_dense_data()->data(), &ldc); + } else { // leaf is low rank matrix + add_lrmat_lrmat(lrmat, target_cluster_lrmat, source_cluster_lrmat, *leaf->get_low_rank_data(), target_cluster, source_cluster); + } + } +} + } // namespace htool #endif diff --git a/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp b/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp index eaccdf20..ca1841e4 100644 --- a/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_lrmat_hmatrix_product.hpp @@ -1,21 +1,21 @@ #ifndef HTOOL_HMATRIX_LINALG_ADD_LOW_RANK_MATRIX_HMATRIX_PRODUCT_HPP #define HTOOL_HMATRIX_LINALG_ADD_LOW_RANK_MATRIX_HMATRIX_PRODUCT_HPP -#include "../../matrix/linalg/scale.hpp" // for scale -#include "../../matrix/linalg/transpose.hpp" // for transpose -#include "../../matrix/matrix.hpp" // for Matrix -#include "../../misc/misc.hpp" // for conj_if_complex -#include "../hmatrix.hpp" // for HMatrix -#include "../lrmat/lrmat.hpp" // for LowRankMatrix -#include "../lrmat/utils/recompression.hpp" // for recompression -#include "add_lrmat_hmatrix.hpp" // for add_lrmat_hma... -#include "add_matrix_hmatrix_product.hpp" // for add_matrix_hm... -#include // for copy_n +#include "../../matrix/linalg/scale.hpp" // for scale +#include "../../matrix/linalg/transpose.hpp" // for transpose +#include "../../matrix/matrix.hpp" // for Matrix +#include "../../misc/misc.hpp" // for conj_if_complex +#include "../hmatrix.hpp" // for HMatrix +#include "../lrmat/lrmat.hpp" // for LowRankMatrix +#include "../lrmat/utils/SVD_recompression.hpp" // for recompression +#include "add_lrmat_hmatrix.hpp" // for add_lrmat_hma... +#include "add_matrix_hmatrix_product.hpp" // for add_matrix_hm... +#include // for copy_n namespace htool { -template -void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, Matrix &C) { +template > +void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, Matrix &C) { auto rank = A.rank_of(); if (rank != 0) { auto &U = A.get_U(); @@ -32,8 +32,8 @@ void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPre } } -template -void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +template > +void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { auto rank = A.rank_of(); if (rank != 0) { auto &U_A = A.get_U(); @@ -98,18 +98,18 @@ void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPre } C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } } -template -void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, HMatrix &C) { +template > +void internal_add_lrmat_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const LowRankMatrix &A, const HMatrix &B, CoefficientPrecision beta, HMatrix &C) { if (beta != CoefficientPrecision(1)) { scale(beta, C); } - LowRankMatrix lrmat(A.get_epsilon()); + LowRankMatrix lrmat(A.nb_rows(), B.nb_cols(), A.get_epsilon()); internal_add_lrmat_hmatrix_product(transa, transb, alpha, A, B, CoefficientPrecision(1), lrmat); internal_add_lrmat_hmatrix(lrmat, C); } diff --git a/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp b/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp index b959315b..a03d97f5 100644 --- a/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp +++ b/include/htool/hmatrix/linalg/add_matrix_hmatrix_product.hpp @@ -9,14 +9,14 @@ #include "../../wrappers/wrapper_blas.hpp" // for Blas #include "../hmatrix.hpp" // for HMatrix #include "../lrmat/lrmat.hpp" // for LowRankMatrix -#include "../lrmat/utils/recompression.hpp" // for recompression +#include "../lrmat/utils/SVD_recompression.hpp" // for recompression #include "add_hmatrix_matrix_product_row_major.hpp" // for sequential_ad... #include // for copy_n, min #include // for vector namespace htool { -template +template > void internal_add_matrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const Matrix &A, const HMatrix &B, CoefficientPrecision beta, Matrix &C) { char new_transa = transb == 'N' ? 'T' : 'N'; @@ -50,8 +50,8 @@ void internal_add_matrix_hmatrix_product(char transa, char transb, CoefficientPr } } -template -void internal_add_matrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const Matrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +template > +void internal_add_matrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const Matrix &A, const HMatrix &B, CoefficientPrecision beta, LowRankMatrix &C) { bool C_is_overwritten = (beta == CoefficientPrecision(0) || C.rank_of() == 0); int nb_rows = (transa == 'N') ? A.nb_rows() : A.nb_cols(); @@ -125,7 +125,7 @@ void internal_add_matrix_hmatrix_product(char transa, char transb, CoefficientPr // Set C C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } // namespace htool diff --git a/include/htool/hmatrix/linalg/execution_policies.hpp b/include/htool/hmatrix/linalg/execution_policies.hpp new file mode 100644 index 00000000..418f97ee --- /dev/null +++ b/include/htool/hmatrix/linalg/execution_policies.hpp @@ -0,0 +1,18 @@ +#ifndef HTOOL_HMATRIX_LINALG_EXECUTION_POLICIES_HPP +#define HTOOL_HMATRIX_LINALG_EXECUTION_POLICIES_HPP + +#if __has_include() && __cplusplus >= 201703L +# include +#endif + +// namespace htool { + +// namespace execution { +// class openmp_task_policy {}; + +// inline constexpr openmp_task_policy omp_task; +// } // namespace execution + +// } // namespace htool + +#endif diff --git a/include/htool/hmatrix/linalg/factorization.hpp b/include/htool/hmatrix/linalg/factorization.hpp index 61247163..775a0d96 100644 --- a/include/htool/hmatrix/linalg/factorization.hpp +++ b/include/htool/hmatrix/linalg/factorization.hpp @@ -19,6 +19,9 @@ void lu_factorization(HMatrix &hmatri if (!hmatrix.is_block_tree_consistent()) { htool::Logger::get_instance().log(LogLevel::ERROR, "lu_factorization is only implemented for consistent block tree."); // LCOV_EXCL_LINE } + if (hmatrix.get_UPLO() != 'N') { + htool::Logger::get_instance().log(LogLevel::ERROR, "lu_factorization cannot be used on a HMatrix with UPLO=" + std::string(1, hmatrix.get_UPLO()) + "!=N. You should use another factorization."); // LCOV_EXCL_LINE + } if (hmatrix.is_hierarchical()) { @@ -90,6 +93,10 @@ void cholesky_factorization(char UPLO, HMatrix()) + and (hmatrix.get_UPLO() != 'H' and is_complex())) { + htool::Logger::get_instance().log(LogLevel::ERROR, "cholesky_factorization cannot be used on a HMatrix with UPLO=" + std::string(1, hmatrix.get_UPLO()) + "!=N. You should use another factorization."); // LCOV_EXCL_LINE + } if (hmatrix.is_hierarchical()) { diff --git a/include/htool/hmatrix/linalg/scale.hpp b/include/htool/hmatrix/linalg/scale.hpp index 637f4121..580c0d8f 100644 --- a/include/htool/hmatrix/linalg/scale.hpp +++ b/include/htool/hmatrix/linalg/scale.hpp @@ -8,7 +8,7 @@ namespace htool { -template +template > void openmp_scale(CoefficientPrecision da, HMatrix &A) { std::vector *> leaves; preorder_tree_traversal(A, [&leaves](HMatrix ¤t_node) { @@ -34,7 +34,7 @@ void openmp_scale(CoefficientPrecision da, HMatrix +template > void sequential_scale(CoefficientPrecision da, HMatrix &A) { std::vector *> leaves; preorder_tree_traversal(A, [&leaves](HMatrix ¤t_node) { @@ -51,9 +51,9 @@ void sequential_scale(CoefficientPrecision da, HMatrix +template > void scale(CoefficientPrecision da, HMatrix &A) { - openmp_scale(da, A); + sequential_scale(da, A); } } // namespace htool diff --git a/include/htool/hmatrix/linalg/task_based_add_hmatrix_hmatrix_product.hpp b/include/htool/hmatrix/linalg/task_based_add_hmatrix_hmatrix_product.hpp new file mode 100644 index 00000000..56159f3e --- /dev/null +++ b/include/htool/hmatrix/linalg/task_based_add_hmatrix_hmatrix_product.hpp @@ -0,0 +1,200 @@ +#ifndef HTOOL_TASK_BASED_HMATRIX_LINALG_ADD_HMATRIX_HMATRIX_PRODUCT_HPP +#define HTOOL_TASK_BASED_HMATRIX_LINALG_ADD_HMATRIX_HMATRIX_PRODUCT_HPP + +// #include "../../matrix/linalg/add_matrix_vector_product.hpp" // for add_mat... +// #include "../../misc/misc.hpp" // for underly... +#include "../../wrappers/wrapper_blas.hpp" // for Blas +#include "../hmatrix.hpp" // for HMatrix +#include "add_lrmat_hmatrix.hpp" // for internal_add_lrmat_hmatrix +// #include "../lrmat/linalg/add_lrmat_vector_product.hpp" // for add_lrm... +#include // for transform, max +// #include // for complex +#include // for enumerate_dependence, find_l0... +#include +#include +#include // for vector + +// to remove warning from depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wuseless-cast" +#endif + +namespace htool { + +/** + * \brief Task-based internal addition of HMatrix-HMatrix product. + * + * This function computes the product of two HMatrix objects, `A` and `B` + * scaled by `alpha`, and adds the result to a third HMatrix object, `C`, + * scaled by `beta`. The computation is performed in a task-based manner using + * OpenMP tasks to parallelize the operations over the hierarchical structure + * of the matrices. + * + * \param transa A character indicating the transpose operation on matrix `A`. + * \param transb A character indicating the transpose operation on matrix `B`. + * \param alpha A scalar coefficient to scale the product of `A` and `B`. + * \param A The first input HMatrix. + * \param B The second input HMatrix. + * \param beta A scalar coefficient to scale the matrix `C`. + * \param C The output HMatrix to which the result is added. + * \param L0 A vector of pointers to nodes of the output HMatrix `C`. + * \param L0_A A vector of pointers to nodes of the input HMatrix `A`. + * \param L0_B A vector of pointers to nodes of the input HMatrix `B`. + * + * The function handles different types of matrix structures (dense, low-rank, + * hierarchical) and performs recursive calls if necessary to handle non-leaf + * nodes. It uses dependency tracking to ensure correct task execution order. + */ + +template +void task_based_internal_add_hmatrix_hmatrix_product(char transa, char transb, CoefficientPrecision alpha, const HMatrix &A, const HMatrix &B, CoefficientPrecision beta, HMatrix &C, std::vector *> &L0_A, std::vector *> &L0_B, std::vector *> &L0) { + + // int max_prio = std::max(0, omp_get_max_task_priority()); + + // Scale the output vector `out` with `beta` if `beta` is not equal to 1 + if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { + for (auto &L0C_node : L0) { + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(L0C_node, beta) \ + depend(inout : *L0C_node) +#endif + { + sequential_scale(beta, *L0C_node); + } + } + } + + // check if C is in L0. + bool is_C_in_L0 = false; + for (auto &L0C_node : L0) { + if (L0C_node->get_target_cluster() == C.get_target_cluster() && L0C_node->get_source_cluster() == C.get_source_cluster()) { + is_C_in_L0 = true; + break; + } + } + + // Fill the clusters for the output, middle, and input clusters + bool block_tree_not_consistent = (A.get_target_cluster().get_rank() < 0 || A.get_source_cluster().get_rank() < 0 || B.get_target_cluster().get_rank() < 0 || B.get_source_cluster().get_rank() < 0); + + std::vector *> output_clusters, middle_clusters, input_clusters; + const Cluster &output_cluster = transa == 'N' ? A.get_target_cluster() : A.get_source_cluster(); + const Cluster &middle_cluster = transa == 'N' ? A.get_source_cluster() : A.get_target_cluster(); + const Cluster &input_cluster = transb == 'N' ? B.get_source_cluster() : B.get_target_cluster(); + + fill_clusters(output_cluster, block_tree_not_consistent, output_clusters); + fill_clusters(input_cluster, block_tree_not_consistent, input_clusters); + fill_clusters(middle_cluster, block_tree_not_consistent, middle_clusters); + + // C += alpha A * B + if (!is_C_in_L0 && !A.is_leaf() && !B.is_leaf() && !C.is_leaf()) { // recursive call + // std::cout << "Recursive call" << std::endl; + for (auto &output_cluster_child : output_clusters) { + for (auto &input_cluster_child : input_clusters) { + for (auto &middle_cluster_child : middle_clusters) { + auto &&A_child = (transa == 'N') ? A.get_child_or_this(*output_cluster_child, *middle_cluster_child) : A.get_child_or_this(*middle_cluster_child, *output_cluster_child); + auto &&B_child = (transb == 'N') ? B.get_child_or_this(*middle_cluster_child, *input_cluster_child) : B.get_child_or_this(*input_cluster_child, *middle_cluster_child); + auto &&C_child = C.get_child_or_this(*output_cluster_child, *input_cluster_child); + if (A_child != nullptr && B_child != nullptr && C_child != nullptr) { + task_based_internal_add_hmatrix_hmatrix_product(transa, transb, alpha, *A_child, *B_child, CoefficientPrecision(1), *C.get_child_or_this(*output_cluster_child, *input_cluster_child), L0_A, L0_B, L0); + } + } + } + } + } else { // if (is_C_in_L0 || A.is_leaf() || B.is_leaf() ||C.is_leaf()) + std::vector *> read_deps, temp; + read_deps = enumerate_dependences(A, L0_A); + temp = enumerate_dependences(B, L0_B); + read_deps.insert(read_deps.end(), temp.begin(), temp.end()); // concatenating the two vectors + auto read_deps_size = read_deps.size(); + + if (is_C_in_L0) { +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(transa, transb, alpha, read_deps_size) \ + shared(A, B, C, read_deps) \ + depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) \ + depend(inout : C) +#endif + { + internal_add_hmatrix_hmatrix_product(transa, transb, alpha, A, B, CoefficientPrecision(1), C); + } + + } else { + auto AB = std::make_shared>(A.nb_rows(), B.nb_cols(), C.get_epsilon()); + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(transa, transb, alpha, AB, read_deps_size) \ + shared(A, B, read_deps) \ + depend(inout : *AB) \ + depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) +#endif + { + if (A.is_low_rank() || B.is_low_rank()) { + if (A.is_dense() and B.is_low_rank()) { + add_matrix_lrmat_product(transa, transb, alpha, *A.get_dense_data(), *B.get_low_rank_data(), CoefficientPrecision(1), *AB); + } else if (A.is_hierarchical() and B.is_low_rank()) { + internal_add_hmatrix_lrmat_product(transa, transb, alpha, A, *B.get_low_rank_data(), CoefficientPrecision(1), *AB); + } else if (A.is_low_rank() and B.is_low_rank()) { + add_lrmat_lrmat_product(transa, transb, alpha, *A.get_low_rank_data(), *B.get_low_rank_data(), CoefficientPrecision(1), *AB); + } else if (A.is_low_rank() and B.is_dense()) { + add_lrmat_matrix_product(transa, transb, alpha, *A.get_low_rank_data(), *B.get_dense_data(), CoefficientPrecision(1), *AB); + } else if (A.is_low_rank() and B.is_hierarchical()) { + internal_add_lrmat_hmatrix_product(transa, transb, alpha, *A.get_low_rank_data(), B, CoefficientPrecision(1), *AB); + } + } else { + if (A.is_dense() and B.is_dense()) { + add_matrix_matrix_product(transa, transb, alpha, *A.get_dense_data(), *B.get_dense_data(), CoefficientPrecision(1), *AB); + } else if (A.is_dense() and B.is_hierarchical()) { + internal_add_matrix_hmatrix_product(transa, transb, alpha, *A.get_dense_data(), B, CoefficientPrecision(1), *AB); + } else if (A.is_hierarchical() and B.is_dense()) { + internal_add_hmatrix_matrix_product(transa, transb, alpha, A, *B.get_dense_data(), CoefficientPrecision(1), *AB); + } + } + } + // Use a ptr to keep output_cluster and input_cluster alive as long as any task is running + const Cluster *output_cluster_ptr = &output_cluster; + const Cluster *input_cluster_ptr = &input_cluster; + auto write_deps = enumerate_dependences(C, L0); + + for (auto &C_node : write_deps) { +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(AB, C_node, output_cluster_ptr, input_cluster_ptr, write_deps) \ + depend(inout : *C_node) \ + depend(in : *AB) +#endif + { + internal_add_lrmat_hmatrix(*AB, *output_cluster_ptr, *input_cluster_ptr, *C_node); + } + } + } // end of else (is_C_in_L0) + } // end of if (!is_C_in_L0 && A.is_leaf() && B.is_leaf() && C.is_leaf()) +} // end of task_based_internal_add_hmatrix_hmatrix_product + +template +void fill_clusters(const Cluster &cluster, bool block_tree_not_consistent, std::vector *> &clusters) { + if (cluster.is_leaf() || (block_tree_not_consistent && cluster.get_rank() >= 0)) { + clusters.push_back(&cluster); + } else if (block_tree_not_consistent) { + for (auto &child : cluster.get_clusters_on_partition()) { + clusters.push_back(child); + } + } else { + for (auto &child : cluster.get_children()) { + clusters.push_back(child.get()); + } + } +} + +} // namespace htool +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic pop +#endif + +#endif diff --git a/include/htool/hmatrix/linalg/task_based_add_hmatrix_vector_product.hpp b/include/htool/hmatrix/linalg/task_based_add_hmatrix_vector_product.hpp new file mode 100644 index 00000000..f17fbe7a --- /dev/null +++ b/include/htool/hmatrix/linalg/task_based_add_hmatrix_vector_product.hpp @@ -0,0 +1,117 @@ +#ifndef HTOOL_TASK_BASED_HMATRIX_LINALG_ADD_HMATRIX_VECTOR_PRODUCT_HPP +#define HTOOL_TASK_BASED_HMATRIX_LINALG_ADD_HMATRIX_VECTOR_PRODUCT_HPP + +// #include "../../matrix/linalg/add_matrix_vector_product.hpp" // for add_mat... +// #include "../../misc/misc.hpp" // for underly... +#include "../../wrappers/wrapper_blas.hpp" // for Blas +#include "../hmatrix.hpp" // for HMatrix +// #include "../lrmat/linalg/add_lrmat_vector_product.hpp" // for add_lrm... +#include // for transform, max +// #include // for complex +#include // for enumerate_dependence, find_l0... +#include +#include // for vector + +#include + +// to remove warning from depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wuseless-cast" +#endif + +namespace htool { + +/** + * @brief Task-based internal add hmatrix vector product, i.e., C = beta*C + alpha*A*x, where A is a HMatrix, x is a vector, and C is a vector. + * + * @details + * The function returns the result in the vector C. + * The function takes as input the HMatrix A, the vector x, the scalar alpha, and the scalar beta. + * The function also takes as input the vector L0, which is a vector of pointers to nodes of the tree of A, and the vectors in_L0 and out_L0, which are vectors of pointers to the cluster nodes of the input and output vectors x and C, respectively. + * The function also takes as input a pointer to a mutex, which is used to synchronize the output of the tasks. + * If the mutex is nullptr, the function does not use any synchronization. + */ +template +void task_based_internal_add_hmatrix_vector_product(char trans, CoefficientPrecision alpha, const HMatrix &A, const CoefficientPrecision *in, CoefficientPrecision beta, CoefficientPrecision *out, const std::vector *> &L0, const std::vector *> &in_L0, const std::vector *> &out_L0, std::mutex *cout_mutex = nullptr) { + + auto get_output_cluster{&HMatrix::get_target_cluster}; + auto get_input_cluster{&HMatrix::get_source_cluster}; + char sym = A.get_symmetry(); + char trans_sym = (A.get_symmetry_for_leaves() == 'S') ? 'T' : 'C'; + if (trans != 'N') { + get_input_cluster = &HMatrix::get_target_cluster; + get_output_cluster = &HMatrix::get_source_cluster; + trans_sym = 'N'; + } + + int local_input_offset = A.get_source_cluster().get_offset(); + int local_output_offset = A.get_target_cluster().get_offset(); + // int local_output_size = A.get_target_cluster().get_size(); + + // int max_prio = std::max(0, omp_get_max_task_priority()); + + // Scale the output vector `out` with `beta` if `beta` is not equal to 1 + if (CoefficientPrecision(beta) != CoefficientPrecision(1)) { + for (auto &out_L0_node : out_L0) { + int out_L0_node_offset = out_L0_node->get_offset(); + int out_L0_node_size = out_L0_node->get_size(); + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(out_L0_node_offset, out_L0_node_size, out, beta, cout_mutex) \ + shared(out_L0_node, std::cout) \ + depend(inout : *out_L0_node) + // priority(max_prio - 1) +#endif // 'out' must be copied by the thread else stack-use-after-return error + { + if (cout_mutex != nullptr) { + std::lock_guard lock(*cout_mutex); + std::cout << "Scaling : out[" << out_L0_node_offset << " : " << out_L0_node_offset + out_L0_node_size << "], thread " << omp_get_thread_num() << "\n"; + } + int incx(1); + Blas::scal(&out_L0_node_size, &beta, out + out_L0_node_offset, &incx); + } + } + } + + for (auto &L0_node : L0) { + int input_offset = (*L0_node.*get_input_cluster)().get_offset(); + int output_offset = (*L0_node.*get_output_cluster)().get_offset(); + + std::vector *> write_cluster, read_cluster; + write_cluster = enumerate_dependences((*L0_node.*get_output_cluster)(), out_L0); + read_cluster = enumerate_dependences((*L0_node.*get_input_cluster)(), in_L0); + + // concatenate write_cluster and read_cluster + write_cluster.insert(write_cluster.end(), read_cluster.begin(), read_cluster.end()); + + auto write_deps_size = write_cluster.size(); + // auto read_deps_cluster_size = read_cluster.size(); + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(sym, trans_sym, alpha, in, trans, out, write_deps_size, cout_mutex, input_offset, output_offset, local_input_offset, local_output_offset, L0_node) \ + shared(std::cout, write_cluster, read_cluster) \ + depend(iterator(it = 0 : write_deps_size), inout : *write_cluster[it]) \ + depend(in : *L0_node) + // priority(max_prio) + // depend(iterator(it = 0 : read_deps_cluster_size), in : *read_cluster[it]) +#endif // L0_node must be copied by the thread else heap-use-after-free error + { + sequential_internal_add_hmatrix_vector_product(trans, alpha, *L0_node, in + input_offset - local_input_offset, CoefficientPrecision(1), out + output_offset - local_output_offset); + if (sym != 'N' && input_offset != output_offset) { + sequential_internal_add_hmatrix_vector_product(trans_sym, alpha, *L0_node, in + output_offset - local_input_offset, CoefficientPrecision(1), out + input_offset - local_output_offset); + } + } + } +} // end of task_based_internal_add_hmatrix_vector_product + +} // namespace htool + +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic pop +#endif +#endif diff --git a/include/htool/hmatrix/linalg/task_based_factorization.hpp b/include/htool/hmatrix/linalg/task_based_factorization.hpp new file mode 100644 index 00000000..b31968f6 --- /dev/null +++ b/include/htool/hmatrix/linalg/task_based_factorization.hpp @@ -0,0 +1,196 @@ +#ifndef HTOOL_TASK_BASED_HMATRIX_LINALG_FACTORIZATION_HPP +#define HTOOL_TASK_BASED_HMATRIX_LINALG_FACTORIZATION_HPP + +#include "../../clustering/cluster_node.hpp" // for Cluster +#include "../../matrix/matrix.hpp" +#include "../../misc/logger.hpp" // for Logger +#include "../../misc/misc.hpp" // for is_c... +#include "../hmatrix.hpp" // for HMatrix +// #include "../linalg/triangular_hmatrix_hmatrix_solve.hpp" // for tria... +// #include "../linalg/triangular_hmatrix_matrix_solve.hpp" // for tria... +// #include "htool/hmatrix/linalg/add_hmatrix_hmatrix_product.hpp" // for add_... +#include "task_based_add_hmatrix_hmatrix_product.hpp" // for add_hmatrix_hmatrix_product +#include "task_based_triangular_hmatrix_hmatrix_solve.hpp" // for task_based_internal_triangular_hmatrix_hmatrix_solve +#include // for basi... +#include // for vector + +namespace htool { + +/** + * @brief Task-based LU factorization of a hierarchical matrix. + * + * This function performs a task-based LU factorization of a hierarchical matrix. The matrix is + * factorized as a product of a lower triangular matrix L and an upper triangular matrix U, i.e. + * A = LU. The block structure of the matrix is used to split the computation into tasks that are + * independent and can be executed concurrently. + * + * @param hmatrix The hierarchical matrix to be factorized. + * @param L0 A vector of pointers to nodes of the output 'hmatrix'. + * + * @see lu_factorization + */ +template > +void task_based_lu_factorization(HMatrix &hmatrix, std::vector *> &L0) { + if (!hmatrix.is_block_tree_consistent()) { + htool::Logger::get_instance().log(LogLevel::ERROR, "task_based_lu_factorization is only implemented for consistent block tree."); // LCOV_EXCL_LINE + } + + if (hmatrix.is_hierarchical()) { + // check if hmatrix is in L0. + bool is_hmatrix_in_L0 = false; + for (auto &L0_node : L0) { + if (L0_node->get_target_cluster() == hmatrix.get_target_cluster() && L0_node->get_source_cluster() == hmatrix.get_source_cluster()) { + is_hmatrix_in_L0 = true; + break; + } + } + + if (is_hmatrix_in_L0) { +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + shared(hmatrix) \ + depend(inout : hmatrix) + +#endif + { + lu_factorization(hmatrix); + } + } else { + + bool block_tree_not_consistent = (hmatrix.get_target_cluster().get_rank() < 0 || hmatrix.get_source_cluster().get_rank() < 0); + std::vector *> clusters; + const Cluster &cluster = hmatrix.get_target_cluster(); + fill_clusters(cluster, block_tree_not_consistent, clusters); + + for (auto &cluster_child : clusters) { + HMatrix *pivot = hmatrix.get_sub_hmatrix(*cluster_child, *cluster_child); + // Compute pivot block + task_based_lu_factorization(*pivot, L0); + + // Apply pivot block to row and column + for (auto &other_cluster_child : clusters) { + if (other_cluster_child->get_offset() > cluster_child->get_offset()) { + HMatrix *U = hmatrix.get_sub_hmatrix(*cluster_child, *other_cluster_child); + HMatrix *L = hmatrix.get_sub_hmatrix(*other_cluster_child, *cluster_child); + + task_based_internal_triangular_hmatrix_hmatrix_solve('L', 'L', 'N', 'U', CoefficientPrecision(1), *pivot, *U, L0, L0); + task_based_internal_triangular_hmatrix_hmatrix_solve('R', 'U', 'N', 'N', CoefficientPrecision(1), *pivot, *L, L0, L0); + } + } + + // Update Schur complement + for (auto &output_cluster_child : clusters) { + for (auto &input_cluster_child : clusters) { + if (output_cluster_child->get_offset() > cluster_child->get_offset() && input_cluster_child->get_offset() > cluster_child->get_offset()) { + HMatrix *A_child = hmatrix.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + const HMatrix *U = hmatrix.get_sub_hmatrix(*cluster_child, *input_cluster_child); + const HMatrix *L = hmatrix.get_sub_hmatrix(*output_cluster_child, *cluster_child); + + task_based_internal_add_hmatrix_hmatrix_product('N', 'N', CoefficientPrecision(-1), *L, *U, CoefficientPrecision(1), *A_child, L0, L0, L0); + } + } + } + } + } + } else if (hmatrix.is_dense()) { + lu_factorization(*hmatrix.get_dense_data()); + } else { + htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for task_based_lu_factorization (hmatrix is low-rank)"); // LCOV_EXCL_LINE + } +} // end of task_based_lu_factorization + +/** + * @brief Task-based Cholesky factorization of a hierarchical matrix. + * + * @param UPLO specifies whether the upper or lower triangular part of the matrix is used. + * @param hmatrix the hierarchical matrix to be factorized. + * @param L0 A vector of pointers to nodes of the output 'hmatrix'. + * + * This function performs a Cholesky factorization of a hierarchical matrix. The factorization is + * done in a task-based way, i.e. the factorization of the sub-matrices is done in parallel using + * OpenMP tasks. The function is only implemented for consistent block trees. + * + */ +template > +void task_based_cholesky_factorization(char UPLO, HMatrix &hmatrix, std::vector *> &L0) { + if (!hmatrix.is_block_tree_consistent()) { + htool::Logger::get_instance().log(LogLevel::ERROR, "task_based_cholesky_factorization is only implemented for consistent block tree."); // LCOV_EXCL_LINE + } + if ((hmatrix.get_UPLO() != 'S' and !is_complex()) + and (hmatrix.get_UPLO() != 'H' and is_complex())) { + htool::Logger::get_instance().log(LogLevel::ERROR, "task_based_cholesky_factorization cannot be used on a HMatrix with UPLO=" + std::string(1, hmatrix.get_UPLO()) + "!=N. You should use another factorization."); // LCOV_EXCL_LINE + } + + if (hmatrix.is_hierarchical()) { + // check if hmatrix is in L0. + bool is_hmatrix_in_L0 = false; + for (auto &L0_node : L0) { + if (L0_node->get_target_cluster() == hmatrix.get_target_cluster() && L0_node->get_source_cluster() == hmatrix.get_source_cluster()) { + is_hmatrix_in_L0 = true; + break; + } + } + + if (is_hmatrix_in_L0) { +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(UPLO) \ + shared(hmatrix) \ + depend(inout : hmatrix) +#endif + { + cholesky_factorization(UPLO, hmatrix); + } + } else { + + bool block_tree_not_consistent = (hmatrix.get_target_cluster().get_rank() < 0 || hmatrix.get_source_cluster().get_rank() < 0); + std::vector *> clusters; + const Cluster &cluster = hmatrix.get_target_cluster(); + fill_clusters(cluster, block_tree_not_consistent, clusters); + + for (auto &cluster_child : clusters) { + HMatrix *pivot = hmatrix.get_sub_hmatrix(*cluster_child, *cluster_child); + // Compute pivot block + task_based_cholesky_factorization(UPLO, *pivot, L0); + + // Apply pivot block to row and column + for (auto &other_cluster_child : clusters) { + if (other_cluster_child->get_offset() > cluster_child->get_offset()) { + if (UPLO == 'L') { + HMatrix *L = hmatrix.get_sub_hmatrix(*other_cluster_child, *cluster_child); + task_based_internal_triangular_hmatrix_hmatrix_solve('R', UPLO, is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(1), *pivot, *L, L0, L0); + + } else { + HMatrix *U = hmatrix.get_sub_hmatrix(*cluster_child, *other_cluster_child); + task_based_internal_triangular_hmatrix_hmatrix_solve('L', UPLO, is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(1), *pivot, *U, L0, L0); + } + } + } + + // Update Schur complement + for (auto &output_cluster_child : clusters) { + for (auto &input_cluster_child : clusters) { + if (UPLO == 'L' && output_cluster_child->get_offset() > cluster_child->get_offset() && input_cluster_child->get_offset() > cluster_child->get_offset() && output_cluster_child->get_offset() >= input_cluster_child->get_offset()) { + HMatrix *A_child = hmatrix.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + const HMatrix *L = hmatrix.get_sub_hmatrix(*output_cluster_child, *cluster_child); + task_based_internal_add_hmatrix_hmatrix_product('N', is_complex() ? 'C' : 'T', CoefficientPrecision(-1), *L, *L, CoefficientPrecision(1), *A_child, L0, L0, L0); + + } else if (UPLO == 'U' && output_cluster_child->get_offset() > cluster_child->get_offset() && input_cluster_child->get_offset() > cluster_child->get_offset() && input_cluster_child->get_offset() >= output_cluster_child->get_offset()) { + HMatrix *A_child = hmatrix.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + const HMatrix *U = hmatrix.get_sub_hmatrix(*cluster_child, *input_cluster_child); + task_based_internal_add_hmatrix_hmatrix_product(is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(-1), *U, *U, CoefficientPrecision(1), *A_child, L0, L0, L0); + } + } + } + } + } + } else if (hmatrix.is_dense()) { + cholesky_factorization(UPLO, *hmatrix.get_dense_data()); + } else { + htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for task_based_cholesky_factorization (hmatrix is low-rank)"); // LCOV_EXCL_LINE + } +} // end of task_based_cholesky_factorization + +} // namespace htool + +#endif diff --git a/include/htool/hmatrix/linalg/task_based_triangular_hmatrix_hmatrix_solve.hpp b/include/htool/hmatrix/linalg/task_based_triangular_hmatrix_hmatrix_solve.hpp new file mode 100644 index 00000000..7b60f72f --- /dev/null +++ b/include/htool/hmatrix/linalg/task_based_triangular_hmatrix_hmatrix_solve.hpp @@ -0,0 +1,209 @@ +#ifndef HTOOL_TASK_BASED_HMATRIX_LINALG_TRIANGULAR_HMATRIX_HMATRIX_SOLVE_HPP +#define HTOOL_TASK_BASED_HMATRIX_LINALG_TRIANGULAR_HMATRIX_HMATRIX_SOLVE_HPP + +#include "../../clustering/cluster_node.hpp" +#include "../../matrix/linalg/factorization.hpp" +#include "../../matrix/matrix.hpp" +#include "../../misc/logger.hpp" +#include "../../misc/misc.hpp" +#include "../hmatrix.hpp" +#include "task_based_add_hmatrix_hmatrix_product.hpp" +#include "triangular_hmatrix_lrmat_solve.hpp" +#include "triangular_hmatrix_matrix_solve.hpp" +#include +#include +#include + +// to remove warning from depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wuseless-cast" +#endif + +namespace htool { +/** + * @brief Solves a triangular linear system of equation of the form \f$Ax = B\f$ where \f$A\f$ is a triangular HMatrix and \f$B\f$ is an HMatrix. The solve is done in a task-based manner + * and uses task based HMatrix-HMatrix product. + * + * @param[in] side Indicates whether \f$A\f$ is on the left or right side of \f$x\f$. + * @param[in] UPLO Indicates whether the upper or lower triangular part of \f$A\f$ is used. + * @param[in] transa Indicates whether the matrix \f$A\f$ is transposed or not. + * @param[in] diag Is passed to internal_triangular_hmatrix_hmatrix_solve function + * @param[in] alpha The scalar \f$\alpha\f$. + * @param[in] A The triangular HMatrix \f$A\f$. + * @param[in,out] B The HMatrix \f$B\f$. + * @param[in] L0_A The L0 of \f$A\f$. + * @param[in] L0_B The L0 of \f$B\f$. + */ +template > +void task_based_internal_triangular_hmatrix_hmatrix_solve(char side, char UPLO, char transa, char diag, CoefficientPrecision alpha, const HMatrix &A, HMatrix &B, std::vector *> &L0_A, std::vector *> &L0_B) { + // if (alpha != CoefficientPrecision(1)) { + // scale(alpha, B); + // } + + // Scale 'B' with `alpha` if `alpha` is not equal to 1 + if (CoefficientPrecision(alpha) != CoefficientPrecision(1)) { + for (auto &L0_node : L0_B) { + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(L0_node, alpha) \ + depend(inout : *L0_node) +#endif + { + scale(alpha, *L0_node); + } + } + } + + // check if B is in L0_B. + bool is_B_in_L0_B = false; + for (auto &L0_node : L0_B) { + if (L0_node->get_target_cluster() == B.get_target_cluster() && L0_node->get_source_cluster() == B.get_source_cluster()) { + is_B_in_L0_B = true; + break; + } + } + + if (is_B_in_L0_B) { + std::vector *> read_deps = enumerate_dependences(A, L0_A); + int read_deps_size = read_deps.size(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(side, UPLO, transa, alpha, diag) \ + shared(A, B, read_deps) \ + depend(in : A) \ + depend(inout : B) \ + depend(iterator(it = 0 : read_deps_size), in : *read_deps[it]) + +#endif + { + // internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, 'N', alpha, A, B); + internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, diag, CoefficientPrecision(1), A, B); // alpha == 1 because the scaling is done here + } + } else { + // Fill the clusters for the output, middle, and input clusters + bool block_tree_not_consistent = (A.get_target_cluster().get_rank() < 0 || A.get_source_cluster().get_rank() < 0 || B.get_target_cluster().get_rank() < 0 || B.get_source_cluster().get_rank() < 0); + + std::vector *> output_clusters, middle_clusters, input_clusters; + auto get_output_cluster_A{&HMatrix::get_target_cluster}; + auto get_input_cluster_A{&HMatrix::get_source_cluster}; + + if (transa != 'N') { + get_input_cluster_A = &HMatrix::get_target_cluster; + get_output_cluster_A = &HMatrix::get_source_cluster; + } + const Cluster &output_cluster = side == 'L' ? (A.*get_output_cluster_A)() : B.get_target_cluster(); + const Cluster &middle_cluster = side == 'L' ? (A.*get_input_cluster_A)() : (A.*get_output_cluster_A)(); + const Cluster &input_cluster = side == 'L' ? B.get_source_cluster() : (A.*get_input_cluster_A)(); + + fill_clusters(output_cluster, block_tree_not_consistent, output_clusters); + fill_clusters(input_cluster, block_tree_not_consistent, input_clusters); + fill_clusters(middle_cluster, block_tree_not_consistent, middle_clusters); + + // Forward, compute each block rows one after the other + if ((UPLO == 'L' && transa == 'N' && side == 'L') || (UPLO == 'U' && transa != 'N' && side == 'L')) { + // std::cout << "clusters' size: " << output_clusters.size() << ", " << middle_clusters.size() << ", " << input_clusters.size() << std::endl; + for (auto &output_cluster_child : output_clusters) { + for (auto &input_cluster_child : input_clusters) { + HMatrix *B_child_to_modify = B.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + + for (auto &middle_cluster_child : middle_clusters) { + const HMatrix *A_child = transa == 'N' ? A.get_sub_hmatrix(*output_cluster_child, *middle_cluster_child) : A.get_sub_hmatrix(*middle_cluster_child, *output_cluster_child); + if (*output_cluster_child == *middle_cluster_child) { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, diag, CoefficientPrecision(1), *A_child, *B_child_to_modify, L0_A, L0_B); + } else if (output_cluster_child->get_offset() > middle_cluster_child->get_offset()) { + const HMatrix *B_child = B.get_sub_hmatrix(*middle_cluster_child, *input_cluster_child); + task_based_internal_add_hmatrix_hmatrix_product(transa, 'N', CoefficientPrecision(-1), *A_child, *B_child, CoefficientPrecision(1), *B_child_to_modify, L0_A, L0_B, L0_B); + } + } + } + } + } // Backward, compute each block rows one after the other + else if ((UPLO == 'U' && transa == 'N' && side == 'L') || (UPLO == 'L' && transa != 'N' && side == 'L')) { + for (auto output_it = output_clusters.rbegin(); output_it != output_clusters.rend(); ++output_it) { + auto &output_cluster_child = *output_it; + + for (auto input_it = input_clusters.rbegin(); input_it != input_clusters.rend(); ++input_it) { + auto &input_cluster_child = *input_it; + + HMatrix *B_child_to_modify = B.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + + for (auto middle_it = middle_clusters.rbegin(); middle_it != middle_clusters.rend(); ++middle_it) { + auto &middle_cluster_child = *middle_it; + + const HMatrix *A_child = transa == 'N' ? A.get_sub_hmatrix(*output_cluster_child, *middle_cluster_child) : A.get_sub_hmatrix(*middle_cluster_child, *output_cluster_child); + if (*output_cluster_child == *middle_cluster_child) { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, diag, CoefficientPrecision(1), *A_child, *B_child_to_modify, L0_A, L0_B); + } else if (output_cluster_child->get_offset() < middle_cluster_child->get_offset()) { + + const HMatrix *B_child = B.get_sub_hmatrix(*middle_cluster_child, *input_cluster_child); + + task_based_internal_add_hmatrix_hmatrix_product(transa, 'N', CoefficientPrecision(-1), *A_child, *B_child, CoefficientPrecision(1), *B_child_to_modify, L0_A, L0_B, L0_B); + } + } + } + } + } // Forward, compute each block column one after the other + else if ((UPLO == 'U' && transa == 'N' && side == 'R') || (UPLO == 'L' && transa != 'N' && side == 'R')) { + + for (auto input_it = input_clusters.begin(); input_it != input_clusters.end(); ++input_it) { + auto &input_cluster_child = *input_it; + + for (auto output_it = output_clusters.begin(); output_it != output_clusters.end(); ++output_it) { + auto &output_cluster_child = *output_it; + + HMatrix *B_child_to_modify = B.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + + for (auto middle_it = middle_clusters.begin(); middle_it != middle_clusters.end(); ++middle_it) { + auto &middle_cluster_child = *middle_it; + + const HMatrix *A_child = transa == 'N' ? A.get_sub_hmatrix(*middle_cluster_child, *input_cluster_child) : A.get_sub_hmatrix(*input_cluster_child, *middle_cluster_child); + if (*input_cluster_child == *middle_cluster_child) { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, diag, CoefficientPrecision(1), *A_child, *B_child_to_modify, L0_A, L0_B); + } else if (middle_cluster_child->get_offset() < input_cluster_child->get_offset()) { + + const HMatrix *B_child = B.get_sub_hmatrix(*output_cluster_child, *middle_cluster_child); + + task_based_internal_add_hmatrix_hmatrix_product('N', transa, CoefficientPrecision(-1), *B_child, *A_child, CoefficientPrecision(1), *B_child_to_modify, L0_B, L0_A, L0_B); + } + } + } + } + } // Backward, compute each block column one after the other + else if ((UPLO == 'L' && transa == 'N' && side == 'R') || (UPLO == 'U' && transa != 'N' && side == 'R')) { + + for (auto input_it = input_clusters.rbegin(); input_it != input_clusters.rend(); ++input_it) { + auto &input_cluster_child = *input_it; + + for (auto output_it = output_clusters.rbegin(); output_it != output_clusters.rend(); ++output_it) { + auto &output_cluster_child = *output_it; + + HMatrix *B_child_to_modify = B.get_sub_hmatrix(*output_cluster_child, *input_cluster_child); + + for (auto middle_it = middle_clusters.rbegin(); middle_it != middle_clusters.rend(); ++middle_it) { + auto &middle_cluster_child = *middle_it; + + const HMatrix *A_child = transa == 'N' ? A.get_sub_hmatrix(*middle_cluster_child, *input_cluster_child) : A.get_sub_hmatrix(*input_cluster_child, *middle_cluster_child); + if (*input_cluster_child == *middle_cluster_child) { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, UPLO, transa, diag, CoefficientPrecision(1), *A_child, *B_child_to_modify, L0_A, L0_B); + } else if (middle_cluster_child->get_offset() > input_cluster_child->get_offset()) { + + const HMatrix *B_child = B.get_sub_hmatrix(*output_cluster_child, *middle_cluster_child); + + task_based_internal_add_hmatrix_hmatrix_product('N', transa, CoefficientPrecision(-1), *B_child, *A_child, CoefficientPrecision(1), *B_child_to_modify, L0_B, L0_A, L0_B); + } + } + } + } + } + } +} +} // namespace htool + +#if defined(__clang__) +#elif defined(__GNUC__) || defined(__GNUG__) +# pragma GCC diagnostic pop +#endif +#endif diff --git a/include/htool/hmatrix/linalg/triangular_hmatrix_lrmat_solve.hpp b/include/htool/hmatrix/linalg/triangular_hmatrix_lrmat_solve.hpp index afb4ed58..5deb22f5 100644 --- a/include/htool/hmatrix/linalg/triangular_hmatrix_lrmat_solve.hpp +++ b/include/htool/hmatrix/linalg/triangular_hmatrix_lrmat_solve.hpp @@ -8,7 +8,7 @@ namespace htool { template > -void internal_triangular_hmatrix_lrmat_solve(char side, char UPLO, char transa, char diag, CoefficientPrecision alpha, const HMatrix &A, LowRankMatrix &B) { +void internal_triangular_hmatrix_lrmat_solve(char side, char UPLO, char transa, char diag, CoefficientPrecision alpha, const HMatrix &A, LowRankMatrix &B) { if (alpha != CoefficientPrecision(1)) { scale(alpha, B); } diff --git a/include/htool/hmatrix/lrmat/SVD.hpp b/include/htool/hmatrix/lrmat/SVD.hpp index 253ec327..368cd71c 100644 --- a/include/htool/hmatrix/lrmat/SVD.hpp +++ b/include/htool/hmatrix/lrmat/SVD.hpp @@ -12,38 +12,38 @@ namespace htool { -template > -class SVD final : public VirtualLowRankGenerator { +template +class SVD final : public VirtualInternalLowRankGenerator { + + std::unique_ptr> internal_generator_w_permutation; + const VirtualInternalGenerator &m_A; public: - using VirtualLowRankGenerator::VirtualLowRankGenerator; + using VirtualInternalLowRankGenerator::VirtualInternalLowRankGenerator; - void copy_low_rank_approximation(const VirtualInternalGenerator &A, const Cluster &target_cluster, const Cluster &source_cluster, underlying_type epsilon, int &rank, Matrix &U, Matrix &V) const override { + SVD(const VirtualInternalGenerator &A) : m_A(A) {} + SVD(const VirtualGenerator &A, const int *target_permutation, const int *source_permutation) : internal_generator_w_permutation(std::make_unique>(A, target_permutation, source_permutation)), m_A(*internal_generator_w_permutation) {} - int M = target_cluster.get_size(); - int N = source_cluster.get_size(); + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { //// Matrix assembling Matrix mat(M, N); - A.copy_submatrix(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), mat.data()); + m_A.copy_submatrix(M, N, row_offset, col_offset, mat.data()); //// SVD std::vector> singular_values(std::min(M, N)); Matrix u(M, M); Matrix vt(N, N); - int truncated_rank = SVD_truncation(mat, epsilon, u, vt, singular_values); - - if (rank == -1) { - if (truncated_rank * (M + N) > (M * N)) { - truncated_rank = -1; - } + int truncated_rank = SVD_truncation(mat, lrmat.get_epsilon(), u, vt, singular_values); - } else { - truncated_rank = std::min(rank, std::min(M, N)); + if (truncated_rank * (M + N) > (M * N)) { + return false; } if (truncated_rank > 0) { + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); U.resize(M, truncated_rank); V.resize(truncated_rank, N); for (int i = 0; i < M; i++) { @@ -56,7 +56,39 @@ class SVD final : public VirtualLowRankGenerator &lrmat) const override { + //// Matrix assembling + Matrix mat(M, N); + m_A.copy_submatrix(M, N, row_offset, col_offset, mat.data()); + + //// SVD + std::vector> singular_values(std::min(M, N)); + Matrix u(M, M); + Matrix vt(N, N); + + int truncated_rank = SVD_truncation(mat, lrmat.get_epsilon(), u, vt, singular_values); + truncated_rank = std::min(rank, std::min(M, N)); + + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); + U.resize(M, truncated_rank); + V.resize(truncated_rank, N); + for (int i = 0; i < M; i++) { + for (int j = 0; j < truncated_rank; j++) { + U(i, j) = u(i, j) * singular_values[j]; + } + } + for (int i = 0; i < truncated_rank; i++) { + for (int j = 0; j < N; j++) { + V(i, j) = vt(i, j); + } } + return true; } }; diff --git a/include/htool/hmatrix/lrmat/fullACA.hpp b/include/htool/hmatrix/lrmat/fullACA.hpp index f8de95a3..7dea4a25 100644 --- a/include/htool/hmatrix/lrmat/fullACA.hpp +++ b/include/htool/hmatrix/lrmat/fullACA.hpp @@ -12,44 +12,33 @@ #include // for vector namespace htool { -//================================// -// CLASSE MATRICE RANG FAIBLE // -//================================// -// -// Refs biblio: -// -// -> slides de Stéphanie Chaillat: -// http://uma.ensta-paristech.fr/var/files/chaillat/seance2.pdf -// et en particulier la slide 25 -// -// -> livre de M.Bebendorf: -// http://www.springer.com/kr/book/9783540771463 -// et en particulier le paragraphe 3.4 -// -// -> livre de Rjasanow-Steinbach: -// http://www.ems-ph.org/books/book.php?proj_nr=125 -// et en particulier le paragraphe 3.2 -// -//=================================// -template > -class fullACA final : public VirtualLowRankGenerator { + +template +class fullACA final : public VirtualInternalLowRankGenerator { + + std::unique_ptr> internal_generator_w_permutation; + const VirtualInternalGenerator &m_A; public: - //=========================// - // FULL PIVOT ACA // - //=========================// - // If reqrank=-1 (default value), we use the precision given by epsilon for the stopping criterion; - // otherwise, we use the required rank for the stopping criterion (!: at the end the rank could be lower) - using VirtualLowRankGenerator::VirtualLowRankGenerator; + using VirtualInternalLowRankGenerator::VirtualInternalLowRankGenerator; - void copy_low_rank_approximation(const VirtualInternalGenerator &A, const Cluster &target_cluster, const Cluster &source_cluster, underlying_type epsilon, int &rank, Matrix &U, Matrix &V) const override { + fullACA(const VirtualInternalGenerator &A) : m_A(A) {} + fullACA(const VirtualGenerator &A, const int *target_permutation, const int *source_permutation) : internal_generator_w_permutation(std::make_unique>(A, target_permutation, source_permutation)), m_A(*internal_generator_w_permutation) {} - int M = target_cluster.get_size(); - int N = source_cluster.get_size(); + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { + int reqrank = -1; + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const override { + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } + private: + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, underlying_type epsilon, int &rank, LowRankMatrix &lrmat) const { // Matrix assembling Matrix mat(M, N); - A.copy_submatrix(M, N, target_cluster.get_offset(), source_cluster.get_offset(), mat.data()); + m_A.copy_submatrix(M, N, row_offset, col_offset, mat.data()); // Full pivot int q = 0; @@ -83,6 +72,8 @@ class fullACA final : public VirtualLowRankGenerator 0) { + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); U.resize(M, rank); V.resize(rank, N); for (int k = 0; k < rank; k++) { @@ -91,7 +82,9 @@ class fullACA final : public VirtualLowRankGenerator // for copy_n -#include // for basic_string +#include "../../../clustering/cluster_node.hpp" // for Cluster (ptr ... +#include "../../../hmatrix/lrmat/utils/SVD_recompression.hpp" // for recompression +#include "../../../matrix/matrix.hpp" // for Matrix +#include "../../../misc/logger.hpp" // for Logger, LogLevel +#include "../../../misc/misc.hpp" // for underlying_type +#include "../lrmat.hpp" // for LowRankMatrix +#include // for copy_n +#include // for basic_string namespace htool { template > -void add_lrmat_lrmat(const LowRankMatrix &X_lrmat, const Cluster &X_target_cluster, const Cluster &X_source_cluster, LowRankMatrix &Y_lrmat, const Cluster &Y_target_cluster, const Cluster &Y_source_cluster) { +void add_lrmat_lrmat(const LowRankMatrix &X_lrmat, const Cluster &X_target_cluster, const Cluster &X_source_cluster, LowRankMatrix &Y_lrmat, const Cluster &Y_target_cluster, const Cluster &Y_source_cluster) { if (left_cluster_contains_right_cluster(Y_target_cluster, X_target_cluster) && left_cluster_contains_right_cluster(Y_source_cluster, X_source_cluster)) { // extends X and add to Y int row_offset = X_target_cluster.get_offset() - Y_target_cluster.get_offset(); int col_offset = X_source_cluster.get_offset() - Y_source_cluster.get_offset(); @@ -56,7 +56,7 @@ void add_lrmat_lrmat(const LowRankMatrix // for copy_n @@ -55,7 +55,7 @@ void add_lrmat_matrix_product(char transa, char transb, CoefficientPrecision alp add_matrix_matrix_product(transa, transb, alpha, U_A, B, 0, V_C); } if (C.rank_of() * (C.nb_rows() + C.nb_cols()) > (C.nb_rows() * C.nb_cols())) { - recompression(C); + SVD_recompression(C); } } else { Matrix VB; @@ -101,7 +101,7 @@ void add_lrmat_matrix_product(char transa, char transb, CoefficientPrecision alp } C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } } diff --git a/include/htool/hmatrix/lrmat/linalg/add_matrix_lrmat_product.hpp b/include/htool/hmatrix/lrmat/linalg/add_matrix_lrmat_product.hpp index b50a74ac..6836f4a3 100644 --- a/include/htool/hmatrix/lrmat/linalg/add_matrix_lrmat_product.hpp +++ b/include/htool/hmatrix/lrmat/linalg/add_matrix_lrmat_product.hpp @@ -7,7 +7,7 @@ #include "../../../matrix/matrix.hpp" // for Matrix #include "../../../misc/misc.hpp" // for conj... #include "../lrmat.hpp" // for LowR... -#include "../utils/recompression.hpp" // for reco... +#include "../utils/SVD_recompression.hpp" // for reco... #include // for copy_n namespace htool { @@ -105,7 +105,7 @@ void add_matrix_lrmat_product(char transa, char transb, CoefficientPrecision alp C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } } diff --git a/include/htool/hmatrix/lrmat/linalg/add_matrix_matrix_product.hpp b/include/htool/hmatrix/lrmat/linalg/add_matrix_matrix_product.hpp index 02cffe5d..0d175fac 100644 --- a/include/htool/hmatrix/lrmat/linalg/add_matrix_matrix_product.hpp +++ b/include/htool/hmatrix/lrmat/linalg/add_matrix_matrix_product.hpp @@ -7,13 +7,13 @@ #include "../../../misc/misc.hpp" // for underlying_type #include "../../../wrappers/wrapper_blas.hpp" // for Blas #include "../lrmat.hpp" // for LowRankMatrix -#include "../utils/recompression.hpp" // for recompression +#include "../utils/SVD_recompression.hpp" // for recompression #include // for copy_n, min #include // for vector namespace htool { template > -void add_matrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const Matrix &A, const Matrix &B, CoefficientPrecision beta, LowRankMatrix &C) { +void add_matrix_matrix_product(char transa, char transb, CoefficientPrecision alpha, const Matrix &A, const Matrix &B, CoefficientPrecision beta, LowRankMatrix &C) { bool C_is_overwritten = (beta == CoefficientPrecision(0) || C.rank_of() == 0); @@ -86,7 +86,7 @@ void add_matrix_matrix_product(char transa, char transb, CoefficientPrecision al } C.get_U() = new_U; C.get_V() = new_V; - recompression(C); + SVD_recompression(C); } } // namespace htool diff --git a/include/htool/hmatrix/lrmat/linalg/scale.hpp b/include/htool/hmatrix/lrmat/linalg/scale.hpp index c9d3844c..fd952c23 100644 --- a/include/htool/hmatrix/lrmat/linalg/scale.hpp +++ b/include/htool/hmatrix/lrmat/linalg/scale.hpp @@ -7,7 +7,7 @@ namespace htool { template void scale(CoefficientPrecision da, LowRankMatrix &lrmat) { - if (lrmat.get_U().nb_rows() > lrmat.get_U().nb_cols()) { + if (lrmat.get_U().nb_rows() <= lrmat.get_V().nb_cols()) { scale(da, lrmat.get_U()); } else { scale(da, lrmat.get_V()); diff --git a/include/htool/hmatrix/lrmat/lrmat.hpp b/include/htool/hmatrix/lrmat/lrmat.hpp index d4cc908e..bbe33149 100644 --- a/include/htool/hmatrix/lrmat/lrmat.hpp +++ b/include/htool/hmatrix/lrmat/lrmat.hpp @@ -1,18 +1,18 @@ #ifndef HTOOL_LRMAT_HPP #define HTOOL_LRMAT_HPP -#include "../../clustering/cluster_node.hpp" // for Cluster -#include "../../hmatrix/interfaces/virtual_generator.hpp" // for VirtualGenerator -#include "../../matrix/matrix.hpp" // for Matrix -#include "../../misc/misc.hpp" // for underlying_type -#include "../../wrappers/wrapper_blas.hpp" // for Blas -#include "../interfaces/virtual_lrmat_generator.hpp" // for VirtualLowRankG... -#include // for fill -#include // for vector +#include "../../clustering/cluster_node.hpp" // for Cluster +#include "../../hmatrix/interfaces/virtual_generator.hpp" // for VirtualGenerator +#include "../../matrix/linalg/add_matrix_matrix_product.hpp" // for add_matrix_matrix_product +#include "../../matrix/matrix.hpp" // for Matrix +#include "../../misc/misc.hpp" // for underlying_type +#include "../../wrappers/wrapper_blas.hpp" // for Blas +#include // for fill +#include // for vector namespace htool { -template > +template class LowRankMatrix { protected: @@ -21,17 +21,20 @@ class LowRankMatrix { public: // Constructors - LowRankMatrix(const VirtualInternalGenerator &A, const VirtualLowRankGenerator &LRGenerator, const Cluster &target_cluster, const Cluster &source_cluster, int rank = -1, underlying_type epsilon = 1e-3) : m_U(), m_V(), m_epsilon(epsilon) { - - if (rank == 0) { - m_U.resize(target_cluster.get_size(), 0); - m_V.resize(0, source_cluster.get_size()); + explicit LowRankMatrix(int M, int N, int rank, underlying_type epsilon = 1e-3) : m_U(), m_V(), m_epsilon(epsilon) { + if (rank > 0) { + m_U.resize(M, rank); + m_V.resize(rank, N); } else { - LRGenerator.copy_low_rank_approximation(A, target_cluster, source_cluster, epsilon, rank, m_U, m_V); + m_U.resize(M, 0); + m_V.resize(0, N); } } - LowRankMatrix(underlying_type epsilon) : m_U(), m_V(), m_epsilon(epsilon) {} + explicit LowRankMatrix(int M, int N, underlying_type epsilon) : m_U(), m_V(), m_epsilon(epsilon) { + m_U.resize(M, 0); + m_V.resize(0, N); + } LowRankMatrix &operator=(const LowRankMatrix &rhs) = default; LowRankMatrix(const LowRankMatrix &rhs) = default; @@ -48,6 +51,7 @@ class LowRankMatrix { CoefficientPrecision get_U(int i, int j) const { return m_U(i, j); } CoefficientPrecision get_V(int i, int j) const { return m_V(i, j); } underlying_type get_epsilon() const { return m_epsilon; } + underlying_type &get_epsilon() { return m_epsilon; } std::vector operator*(const std::vector &a) const { return m_U * (m_V * a); @@ -95,17 +99,6 @@ class LowRankMatrix { } } - void - mvprod(const CoefficientPrecision *const in, CoefficientPrecision *const out) const { - if (m_U.nb_cols() == 0) { - std::fill(out, out + m_U.nb_cols(), 0); - } else { - std::vector a(m_U.nb_cols()); - m_V.mvprod(in, a.data()); - m_U.mvprod(a.data(), out); - } - } - void add_mvprod_row_major(const CoefficientPrecision *const in, CoefficientPrecision *const out, const int &mu, char transb = 'T', char op = 'N') const { if (m_U.nb_cols() != 0) { std::vector a(m_U.nb_cols() * mu); @@ -120,31 +113,22 @@ class LowRankMatrix { } void copy_to_dense(CoefficientPrecision *const out) const { - char transa = 'N'; - char transb = 'N'; - int M = m_U.nb_rows(); - int N = m_V.nb_cols(); - int K = m_U.nb_cols(); - CoefficientPrecision alpha = 1; - int lda = m_U.nb_rows(); - int ldb = m_V.nb_rows(); - CoefficientPrecision beta = 0; - int ldc = m_U.nb_rows(); - - Blas::gemm(&transa, &transb, &M, &N, &K, &alpha, m_U.data(), &lda, m_V.data(), &ldb, &beta, out, &ldc); + Matrix C; + C.assign(m_U.nb_rows(), m_V.nb_cols(), out, false); + add_matrix_matrix_product('N', 'N', CoefficientPrecision(1), m_U, m_V, CoefficientPrecision(0), C); } double compression_ratio() const { - return (m_U.nb_rows() * m_V.nb_cols()) / (double)(m_U.nb_cols() * (m_U.nb_rows() + m_V.nb_cols())); + return (m_U.nb_rows() * m_V.nb_cols()) / static_cast(m_U.nb_cols() * (m_U.nb_rows() + m_V.nb_cols())); } double space_saving() const { - return (1 - (m_U.nb_cols() * (1. / double(m_U.nb_rows()) + 1. / double(m_V.nb_cols())))); + return (1 - (m_U.nb_cols() * (1. / double(m_U.nb_rows()) + 1. / static_cast(m_V.nb_cols())))); } }; template > -underlying_type Frobenius_relative_error(const Cluster &target_cluster, const Cluster &source_cluster, const LowRankMatrix &lrmat, const VirtualInternalGenerator &ref, int reqrank = -1) { +underlying_type Frobenius_relative_error(const Cluster &target_cluster, const Cluster &source_cluster, const LowRankMatrix &lrmat, const VirtualInternalGenerator &ref, int reqrank = -1) { if (reqrank == -1) { reqrank = lrmat.rank_of(); } @@ -166,7 +150,7 @@ underlying_type Frobenius_relative_error(const Cluster> -underlying_type Frobenius_absolute_error(const Cluster &target_cluster, const Cluster &source_cluster, const LowRankMatrix &lrmat, const VirtualInternalGenerator &ref, int reqrank = -1) { +underlying_type Frobenius_absolute_error(const Cluster &target_cluster, const Cluster &source_cluster, const LowRankMatrix &lrmat, const VirtualInternalGenerator &ref, int reqrank = -1) { if (reqrank == -1) { reqrank = lrmat.rank_of(); } diff --git a/include/htool/hmatrix/lrmat/partialACA.hpp b/include/htool/hmatrix/lrmat/partialACA.hpp index d8262eef..8489de69 100644 --- a/include/htool/hmatrix/lrmat/partialACA.hpp +++ b/include/htool/hmatrix/lrmat/partialACA.hpp @@ -16,42 +16,35 @@ #include // for vector namespace htool { -//================================// -// CLASSE MATRICE RANG FAIBLE // -//================================// -// -// Refs biblio: -// -// -> slides de Stéphanie Chaillat: -// http://uma.ensta-paristech.fr/var/files/chaillat/seance2.pdf -// et en particulier la slide 25 -// -// -> livre de M.Bebendorf: -// http://www.springer.com/kr/book/9783540771463 -// et en particulier le paragraphe 3.4 -// -// -> livre de Rjasanow-Steinbach: -// http://www.ems-ph.org/books/book.php?proj_nr=125 -// et en particulier le paragraphe 3.2 -// -//=================================// -template > -class partialACA final : public VirtualLowRankGenerator { + +template +class partialACA final : public VirtualInternalLowRankGenerator { + + std::unique_ptr> internal_generator_w_permutation; + const VirtualInternalGenerator &m_A; public: - //=========================// - // PARTIAL PIVOT ACA // - //=========================// - // If reqrank=-1 (default value), we use the precision given by epsilon for the stopping criterion; - // otherwise, we use the required rank for the stopping criterion (!: at the end the rank could be lower) - using VirtualLowRankGenerator::VirtualLowRankGenerator; + using VirtualInternalLowRankGenerator::VirtualInternalLowRankGenerator; + + partialACA(const VirtualInternalGenerator &A) : m_A(A) {} + partialACA(const VirtualGenerator &A, const int *target_permutation, const int *source_permutation) : internal_generator_w_permutation(std::make_unique>(A, target_permutation, source_permutation)), m_A(*internal_generator_w_permutation) {} + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { + int reqrank = -1; + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const override { + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } - void copy_low_rank_approximation(const VirtualInternalGenerator &A, const Cluster &target_cluster, const Cluster &source_cluster, underlying_type epsilon, int &rank, Matrix &U, Matrix &V) const override { + private: + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, underlying_type epsilon, int &rank, LowRankMatrix &lrmat) const { - int target_size = target_cluster.get_size(); - int source_size = source_cluster.get_size(); - int target_offset = target_cluster.get_offset(); - int source_offset = source_cluster.get_offset(); + int target_size = M; + int source_size = N; + int target_offset = row_offset; + int source_offset = col_offset; //// Choice of the first row (see paragraph 3.4.3 page 151 Bebendorf) // double dist = 1e30; @@ -97,7 +90,7 @@ class partialACA final : public VirtualLowRankGenerator::axpy(&(source_size), &(coef), vv[j].data(), &incx, r.data(), &incy); @@ -113,7 +106,6 @@ class partialACA final : public VirtualLowRankGenerator 1e-15) { std::fill(c.begin(), c.end(), CoefficientPrecision(0)); - A.copy_submatrix(target_size, 1, target_offset, J + source_offset, c.data()); + m_A.copy_submatrix(target_size, 1, target_offset, J + source_offset, c.data()); for (int k = 0; k < uu.size(); k++) { coef = -vv[k][J]; Blas::axpy(&(target_size), &(coef), uu[k].data(), &incx, c.data(), &incy); @@ -169,7 +161,6 @@ class partialACA final : public VirtualLowRankGenerator 0) { + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); U.resize(target_size, rank); V.resize(rank, source_size); for (int k = 0; k < rank; k++) { @@ -185,7 +178,9 @@ class partialACA final : public VirtualLowRankGenerator + +namespace htool { + +template +class RecompressedLowRankGenerator final : public VirtualInternalLowRankGenerator { + const VirtualInternalLowRankGenerator &m_low_rank_generator; + std::function &)> m_recompression; + + public: + RecompressedLowRankGenerator(const VirtualInternalLowRankGenerator &low_rank_generator, std::function &)> recompression) : m_low_rank_generator(low_rank_generator), m_recompression(recompression) {} + + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const { + bool info = m_low_rank_generator.copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat); + m_recompression(lrmat); + return info; + } + + virtual bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const { + bool info = m_low_rank_generator.copy_low_rank_approximation(M, N, row_offset, col_offset, reqrank, lrmat); + return info; + } +}; + +} // namespace htool + +#endif diff --git a/include/htool/hmatrix/lrmat/sympartialACA.hpp b/include/htool/hmatrix/lrmat/sympartialACA.hpp index 36e4575c..88dbcc6a 100644 --- a/include/htool/hmatrix/lrmat/sympartialACA.hpp +++ b/include/htool/hmatrix/lrmat/sympartialACA.hpp @@ -16,56 +16,48 @@ #include // for vector namespace htool { -//================================// -// CLASSE MATRICE RANG FAIBLE // -//================================// -// -// Refs biblio: -// -// -> slides de Stéphanie Chaillat: -// http://uma.ensta-paristech.fr/var/files/chaillat/seance2.pdf -// et en particulier la slide 25 -// -// -> livre de M.Bebendorf: -// http://www.springer.com/kr/book/9783540771463 -// et en particulier le paragraphe 3.4 -// -// -> livre de Rjasanow-Steinbach: -// http://www.ems-ph.org/books/book.php?proj_nr=125 -// et en particulier le paragraphe 3.2 -// -//=================================// -template > -class sympartialACA final : public VirtualLowRankGenerator { + +template +class sympartialACA final : public VirtualInternalLowRankGenerator { + std::unique_ptr> internal_generator_w_permutation; + const VirtualInternalGenerator &m_A; public: - //=========================// - // PARTIAL PIVOT ACA // - //=========================// - // If reqrank=-1 (default value), we use the precision given by epsilon for the stopping criterion; - // otherwise, we use the required rank for the stopping criterion (!: at the end the rank could be lower) - using VirtualLowRankGenerator::VirtualLowRankGenerator; + using VirtualInternalLowRankGenerator::VirtualInternalLowRankGenerator; + + sympartialACA(const VirtualInternalGenerator &A) : m_A(A) {} + sympartialACA(const VirtualGenerator &A, const int *target_permutation, const int *source_permutation) : internal_generator_w_permutation(std::make_unique>(A, target_permutation, source_permutation)), m_A(*internal_generator_w_permutation) {} + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { + int reqrank = -1; + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const override { + return copy_low_rank_approximation(M, N, row_offset, col_offset, lrmat.get_epsilon(), reqrank, lrmat); + } - void copy_low_rank_approximation(const VirtualInternalGenerator &A, const Cluster &t, const Cluster &s, underlying_type epsilon, int &rank, Matrix &U, Matrix &V) const { + private: + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, underlying_type epsilon, int &rank, LowRankMatrix &lrmat) const { int n1, n2; int i1; int i2; // const double *x1; - if (t.get_offset() >= s.get_offset()) { + if (row_offset >= col_offset) { - n1 = t.get_size(); - n2 = s.get_size(); - i1 = t.get_offset(); - i2 = s.get_offset(); + n1 = M; + n2 = N; + i1 = row_offset; + i2 = col_offset; // x1 = xt; // cluster_1 = &t; } else { - n1 = s.get_size(); - n2 = t.get_size(); - i1 = s.get_offset(); - i2 = t.get_offset(); + n1 = N; + n2 = M; + i1 = col_offset; + i2 = row_offset; // x1 = xs; // cluster_1 = &s; } @@ -110,10 +102,10 @@ class sympartialACA final : public VirtualLowRankGenerator= s.get_offset()) { - A.copy_submatrix(1, n2, i1 + I1, i2, u1.data()); + if (row_offset >= col_offset) { + m_A.copy_submatrix(1, n2, i1 + I1, i2, u1.data()); } else { - A.copy_submatrix(n2, 1, i2, i1 + I1, u1.data()); + m_A.copy_submatrix(n2, 1, i2, i1 + I1, u1.data()); } for (int j = 0; j < uu.size(); j++) { @@ -139,10 +131,10 @@ class sympartialACA final : public VirtualLowRankGenerator 1e-15) { std::fill(u2.begin(), u2.end(), CoefficientPrecision(0)); - if (t.get_offset() >= s.get_offset()) { - A.copy_submatrix(n1, 1, i1, i2 + I2, u2.data()); + if (row_offset >= col_offset) { + m_A.copy_submatrix(n1, 1, i1, i2 + I2, u2.data()); } else { - A.copy_submatrix(1, n1, i2 + I2, i1, u2.data()); + m_A.copy_submatrix(1, n1, i2 + I2, i1, u2.data()); } for (int k = 0; k < uu.size(); k++) { coef = -vv[k][I2]; @@ -188,8 +180,8 @@ class sympartialACA final : public VirtualLowRankGenerator 0) { - U.resize(t.get_size(), rank); - V.resize(rank, s.get_size()); + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); + U.resize(M, rank); + V.resize(rank, N); - if (t.get_offset() >= s.get_offset()) { + if (row_offset >= col_offset) { for (int k = 0; k < rank; k++) { - std::move(uu[k].begin(), uu[k].end(), U.data() + k * t.get_size()); - for (int j = 0; j < s.get_size(); j++) { + std::move(uu[k].begin(), uu[k].end(), U.data() + k * M); + for (int j = 0; j < N; j++) { V(k, j) = vv[k][j]; } } } else { for (int k = 0; k < rank; k++) { - std::move(vv[k].begin(), vv[k].end(), U.data() + k * t.get_size()); - for (int j = 0; j < s.get_size(); j++) { + std::move(vv[k].begin(), vv[k].end(), U.data() + k * M); + for (int j = 0; j < N; j++) { V(k, j) = uu[k][j]; } } } + return true; } + return false; } }; } // namespace htool diff --git a/include/htool/hmatrix/lrmat/utils/recompression.hpp b/include/htool/hmatrix/lrmat/utils/SVD_recompression.hpp similarity index 61% rename from include/htool/hmatrix/lrmat/utils/recompression.hpp rename to include/htool/hmatrix/lrmat/utils/SVD_recompression.hpp index 7e64e13f..7bd81c41 100644 --- a/include/htool/hmatrix/lrmat/utils/recompression.hpp +++ b/include/htool/hmatrix/lrmat/utils/SVD_recompression.hpp @@ -1,5 +1,5 @@ -#ifndef HTOOL_LRMAT_UTILS_RECOMPRESSION_HPP -#define HTOOL_LRMAT_UTILS_RECOMPRESSION_HPP +#ifndef HTOOL_LRMAT_UTILS_SVD_RECOMPRESSION_HPP +#define HTOOL_LRMAT_UTILS_SVD_RECOMPRESSION_HPP #include "../../../matrix/linalg/add_matrix_matrix_product.hpp" // for add_... #include "../../../matrix/matrix.hpp" // for Matrix @@ -16,7 +16,7 @@ namespace htool { template -void recompression(LowRankMatrix &lrmat) { +void SVD_recompression(LowRankMatrix &lrmat) { Matrix U = lrmat.get_U(); Matrix V = lrmat.get_V(); auto rank = lrmat.rank_of(); @@ -70,7 +70,7 @@ void recompression(LowRankMatrix &lrmat) { std::vector work(1); tau_QR.resize(std::min(M, N)); Lapack::geqrf(&M, &N, U.data(), &lda, tau_QR.data(), work.data(), &lwork, &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::geqrf(&M, &N, U.data(), &lda, tau_QR.data(), work.data(), &lwork, &info); } @@ -86,7 +86,7 @@ void recompression(LowRankMatrix &lrmat) { std::vector work(1); tau_LQ.resize(std::min(M, N)); Lapack::gelqf(&M, &N, V.data(), &lda, tau_LQ.data(), work.data(), &lwork, &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::gelqf(&M, &N, V.data(), &lda, tau_LQ.data(), work.data(), &lwork, &info); } @@ -115,76 +115,71 @@ void recompression(LowRankMatrix &lrmat) { Matrix vt(rank, rank); int truncated_rank = SVD_truncation(RL, epsilon, u, vt, singular_values); - // new_U=u*sqrt(tildeS) and new_V=sqrt(tildeS)*vt in the right dimensions - Matrix &new_U = lrmat.get_U(); - Matrix &new_V = lrmat.get_V(); - { - int M = U.nb_rows(); - int N = V.nb_cols(); - int incx = 1; - new_U.resize(M, truncated_rank); - new_V.resize(truncated_rank, N); - CoefficientPrecision scaling_coef; - for (int r = 0; r < truncated_rank; r++) { - scaling_coef = std::sqrt(singular_values[r]); - std::copy_n(u.data() + r * rank, rank, new_U.data() + r * M); - Blas::scal(&M, &scaling_coef, new_U.data() + r * M, &incx); - } - for (int r = 0; r < rank; r++) { - std::copy_n(vt.data() + r * rank, truncated_rank, new_V.data() + r * truncated_rank); - } + if (truncated_rank < rank) { + // new_U=u*sqrt(tildeS) and new_V=sqrt(tildeS)*vt in the right dimensions + Matrix &new_U = lrmat.get_U(); + Matrix &new_V = lrmat.get_V(); + { + int M = U.nb_rows(); + int N = V.nb_cols(); + int incx = 1; + new_U.resize(M, truncated_rank); + new_V.resize(truncated_rank, N); + CoefficientPrecision scaling_coef; + for (int r = 0; r < truncated_rank; r++) { + scaling_coef = std::sqrt(singular_values[r]); + std::copy_n(u.data() + r * rank, rank, new_U.data() + r * M); + Blas::scal(&M, &scaling_coef, new_U.data() + r * M, &incx); + } + for (int r = 0; r < rank; r++) { + std::copy_n(vt.data() + r * rank, truncated_rank, new_V.data() + r * truncated_rank); + } - for (int r = 0; r < truncated_rank; r++) { - for (int j = 0; j < rank; j++) { - new_V(r, j) = std::sqrt(singular_values[r]) * new_V(r, j); + for (int r = 0; r < truncated_rank; r++) { + for (int j = 0; j < rank; j++) { + new_V(r, j) = std::sqrt(singular_values[r]) * new_V(r, j); + } } } - } - // new_U=Q1*new_U - { - char size = 'L'; - char trans = 'N'; - int M = new_U.nb_rows(); - int N = new_U.nb_cols(); - int K = tau_QR.size(); - int ldc = M; - int lwork = -1; - int info; - std::vector work(1); - Lapack::mqr(&size, &trans, &M, &N, &K, U.data(), &M, tau_QR.data(), new_U.data(), &ldc, work.data(), &lwork, &info); - lwork = (int)std::real(work[0]); - work.resize(lwork); - Lapack::mqr(&size, &trans, &M, &N, &K, U.data(), &M, tau_QR.data(), new_U.data(), &ldc, work.data(), &lwork, &info); - } + // new_U=Q1*new_U + { + char size = 'L'; + char trans = 'N'; + int M = new_U.nb_rows(); + int N = new_U.nb_cols(); + int K = tau_QR.size(); + int ldc = M; + int lwork = -1; + int info; + std::vector work(1); + Lapack::mqr(&size, &trans, &M, &N, &K, U.data(), &M, tau_QR.data(), new_U.data(), &ldc, work.data(), &lwork, &info); + lwork = static_cast(std::real(work[0])); + work.resize(lwork); + Lapack::mqr(&size, &trans, &M, &N, &K, U.data(), &M, tau_QR.data(), new_U.data(), &ldc, work.data(), &lwork, &info); + } - // new_V=new_V*Q2 - { - char size = 'R'; - char trans = 'N'; - int M = new_V.nb_rows(); - int N = new_V.nb_cols(); - int K = tau_LQ.size(); - int lda = V.nb_rows(); - int ldc = M; - int lwork = -1; - int info; - std::vector work(1); - Lapack::mlq(&size, &trans, &M, &N, &K, V.data(), &lda, tau_LQ.data(), new_V.data(), &ldc, work.data(), &lwork, &info); - lwork = (int)std::real(work[0]); - work.resize(lwork); - Lapack::mlq(&size, &trans, &M, &N, &K, V.data(), &lda, tau_LQ.data(), new_V.data(), &ldc, work.data(), &lwork, &info); + // new_V=new_V*Q2 + { + char size = 'R'; + char trans = 'N'; + int M = new_V.nb_rows(); + int N = new_V.nb_cols(); + int K = tau_LQ.size(); + int lda = V.nb_rows(); + int ldc = M; + int lwork = -1; + int info; + std::vector work(1); + Lapack::mlq(&size, &trans, &M, &N, &K, V.data(), &lda, tau_LQ.data(), new_V.data(), &ldc, work.data(), &lwork, &info); + lwork = static_cast(std::real(work[0])); + work.resize(lwork); + Lapack::mlq(&size, &trans, &M, &N, &K, V.data(), &lda, tau_LQ.data(), new_V.data(), &ldc, work.data(), &lwork, &info); + } } } } -template -std::unique_ptr> recompression(const LowRankMatrix &lrmat) { - std::unique_ptr> new_lrmat_ptr = std::make_unique>(lrmat); - recompression(*new_lrmat_ptr); - return new_lrmat_ptr; -} - } // namespace htool #endif diff --git a/include/htool/hmatrix/tree_builder/task_based_tree_builder.hpp b/include/htool/hmatrix/tree_builder/task_based_tree_builder.hpp new file mode 100644 index 00000000..37bddcde --- /dev/null +++ b/include/htool/hmatrix/tree_builder/task_based_tree_builder.hpp @@ -0,0 +1,288 @@ +#ifndef HTOOL_HMATRIX_TASK_BASED_TREE_BUILDER_HPP +#define HTOOL_HMATRIX_TASK_BASED_TREE_BUILDER_HPP + +#include "../hmatrix.hpp" +namespace htool { + +/** + * @brief The cost_function associates with a node of the group tree a score + * representing an estimate of the amount of work associated with this leaf. + * + * The cost of a node is given by the number of points in the target cluster + * times the number of points in the source cluster. + * + * @param hmatrix The input hierarchical matrix. + * @return The naive cost of the node, i.e. the number of points in the target cluster + * times the number of points in the source cluster. + */ +template > +std::size_t cost_function(const HMatrix &hmatrix) { + std::size_t nb_rows = hmatrix.get_target_cluster().get_size(); + std::size_t nb_cols = hmatrix.get_source_cluster().get_size(); + return std::size_t(nb_rows * nb_cols); +} + +// overload for const Cluster +template +std::size_t cost_function(const Cluster &cluster) { + return std::size_t(cluster.get_size()); +} + +/** + * @brief Find the nodes of the group tree such that the total cost of their + * leaves is less than or equal to nb_nodes_max. + * + * This function performs a dichotomy search to find the optimal criterion + * such that the total cost of the nodes of the group tree is less than or + * equal to nb_nodes_max. + * + * @param root_hmatrix The root node of the group tree. + * @param nb_nodes_max The maximum total cost of the nodes of the group tree. + * @return A vector of pointers to the nodes of the group tree such that the + * total cost of their leaves is less than or equal to nb_nodes_max. + */ +template > +std::vector *> find_l0(HMatrix &root_hmatrix, const size_t nb_nodes_max) { + // Initialize criterion with the cost of the root node + double criterion = cost_function(root_hmatrix); + + // Find initial nodes that meet the criterion + std::vector *> old_result, result = count_nodes(root_hmatrix, criterion); + // Check if the initial result exceeds the maximum allowed nodes + if (result.size() > nb_nodes_max) { + std::cerr << "Error: no L0 can be defined." << std::endl; + return std::vector *>(); + } else { + // Perform a dichotomy search to find the optimal criterion + do { + // Save the result of this iteration + old_result = result; + + // If all nodes are leaves, return the result as it cannot be further divided + if (std::all_of(result.begin(), result.end(), [](HMatrix *hmatrix) { + return hmatrix->is_leaf(); + })) { + return result; + } + + // Reduce the criterion to explore smaller cost nodes + criterion /= 2; + + // Update the result with the new criterion + result = count_nodes(root_hmatrix, criterion); + } while (result.size() <= nb_nodes_max); // Loop until the result size exceeds nb_nodes_max + } + + // Return the last valid result + return old_result; +} + +// overload for const Cluster +template +std::vector *> find_l0(const Cluster &cluster, const size_t nb_nodes_max) { + // Initialize criterion with the cost of the root node + double criterion = cost_function(cluster); + + // Find initial nodes that meet the criterion + std::vector *> old_result, result = count_nodes(cluster, criterion); + + // Check if the initial result exceeds the maximum allowed nodes + if (result.size() > nb_nodes_max) { + std::cerr << "Error: no L0 can be defined." << std::endl; + return std::vector *>(); + } else { + // Perform a dichotomy search to find the optimal criterion + do { + // Save the result of this iteration + old_result = result; + + // If all nodes are leaves, return the result as it cannot be further divided + if (std::all_of(result.begin(), result.end(), [](const Cluster *leaf_cluster) { + return leaf_cluster->is_leaf(); + })) { + return result; + } + + // Reduce the criterion to explore smaller cost nodes + criterion /= 2; + + // Update the result with the new criterion + result = count_nodes(cluster, criterion); + } while (result.size() <= nb_nodes_max); // Loop until the result size exceeds nb_nodes_max + } + + // Return the last valid result + return old_result; +} + +/** + * @brief Performs a postorder tree traversal of the group tree and returns a vector of all the nodes of the group tree that have a cost less than criterion. + * + * The cost of a node is given by the cost_function: it is the product of the number of points in the target cluster and the number of points in the source cluster. + * + * The criterion is the maximum cost of the nodes to be returned. + * + * The result is a vector of pointers to the nodes of the group tree that have a cost less than criterion. + * + * @param hmatrix The input hierarchical matrix. + * @param criterion The maximum cost of the nodes to be returned. + * @return A vector of pointers to the nodes of the group tree that have a cost less than criterion. + */ +template > +std::vector *> count_nodes(HMatrix &hmatrix, double criterion) { + std::vector *> result; + + if (cost_function(hmatrix) <= criterion || hmatrix.is_leaf()) { + // if the node is a leaf or its cost is less than criterion, add it to the result + result.push_back(&hmatrix); + } else { + // if the node is not a leaf, traverse its children + for (auto &child : hmatrix.get_children()) { + // perform a postorder tree traversal of the subtree rooted at child + auto local_result = count_nodes(*child.get(), criterion); + // add the result of the subtree traversal to the result + result.insert(result.end(), local_result.begin(), local_result.end()); + } + } + return result; +} + +// overload for const Cluster +template +std::vector *> count_nodes(const Cluster &cluster, double criterion) { + std::vector *> result; + + if (cost_function(cluster) <= criterion || cluster.is_leaf()) { + // if the node is a leaf or its cost is less than criterion, add it to the result + result.push_back(&cluster); + } else { + // if the node is not a leaf, traverse its children + for (const auto &child : cluster.get_children()) { + // perform a postorder tree traversal of the subtree rooted at child + auto local_result = count_nodes(*child.get(), criterion); + // add the result of the subtree traversal to the result + result.insert(result.end(), local_result.begin(), local_result.end()); + } + } + return result; +} + +/** + * Enumerates the dependences of a given hierarchical matrix (HMatrix) with respect to a set of nodes (L0). + * + * This function determines the relationship of the input HMatrix with the nodes in the L0 vector, + * based on their hierarchical position, and returns a vector of dependent HMatrix pointers. + * + * Cases handled: + * 1. If the input HMatrix is present in L0, it directly returns the HMatrix. + * 2. If the input HMatrix is an ancestor of any node in L0, it returns those descendant nodes from L0. + * 3. If the input HMatrix is a descendant of any node in L0, it returns the unique ancestor from L0. + * 4. If none of the above cases are satisfied, it logs an error indicating an unexpected state. + * + * @param hmatrix The hierarchical matrix for which dependencies are to be enumerated. + * @param L0 A vector of hierarchical matrix pointers representing a set of nodes. + * @return A vector of pointers to the hierarchical matrices that are dependencies of the input matrix. + */ + +template > +std::vector *> enumerate_dependences(const HMatrix &hmatrix, const std::vector *> &L0) { + // Case 1 : hmatrix is in L0 + if (std::find(L0.begin(), L0.end(), &hmatrix) != L0.end()) { + return {&hmatrix}; + } + + // Case 2 : hmatrix is above L0. Find descendants of hmatrix in L0 + std::vector *> children; + for (const auto &hmatrix_on_L0 : L0) { + if (left_hmatrix_ancestor_of_right_hmatrix(hmatrix, *hmatrix_on_L0)) { + children.push_back(hmatrix_on_L0); + } + } + if (!children.empty()) { + return children; + } + + // Case 3 : hmatrix is below L0. Find the unique ancestor of hmatrix in L0 + + const auto it = std::find_if(L0.begin(), L0.end(), [&hmatrix](HMatrix *hmatrix_in_L0) { + return left_hmatrix_descendant_of_right_hmatrix(hmatrix, *hmatrix_in_L0); + }); + + if (it != L0.end()) { + return {*it}; + } + + // Case 4 : error + Logger::get_instance().log(LogLevel::ERROR, "No dependence found with L0. It should not happen."); + return {}; +} + +// overload for non-const HMatrix pointers +template > +std::vector *> enumerate_dependences(HMatrix &hmatrix, const std::vector *> &L0) { + // Case 1 : hmatrix is in L0 + if (std::find(L0.begin(), L0.end(), &hmatrix) != L0.end()) { + return {&hmatrix}; + } + + // Case 2 : hmatrix is above L0. Find descendants of hmatrix in L0 + std::vector *> children; + for (auto &hmatrix_on_L0 : L0) { + if (left_hmatrix_ancestor_of_right_hmatrix(hmatrix, *hmatrix_on_L0)) { + children.push_back(hmatrix_on_L0); + } + } + if (!children.empty()) { + return children; + } + + // Case 3 : hmatrix is below L0. Find the unique ancestor of hmatrix in L0 + + const auto it = std::find_if(L0.begin(), L0.end(), [&hmatrix](HMatrix *hmatrix_in_L0) { + return left_hmatrix_descendant_of_right_hmatrix(hmatrix, *hmatrix_in_L0); + }); + + if (it != L0.end()) { + return {*it}; + } + + // Case 4 : error + Logger::get_instance().log(LogLevel::ERROR, "No dependence found with L0. It should not happen."); + return {}; +} + +// overload for const Cluster pointers +template +std::vector *> enumerate_dependences(const Cluster &cluster, const std::vector *> &L0) { + // Case 1 : cluster is in L0 + if (std::find(L0.begin(), L0.end(), &cluster) != L0.end()) { + return {&cluster}; + } + + // Case 2 : cluster is above L0. Find descendants of cluster in L0 + std::vector *> children; + for (const auto &cluster_on_L0 : L0) { + if (left_cluster_contains_right_cluster(cluster, *cluster_on_L0)) { + children.push_back(cluster_on_L0); + } + } + if (!children.empty()) { + return children; + } + + // Case 3 : cluster is below L0. Find the unique ancestor of cluster in L0 + const auto it = std::find_if(L0.begin(), L0.end(), [&cluster](const Cluster *cluster_in_L0) { + return left_cluster_contains_right_cluster(*cluster_in_L0, cluster); + }); + if (it != L0.end()) { + return {*it}; + } + + // Case 4 : error + Logger::get_instance().log(LogLevel::ERROR, "No dependence found with L0. It should not happen."); + return {}; +} + +} // namespace htool + +#endif \ No newline at end of file diff --git a/include/htool/hmatrix/tree_builder/tree_builder.hpp b/include/htool/hmatrix/tree_builder/tree_builder.hpp index d08a746b..3d3d845f 100644 --- a/include/htool/hmatrix/tree_builder/tree_builder.hpp +++ b/include/htool/hmatrix/tree_builder/tree_builder.hpp @@ -10,14 +10,16 @@ #include "htool/clustering/cluster_node.hpp" // for left... #include "htool/hmatrix/interfaces/virtual_admissibility_condition.hpp" #include "htool/hmatrix/interfaces/virtual_dense_blocks_generator.hpp" -#include "htool/hmatrix/interfaces/virtual_lrmat_generator.hpp" // for Virt... -#include "htool/misc/misc.hpp" // for unde... -#include // for fill_n -#include // for dura... -#include // for shar... -#include // for stack -#include // for basi... -#include // for vector +#include "htool/hmatrix/interfaces/virtual_lrmat_generator.hpp" // for Virt... +#include "htool/misc/misc.hpp" // for unde... +#include // for fill_n +#include // for dura... +#include // for enumerate_dependence, find_l0... +#include // for list +#include // for shar... +#include // for stack +#include // for basi... +#include // for vector namespace htool { @@ -34,27 +36,34 @@ class HMatrixTreeBuilder { using ClusterType = Cluster; // Parameters - const ClusterType &m_target_root_cluster, &m_source_root_cluster; underlying_type m_epsilon{1e-6}; CoordinatePrecision m_eta{10}; int m_minsourcedepth{0}; int m_mintargetdepth{0}; int m_reqrank{-1}; - int m_target_partition_number{-1}; char m_symmetry_type{'N'}; char m_UPLO_type{'N'}; - int m_partition_number_for_symmetry{-1}; - // Views + // Cached information during build mutable std::vector m_admissible_tasks{}; mutable std::vector m_dense_tasks{}; + mutable int m_target_partition_number{-1}; + mutable int m_partition_number_for_symmetry{-1}; + mutable const Cluster *m_target_root_cluster{nullptr}; + mutable const Cluster *m_source_root_cluster{nullptr}; + mutable std::shared_ptr> m_used_low_rank_generator{nullptr}; // Information mutable int m_false_positive{0}; + mutable std::vector m_L0; + + // Internal storage for adapting user generator + mutable std::list> m_internal_generators; // using list to get pointer stability // Strategies - std::shared_ptr> m_low_rank_generator; + std::shared_ptr> m_internal_low_rank_generator{nullptr}; + std::shared_ptr> m_low_rank_generator{nullptr}; std::shared_ptr> m_admissibility_condition; std::shared_ptr> m_dense_blocks_generator; bool m_is_block_tree_consistent{true}; @@ -63,8 +72,21 @@ class HMatrixTreeBuilder { void build_block_tree(HMatrixType *current_hmatrix) const; void reset_root_of_block_tree(HMatrixType &) const; void compute_blocks(const VirtualInternalGenerator &generator) const; + void task_based_compute_blocks(const VirtualInternalGenerator &generator, const std::vector *> &L0) const; // Tests + void check_inputs() { + if (!((m_symmetry_type == 'N' || m_symmetry_type == 'H' || m_symmetry_type == 'S') + && (m_UPLO_type == 'N' || m_UPLO_type == 'L' || m_UPLO_type == 'U') + && ((m_symmetry_type == 'N' && m_UPLO_type == 'N') || (m_symmetry_type != 'N' && m_UPLO_type != 'N')) + && ((m_symmetry_type == 'H' && is_complex()) || m_symmetry_type != 'H'))) { + std::string error_message = "[Htool error] Invalid arguments to create HMatrix: m_symmetry_type="; + error_message.push_back(m_symmetry_type); + error_message += " and m_UPLO_type="; + error_message.push_back(m_UPLO_type); + htool::Logger::get_instance().log(LogLevel::ERROR, error_message); // LCOV_EXCL_LINE + } + } bool is_target_cluster_in_target_partition(const ClusterType &cluster) const { return (m_target_partition_number == -1) ? true : (m_target_partition_number == cluster.get_rank()); } @@ -73,14 +95,14 @@ class HMatrixTreeBuilder { && ((m_UPLO_type == 'U' && target_cluster.get_offset() >= (source_cluster.get_offset() + source_cluster.get_size()) && ((m_partition_number_for_symmetry == -1) - || (source_cluster.get_offset() >= m_source_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() and m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() <= target_cluster.get_offset() && target_cluster.get_offset() + target_cluster.get_size() <= m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size())) + || (source_cluster.get_offset() >= m_source_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() and m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() <= target_cluster.get_offset() && target_cluster.get_offset() + target_cluster.get_size() <= m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size())) // && ((m_target_partition_number != -1) // || source_cluster.get_offset() >= target_cluster.get_offset()) ) || (m_UPLO_type == 'L' && source_cluster.get_offset() >= (target_cluster.get_offset() + target_cluster.get_size()) && ((m_partition_number_for_symmetry == -1) - || (source_cluster.get_offset() < m_source_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_source_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size() && m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() <= target_cluster.get_offset() && target_cluster.get_offset() + target_cluster.get_size() <= m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_target_root_cluster.get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size())) + || (source_cluster.get_offset() < m_source_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_source_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size() && m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() <= target_cluster.get_offset() && target_cluster.get_offset() + target_cluster.get_size() <= m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_offset() + m_target_root_cluster->get_clusters_on_partition()[m_partition_number_for_symmetry]->get_size())) // && ((m_target_partition_number != -1) // || source_cluster.get_offset() < target_cluster.get_offset() + target_cluster.get_size()) )); @@ -91,10 +113,10 @@ class HMatrixTreeBuilder { const auto &source_cluster = hmatrix.get_source_cluster(); return (is_there_a_target_partition - && (target_cluster == *m_target_root_cluster.get_clusters_on_partition()[m_target_partition_number]) - && source_cluster == *m_source_root_cluster.get_clusters_on_partition()[m_target_partition_number]) + && (target_cluster == *m_target_root_cluster->get_clusters_on_partition()[m_target_partition_number]) + && source_cluster == *m_source_root_cluster->get_clusters_on_partition()[m_target_partition_number]) || (!is_there_a_target_partition - && target_cluster == m_target_root_cluster + && target_cluster == *m_target_root_cluster && target_cluster == source_cluster); } @@ -124,25 +146,41 @@ class HMatrixTreeBuilder { }); } } + std::pair *>, std::vector *>> get_leaves_from(HMatrix &hmatrix) const { + std::pair *>, std::vector *>> result; + auto &leaves = result.first; + auto &leaves_for_symmetry = result.second; + std::stack *, bool>> hmatrix_stack; + hmatrix_stack.push(std::pair *, bool>(&hmatrix, hmatrix.get_symmetry() != 'N')); + + while (!hmatrix_stack.empty()) { + auto ¤t_element = hmatrix_stack.top(); + hmatrix_stack.pop(); + HMatrix *current_hmatrix = current_element.first; + bool has_symmetric_ancestor = current_element.second; + + if (current_hmatrix->is_leaf()) { + leaves.push_back(current_hmatrix); + + if (has_symmetric_ancestor && current_hmatrix->get_target_cluster().get_offset() != current_hmatrix->get_source_cluster().get_offset()) { + leaves_for_symmetry.push_back(current_hmatrix); + } + } - public: - explicit HMatrixTreeBuilder(const ClusterType &root_cluster_tree_target, const ClusterType &root_source_cluster_tree, underlying_type epsilon, CoordinatePrecision eta, char symmetry, char UPLO, int reqrank, int target_partition_number, int partition_number_for_symmetry) : m_target_root_cluster(root_cluster_tree_target), m_source_root_cluster(root_source_cluster_tree), m_epsilon(epsilon), m_eta(eta), m_reqrank(reqrank), m_target_partition_number(target_partition_number), m_symmetry_type(symmetry), m_UPLO_type(UPLO), m_partition_number_for_symmetry(partition_number_for_symmetry), m_low_rank_generator(std::make_shared>()), m_admissibility_condition(std::make_shared>()) { - if (!((m_symmetry_type == 'N' || m_symmetry_type == 'H' || m_symmetry_type == 'S') - && (m_UPLO_type == 'N' || m_UPLO_type == 'L' || m_UPLO_type == 'U') - && ((m_symmetry_type == 'N' && m_UPLO_type == 'N') || (m_symmetry_type != 'N' && m_UPLO_type != 'N')) - && ((m_symmetry_type == 'H' && is_complex()) || m_symmetry_type != 'H'))) { - std::string error_message = "[Htool error] Invalid arguments to create HMatrix: m_symmetry_type="; - error_message.push_back(m_symmetry_type); - error_message += " and m_UPLO_type="; - error_message.push_back(m_UPLO_type); - htool::Logger::get_instance().log(LogLevel::ERROR, error_message); // LCOV_EXCL_LINE - } - if (target_partition_number != -1 && target_partition_number >= m_target_root_cluster.get_clusters_on_partition().size()) { - htool::Logger::get_instance().log(LogLevel::ERROR, "Target partition number cannot exceed number of partitions"); // LCOV_EXCL_LINE - } - if (partition_number_for_symmetry != -1 && partition_number_for_symmetry >= m_target_root_cluster.get_clusters_on_partition().size()) { - htool::Logger::get_instance().log(LogLevel::ERROR, "Partition number for symmetry cannot exceed number of partitions"); // LCOV_EXCL_LINE + for (auto &child : current_hmatrix->get_children()) { + hmatrix_stack.push(std::pair *, bool>(child.get(), current_hmatrix->get_symmetry() != 'N' || has_symmetric_ancestor)); + } } + return result; + } + + public: + explicit HMatrixTreeBuilder(underlying_type epsilon, CoordinatePrecision eta, char symmetry, char UPLO, int reqrank, std::shared_ptr> low_rank_strategy, std::shared_ptr> admissibility_condition_strategy = nullptr) : m_epsilon(epsilon), m_eta(eta), m_reqrank(reqrank), m_symmetry_type(symmetry), m_UPLO_type(UPLO), m_internal_low_rank_generator(low_rank_strategy), m_admissibility_condition(admissibility_condition_strategy ? admissibility_condition_strategy : std::make_shared>()) { + check_inputs(); + } + + explicit HMatrixTreeBuilder(underlying_type epsilon, CoordinatePrecision eta, char symmetry, char UPLO, int reqrank = -1, std::shared_ptr> low_rank_strategy = nullptr, std::shared_ptr> admissibility_condition_strategy = nullptr) : m_epsilon(epsilon), m_eta(eta), m_reqrank(reqrank), m_symmetry_type(symmetry), m_UPLO_type(UPLO), m_low_rank_generator(low_rank_strategy), m_admissibility_condition(admissibility_condition_strategy ? admissibility_condition_strategy : std::make_shared>()) { + check_inputs(); } HMatrixTreeBuilder(const HMatrixTreeBuilder &) = delete; @@ -152,13 +190,54 @@ class HMatrixTreeBuilder { virtual ~HMatrixTreeBuilder() = default; // Build - HMatrixType build(const VirtualInternalGenerator &generator) const; - HMatrixType build(const VirtualGenerator &generator) const { - return this->build(InternalGeneratorWithPermutation(generator, m_target_root_cluster.get_permutation().data(), m_source_root_cluster.get_permutation().data())); + HMatrixType build(const VirtualInternalGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree, int target_partition_number = -1, int partition_number_for_symmetry = -1, bool is_task_based = false, const size_t nb_nodes_max = 0) const; + + HMatrixType build(const VirtualGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree, int target_partition_number = -1, int partition_number_for_symmetry = -1, bool is_task_based = false, const size_t nb_nodes_max = 0) const { + m_internal_generators.emplace_back(generator, target_root_cluster_tree.get_permutation().data(), source_root_cluster_tree.get_permutation().data()); + return this->build(m_internal_generators.back(), target_root_cluster_tree, source_root_cluster_tree, target_partition_number, partition_number_for_symmetry, is_task_based, nb_nodes_max); } + // HMatrixType build(const VirtualInternalGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree, int target_partition_number) const { + // return this->build(generator, target_root_cluster_tree, source_root_cluster_tree, target_partition_number, target_partition_number); + // } + // HMatrixType build(const VirtualInternalGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree) const { + // return this->build(generator, target_root_cluster_tree, source_root_cluster_tree, -1, -1); + // } + + // HMatrixType build(const VirtualGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree, int target_partition_number, int partition_number_for_symmetry) const { + // return this->build(InternalGeneratorWithPermutation(generator, target_root_cluster_tree.get_permutation().data(), source_root_cluster_tree.get_permutation().data()), target_root_cluster_tree, source_root_cluster_tree, target_partition_number, partition_number_for_symmetry); + // } + // HMatrixType build(const VirtualGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree, int target_partition_number) const { + // return this->build(InternalGeneratorWithPermutation(generator, target_root_cluster_tree.get_permutation().data(), source_root_cluster_tree.get_permutation().data()), target_root_cluster_tree, source_root_cluster_tree, target_partition_number, target_partition_number); + // } + // HMatrixType build(const VirtualGenerator &generator, const ClusterType &target_root_cluster_tree, const ClusterType &source_root_cluster_tree) const { + // return this->build(InternalGeneratorWithPermutation(generator, target_root_cluster_tree.get_permutation().data(), source_root_cluster_tree.get_permutation().data()), target_root_cluster_tree, source_root_cluster_tree, -1, -1); + // ======= + // HMatrixType build(const VirtualInternalGenerator &generator, bool is_task_based = false, const size_t nb_nodes_max = 64) const; + // HMatrixType build(const VirtualGenerator &generator, bool is_task_based = false, const size_t nb_nodes_max = 64) const { + // m_internal_generators.emplace_back(generator, m_target_root_cluster.get_permutation().data(), m_source_root_cluster.get_permutation().data()); + // return this->build(m_internal_generators.back(), is_task_based); + // >>>>>>> 1394159 (first commit task based) + // } + // Setters - void set_low_rank_generator(std::shared_ptr> ptr) { m_low_rank_generator = ptr; } + void + set_symmetry(char symmetry_type) { + m_symmetry_type = symmetry_type; + check_inputs(); + } + void set_UPLO(char UPLO_type) { + m_UPLO_type = UPLO_type; + check_inputs(); + } + void set_low_rank_generator(std::shared_ptr> ptr) { + m_internal_low_rank_generator.reset(); + m_low_rank_generator = ptr; + } + void set_low_rank_generator(std::shared_ptr> ptr) { + m_low_rank_generator.reset(); + m_internal_low_rank_generator = ptr; + } void set_admissibility_condition(std::shared_ptr> ptr) { m_admissibility_condition = ptr; } void set_minimal_source_depth(int minimal_source_depth) { m_minsourcedepth = minimal_source_depth; } void set_minimal_target_depth(int minimal_target_depth) { m_mintargetdepth = minimal_target_depth; } @@ -171,18 +250,60 @@ class HMatrixTreeBuilder { } // Getters + int get_false_positive() const { return m_false_positive; } char get_symmetry() const { return m_symmetry_type; } char get_UPLO() const { return m_UPLO_type; } - const Cluster &get_target_cluster() const { return m_target_root_cluster; } - const Cluster &get_source_cluster() const { return m_source_root_cluster; } + double get_epsilon() const { return m_epsilon; } + double get_eta() const { return m_eta; } + std::shared_ptr> get_internal_low_rank_generator() const { return m_internal_low_rank_generator; } + std::shared_ptr> get_low_rank_generator() const { return m_low_rank_generator; } + std::vector get_L0() const { return m_L0; } }; template -HMatrix HMatrixTreeBuilder::build(const VirtualInternalGenerator &generator) const { +HMatrix HMatrixTreeBuilder::build(const VirtualInternalGenerator &generator, const ClusterType &root_target_cluster_tree, const ClusterType &root_source_cluster_tree, int target_partition_number, int partition_number_for_symmetry, bool is_task_based, const size_t nb_nodes_max) const { + + if (target_partition_number != -1 && target_partition_number >= root_target_cluster_tree.get_clusters_on_partition().size()) { + htool::Logger::get_instance().log(LogLevel::ERROR, "Target partition number cannot exceed number of partitions"); // LCOV_EXCL_LINE + } + if (partition_number_for_symmetry != -1 && partition_number_for_symmetry >= root_target_cluster_tree.get_clusters_on_partition().size()) { + htool::Logger::get_instance().log(LogLevel::ERROR, "Partition number for symmetry cannot exceed number of partitions"); // LCOV_EXCL_LINE + } + + // Cached information + m_target_root_cluster = &root_target_cluster_tree; + m_source_root_cluster = &root_source_cluster_tree; + m_target_partition_number = target_partition_number; + m_partition_number_for_symmetry = partition_number_for_symmetry; + if (!m_internal_low_rank_generator && !m_low_rank_generator) { + m_used_low_rank_generator = std::make_shared>(generator); + } else if (!m_internal_low_rank_generator) { + m_used_low_rank_generator = std::make_shared>(*m_low_rank_generator, root_target_cluster_tree.get_permutation().data(), root_source_cluster_tree.get_permutation().data()); + } else { + m_used_low_rank_generator = m_internal_low_rank_generator; + } + m_admissible_tasks.clear(); + m_dense_tasks.clear(); + m_false_positive = 0; + m_L0.clear(); + // ======= + // const Cluster &get_target_cluster() const { return m_target_root_cluster; } + // const Cluster &get_source_cluster() const { return m_source_root_cluster; } + // }; + + // template + // HMatrix HMatrixTreeBuilder::build(const VirtualInternalGenerator &generator, bool is_task_based, const size_t nb_nodes_max) const { + // // Clear HMatrix tree data + // m_admissible_tasks.clear(); + // m_dense_tasks.clear(); + // m_false_positive = 0; + // m_L0.clear(); + // >>>>>>> 1394159 (first commit task based) + // Create root hmatrix - HMatrixType root_hmatrix(m_target_root_cluster, m_source_root_cluster); + HMatrixType root_hmatrix(root_target_cluster_tree, root_source_cluster_tree); root_hmatrix.set_admissibility_condition(m_admissibility_condition); - root_hmatrix.set_low_rank_generator(m_low_rank_generator); + root_hmatrix.set_low_rank_generator(m_used_low_rank_generator); root_hmatrix.set_eta(m_eta); root_hmatrix.set_epsilon(m_epsilon); root_hmatrix.set_minimal_target_depth(m_mintargetdepth); @@ -191,30 +312,34 @@ HMatrix HMatrixTreeBuilder block_tree_build_duration = end - start; // Compute leave's data - start = std::chrono::steady_clock::now(); - compute_blocks(generator); - end = std::chrono::steady_clock::now(); - - std::chrono::duration block_comptations_duration = end - start; + if (is_task_based) { + // std::vector *> L0; + m_L0 = find_l0(root_hmatrix, nb_nodes_max); // TODO: make this parameterizable + task_based_compute_blocks(generator, m_L0); + } else { + start = std::chrono::steady_clock::now(); + compute_blocks(generator); + end = std::chrono::steady_clock::now(); + + // Set information + std::chrono::duration block_comptations_duration = end - start; + root_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] = NbrToStr(m_false_positive); + root_hmatrix.get_hmatrix_tree_data()->m_timings["Blocks_computation_walltime"] = block_comptations_duration; + } + root_hmatrix.get_hmatrix_tree_data()->m_timings["Block_tree_walltime"] = block_tree_build_duration; - // set_symmetry_for_leaves(root_hmatrix); - // Set information - root_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] = NbrToStr(m_false_positive); - root_hmatrix.get_hmatrix_tree_data()->m_timings["Block_tree_walltime"] = block_tree_build_duration; - root_hmatrix.get_hmatrix_tree_data()->m_timings["Blocks_computation_walltime"] = block_comptations_duration; - return root_hmatrix; } @@ -383,9 +508,9 @@ void HMatrixTreeBuilder::compute_bloc # pragma omp for schedule(guided) nowait #endif for (int p = 0; p < m_admissible_tasks.size(); p++) { - m_admissible_tasks[p]->compute_low_rank_data(generator, *m_low_rank_generator, m_reqrank, m_epsilon); + bool has_low_rank_approximation_succeded = m_admissible_tasks[p]->compute_low_rank_data(*m_used_low_rank_generator, m_reqrank, m_epsilon); // local_low_rank_leaves.emplace_back(m_admissible_tasks[p]); - if (m_admissible_tasks[p]->get_low_rank_data()->get_U().nb_rows() != m_admissible_tasks[p]->get_target_cluster().get_size() || m_admissible_tasks[p]->get_low_rank_data()->get_V().nb_cols() != m_admissible_tasks[p]->get_source_cluster().get_size()) { + if (!has_low_rank_approximation_succeded) { // local_low_rank_leaves.pop_back(); m_admissible_tasks[p]->clear_low_rank_data(); // if (m_dense_blocks_generator.get() == nullptr) { @@ -465,12 +590,10 @@ void HMatrixTreeBuilder::compute_bloc // template // void block_tree_preorder_traversal(const BlockTree &block_tree, PreOrderFunction pre_order_visitor) { // std::stack *> hmatrix_stack{std::deque *>{block_tree.get_root_hmatrix()}}; - // while (!hmatrix_stack.empty()) { // const HMatrix *current_hmatrix = hmatrix_stack.top(); // hmatrix_stack.pop(); // pre_order_visitor(*current_hmatrix, block_tree); - // const auto &children = current_hmatrix->get_children(); // for (auto child = children.rbegin(); child != children.rend(); child++) { // hmatrix_stack.push(*child); @@ -478,5 +601,52 @@ void HMatrixTreeBuilder::compute_bloc // } // } +template +void HMatrixTreeBuilder::task_based_compute_blocks(const VirtualInternalGenerator &generator, const std::vector *> &L0) const { + + // int max_prio = std::max(0, omp_get_max_task_priority()); + for (int p = 0; p < L0.size(); p++) { + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp task default(none) \ + firstprivate(p, m_reqrank, m_epsilon) \ + shared(generator, m_false_positive, m_low_rank_generator, L0, m_admissible_tasks, m_dense_tasks) \ + depend(out : *L0[p]) + // priority(max_prio - 2) +#endif + { + std::vector *> leaves; + std::vector *> leaves_for_symmetry; + std::tie(leaves, leaves_for_symmetry) = get_leaves_from(*L0[p]); // C++17 structured binding + for (auto leaf : leaves) { + // check if leaf is removed by symmetry + if (!is_removed_by_symmetry(leaf->get_target_cluster(), leaf->get_source_cluster())) { + + // check if leaf is in m_admissible_tasks + auto it_admissible = std::find(m_admissible_tasks.begin(), m_admissible_tasks.end(), leaf); + if (it_admissible != m_admissible_tasks.end()) { + bool has_low_rank_approximation_succeded = leaf->compute_low_rank_data(*m_used_low_rank_generator, m_reqrank, m_epsilon); + + if (!has_low_rank_approximation_succeded) { + leaf->clear_low_rank_data(); + leaf->compute_dense_data(generator); + + m_false_positive += 1; + } + + } else { + // check if leaf is in m_dense_tasks + auto it_dense = std::find(m_dense_tasks.begin(), m_dense_tasks.end(), leaf); + if (it_dense != m_dense_tasks.end()) { + leaf->compute_dense_data(generator); + } + } + } + } + // Todo m_dense_blocks_generator + } + } +} + } // namespace htool #endif diff --git a/include/htool/hmatrix/utility.hpp b/include/htool/hmatrix/utility.hpp new file mode 100644 index 00000000..eea1746d --- /dev/null +++ b/include/htool/hmatrix/utility.hpp @@ -0,0 +1,31 @@ +#ifndef HTOOL_HMATRIX_BUILDER_HPP +#define HTOOL_HMATRIX_BUILDER_HPP + +#include "../clustering/cluster_node.hpp" +#include "../clustering/tree_builder/tree_builder.hpp" +#include "hmatrix.hpp" +#include "tree_builder/tree_builder.hpp" + +namespace htool { + +template > +class HMatrixBuilder { + public: + Cluster target_cluster; + Cluster source_cluster; // use an optional source cluster + + HMatrixBuilder(int target_number_of_points, int target_spatial_dimension, const CoordinatesPrecision *target_coordinates, const ClusterTreeBuilder *target_cluster_tree_builder, int source_number_of_points, int source_spatial_dimension, const CoordinatesPrecision *source_coordinates, const ClusterTreeBuilder *source_cluster_tree_builder) : target_cluster(target_cluster_tree_builder == nullptr ? ClusterTreeBuilder().create_cluster_tree(target_number_of_points, target_spatial_dimension, target_coordinates, 2, 2) : target_cluster_tree_builder->create_cluster_tree(target_number_of_points, target_spatial_dimension, target_coordinates, 2, 2)), source_cluster(source_cluster_tree_builder == nullptr ? ClusterTreeBuilder().create_cluster_tree(source_number_of_points, source_spatial_dimension, source_coordinates, 2, 2) : source_cluster_tree_builder->create_cluster_tree(source_number_of_points, source_spatial_dimension, source_coordinates, 2, 2)) { + } + + HMatrixBuilder(int target_number_of_points, int target_spatial_dimension, const CoordinatesPrecision *target_coordinates, int source_number_of_points, int source_spatial_dimension, const CoordinatesPrecision *source_coordinates) : HMatrixBuilder(target_number_of_points, target_spatial_dimension, target_coordinates, nullptr, source_number_of_points, source_spatial_dimension, source_coordinates, nullptr) { + } + + HMatrixBuilder(int number_of_points, int spatial_dimension, const CoordinatesPrecision *coordinates) : HMatrixBuilder(number_of_points, spatial_dimension, coordinates, nullptr, number_of_points, spatial_dimension, coordinates, nullptr) { + } + + HMatrix build(const VirtualGenerator &generator, const HMatrixTreeBuilder &hmatrix_tree_builder) { + return hmatrix_tree_builder.build(generator, target_cluster, source_cluster); + } +}; +} // namespace htool +#endif diff --git a/include/htool/hmatrix/utils/recompression.hpp b/include/htool/hmatrix/utils/recompression.hpp new file mode 100644 index 00000000..68852336 --- /dev/null +++ b/include/htool/hmatrix/utils/recompression.hpp @@ -0,0 +1,33 @@ +#ifndef HTOOL_HMATRIX_UTILS_RECOMPRESSION_HPP +#define HTOOL_HMATRIX_UTILS_RECOMPRESSION_HPP +#include "../hmatrix.hpp" +#include "../lrmat/utils/SVD_recompression.hpp" +namespace htool { + +template , class Recompression = decltype(SVD_recompression(std::declval &>()))(LowRankMatrix &)> +void recompression(HMatrix &hmatrix, Recompression recompression = &SVD_recompression) { + auto leaves = get_low_rank_leaves_from(hmatrix); // C++17 structured binding + for (auto &leaf : leaves) { + recompression(*leaf->get_low_rank_data()); + } +} + +template , class Recompression = decltype(SVD_recompression(std::declval &>()))(LowRankMatrix &)> +void openmp_recompression(HMatrix &hmatrix, Recompression recompression = &SVD_recompression) { + auto leaves = get_low_rank_leaves_from(hmatrix); // C++17 structured binding + +#if defined(_OPENMP) +# pragma omp parallel +#endif + { +#if defined(_OPENMP) +# pragma omp for schedule(guided) nowait +#endif + for (int i = 0; i < leaves.size(); i++) { + recompression(*leaves[i]->get_low_rank_data()); + } + } +} +} // namespace htool + +#endif diff --git a/include/htool/htool.hpp b/include/htool/htool.hpp index 9be47f90..2de0a2ec 100644 --- a/include/htool/htool.hpp +++ b/include/htool/htool.hpp @@ -5,7 +5,8 @@ #include "misc/define.hpp" #include "basic_types/vector.hpp" -#include "clustering/clustering.hpp" +#include "clustering/cluster_output.hpp" +#include "clustering/tree_builder/tree_builder.hpp" #include "distributed_operator/distributed_operator.hpp" #include "distributed_operator/utility.hpp" #include "hmatrix/hmatrix.hpp" diff --git a/include/htool/matrix/linalg/add_matrix_matrix_product.hpp b/include/htool/matrix/linalg/add_matrix_matrix_product.hpp index 4db4e083..e8bdfcdd 100644 --- a/include/htool/matrix/linalg/add_matrix_matrix_product.hpp +++ b/include/htool/matrix/linalg/add_matrix_matrix_product.hpp @@ -1,6 +1,7 @@ #ifndef HTOOL_MATRIX_LINALG_ADD_MATRIX_MATRIX_PRODUCT_HPP #define HTOOL_MATRIX_LINALG_ADD_MATRIX_MATRIX_PRODUCT_HPP +#include "../../matrix/linalg/scale.hpp" // for scale #include "../../matrix/matrix.hpp" // for Matrix #include "../../wrappers/wrapper_blas.hpp" // for Blas namespace htool { @@ -36,7 +37,11 @@ void add_matrix_matrix_product(char transa, char transb, T alpha, const Matrix::gemm(&transa, &transb, &M, &N, &K, &alpha, A.data(), &lda, B.data(), &ldb, &beta, C.data(), &ldc); + if (ldb > 0) { // AB is zero + Blas::gemm(&transa, &transb, &M, &N, &K, &alpha, A.data(), &lda, B.data(), &ldb, &beta, C.data(), &ldc); + } else { + scale(beta, C); + } // add_matrix_matrix_product(transa, transb, alpha, A, B.data(), beta, C.data(), C.nb_cols()); } @@ -108,12 +113,17 @@ void add_symmetric_matrix_matrix_product(char side, char UPLO, T alpha, const Ma template void add_hermitian_matrix_matrix_product(char side, char UPLO, T alpha, const Matrix &A, const Matrix &B, T beta, Matrix &C) { + add_symmetric_matrix_matrix_product(side, UPLO, alpha, A, B, beta, C); +} + +template +void add_hermitian_matrix_matrix_product(char side, char UPLO, std::complex alpha, const Matrix> &A, const Matrix> &B, std::complex beta, Matrix> &C) { int M = C.nb_rows(); int N = C.nb_cols(); int lda = (side == 'L') ? M : N; int ldb = M; int ldc = M; - Blas::hemm(&side, &UPLO, &M, &N, &alpha, A.data(), &lda, B.data(), &ldb, &beta, C.data(), &ldc); + Blas>::hemm(&side, &UPLO, &M, &N, &alpha, A.data(), &lda, B.data(), &ldb, &beta, C.data(), &ldc); } } // namespace htool diff --git a/include/htool/matrix/linalg/add_matrix_matrix_product_row_major.hpp b/include/htool/matrix/linalg/add_matrix_matrix_product_row_major.hpp index 762cfd70..8ea72234 100644 --- a/include/htool/matrix/linalg/add_matrix_matrix_product_row_major.hpp +++ b/include/htool/matrix/linalg/add_matrix_matrix_product_row_major.hpp @@ -138,53 +138,6 @@ void add_hermitian_matrix_matrix_product_row_major(char side, char UPLO, std::co } } -// template -// void add_matrix_matrix_product_symmetric_row_major(char side, char transa, char transb, std::complex alpha, const Matrix> &A, const std::complex *in, std::complex beta, std::complex *out, const int &mu, char UPLO, char symmetry) { -// int nr = A.nb_rows(); - -// if (transb != 'N') { -// htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for add_matrix_matrix_product_symmetric (transb=" + std::string(1, transb) + ")"); // LCOV_EXCL_LINE -// } - -// if (side != 'L') { -// htool::Logger::get_instance().log(LogLevel::ERROR, "Operation is not implemented for add_matrix_matrix_product_symmetric (side=" + std::string(1, side) + ")"); // LCOV_EXCL_LINE -// } - -// if (nr) { -// int lda = nr; -// char inverted_side = 'R'; -// int M = mu; -// int N = nr; -// int ldb = mu; -// int ldc = mu; - -// if (symmetry == 'S' && (transa == 'N' || transa == 'T')) { -// Blas>::symm(&inverted_side, &UPLO, &M, &N, &alpha, A.data(), &lda, in, &ldb, &beta, out, &ldc); -// } else if (symmetry == 'H' && transa == 'T') { -// Blas>::hemm(&inverted_side, &UPLO, &M, &N, &alpha, A.data(), &lda, in, &ldb, &beta, out, &ldc); -// } else if (symmetry == 'S' && transa == 'C') { -// std::vector> conjugate_in(nr * mu); -// std::complex conjugate_alpha = std::conj(alpha); -// std::complex conjugate_beta = std::conj(beta); -// std::transform(in, in + nr * mu, conjugate_in.data(), [](const std::complex &c) { return std::conj(c); }); -// conj_if_complex>(out, A.nb_cols() * mu); -// Blas>::symm(&inverted_side, &UPLO, &M, &N, &conjugate_alpha, A.data(), &lda, conjugate_in.data(), &ldb, &conjugate_beta, out, &ldc); -// conj_if_complex>(out, A.nb_cols() * mu); -// } else if (symmetry == 'H' && (transa == 'N' || transa == 'C')) { -// std::vector> conjugate_in(nr * mu); -// std::complex conjugate_alpha = std::conj(alpha); -// std::complex conjugate_beta = std::conj(beta); -// std::transform(in, in + nr * mu, conjugate_in.data(), [](const std::complex &c) { return std::conj(c); }); -// conj_if_complex>(out, A.nb_cols() * mu); -// Blas>::hemm(&inverted_side, &UPLO, &M, &N, &conjugate_alpha, A.data(), &lda, conjugate_in.data(), &ldb, &conjugate_beta, out, &ldc); -// conj_if_complex>(out, A.nb_cols() * mu); -// } else { -// htool::Logger::get_instance().log(LogLevel::ERROR, "Invalid arguments for add_matrix_product_symmetric_row_major: " + std::string(1, transa) + " with " + symmetry + ")\n"); // LCOV_EXCL_LINE -// // throw std::invalid_argument("[Htool error] Operation is not supported (" + std::string(1, trans) + " with " + symmetry + ")"); // LCOV_EXCL_LINE -// } -// } -// } - } // namespace htool #endif diff --git a/include/htool/matrix/linalg/factorization.hpp b/include/htool/matrix/linalg/factorization.hpp index 02619b03..3370b52e 100644 --- a/include/htool/matrix/linalg/factorization.hpp +++ b/include/htool/matrix/linalg/factorization.hpp @@ -42,13 +42,12 @@ void triangular_matrix_matrix_solve(char side, char UPLO, char transa, char diag int K1 = 1; int K2 = m; int incx = 1; - Blas::laswp(&n, B.data(), &ldb, &K1, &K2, ipiv.data(), &incx); + Lapack::laswp(&n, B.data(), &ldb, &K1, &K2, ipiv.data(), &incx); } else if (side == 'R' and transa != 'N') { - // C++17 std::swap_ranges - for (int i = 0; i < B.nb_rows(); i++) { - for (int j = 0; j < B.nb_cols(); j++) { - std::swap(B(i, ipiv[j] - 1), B(i, j)); - } + int incx = 1; + int incy = 1; + for (int j = 0; j < B.nb_cols(); j++) { + Blas::swap(&m, &B(0, ipiv[j] - 1), &incx, &B(0, j), &incy); } } } @@ -60,13 +59,12 @@ void triangular_matrix_matrix_solve(char side, char UPLO, char transa, char diag int K1 = 1; int K2 = m; int incx = -1; - Blas::laswp(&n, B.data(), &ldb, &K1, &K2, ipiv.data(), &incx); + Lapack::laswp(&n, B.data(), &ldb, &K1, &K2, ipiv.data(), &incx); } else if (side == 'R' and transa == 'N') { - // C++17 std::swap_ranges - for (int i = 0; i < B.nb_rows(); i++) { - for (int j = B.nb_cols() - 1; j >= 0; j--) { - std::swap(B(i, ipiv[j] - 1), B(i, j)); - } + int incx = 1; + int incy = 1; + for (int j = B.nb_cols() - 1; j >= 0; j--) { + Blas::swap(&m, &B(0, ipiv[j] - 1), &incx, &B(0, j), &incy); } } } diff --git a/include/htool/matrix/matrix.hpp b/include/htool/matrix/matrix.hpp index 9298e3bb..15a9b9fa 100644 --- a/include/htool/matrix/matrix.hpp +++ b/include/htool/matrix/matrix.hpp @@ -29,11 +29,13 @@ class Matrix { public: Matrix() : m_number_of_rows(0), m_number_of_cols(0), m_data(nullptr), m_is_owning_data(true), m_pivots(0) {} Matrix(int nbr, int nbc, T value = 0) : m_number_of_rows(nbr), m_number_of_cols(nbc), m_is_owning_data(true), m_pivots(0) { - m_data = new T[nbr * nbc]; - std::fill_n(m_data, nbr * nbc, value); + std::size_t size = std::size_t(nbr) * std::size_t(nbc); + m_data = size != 0 ? new T[size] : nullptr; + std::fill_n(m_data, std::size_t(nbr) * std::size_t(nbc), value); } Matrix(const Matrix &rhs) : m_number_of_rows(rhs.m_number_of_rows), m_number_of_cols(rhs.m_number_of_cols), m_is_owning_data(true), m_pivots(rhs.m_pivots) { - m_data = new T[rhs.m_number_of_rows * rhs.m_number_of_cols](); + std::size_t size = std::size_t(rhs.m_number_of_rows) * std::size_t(rhs.m_number_of_cols); + m_data = size != 0 ? new T[rhs.m_number_of_rows * rhs.m_number_of_cols]() : nullptr; std::copy_n(rhs.m_data, rhs.m_number_of_rows * rhs.m_number_of_cols, m_data); } @@ -41,17 +43,18 @@ class Matrix { if (&rhs == this) { return *this; } - if (m_number_of_rows * m_number_of_cols == rhs.m_number_of_cols * rhs.m_number_of_rows) { + std::size_t size = std::size_t(m_number_of_rows) * std::size_t(m_number_of_cols); + std::size_t size_rhs = std::size_t(rhs.m_number_of_rows) * std::size_t(rhs.m_number_of_cols); + if (size == size_rhs) { std::copy_n(rhs.m_data, m_number_of_rows * m_number_of_cols, m_data); m_number_of_rows = rhs.m_number_of_rows; m_number_of_cols = rhs.m_number_of_cols; - // m_is_owning_data = true; } else { m_number_of_rows = rhs.m_number_of_rows; m_number_of_cols = rhs.m_number_of_cols; if (m_is_owning_data) delete[] m_data; - m_data = new T[m_number_of_rows * m_number_of_cols](); + m_data = size_rhs != 0 ? new T[m_number_of_rows * m_number_of_cols]() : nullptr; std::copy_n(rhs.m_data, m_number_of_rows * m_number_of_cols, m_data); m_is_owning_data = true; } @@ -222,16 +225,20 @@ class Matrix { void resize(int nbr, int nbc, T value = 0) { if (m_data != nullptr and m_is_owning_data and m_number_of_rows * m_number_of_cols != nbr * nbc) { delete[] m_data; - m_data = nullptr; - m_data = new T[nbr * nbc]; + m_data = nullptr; + m_data = new T[nbr * nbc]; + m_is_owning_data = true; } else if (m_number_of_rows * m_number_of_cols != nbr * nbc) { - m_data = new T[nbr * nbc]; + m_data = new T[nbr * nbc]; + m_is_owning_data = true; + } else if (!m_is_owning_data and m_number_of_rows * m_number_of_cols == nbr * nbc) { + m_data = new T[nbr * nbc]; + m_is_owning_data = true; } m_number_of_rows = nbr; m_number_of_cols = nbc; std::fill_n(m_data, nbr * nbc, value); - m_is_owning_data = true; } //! ### Matrix-scalar product @@ -624,7 +631,7 @@ class Matrix { */ friend std::pair argmax(const Matrix &M) { int p = std::max_element(M.data(), M.data() + M.nb_cols() * M.nb_rows(), [](T a, T b) { return std::abs(a) < std::abs(b); }) - M.data(); - return std::pair(p % M.m_number_of_rows, (int)p / M.m_number_of_rows); + return std::pair(p % M.m_number_of_rows, p / M.m_number_of_rows); } //! ### Looking for the entry of maximal modulus @@ -640,9 +647,9 @@ class Matrix { } int rows = m_number_of_rows; int cols = m_number_of_cols; - out.write((char *)(&rows), sizeof(int)); - out.write((char *)(&cols), sizeof(int)); - out.write((char *)m_data, rows * cols * sizeof(T)); + out.write(reinterpret_cast(&rows), sizeof(int)); + out.write(reinterpret_cast(&cols), sizeof(int)); + out.write(reinterpret_cast(m_data), rows * cols * sizeof(T)); out.close(); return 0; @@ -661,15 +668,15 @@ class Matrix { } int rows = 0, cols = 0; - in.read((char *)(&rows), sizeof(int)); - in.read((char *)(&cols), sizeof(int)); + in.read(reinterpret_cast(&rows), sizeof(int)); + in.read(reinterpret_cast(&cols), sizeof(int)); if (m_number_of_rows != 0 && m_number_of_cols != 0 && m_is_owning_data) delete[] m_data; m_data = new T[rows * cols]; m_number_of_rows = rows; m_number_of_cols = cols; m_is_owning_data = true; - in.read((char *)&(m_data[0]), rows * cols * sizeof(T)); + in.read(reinterpret_cast(&(m_data[0])), rows * cols * sizeof(T)); in.close(); return 0; diff --git a/include/htool/matrix/utils/SVD_truncation.hpp b/include/htool/matrix/utils/SVD_truncation.hpp index 0f0e0e05..d17eccf5 100644 --- a/include/htool/matrix/utils/SVD_truncation.hpp +++ b/include/htool/matrix/utils/SVD_truncation.hpp @@ -28,7 +28,7 @@ int SVD_truncation(Matrix &A, htool::underlying_type> rwork(5 * std::min(M, N)); Lapack::gesvd("A", "A", &M, &N, A.data(), &lda, singular_values.data(), u.data(), &ldu, vt.data(), &ldvt, work.data(), &lwork, rwork.data(), &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::gesvd("A", "A", &M, &N, A.data(), &lda, singular_values.data(), u.data(), &ldu, vt.data(), &ldvt, work.data(), &lwork, rwork.data(), &info); } diff --git a/include/htool/misc/evp.hpp b/include/htool/misc/evp.hpp index 95eb27b0..a8c32a5f 100644 --- a/include/htool/misc/evp.hpp +++ b/include/htool/misc/evp.hpp @@ -3,16 +3,18 @@ #include "../basic_types/vector.hpp" // for dprod #include "../matrix/matrix.hpp" // for Matrix +#include // for array #include // for numeric_limits #include // for vector namespace htool { template -std::vector solve_EVP_2(const Matrix &cov) { +Matrix solve_EVP_2(const Matrix &cov) { std::vector dir(2, 0); std::vector eigs(2); Matrix I(2, 2); + Matrix result(2, 2); I(0, 0) = 1; I(1, 1) = 1; Matrix prod(2, 2); @@ -21,30 +23,34 @@ std::vector solve_EVP_2(const Matrix &cov) { eigs[0] = trace / static_cast(2.) + std::sqrt((trace * trace / static_cast(4.) - det)); eigs[1] = trace / static_cast(2.) - std::sqrt((trace * trace / static_cast(4.) - det)); if (std::abs(eigs[0]) > std::numeric_limits::epsilon()) { - - prod = (cov - eigs[1] * I); - int ind = 0; - T dirnorm = 0; - do { - dir[0] = prod(0, ind); - dir[1] = prod(1, ind); - dirnorm = std::sqrt(dir[0] * dir[0] + dir[1] * dir[1]); - ind++; - } while ((dirnorm < std::numeric_limits::epsilon()) && (ind < 2)); - if (dirnorm < std::numeric_limits::epsilon()) { - dir[0] = 1; - dir[1] = 0; - } else { - dir[0] /= dirnorm; - dir[1] /= dirnorm; + for (int index : {0, 1}) { + prod = (cov - eigs[(index + 1) % 2] * I); + int ind = 0; + T dirnorm = 0; + do { + result(0, index) = prod(0, ind); + result(1, index) = prod(1, ind); + dirnorm = std::sqrt(result(0, index) * result(0, index) + result(1, index) * result(1, index)); + ind++; + } while ((dirnorm < std::numeric_limits::epsilon()) && (ind < 2)); + if (dirnorm < std::numeric_limits::epsilon()) { + result(0, index) = 1; + result(1, index) = 0; + } else { + result(0, index) /= dirnorm; + result(1, index) /= dirnorm; + } } + } else { + result(0, 0) = 1; + result(1, 1) = 1; } - return dir; + return result; } template -std::vector solve_EVP_3(const Matrix &cov) { - std::vector dir(3, 0); +Matrix solve_EVP_3(const Matrix &cov) { + Matrix result(3, 3); T p1 = std::pow(cov(0, 1), 2) + std::pow(cov(0, 2), 2) + std::pow(cov(1, 2), 2); std::vector eigs(3); Matrix I(3, 3); @@ -54,28 +60,14 @@ std::vector solve_EVP_3(const Matrix &cov) { Matrix prod(3, 3); if (p1 < std::numeric_limits::epsilon()) { // cov is diagonal. - eigs[0] = cov(0, 0); - eigs[1] = cov(1, 1); - eigs[2] = cov(2, 2); - dir[0] = 1; - dir[1] = 0; - dir[2] = 0; - if (eigs[0] < eigs[1]) { - T tmp = eigs[0]; - eigs[0] = eigs[1]; - eigs[1] = tmp; - dir[0] = 0; - dir[1] = 1; - dir[2] = 0; - } - if (eigs[0] < eigs[2]) { - T tmp = eigs[0]; - eigs[0] = eigs[2]; - eigs[2] = tmp; - dir[0] = 0; - dir[1] = 0; - dir[2] = 1; - } + eigs[0] = cov(0, 0); + eigs[1] = cov(1, 1); + eigs[2] = cov(2, 2); + std::array indexes = {0, 1, 2}; + std::sort(indexes.begin(), indexes.end(), [&eigs](int a, int b) { return eigs[a] < eigs[b]; }); + result(indexes[2], 0) = 1; + result(indexes[1], 1) = 1; + result(indexes[0], 2) = 1; } else { T q = (cov(0, 0) + cov(1, 1) + cov(2, 2)) / static_cast(3.); T p2 = static_cast(std::pow(cov(0, 0) - q, 2)) + static_cast(std::pow(cov(1, 1) - q, 2)) + static_cast(std::pow(cov(2, 2) - q, 2)) + static_cast(2.) * p1; @@ -102,50 +94,63 @@ std::vector solve_EVP_3(const Matrix &cov) { eigs[2] = q + static_cast(2) * p * std::cos(phi + static_cast(2.094395102393195)); eigs[1] = static_cast(3) * q - eigs[0] - eigs[2]; // since trace(cov) = eig1 + eig2 + eig3 - if (std::abs(eigs[0]) < std::numeric_limits::epsilon()) - dir *= static_cast(0.); - else { - prod = (cov - eigs[0] * I); + if (std::abs(eigs[0]) > std::numeric_limits::epsilon()) { + for (int index : {0, 1, 2}) { + prod = (cov - eigs[index] * I); - const T *col0 = prod.data(); - const T *col1 = prod.data() + 3; - const T *col2 = prod.data() + 6; + const T *col0 = prod.data(); + const T *col1 = prod.data() + 3; + const T *col2 = prod.data() + 6; - std::vector c0xc1{col0[1] * col1[2] - col0[2] * col1[1], - col0[2] * col1[0] - col0[0] * col1[2], - col0[0] * col1[1] - col0[1] * col1[0]}; + std::vector c0xc1{col0[1] * col1[2] - col0[2] * col1[1], + col0[2] * col1[0] - col0[0] * col1[2], + col0[0] * col1[1] - col0[1] * col1[0]}; - std::vector c0xc2 = {col0[1] * col2[2] - col0[2] * col2[1], - col0[2] * col2[0] - col0[0] * col2[2], - col0[0] * col2[1] - col0[1] * col2[0]}; + std::vector c0xc2 = {col0[1] * col2[2] - col0[2] * col2[1], + col0[2] * col2[0] - col0[0] * col2[2], + col0[0] * col2[1] - col0[1] * col2[0]}; - std::vector c1xc2 = {col1[1] * col2[2] - col1[2] * col2[1], - col1[2] * col2[0] - col1[0] * col2[2], - col1[0] * col2[1] - col1[1] * col2[0]}; + std::vector c1xc2 = {col1[1] * col2[2] - col1[2] * col2[1], + col1[2] * col2[0] - col1[0] * col2[2], + col1[0] * col2[1] - col1[1] * col2[0]}; - T d0 = dprod(c0xc1, c0xc1); - T d1 = dprod(c0xc2, c0xc2); - T d2 = dprod(c1xc2, c1xc2); - T dmax = d0; - int imax = 0; - if (d1 > dmax) { - dmax = d1; - imax = 1; - } - if (d2 > dmax) { - imax = 2; - } + T d0 = dprod(c0xc1, c0xc1); + T d1 = dprod(c0xc2, c0xc2); + T d2 = dprod(c1xc2, c1xc2); + T dmax = d0; + int imax = 0; + if (d1 > dmax) { + dmax = d1; + imax = 1; + } + if (d2 > dmax) { + imax = 2; + } - if (imax == 0) { - dir = c0xc1 / std::sqrt(d0); - } else if (imax == 1) { - dir = c0xc2 / std::sqrt(d1); - } else { - dir = c1xc2 / std::sqrt(d2); + if (imax == 0) { + c0xc1 /= std::sqrt(d0); + result(0, index) = c0xc1[0]; + result(1, index) = c0xc1[1]; + result(2, index) = c0xc1[2]; + } else if (imax == 1) { + c0xc2 /= std::sqrt(d1); + result(0, index) = c0xc2[0]; + result(1, index) = c0xc2[1]; + result(2, index) = c0xc2[2]; + } else { + c1xc2 /= std::sqrt(d2); + result(0, index) = c1xc2[0]; + result(1, index) = c1xc2[1]; + result(2, index) = c1xc2[2]; + } } + } else { + result(0, 0) = 1; + result(1, 1) = 1; + result(2, 2) = 1; } } - return dir; + return result; } } // namespace htool #endif diff --git a/include/htool/solvers/ddm.hpp b/include/htool/solvers/ddm.hpp index 3a6d843b..e42e2a7c 100644 --- a/include/htool/solvers/ddm.hpp +++ b/include/htool/solvers/ddm.hpp @@ -64,7 +64,7 @@ class DDM { void facto_one_level() { double time = MPI_Wtime(); - double mytime, maxtime; + double mytime(0), maxtime(0); m_hpddm_op->callNumfact(); mytime = MPI_Wtime() - time; @@ -93,7 +93,7 @@ class DDM { // int rankWorld; // MPI_Comm_rank(MPI_COMM_WORLD, &rankWorld); // Z.csv_save("test_" + NbrToStr(rankWorld)); - CoefficientPrecision **Z_ptr_ptr = new CoefficientPrecision *[nevi]; + CoefficientPrecision **Z_ptr_ptr = nevi ? new CoefficientPrecision *[nevi] : nullptr; CoefficientPrecision *Z_ptr = Z.release(); for (int i = 0; i < nevi; i++) { Z_ptr_ptr[i] = Z_ptr + i * m_size_w_overlap; @@ -303,7 +303,7 @@ class DDM { MPI_Reduce(&(nevi_min), &(nevi_min), 1, MPI_INT, MPI_MIN, 0, m_hpddm_op->HA->get_comm()); } - infos["DDM_local_coarse_size_mean"] = NbrToStr((double)nevi_mean / (double)sizeWorld); + infos["DDM_local_coarse_size_mean"] = NbrToStr(static_cast(nevi_mean) / static_cast(sizeWorld)); infos["DDM_local_coarse_size_max"] = NbrToStr(nevi_max); infos["DDM_local_coarse_size_min"] = NbrToStr(nevi_min); @@ -388,7 +388,7 @@ DDM make_DDM_solver(const DistributedO int n = local_dense_matrix.nb_rows(); // Timing - double mytime, maxtime; + double mytime(0), maxtime(0); double time = MPI_Wtime(); // Symmetry and storage @@ -459,7 +459,7 @@ DDM make_DDM_solver_w_custom_local int n = local_hmatrix.get_target_cluster().get_size(); // Timing - double mytime, maxtime; + double mytime(0), maxtime(0); double time = MPI_Wtime(); // Symmetry and storage @@ -490,7 +490,7 @@ DDM make_DDM_solver_w_custom_local int n = local_hmatrix.get_target_cluster().get_size() + D.nb_rows(); // Timing - double mytime, maxtime; + double mytime(0), maxtime(0); double time = MPI_Wtime(); // Symmetry and storage diff --git a/include/htool/solvers/geneo/coarse_space_builder.hpp b/include/htool/solvers/geneo/coarse_space_builder.hpp index 125d9b72..14176fbd 100644 --- a/include/htool/solvers/geneo/coarse_space_builder.hpp +++ b/include/htool/solvers/geneo/coarse_space_builder.hpp @@ -87,10 +87,14 @@ class GeneoCoarseSpaceDenseBuilder : public VirtualCoarseSpaceBuilder::gv(&itype, "V", &m_uplo, &n, m_DAiD.data(), &lda, m_Bi.data(), &ldb, w.data(), work.data(), &lwork, rwork.data(), &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::gv(&itype, "V", &m_uplo, &n, m_DAiD.data(), &lda, m_Bi.data(), &ldb, w.data(), work.data(), &lwork, rwork.data(), &info); + if (info != 0) { + htool::Logger::get_instance().log(LogLevel::ERROR, "Local eigensolver failed with info=" + std::to_string(info) + "."); // LCOV_EXCL_LINE + } + // std::cout << "OUAAAAH 2\n"; std::sort(index.begin(), index.end(), [&](const int &a, const int &b) { return (std::abs(w[a]) > std::abs(w[b])); @@ -136,7 +140,7 @@ class GeneoCoarseSpaceDenseBuilder : public VirtualCoarseSpaceBuilder vl(n * n), vr(n * n); Lapack::ggev("N", "V", &n, m_DAiD.data(), &lda, m_Bi.data(), &ldb, alphar.data(), alphai.data(), beta.data(), vl.data(), &ldvl, vr.data(), &ldvr, work.data(), &lwork, rwork.data(), &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::ggev("N", "V", &n, m_DAiD.data(), &lda, m_Bi.data(), &ldb, alphar.data(), alphai.data(), beta.data(), vl.data(), &ldvl, vr.data(), &ldvr, work.data(), &lwork, rwork.data(), &info); diff --git a/include/htool/solvers/local_solvers/local_hmatrix_plus_overlap_solvers.hpp b/include/htool/solvers/local_solvers/local_hmatrix_plus_overlap_solvers.hpp index e552d561..28c6379c 100644 --- a/include/htool/solvers/local_solvers/local_hmatrix_plus_overlap_solvers.hpp +++ b/include/htool/solvers/local_solvers/local_hmatrix_plus_overlap_solvers.hpp @@ -99,7 +99,7 @@ class LocalHMatrixPlusOverlapSolver : public VirtualLocalSolver 0) { add_matrix_matrix_product('N', 'N', CoefficientPrecision(-1), m_C, b1, CoefficientPrecision(1), b2); - triangular_matrix_matrix_solve('L', 'L', 'N', 'U', CoefficientPrecision(1), m_D, b2); + triangular_matrix_matrix_solve('L', 'L', 'N', 'N', CoefficientPrecision(1), m_D, b2); triangular_matrix_matrix_solve('L', 'L', is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(1), m_D, b2); add_matrix_matrix_product(is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(-1), m_C, b2, CoefficientPrecision(1), b1); } @@ -109,7 +109,8 @@ class LocalHMatrixPlusOverlapSolver : public VirtualLocalSolver 0) { add_matrix_matrix_product(is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(-1), m_B, b1, CoefficientPrecision(1), b2); - triangular_matrix_matrix_solve('L', 'U', 'N', 'U', CoefficientPrecision(1), m_D, b2); + triangular_matrix_matrix_solve('L', 'U', is_complex() ? 'C' : 'T', 'N', CoefficientPrecision(1), m_D, b2); + triangular_matrix_matrix_solve('L', 'U', 'N', 'N', CoefficientPrecision(1), m_D, b2); add_matrix_matrix_product('N', 'N', CoefficientPrecision(-1), m_B, b2, CoefficientPrecision(1), b1); } diff --git a/include/htool/solvers/utility.hpp b/include/htool/solvers/utility.hpp index 78de9b12..4e310330 100644 --- a/include/htool/solvers/utility.hpp +++ b/include/htool/solvers/utility.hpp @@ -2,7 +2,7 @@ #define HTOOL_SOLVERS_UTILITY_HPP #include "../clustering/cluster_node.hpp" // for Cluster -#include "../clustering/tree_builder/recursive_build.hpp" // for ClusterTre... +#include "../clustering/tree_builder/tree_builder.hpp" // for ClusterTre... #include "../distributed_operator/distributed_operator.hpp" // for DistributedOperator #include "../hmatrix/hmatrix.hpp" // for HMatrix #include "../hmatrix/interfaces/virtual_generator.hpp" // for VirtualGen... @@ -175,9 +175,9 @@ class DDMSolverWithDenseLocalSolver { LocalGeneratorInUserNumbering local_generator(generator0, local_to_global_numbering, local_to_global_numbering); // Local HMatrix - HMatrixTreeBuilder local_hmatrix_builder(*local_cluster, *local_cluster, epsilon0, eta0, symmetry0, symmetry0 != 'N' ? 'L' : 'N', -1, -1, -1); + HMatrixTreeBuilder local_hmatrix_builder(epsilon0, eta0, symmetry0, symmetry0 != 'N' ? 'L' : 'N'); - return local_hmatrix_builder.build(local_generator); + return local_hmatrix_builder.build(local_generator, *local_cluster, *local_cluster); }; public: @@ -227,7 +227,8 @@ class DDMSolverBuilder { if (!sym or block_diagonal_hmatrix0.get_UPLO() == 'U') { B.resize(local_size_without_overlap, local_size_with_overlap - local_size_without_overlap); generator0.copy_submatrix(local_size_without_overlap, local_size_with_overlap - local_size_without_overlap, inside_num.data(), overlap_num.data(), B.data()); - } else if (!sym or block_diagonal_hmatrix0.get_UPLO() == 'L') { + } + if (!sym or block_diagonal_hmatrix0.get_UPLO() == 'L') { C.resize(local_size_with_overlap - local_size_without_overlap, local_size_without_overlap); generator0.copy_submatrix(local_size_with_overlap - local_size_without_overlap, local_size_without_overlap, overlap_num.data(), inside_num.data(), C.data()); @@ -235,24 +236,31 @@ class DDMSolverBuilder { return blocks_in_overlap0; }; - std::function(int, const CoordinatePrecision *)> initialize_local_cluster = [this](int spatial_dimension0, const CoordinatePrecision *global_geometry0) { + std::function(int, const CoordinatePrecision *, const CoordinatePrecision *, const CoordinatePrecision *, const ClusterTreeBuilder &)> initialize_local_cluster = [this](int spatial_dimension0, const CoordinatePrecision *global_geometry0, const CoordinatePrecision *global_radii0, const CoordinatePrecision *global_weights0, const ClusterTreeBuilder &cluster_tree_builder) { // Local geometry int local_size = local_to_global_numbering.size(); std::vector local_geometry(spatial_dimension0 * local_size); + std::vector local_radii(global_radii0 == nullptr ? 0 : local_size); + std::vector local_weights(global_weights0 == nullptr ? 0 : local_size); for (int i = 0; i < local_to_global_numbering.size(); i++) { for (int dimension = 0; dimension < spatial_dimension0; dimension++) { local_geometry[spatial_dimension0 * i + dimension] = global_geometry0[spatial_dimension0 * local_to_global_numbering[i] + dimension]; + if (global_radii0) { + local_radii[i] = global_radii0[local_to_global_numbering[i]]; + } + if (global_weights0) { + local_weights[i] = global_weights0[local_to_global_numbering[i]]; + } } } // Local cluster - ClusterTreeBuilder recursive_build_strategy; - return recursive_build_strategy.create_cluster_tree(local_to_global_numbering.size(), spatial_dimension0, local_geometry.data(), 2, 2); + return cluster_tree_builder.create_cluster_tree(local_to_global_numbering.size(), spatial_dimension0, local_geometry.data(), global_radii0 == nullptr ? nullptr : local_radii.data(), global_weights0 == nullptr ? nullptr : local_weights.data(), 2, 2, nullptr); }; - std::unique_ptr> local_cluster; + std::unique_ptr> m_local_cluster; - std::function(const VirtualGenerator &, underlying_type, CoordinatePrecision, char, char)> initialize_local_hmatrix = [this](const VirtualGenerator &generator0, underlying_type epsilon0, CoordinatePrecision eta0, char symmetry0, char UPLO0) { + std::function(const VirtualGenerator &, const HMatrixTreeBuilder &)> initialize_local_hmatrix = [this](const VirtualGenerator &generator0, const HMatrixTreeBuilder &local_hmatrix_builder) { struct LocalGeneratorInUserNumbering : public VirtualGenerator { const std::vector &m_target_local_to_global_numbering; const std::vector &m_source_local_to_global_numbering; @@ -272,18 +280,64 @@ class DDMSolverBuilder { } }; + struct LocalLowRankGenerator : public VirtualInternalLowRankGenerator { + const Cluster &m_local_target_cluster; + const Cluster &m_local_source_cluster; + const std::vector &m_target_local_to_global_numbering; + const std::vector &m_source_local_to_global_numbering; + std::shared_ptr> m_low_rank_generator; + + LocalLowRankGenerator(std::shared_ptr> low_rank_generator, const Cluster &local_target_cluster, const Cluster &local_source_cluster, const std::vector &target_local_to_global_numbering, const std::vector &source_local_to_global_numbering) : m_local_target_cluster(local_target_cluster), m_local_source_cluster(local_source_cluster), m_target_local_to_global_numbering(target_local_to_global_numbering), m_source_local_to_global_numbering(source_local_to_global_numbering), m_low_rank_generator(low_rank_generator) {} + + std::pair, std::vector> renumber(int M, int N, int row_offset, int col_offset) const { + + const int *rows = m_local_target_cluster.get_permutation().data() + row_offset; + const int *cols = m_local_source_cluster.get_permutation().data() + col_offset; + std::pair, std::vector> result; + result.first.resize(M); + result.second.resize(N); + + for (int i = 0; i < M; i++) { + result.first[i] = m_target_local_to_global_numbering[rows[i]]; + } + for (int j = 0; j < N; j++) { + result.second[j] = m_source_local_to_global_numbering[cols[j]]; + } + return result; + } + + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, LowRankMatrix &lrmat) const override { + + auto new_numbering = renumber(M, N, row_offset, col_offset); + + return m_low_rank_generator->copy_low_rank_approximation(M, N, new_numbering.first.data(), new_numbering.second.data(), lrmat); + } + bool copy_low_rank_approximation(int M, int N, int row_offset, int col_offset, int reqrank, LowRankMatrix &lrmat) const override { + + auto new_numbering = renumber(M, N, row_offset, col_offset); + + return m_low_rank_generator->copy_low_rank_approximation(M, N, new_numbering.first.data(), new_numbering.second.data(), reqrank, lrmat); + } + }; + // Local Generator LocalGeneratorInUserNumbering local_generator(generator0, local_to_global_numbering, local_to_global_numbering); - // Local HMatrix - HMatrixTreeBuilder local_hmatrix_builder(*local_cluster, *local_cluster, epsilon0, eta0, symmetry0, UPLO0, -1, -1, -1); + // Local low rank generator + HMatrixTreeBuilder m_local_hmatrix_builder(local_hmatrix_builder.get_epsilon(), local_hmatrix_builder.get_eta(), local_hmatrix_builder.get_symmetry(), local_hmatrix_builder.get_UPLO()); - return local_hmatrix_builder.build(local_generator); + if (local_hmatrix_builder.get_internal_low_rank_generator() != nullptr) { + m_local_hmatrix_builder.set_low_rank_generator(local_hmatrix_builder.get_internal_low_rank_generator()); + } else if (local_hmatrix_builder.get_low_rank_generator() != nullptr) { + m_local_hmatrix_builder.set_low_rank_generator(std::make_shared(local_hmatrix_builder.get_low_rank_generator(), *m_local_cluster, *m_local_cluster, local_to_global_numbering, local_to_global_numbering)); + } + + return m_local_hmatrix_builder.build(local_generator, *m_local_cluster, *m_local_cluster); }; public: - std::unique_ptr> local_hmatrix; // A - std::array, 3> blocks_in_overlap; // B,C,D + std::unique_ptr> local_hmatrix; + std::array, 3> blocks_in_overlap; // B,C,D DDM solver; // Block Jacobi @@ -293,7 +347,9 @@ class DDMSolverBuilder { DDMSolverBuilder(DistributedOperator &distributed_operator, HMatrix &block_diagonal_hmatrix, const VirtualGenerator &generator, const std::vector &ovr_subdomain_to_global, const std::vector &cluster_to_ovr_subdomain, const std::vector &neighbors, const std::vector> &intersections) : m_local_numbering(ovr_subdomain_to_global, cluster_to_ovr_subdomain, intersections), local_to_global_numbering(m_local_numbering.local_to_global_numbering), blocks_in_overlap(initialize_blocks_in_overlap(distributed_operator, block_diagonal_hmatrix, generator)), solver(make_DDM_solver_w_custom_local_solver(distributed_operator, block_diagonal_hmatrix, blocks_in_overlap[0], blocks_in_overlap[1], blocks_in_overlap[2], neighbors, m_local_numbering.intersections)) {} // DDM building local hmatrix with overlap - DDMSolverBuilder(DistributedOperator &distributed_operator, const std::vector &ovr_subdomain_to_global, const std::vector &cluster_to_ovr_subdomain, const std::vector &neighbors, const std::vector> &intersections, const VirtualGenerator &generator, int spatial_dimension, const CoordinatePrecision *global_geometry, underlying_type epsilon, CoordinatePrecision eta) : m_local_numbering(ovr_subdomain_to_global, cluster_to_ovr_subdomain, intersections), local_to_global_numbering(m_local_numbering.local_to_global_numbering), local_cluster(std::make_unique>(initialize_local_cluster(spatial_dimension, global_geometry))), local_hmatrix(std::make_unique>(initialize_local_hmatrix(generator, epsilon, eta, distributed_operator.get_symmetry_type(), distributed_operator.get_storage_type()))), solver(make_DDM_solver_w_custom_local_solver(distributed_operator, *local_hmatrix, neighbors, m_local_numbering.intersections, true)) {} + DDMSolverBuilder(DistributedOperator &distributed_operator, const std::vector &ovr_subdomain_to_global, const std::vector &cluster_to_ovr_subdomain, const std::vector &neighbors, const std::vector> &intersections, const VirtualGenerator &generator, int spatial_dimension, const CoordinatePrecision *global_geometry, const CoordinatePrecision *radii, const CoordinatePrecision *weights, const ClusterTreeBuilder &cluster_tree_builder, const HMatrixTreeBuilder &local_hmatrix_builder) : m_local_numbering(ovr_subdomain_to_global, cluster_to_ovr_subdomain, intersections), local_to_global_numbering(m_local_numbering.local_to_global_numbering), m_local_cluster(std::make_unique>(initialize_local_cluster(spatial_dimension, global_geometry, radii, weights, cluster_tree_builder))), local_hmatrix(std::make_unique>(initialize_local_hmatrix(generator, local_hmatrix_builder))), solver(make_DDM_solver_w_custom_local_solver(distributed_operator, *local_hmatrix, neighbors, m_local_numbering.intersections, true)) {} + + DDMSolverBuilder(DistributedOperator &distributed_operator, const std::vector &ovr_subdomain_to_global, const std::vector &cluster_to_ovr_subdomain, const std::vector &neighbors, const std::vector> &intersections, const VirtualGenerator &generator, int spatial_dimension, const CoordinatePrecision *global_geometry, const ClusterTreeBuilder &cluster_tree_builder, const HMatrixTreeBuilder &local_hmatrix_builder) : DDMSolverBuilder(distributed_operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, generator, spatial_dimension, global_geometry, nullptr, nullptr, cluster_tree_builder, local_hmatrix_builder) {} }; } // namespace htool diff --git a/include/htool/testing/generate_test_case.hpp b/include/htool/testing/generate_test_case.hpp index 0558def0..69b83e1c 100644 --- a/include/htool/testing/generate_test_case.hpp +++ b/include/htool/testing/generate_test_case.hpp @@ -2,15 +2,15 @@ #ifndef HTOOL_TESTING_GENERATE_TEST_CASE_HPP #define HTOOL_TESTING_GENERATE_TEST_CASE_HPP -#include "../clustering/cluster_node.hpp" // for Cluster -#include "../clustering/tree_builder/recursive_build.hpp" // for Cluster... -#include "../hmatrix/interfaces/virtual_generator.hpp" // for GeneratorWithPermutation -#include "../misc/misc.hpp" // for underly... -#include "geometry.hpp" // for create_... -#include "partition.hpp" // for test_pa... -#include // for make_un... -#include // for uniform... -#include // for vector +#include "../clustering/cluster_node.hpp" // for Cluster +#include "../clustering/tree_builder/tree_builder.hpp" // for Cluster... +#include "../hmatrix/interfaces/virtual_generator.hpp" // for GeneratorWithPermutation +#include "../misc/misc.hpp" // for underly... +#include "geometry.hpp" // for create_... +#include "partition.hpp" // for test_pa... +#include // for make_un... +#include // for uniform... +#include // for vector namespace htool { diff --git a/include/htool/testing/generator_test.hpp b/include/htool/testing/generator_test.hpp index 49bd7c78..f91085eb 100644 --- a/include/htool/testing/generator_test.hpp +++ b/include/htool/testing/generator_test.hpp @@ -220,6 +220,22 @@ class GeneratorInUserNumberingFromMatrix : public VirtualGenerator { } }; +template +class GeneratorInUserNumberingFromMatrixToReal : public VirtualGenerator { + public: + const Matrix> &A; + + GeneratorInUserNumberingFromMatrixToReal(const Matrix> &A0) : A(A0) {} + + virtual void copy_submatrix(int M, int N, const int *rows, const int *cols, T *ptr) const override { + for (int i = 0; i < M; i++) { + for (int j = 0; j < N; j++) { + ptr[i + M * j] = A(rows[i], cols[j]).real(); + } + } + } +}; + // class GeneratorFromMatrix : public InternalGeneratorWithPermutation { // public: diff --git a/include/htool/testing/partition.hpp b/include/htool/testing/partition.hpp index 456729d6..bd22c859 100644 --- a/include/htool/testing/partition.hpp +++ b/include/htool/testing/partition.hpp @@ -13,7 +13,7 @@ namespace htool { template void test_partition(int spatial_dimension, int number_of_points, std::vector &coordinates, int partition_size, std::vector &partition) { // Compute largest extent - std::vector direction(spatial_dimension, 0); + Matrix direction(spatial_dimension, spatial_dimension); Matrix cov(spatial_dimension, spatial_dimension); std::vector permutation(number_of_points, 0); std::iota(permutation.begin(), permutation.end(), int(0)); diff --git a/include/htool/wrappers/wrapper_blas.hpp b/include/htool/wrappers/wrapper_blas.hpp index a14f20c3..e009c128 100644 --- a/include/htool/wrappers/wrapper_blas.hpp +++ b/include/htool/wrappers/wrapper_blas.hpp @@ -30,7 +30,8 @@ void HTOOL_BLAS_F77(C##symm)(const char *, const char *, const int *, const int *, const T *, const T *, const int *, const T *, const int *, const T *, T *, const int *) HTOOL_NOEXCEPT; \ void HTOOL_BLAS_F77(C##syrk)(const char *const, const char *const, const int *const, const int *const, const T *const, const T *const, const int *const, const T *const, T *const, const int *const) HTOOL_NOEXCEPT; \ void HTOOL_BLAS_F77(C##trsm)(const char *, const char *, const char *, const char *, const int *, const int *, const T *, const T *, const int *, T *, const int *) HTOOL_NOEXCEPT; \ - void HTOOL_BLAS_F77(C##laswp)(const int *, T *, const int *, const int *, const int *, const int *, const int *) HTOOL_NOEXCEPT; + void HTOOL_BLAS_F77(C##swap)(const int *, T *, const int *, T *, const int *) HTOOL_NOEXCEPT; \ + void HTOOL_BLAS_F77(C##ger)(const int *, const int *, const T *, const T *, const int *, const T *, const int *, T *, const int *) HTOOL_NOEXCEPT; #define HTOOL_GENERATE_EXTERN_BLAS_COMPLEX(C, T, B, U) \ HTOOL_GENERATE_EXTERN_BLAS(B, U) \ HTOOL_GENERATE_EXTERN_BLAS(C, T) \ @@ -111,8 +112,11 @@ struct Blas { * Solves a triangular system. */ static void trsm(const char *, const char *, const char *, const char *, const int *, const int *, const K *, const K *, const int *, K *, const int *); /* Function: - * Performs a series of row interchanges on the matrix A */ - static void laswp(const int *, K *, const int *, const int *, const int *, const int *, const int *); + * Interchanges two vectors. */ + static void swap(const int *, K *, const int *, K *, const int *); + /* Function: + * Performs a rank 1 operation */ + static void ger(const int *, const int *, const K *, const K *, const int *, const K *, const int *, K *, const int *); }; # define HTOOL_GENERATE_GEMM(C, T) \ @@ -177,9 +181,14 @@ struct Blas { (side, uplo, transa, diag, m, n, alpha, a, lda, b, ldb); \ } \ template <> \ - inline void Blas::laswp(const int *N, T *A, const int *lda, const int *K1, const int *K2, const int *ipiv, const int *incx) { \ - HTOOL_BLAS_F77(C##laswp) \ - (N, A, lda, K1, K2, ipiv, incx); \ + inline void Blas::ger(const int *M, const int *N, const T *alpha, const T *x, const int *incx, const T *y, const int *incy, T *A, const int *lda) { \ + HTOOL_BLAS_F77(C##ger) \ + (M, N, alpha, x, incx, y, incy, A, lda); \ + } \ + template <> \ + inline void Blas::swap(const int *N, T *x, const int *incx, T *y, const int *incy) { \ + HTOOL_BLAS_F77(C##swap) \ + (N, x, incx, y, incy); \ } # define HTOOL_GENERATE_BLAS_COMPLEX(C, T, B, U) \ HTOOL_GENERATE_BLAS(C, T) \ diff --git a/include/htool/wrappers/wrapper_hpddm.hpp b/include/htool/wrappers/wrapper_hpddm.hpp index 6655b0b3..cc325b2f 100644 --- a/include/htool/wrappers/wrapper_hpddm.hpp +++ b/include/htool/wrappers/wrapper_hpddm.hpp @@ -11,6 +11,7 @@ #if defined(__clang__) # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wsign-compare" +# pragma clang diagnostic ignored "-Wold-style-cast" # pragma clang diagnostic ignored "-Wshadow" # pragma clang diagnostic ignored "-Wdouble-promotion" # pragma clang diagnostic ignored "-Wunused-parameter" @@ -18,12 +19,14 @@ #elif defined(__GNUC__) || defined(__GNUG__) # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wsign-compare" +# pragma GCC diagnostic ignored "-Wold-style-cast" # pragma GCC diagnostic ignored "-Wshadow" # pragma GCC diagnostic ignored "-Wdouble-promotion" # pragma GCC diagnostic ignored "-Wunused-parameter" # pragma GCC diagnostic ignored "-Wnon-virtual-dtor" # pragma GCC diagnostic ignored "-Wuseless-cast" # pragma GCC diagnostic ignored "-Wunused-local-typedefs" +# pragma GCC diagnostic ignored "-Wmaybe-uninitialized" #endif #include @@ -72,13 +75,13 @@ class HPDDMCustomLocalSolver { }; template class LocalSolver> -class HPDDMOperator final : public HPDDM::Dense { +class HPDDMOperator final : public HPDDM::Dense { // 'S' means the coarse space is symmetric protected: const DistributedOperator *HA; std::vector *in_global, *buffer; public: - typedef HPDDM::Dense super; + typedef HPDDM::Dense super; HPDDMOperator(const DistributedOperator *A) : HA(A) { in_global = new std::vector; diff --git a/include/htool/wrappers/wrapper_lapack.hpp b/include/htool/wrappers/wrapper_lapack.hpp index e4f90993..64d7af4a 100644 --- a/include/htool/wrappers/wrapper_lapack.hpp +++ b/include/htool/wrappers/wrapper_lapack.hpp @@ -26,7 +26,10 @@ void HTOOL_LAPACK_F77(C##getrf)(const int *, const int *, T *, const int *, int *, int *) HTOOL_NOEXCEPT; \ void HTOOL_LAPACK_F77(C##getrs)(const char *, const int *, const int *, const T *, const int *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ void HTOOL_LAPACK_F77(C##potrf)(const char *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ - void HTOOL_LAPACK_F77(C##potrs)(const char *, const int *, const int *, const T *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; + void HTOOL_LAPACK_F77(C##potrs)(const char *, const int *, const int *, const T *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ + void HTOOL_LAPACK_F77(C##sytrf)(const char *, const int *, T *, const int *, int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ + void HTOOL_LAPACK_F77(C##sytrs)(const char *, const int *, const int *, const T *, const int *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ + void HTOOL_LAPACK_F77(C##laswp)(const int *, T *, const int *, const int *, const int *, const int *, const int *) HTOOL_NOEXCEPT; #define HTOOL_GENERATE_EXTERN_LAPACK_COMPLEX(C, T, B, U) \ HTOOL_GENERATE_EXTERN_LAPACK(B, U) \ HTOOL_GENERATE_EXTERN_LAPACK(C, T) \ @@ -39,7 +42,9 @@ void HTOOL_LAPACK_F77(B##ormlq)(const char *, const char *, const int *, const int *, const int *, const U *, const int *, const U *, U *, const int *, U *, const int *, int *) HTOOL_NOEXCEPT; \ void HTOOL_LAPACK_F77(C##unmlq)(const char *, const char *, const int *, const int *, const int *, const T *, const int *, const T *, T *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ void HTOOL_LAPACK_F77(B##ormqr)(const char *, const char *, const int *, const int *, const int *, const U *, const int *, const U *, U *, const int *, U *, const int *, int *) HTOOL_NOEXCEPT; \ - void HTOOL_LAPACK_F77(C##unmqr)(const char *, const char *, const int *, const int *, const int *, const T *, const int *, const T *, T *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; + void HTOOL_LAPACK_F77(C##unmqr)(const char *, const char *, const int *, const int *, const int *, const T *, const int *, const T *, T *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ + void HTOOL_LAPACK_F77(C##hetrf)(const char *, const int *, T *, const int *, int *, T *, const int *, int *) HTOOL_NOEXCEPT; \ + void HTOOL_LAPACK_F77(C##hetrs)(const char *, const int *, const int *, const T *, const int *, const int *, T *, const int *, int *) HTOOL_NOEXCEPT; #if !defined(PETSC_HAVE_BLASLAPACK) # ifndef _MKL_H_ @@ -98,6 +103,21 @@ struct Lapack { /* Function: gv * Computes the eigenvalues and (optionally) the eigenvectors of a hermitian/symetric generalized eigenvalue problem. */ static void gv(const int *, const char *, const char *, const int *, K *, const int *, K *, const int *, underlying_type *, K *, const int *, underlying_type *, int *); + /* Function: sytrf + * Computes the Bunch--Kaufman factorization of a symmetric matrix. */ + static void sytrf(const char *, const int *, K *, const int *, int *, K *, int *, int *); + /* Function: sytrs + * Solves a system of linear equations with an LDLT-factored matrix. */ + static void sytrs(const char *, const int *, const int *, const K *, const int *, const int *, K *, const int *, int *); + /* Function: hetrf + * Computes the Bunch--Kaufman factorization of a hermintian matrix. */ + static void hetrf(const char *, const int *, K *, const int *, int *, K *, int *, int *); + /* Function: hetrs + * Solves a system of linear equations with an LDL*-factored matrix. */ + static void hetrs(const char *, const int *, const int *, const K *, const int *, const int *, K *, const int *, int *); + /* Function: + * Performs a series of row interchanges on the matrix A */ + static void laswp(const int *, K *, const int *, const int *, const int *, const int *, const int *); }; # define HTOOL_GENERATE_LAPACK(C, T) \ @@ -130,6 +150,21 @@ struct Lapack { inline void Lapack::potrs(const char *uplo, const int *n, const int *nrhs, const T *a, const int *lda, T *b, const int *ldb, int *info) { \ HTOOL_LAPACK_F77(C##potrs) \ (uplo, n, nrhs, a, lda, b, ldb, info); \ + } \ + template <> \ + inline void Lapack::sytrf(const char *uplo, const int *n, T *a, const int *lda, int *ipiv, T *work, int *lwork, int *info) { \ + HTOOL_LAPACK_F77(C##sytrf) \ + (uplo, n, a, lda, ipiv, work, lwork, info); \ + } \ + template <> \ + inline void Lapack::sytrs(const char *uplo, const int *n, const int *nrhs, const T *a, const int *lda, const int *ipiv, T *b, const int *ldb, int *info) { \ + HTOOL_LAPACK_F77(C##sytrs) \ + (uplo, n, nrhs, a, lda, ipiv, b, ldb, info); \ + } \ + template <> \ + inline void Lapack::laswp(const int *N, T *A, const int *lda, const int *K1, const int *K2, const int *ipiv, const int *incx) { \ + HTOOL_LAPACK_F77(C##laswp) \ + (N, A, lda, K1, K2, ipiv, incx); \ } # define HTOOL_GENERATE_LAPACK_COMPLEX(C, T, B, U) \ @@ -184,6 +219,16 @@ struct Lapack { inline void Lapack::gv(const int *itype, const char *jobz, const char *uplo, const int *n, T *a, const int *lda, T *b, const int *ldb, U *w, T *work, const int *lwork, U *rwork, int *info) { \ HTOOL_LAPACK_F77(C##hegv) \ (itype, jobz, uplo, n, a, lda, b, ldb, w, work, lwork, rwork, info); \ + } \ + template <> \ + inline void Lapack::hetrf(const char *uplo, const int *n, T *a, const int *lda, int *ipiv, T *work, int *lwork, int *info) { \ + HTOOL_LAPACK_F77(C##hetrf) \ + (uplo, n, a, lda, ipiv, work, lwork, info); \ + } \ + template <> \ + inline void Lapack::hetrs(const char *uplo, const int *n, const int *nrhs, const T *a, const int *lda, const int *ipiv, T *b, const int *ldb, int *info) { \ + HTOOL_LAPACK_F77(C##hetrs) \ + (uplo, n, nrhs, a, lda, ipiv, b, ldb, info); \ } HTOOL_GENERATE_LAPACK_COMPLEX(c, std::complex, s, float) diff --git a/tests/functional_tests/clustering/test_cluster.cpp b/tests/functional_tests/clustering/test_cluster.cpp index 85b66d0a..4967b191 100644 --- a/tests/functional_tests/clustering/test_cluster.cpp +++ b/tests/functional_tests/clustering/test_cluster.cpp @@ -1,7 +1,6 @@ #include "test_cluster.hpp" #include // for pow -#include -#include +#include using namespace std; using namespace htool; diff --git a/tests/functional_tests/clustering/test_cluster.hpp b/tests/functional_tests/clustering/test_cluster.hpp index 48a82caa..05a95258 100644 --- a/tests/functional_tests/clustering/test_cluster.hpp +++ b/tests/functional_tests/clustering/test_cluster.hpp @@ -1,7 +1,8 @@ #include // for equal #include // for cluster... #include // for save_cl... -#include // for Cluster... +#include // for Partitioning... +#include // for Cluster... #include // for NbrToStr #include // for generat... #include // for create_... @@ -45,12 +46,9 @@ bool test_cluster(int size, bool use_given_partition) { } ClusterTreeBuilder recursive_build_strategy; - recursive_build_strategy.set_direction_computation_strategy(std::make_shared()); - recursive_build_strategy.set_splitting_strategy(std::make_shared()); - // if (use_given_partition) { - // recursive_build_strategy.set_partition(partition); - // } - recursive_build_strategy.set_minclustersize(1); + recursive_build_strategy.set_partitioning_strategy(std::make_shared>()); + + recursive_build_strategy.set_maximal_leaf_size(10); Cluster root_cluster = recursive_build_strategy.create_cluster_tree(size, dim, coordinates.data(), nb_sons, sizeWorld, (use_given_partition) ? partition.data() : nullptr); is_error = is_error || !(root_cluster.is_root()); @@ -107,7 +105,7 @@ bool test_cluster(int size, bool use_given_partition) { Cluster copied_cluster = read_cluster_tree("test_save_" + NbrToStr(rankWorld) + "_" + NbrToStr(sizeWorld) + "_cluster_tree_properties.csv", "test_save_" + NbrToStr(rankWorld) + "_" + NbrToStr(sizeWorld) + "_cluster_tree.csv"); save_cluster_tree(copied_cluster, "test_save_2_" + NbrToStr(rankWorld) + "_" + NbrToStr(sizeWorld)); - is_error = is_error || !(root_cluster.get_minclustersize() == copied_cluster.get_minclustersize()); + is_error = is_error || !(root_cluster.get_maximal_leaf_size() == copied_cluster.get_maximal_leaf_size()); is_error = is_error || !(root_cluster.get_maximal_depth() == copied_cluster.get_maximal_depth()); is_error = is_error || !(root_cluster.get_minimal_depth() == copied_cluster.get_minimal_depth()); is_error = is_error || !(root_cluster.get_permutation() == copied_cluster.get_permutation()); diff --git a/tests/functional_tests/distributed_operator/test_distributed_operator.hpp b/tests/functional_tests/distributed_operator/test_distributed_operator.hpp index d1c03363..b6fc0e29 100644 --- a/tests/functional_tests/distributed_operator/test_distributed_operator.hpp +++ b/tests/functional_tests/distributed_operator/test_distributed_operator.hpp @@ -1,23 +1,23 @@ -#include // for copy_n -#include // for sqrt -#include // for norm2 -#include // for Cluster... -#include // for Cluster... -#include // for Default... -#include // for HMatrix -#include // for HMatrix... -#include // for LocalDe... -#include // for LocalHM... -#include // for LocalOp... -#include // for underly... -#include // for generat... -#include // for create_... -#include // for wrapper... -#include // for basic_o... -#include // for make_un... -#include // for MPI_COM... -#include // for srand -#include // for vector +#include // for copy_n +#include // for sqrt +#include // for norm2 +#include // for Cluster... +#include // for Cluster... +#include // for LocalDe... +#include // for LocalHM... +#include // for LocalOp... +#include // for Default... +#include // for HMatrix +#include // for HMatrix... +#include // for underly... +#include // for generat... +#include // for create_... +#include // for wrapper... +#include // for basic_o... +#include // for make_un... +#include // for MPI_COM... +#include // for srand +#include // for vector namespace htool { template class DistributedOperator; @@ -266,11 +266,11 @@ auto add_off_diagonal_operator(ClusterTreeBuilder> &re local_off_diagonal_operator_2 = make_unique>>(*off_diagonal_matrix_2, local_target_root_cluster, *local_off_diagonal_cluster_tree_2, 'N', 'N', false, true); } else if (data_type == DataType::HMatrix || data_type == DataType::DefaultHMatrix) { - HMatrixTreeBuilder> hmatrix_builder_1(local_target_root_cluster, *local_off_diagonal_cluster_tree_1, epsilon, eta, 'N', 'N', -1, -1, -1); - HMatrixTreeBuilder> hmatrix_builder_2(local_target_root_cluster, *local_off_diagonal_cluster_tree_2, epsilon, eta, 'N', 'N', -1, -1, -1); + HMatrixTreeBuilder> hmatrix_builder_1(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_builder_2(epsilon, eta, 'N', 'N'); - off_diagonal_hmatrix_1 = make_unique>>(hmatrix_builder_1.build(*generator_off_diagonal)); - off_diagonal_hmatrix_2 = make_unique>>(hmatrix_builder_2.build(*generator_off_diagonal)); + off_diagonal_hmatrix_1 = make_unique>>(hmatrix_builder_1.build(*generator_off_diagonal, local_target_root_cluster, *local_off_diagonal_cluster_tree_1)); + off_diagonal_hmatrix_2 = make_unique>>(hmatrix_builder_2.build(*generator_off_diagonal, local_target_root_cluster, *local_off_diagonal_cluster_tree_2)); local_off_diagonal_operator_1 = make_unique>>(*off_diagonal_hmatrix_1, local_target_root_cluster, *local_off_diagonal_cluster_tree_1, 'N', 'N', false, true); local_off_diagonal_operator_2 = make_unique>>(*off_diagonal_hmatrix_2, local_target_root_cluster, *local_off_diagonal_cluster_tree_2, 'N', 'N', false, true); @@ -371,9 +371,10 @@ bool test_default_distributed_operator(int nr, int nc, int mu, bool use_permutat // Generator GeneratorTestType generator_in_user_numbering(3, p1, p2); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, Symmetry, UPLO); if (off_diagonal_approximation) { - DefaultLocalApproximationBuilder> distributed_operator_holder(generator_in_user_numbering, *target_root_cluster, *source_root_cluster, epsilon, eta, Symmetry, UPLO, MPI_COMM_WORLD); + DefaultLocalApproximationBuilder> distributed_operator_holder(generator_in_user_numbering, *target_root_cluster, *source_root_cluster, hmatrix_tree_builder, MPI_COMM_WORLD); DistributedOperator &distributed_operator = distributed_operator_holder.distributed_operator; distributed_operator.use_permutation() = use_permutation; @@ -381,7 +382,7 @@ bool test_default_distributed_operator(int nr, int nc, int mu, bool use_permutat test = test_vector_product(generator_in_user_numbering, distributed_operator, *target_root_cluster, MasterOffset_target, *source_root_cluster, MasterOffset_source, mu, op, use_permutation, epsilon); } else { - DefaultApproximationBuilder> distributed_operator_holder(generator_in_user_numbering, *target_root_cluster, *source_root_cluster, epsilon, eta, Symmetry, UPLO, MPI_COMM_WORLD); + DefaultApproximationBuilder> distributed_operator_holder(generator_in_user_numbering, *target_root_cluster, *source_root_cluster, hmatrix_tree_builder, MPI_COMM_WORLD); DistributedOperator &distributed_operator = distributed_operator_holder.distributed_operator; distributed_operator.use_permutation() = use_permutation; diff --git a/tests/functional_tests/hmatrix/CMakeLists.txt b/tests/functional_tests/hmatrix/CMakeLists.txt index cbd8f321..e8b6d758 100644 --- a/tests/functional_tests/hmatrix/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/CMakeLists.txt @@ -6,6 +6,14 @@ add_custom_target(build-tests-hmatrix-build) add_dependencies(build-tests-hmatrix build-tests-hmatrix-build) add_subdirectory(hmatrix_build) +add_custom_target(build-tests-hmatrix-builder) +add_dependencies(build-tests-hmatrix build-tests-hmatrix-builder) +add_subdirectory(hmatrix_builder) + +add_custom_target(build-tests-hmatrix-task-based) +add_dependencies(build-tests-hmatrix build-tests-hmatrix-task-based) +add_subdirectory(hmatrix_task_based) + add_custom_target(build-tests-hmatrix-product) add_dependencies(build-tests-hmatrix build-tests-hmatrix-product) add_subdirectory(hmatrix_product) diff --git a/tests/functional_tests/hmatrix/hmatrix_builder/CMakeLists.txt b/tests/functional_tests/hmatrix/hmatrix_builder/CMakeLists.txt new file mode 100755 index 00000000..c0b917ca --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_builder/CMakeLists.txt @@ -0,0 +1,21 @@ +#=============================================================================# +#=========================== Executables =====================================# +#=============================================================================# + +set(types "double") +list(APPEND types "complex_double") + +foreach(type ${types}) + add_executable(Test_hmatrix_builder_${type} test_hmatrix_builder_${type}.cpp) + target_link_libraries(Test_hmatrix_builder_${type} htool) + add_dependencies(build-tests-hmatrix-builder Test_hmatrix_builder_${type}) + add_test(NAME Test_hmatrix_builder_${type}_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_builder_${type}) + set_tests_properties(Test_hmatrix_builder_${type}_1 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) + + add_test(NAME Test_hmatrix_builder_${type}_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_builder_${type}) + set_tests_properties(Test_hmatrix_builder_${type}_2 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=2) + + add_test(NAME Test_hmatrix_builder_${type}_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_builder_${type}) + set_tests_properties(Test_hmatrix_builder_${type}_4 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) + +endforeach() diff --git a/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_complex_double.cpp new file mode 100644 index 00000000..e3707da5 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_complex_double.cpp @@ -0,0 +1,40 @@ +#include "../test_hmatrix_builder.hpp" // for test_hmatrix_build +#include // for max +#include // for complex, abs, operator- +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits +#include // for MPI_Finalize, MPI_Init + +using namespace std; +using namespace htool; + +int main(int argc, char *argv[]) { + MPI_Init(&argc, &argv); + + bool is_error = false; + + for (auto nr : {200, 400}) { + for (auto nc : {200, 400}) { + for (auto epsilon : {1e-14, 1e-6}) { + std::cout << nr << " " << nc << " " << epsilon << "\n"; + + is_error = is_error || test_hmatrix_builder, GeneratorTestComplexSymmetric>(nr, nc, 'N', 'N', epsilon); + if (nr == nc) { + for (auto UPLO : {'U', 'L'}) { + is_error = is_error || test_hmatrix_builder, GeneratorTestComplexSymmetric>(nr, nr, 'S', UPLO, epsilon); + + is_error = is_error || test_hmatrix_builder, GeneratorTestComplexHermitian>(nr, nr, 'H', UPLO, epsilon); + } + } + } + } + } + + MPI_Finalize(); + + if (is_error) { + return 1; + } + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_double.cpp b/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_double.cpp new file mode 100644 index 00000000..ede99327 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_builder/test_hmatrix_builder_double.cpp @@ -0,0 +1,39 @@ +#include "../test_hmatrix_builder.hpp" // for test_hmatrix_build +#include // for max +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits +#include // for MPI_Finalize, MPI_Init + +using namespace std; +using namespace htool; + +int main(int argc, char *argv[]) { + MPI_Init(&argc, &argv); + + bool is_error = false; + + for (auto nr : {200, 400}) { + for (auto nc : {200, 400}) { + for (auto epsilon : {1e-14, 1e-6}) { + std::cout << nr << " " << nc << " " << epsilon << "\n"; + + is_error = is_error || test_hmatrix_builder(nr, nc, 'N', 'N', epsilon); + + if (nr == nc) { + for (auto UPLO : {'U', 'L'}) { + std::cout << UPLO << "\n"; + is_error = is_error || test_hmatrix_builder(nr, nc, 'S', UPLO, epsilon); + } + } + } + } + } + + MPI_Finalize(); + + if (is_error) { + return 1; + } + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_factorization/CMakeLists.txt b/tests/functional_tests/hmatrix/hmatrix_factorization/CMakeLists.txt index 8443dae2..c17009ab 100644 --- a/tests/functional_tests/hmatrix/hmatrix_factorization/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/hmatrix_factorization/CMakeLists.txt @@ -12,3 +12,11 @@ foreach(type ${types}) add_test(NAME Test_hmatrix_factorization_${type} COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_factorization_${type}) set_tests_properties(Test_hmatrix_factorization_${type} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) endforeach() + +foreach(type ${types}) + add_executable(Test_task_based_hmatrix_factorization_${type} test_task_based_hmatrix_factorization_${type}.cpp) + target_link_libraries(Test_task_based_hmatrix_factorization_${type} htool) + add_dependencies(build-tests-hmatrix-factorization Test_task_based_hmatrix_factorization_${type}) + add_test(NAME Test_task_based_hmatrix_factorization_${type} COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_task_based_hmatrix_factorization_${type}) + set_tests_properties(Test_task_based_hmatrix_factorization_${type} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) +endforeach() diff --git a/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_complex_double.cpp new file mode 100644 index 00000000..7b379c81 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_complex_double.cpp @@ -0,0 +1,44 @@ +#include "../test_task_based_hmatrix_factorization.hpp" // for test_task_based_lu_factorization, test_task_based_cholesky_factorization +#include // for max +#include // for complex, abs, operator- +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + const int n1 = 1100; + const int n2 = 1000; + + for (auto epsilon : {1e-3, 1e-8}) { + for (auto trans : {'N', 'T'}) { + + std::cout << "task based LU factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", trans = " << trans << "\n"; + + is_error = is_error || test_task_based_lu_factorization, GeneratorTestComplexHermitian>(trans, n1, n2, epsilon); + } + + for (auto UPLO : {'L', 'U'}) { + std::cout << "task based Cholesky factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", UPLO = " << UPLO << "\n"; + + is_error = is_error || test_task_based_cholesky_factorization, GeneratorTestComplexHermitian>(UPLO, n1, n2, epsilon); + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_factorization case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_factorization cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_double.cpp b/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_double.cpp new file mode 100644 index 00000000..6eaf3686 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_factorization/test_task_based_hmatrix_factorization_double.cpp @@ -0,0 +1,43 @@ +#include "../test_task_based_hmatrix_factorization.hpp" // for test_task_based_lu_factorization, test_task_based_cholesky_factorization +#include // for max +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + const int n1 = 1100; + const int n2 = 1000; + + for (auto epsilon : {1e-4, 1e-8}) { + for (auto trans : {'N', 'T'}) { + + std::cout << "task based LU factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", trans = " << trans << "\n"; + + is_error = is_error || test_task_based_lu_factorization(trans, n1, n2, epsilon); + } + + for (auto UPLO : {'L', 'U'}) { + std::cout << "task based Cholesky factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", UPLO = " << UPLO << "\n"; + + is_error = is_error || test_task_based_cholesky_factorization(UPLO, n1, n2, epsilon); + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_factorization case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_factorization cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_product/CMakeLists.txt b/tests/functional_tests/hmatrix/hmatrix_product/CMakeLists.txt index 0cc5e2f4..887d6d9a 100755 --- a/tests/functional_tests/hmatrix/hmatrix_product/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/hmatrix_product/CMakeLists.txt @@ -19,3 +19,18 @@ foreach(type ${types}) set_tests_properties(Test_hmatrix_product_${type}_4 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) set_tests_properties(Test_hmatrix_product_${type}_4 PROPERTIES LABELS "mpi") endforeach() + +foreach(type ${types}) + add_executable(Test_task_based_hmatrix_product_${type} test_task_based_hmatrix_product_${type}.cpp) + target_link_libraries(Test_task_based_hmatrix_product_${type} htool) + add_dependencies(build-tests-hmatrix-product Test_task_based_hmatrix_product_${type}) + add_test(NAME Test_task_based_hmatrix_product_${type}_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_task_based_hmatrix_product_${type}) + set_tests_properties(Test_task_based_hmatrix_product_${type}_1 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) + set_tests_properties(Test_task_based_hmatrix_product_${type}_1 PROPERTIES LABELS "mpi") + add_test(NAME Test_task_based_hmatrix_product_${type}_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_task_based_hmatrix_product_${type}) + set_tests_properties(Test_task_based_hmatrix_product_${type}_2 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=2) + set_tests_properties(Test_task_based_hmatrix_product_${type}_2 PROPERTIES LABELS "mpi") + add_test(NAME Test_task_based_hmatrix_product_${type}_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_task_based_hmatrix_product_${type}) + set_tests_properties(Test_task_based_hmatrix_product_${type}_4 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) + set_tests_properties(Test_task_based_hmatrix_product_${type}_4 PROPERTIES LABELS "mpi") +endforeach() diff --git a/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_complex_double.cpp new file mode 100644 index 00000000..4a9d63c9 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_complex_double.cpp @@ -0,0 +1,106 @@ +#include "../test_task_based_hmatrix_hmatrix_product.hpp" +#include "../test_task_based_hmatrix_vector_product.hpp" +#include // for max +#include // for complex, abs, operator- +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + char sym = 'N'; + for (auto epsilon : {1e-4, 1e-8}) { + for (auto n1 : {1000}) { + for (auto n2 : {1000, 1100}) { + for (auto transa : {'N', 'T'}) { + for (auto block_tree_consistency : {true}) { + + // Non symmetric case + sym = 'N'; + + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", sym = " << sym << ", transa = " << transa << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseProduct, GeneratorTestComplexSymmetric> test_case(transa, 'N', n1, n2, 1, 1, 2); + + is_error = is_error || test_task_based_hmatrix_vector_product, GeneratorTestComplexSymmetric, TestCaseProduct, GeneratorTestComplexSymmetric>>(test_case, sym, transa, epsilon, block_tree_consistency); + + // Symmetric case + if (n1 == n2 && block_tree_consistency) { + for (auto UPLO : {'L'}) { + sym = 'S'; + + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = n2 = " << n1 << ", sym = " << sym << ", transa = " << transa << ", UPLO = " << UPLO << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexSymmetric> sym_test_case(n1, n2, 2, 'L', sym, UPLO); + + is_error = is_error || test_task_based_hmatrix_vector_product, GeneratorTestComplexSymmetric, TestCaseSymmetricProduct, GeneratorTestComplexSymmetric>>(sym_test_case, sym, transa, epsilon, block_tree_consistency, UPLO); + } + } + } + } + } + } + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_vector_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_vector_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + for (auto epsilon : {1e-6}) { + for (auto n1 : {1000}) { + for (auto n3 : {800}) { + for (auto transa : {'N', 'T', 'C'}) { + for (auto transb : {'N', 'T', 'C'}) { + for (auto n2 : {900}) { + + // Non symmetric case + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", n3 = " << n3 << ", sym = " << 'N' << ", transa = " << transa << ", transb = " << transb << "\n"; + + TestCaseProduct, GeneratorTestComplexSymmetric> test_case(transa, transb, n1, n2, n3, 1, 1); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexSymmetric, TestCaseProduct, GeneratorTestComplexSymmetric>>(test_case, 'N', transa, transb, epsilon, true); + } + } + } + + // Symmetric case WIP ToDO: fix the test case side = 'R' + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexSymmetric> test_case(n1, n3, 2, side, 'S', UPLO); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexSymmetric, TestCaseSymmetricProduct, GeneratorTestComplexSymmetric>>(test_case, 'S', 'N', 'N', epsilon, true); + + std::cout << "task based hermitian hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexHermitian> test_case_herm(n1, n3, 2, side, 'S', UPLO); + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexHermitian, TestCaseSymmetricProduct, GeneratorTestComplexHermitian>>(test_case_herm, 'S', 'N', 'N', epsilon, true); + } + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_hmatrix_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_hmatrix_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_double.cpp b/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_double.cpp new file mode 100644 index 00000000..4808b3ec --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_product/test_task_based_hmatrix_product_double.cpp @@ -0,0 +1,103 @@ +#include "../test_task_based_hmatrix_hmatrix_product.hpp" +#include "../test_task_based_hmatrix_vector_product.hpp" + +#include // for max +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + char sym = 'N'; + for (auto epsilon : {1e-4}) { + for (auto n1 : {1000}) { + for (auto n2 : {1000, 1100}) { + for (auto transa : {'N', 'T'}) { + for (auto block_tree_consistency : {true}) { + + // Non symmetric case + sym = 'N'; + + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", sym = " << sym << ", transa = " << transa << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseProduct test_case(transa, 'N', n1, n2, 1, 1, 2); + + is_error = is_error || test_task_based_hmatrix_vector_product>(test_case, sym, transa, epsilon, block_tree_consistency); + + // Symmetric case + if (n1 == n2 && block_tree_consistency) { + for (auto UPLO : {'L'}) { + sym = 'S'; + + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = n2 = " << n1 << ", sym = " << sym << ", transa = " << transa << ", UPLO = " << UPLO << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseSymmetricProduct sym_test_case(n1, n2, 2, 'L', sym, UPLO); + + is_error = is_error || test_task_based_hmatrix_vector_product>(sym_test_case, sym, transa, epsilon, block_tree_consistency, UPLO); // with symmetry + + // is_error = is_error || test_task_based_hmatrix_vector_product>(sym_test_case, 'N', transa, epsilon, block_tree_consistency, 'N'); // without symmetry + } + } + } + } + } + } + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_vector_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_vector_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + for (auto epsilon : {1e-6}) { + for (auto n1 : {1000}) { + for (auto n3 : {800}) { + for (auto transa : {'N', 'T'}) { + for (auto transb : {'N', 'T'}) { + for (auto n2 : {900}) { + + // Non symmetric case + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", n3 = " << n3 << ", sym = " << 'N' << ", transa = " << transa << ", transb = " << transb << "\n"; + + TestCaseProduct test_case(transa, transb, n1, n2, n3, 1, 1); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product>(test_case, 'N', transa, transb, epsilon, true); + } + } + } + + // Symmetric case WIP ToDO: fix the test case side = 'R' + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct test_case(n1, n3, 2, side, 'S', UPLO); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product>(test_case, 'S', 'N', 'N', epsilon, true); + } + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_hmatrix_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_hmatrix_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_task_based/CMakeLists.txt b/tests/functional_tests/hmatrix/hmatrix_task_based/CMakeLists.txt new file mode 100644 index 00000000..5d703602 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_task_based/CMakeLists.txt @@ -0,0 +1,21 @@ +#=============================================================================# +#=========================== Executables =====================================# +#=============================================================================# + +set(types "double") +list(APPEND types "complex_double") + +foreach(type ${types}) + add_executable(Test_hmatrix_task_based_${type} test_hmatrix_task_based_${type}.cpp) + target_link_libraries(Test_hmatrix_task_based_${type} htool) + add_dependencies(build-tests-hmatrix-task-based Test_hmatrix_task_based_${type}) + add_test(NAME Test_hmatrix_task_based_${type}_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_task_based_${type}) + set_tests_properties(Test_hmatrix_task_based_${type}_1 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) + + add_test(NAME Test_hmatrix_task_based_${type}_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_task_based_${type}) + set_tests_properties(Test_hmatrix_task_based_${type}_2 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=2) + + add_test(NAME Test_hmatrix_task_based_${type}_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_task_based_${type}) + set_tests_properties(Test_hmatrix_task_based_${type}_4 PROPERTIES ENVIRONMENT OMP_NUM_THREADS=1) + +endforeach() diff --git a/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_complex_double.cpp new file mode 100644 index 00000000..8ec4542c --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_complex_double.cpp @@ -0,0 +1,183 @@ +#include "../test_hmatrix_task_based.hpp" // for test_hmatrix_task_based +#include // for max +#include // for complex, abs, operator- +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + // is_error = is_error || test_hmatrix_task_based, GeneratorTestComplexSymmetric>(nr, nc, 'N', 'N', epsilon); + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix vector product + /////////////////////////////////////////////////////////////////////////////// + if (false) { + char sym = 'N'; + for (auto epsilon : {1e-4, 1e-8}) { + for (auto n1 : {1000}) { + for (auto n2 : {1000, 1100}) { + for (auto transa : {'N', 'T'}) { + for (auto block_tree_consistency : {true}) { + + // Non symmetric case + sym = 'N'; + + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", sym = " << sym << ", transa = " << transa << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseProduct, GeneratorTestComplexSymmetric> test_case(transa, 'N', n1, n2, 1, 1, 2); + + is_error = is_error || test_task_based_hmatrix_vector_product, GeneratorTestComplexSymmetric, TestCaseProduct, GeneratorTestComplexSymmetric>>(test_case, sym, transa, epsilon, block_tree_consistency); + + // Symmetric case + if (n1 == n2 && block_tree_consistency) { + for (auto UPLO : {'L'}) { + sym = 'S'; + + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = n2 = " << n1 << ", sym = " << sym << ", transa = " << transa << ", UPLO = " << UPLO << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexSymmetric> sym_test_case(n1, n2, 2, 'L', sym, UPLO); + + is_error = is_error || test_task_based_hmatrix_vector_product, GeneratorTestComplexSymmetric, TestCaseSymmetricProduct, GeneratorTestComplexSymmetric>>(sym_test_case, sym, transa, epsilon, block_tree_consistency, UPLO); + } + } + } + } + } + } + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_vector_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_vector_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } // end of tests for hmatrix vector product + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix hmatrix product + /////////////////////////////////////////////////////////////////////////////// + if (false) { + for (auto epsilon : {1e-6}) { + for (auto n1 : {1000}) { + for (auto n3 : {800}) { + for (auto transa : {'N', 'T', 'C'}) { + for (auto transb : {'N', 'T', 'C'}) { + for (auto n2 : {900}) { + + // Non symmetric case + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", n3 = " << n3 << ", sym = " << 'N' << ", transa = " << transa << ", transb = " << transb << "\n"; + + TestCaseProduct, GeneratorTestComplexSymmetric> test_case(transa, transb, n1, n2, n3, 1, 1); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexSymmetric, TestCaseProduct, GeneratorTestComplexSymmetric>>(test_case, 'N', transa, transb, epsilon, true); + } + } + } + + // Symmetric case WIP ToDO: fix the test case side = 'R' + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexSymmetric> test_case(n1, n3, 2, side, 'S', UPLO); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexSymmetric, TestCaseSymmetricProduct, GeneratorTestComplexSymmetric>>(test_case, 'S', 'N', 'N', epsilon, true); + + std::cout << "task based hermitian hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct, GeneratorTestComplexHermitian> test_case_herm(n1, n3, 2, side, 'S', UPLO); + is_error = is_error || test_task_based_hmatrix_hmatrix_product, GeneratorTestComplexHermitian, TestCaseSymmetricProduct, GeneratorTestComplexHermitian>>(test_case_herm, 'S', 'N', 'N', epsilon, true); + } + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_hmatrix_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_hmatrix_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } // end of tests for hmatrix hmatrix product + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix triangular solve + /////////////////////////////////////////////////////////////////////////////// + if (false) { + + const int n1 = 400; + const int number_of_rhs = 100; + + for (auto epsilon : {1e-6}) { + for (auto side : {'L', 'R'}) { + for (auto operation : {'N', 'T', 'C'}) { + for (auto diag : {'N', 'U'}) { + + std::cout << "task based hmatrix triangular solve test case: " << "number_of_rhs = " << number_of_rhs << ", epsilon = " << epsilon << ", n1 = " << n1 << ", side = " << side << ", operation = " << operation << ", diag = " << diag << "\n"; + + TestCaseSolve, GeneratorTestComplexHermitian> test_case(side, operation, n1, number_of_rhs, 1, -1); + + is_error = is_error || test_task_based_hmatrix_triangular_solve, GeneratorTestComplexHermitian, TestCaseSolve, GeneratorTestComplexHermitian>>(test_case, side, operation, diag, epsilon); + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_triangular_solve case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_triangular_solve cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix factorization + /////////////////////////////////////////////////////////////////////////////// + if (false) { + const int n1 = 1100; + const int n2 = 1000; + + for (auto epsilon : {1e-3, 1e-8}) { + for (auto trans : {'N', 'T'}) { + + std::cout << "task based LU factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", trans = " << trans << "\n"; + + // is_error = is_error || test_task_based_lu_factorization, GeneratorTestComplexHermitian>(trans, n1, n2, epsilon); + } + + for (auto UPLO : {'L', 'U'}) { + std::cout << "task based Cholesky factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", UPLO = " << UPLO << "\n"; + + is_error = is_error || test_task_based_cholesky_factorization, GeneratorTestComplexHermitian>(UPLO, n1, n2, epsilon); + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_lu_factorization case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_lu_factorization cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_double.cpp b/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_double.cpp new file mode 100644 index 00000000..ef4422bf --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_task_based/test_hmatrix_task_based_double.cpp @@ -0,0 +1,178 @@ +#include "../test_hmatrix_task_based.hpp" // for test_hmatrix_task_based +#include // for max +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix vector product + /////////////////////////////////////////////////////////////////////////////// + if (false) { + char sym = 'N'; + for (auto epsilon : {1e-4}) { + for (auto n1 : {2000}) { + for (auto n2 : {2100}) { + for (auto transa : {'N', 'T'}) { + for (auto block_tree_consistency : {true}) { + + // Non symmetric case + sym = 'N'; + + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", sym = " << sym << ", transa = " << transa << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseProduct test_case(transa, 'N', n1, n2, 1, 1, 2); + + is_error = is_error || test_task_based_hmatrix_vector_product>(test_case, sym, transa, epsilon, block_tree_consistency); + + // Symmetric case + if (n1 == n2 && block_tree_consistency) { + for (auto UPLO : {'L'}) { + sym = 'S'; + + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = n2 = " << n1 << ", sym = " << sym << ", transa = " << transa << ", UPLO = " << UPLO << ", block_tree_consistency = " << block_tree_consistency << "\n"; + + TestCaseSymmetricProduct sym_test_case(n1, n2, 2, 'L', sym, UPLO); + + is_error = is_error || test_task_based_hmatrix_vector_product>(sym_test_case, sym, transa, epsilon, block_tree_consistency, UPLO); // with symmetry + + // is_error = is_error || test_task_based_hmatrix_vector_product>(sym_test_case, 'N', transa, epsilon, block_tree_consistency, 'N'); // without symmetry + } + } + } + } + } + } + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_vector_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_vector_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } // end of tests for hmatrix vector product + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix hmatrix product + /////////////////////////////////////////////////////////////////////////////// + if (false) { + for (auto epsilon : {1e-6}) { + for (auto n1 : {1000}) { + for (auto n3 : {800}) { + for (auto transa : {'N', 'T'}) { + for (auto transb : {'N', 'T'}) { + for (auto n2 : {900}) { + + // Non symmetric case + std::cout << "task based hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", n3 = " << n3 << ", sym = " << 'N' << ", transa = " << transa << ", transb = " << transb << "\n"; + + TestCaseProduct test_case(transa, transb, n1, n2, n3, 1, 1); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product>(test_case, 'N', transa, transb, epsilon, true); + } + } + } + + // Symmetric case WIP ToDO: fix the test case side = 'R' + for (auto side : {'L', 'R'}) { + for (auto UPLO : {'U', 'L'}) { + std::cout << "task based symmetric hmatrix test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n3 = " << n3 << ", sym = " << 'S' << ", side = " << side << ", UPLO = " << UPLO << "\n"; + + TestCaseSymmetricProduct test_case(n1, n3, 2, side, 'S', UPLO); + + is_error = is_error || test_task_based_hmatrix_hmatrix_product>(test_case, 'S', 'N', 'N', epsilon, true); + } + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_hmatrix_product case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_hmatrix_product cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } // end of tests for hmatrix hmatrix product + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix triangular solve + /////////////////////////////////////////////////////////////////////////////// + if (false) { + + const int n1 = 400; + const int number_of_rhs = 100; + + for (auto epsilon : {1e-6}) { + for (auto side : {'L', 'R'}) { + for (auto operation : {'N', 'T'}) { + for (auto diag : {'N', 'U'}) { + + std::cout << "task based hmatrix triangular solve test case: " << "number_of_rhs = " << number_of_rhs << ", epsilon = " << epsilon << ", n1 = " << n1 << ", side = " << side << ", operation = " << operation << ", diag = " << diag << "\n"; + + TestCaseSolve test_case(side, operation, n1, number_of_rhs, 1, -1); + + is_error = is_error || test_task_based_hmatrix_triangular_solve>(test_case, side, operation, diag, epsilon); + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_triangular_solve case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_triangular_solve cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } + + /////////////////////////////////////////////////////////////////////////////// + // Tests for hmatrix factorization + /////////////////////////////////////////////////////////////////////////////// + if (true) { + const int n1 = 1100; + const int n2 = 1000; + + for (auto epsilon : {1e-3, 1e-8}) { + for (auto trans : {'N', 'T'}) { + + std::cout << "task based LU factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", trans = " << trans << "\n"; + + is_error = is_error || test_task_based_lu_factorization(trans, n1, n2, epsilon); + } + + for (auto UPLO : {'L', 'U'}) { + std::cout << "task based Cholesky factorization test case: " << "epsilon = " << epsilon << ", n1 = " << n1 << ", n2 = " << n2 << ", UPLO = " << UPLO << "\n"; + + is_error = is_error || test_task_based_cholesky_factorization(UPLO, n1, n2, epsilon); + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_lu_factorization case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_lu_factorization cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + } + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/CMakeLists.txt b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/CMakeLists.txt index 359ee5ea..fb13edaa 100644 --- a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/CMakeLists.txt @@ -12,3 +12,11 @@ foreach(type ${types}) add_test(NAME Test_hmatrix_triangular_solve_${type} COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_hmatrix_triangular_solve_${type}) set_tests_properties(Test_hmatrix_triangular_solve_${type} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) endforeach() + +foreach(type ${types}) + add_executable(Test_task_based_hmatrix_triangular_solve_${type} test_task_based_hmatrix_triangular_solve_${type}.cpp) + target_link_libraries(Test_task_based_hmatrix_triangular_solve_${type} htool) + add_dependencies(build-tests-hmatrix-triangular-solve Test_task_based_hmatrix_triangular_solve_${type}) + add_test(NAME Test_task_based_hmatrix_triangular_solve_${type} COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_task_based_hmatrix_triangular_solve_${type}) + set_tests_properties(Test_task_based_hmatrix_triangular_solve_${type} PROPERTIES ENVIRONMENT OMP_NUM_THREADS=4) +endforeach() diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp index 7da23773..8efd2818 100644 --- a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_complex_double.cpp @@ -17,8 +17,10 @@ int main(int, char *[]) { for (auto epsilon : {1e-6, 1e-10}) { for (auto side : {'L', 'R'}) { for (auto operation : {'N', 'T', 'C'}) { - std::cout << epsilon << " " << number_of_rhs << " " << side << " " << operation << "\n"; - is_error = is_error || test_hmatrix_triangular_solve, GeneratorTestComplexHermitian>(side, operation, n1, number_of_rhs, epsilon, margin); + for (auto diag : {'N', 'U'}) { + std::cout << epsilon << " " << number_of_rhs << " " << side << " " << operation << " " << diag << "\n"; + is_error = is_error || test_hmatrix_triangular_solve, GeneratorTestComplexHermitian>(side, operation, diag, n1, number_of_rhs, epsilon, margin); + } } } } diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_double.cpp b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_double.cpp index 94c54e05..0fde3b96 100644 --- a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_double.cpp +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_hmatrix_triangular_solve_double.cpp @@ -14,8 +14,10 @@ int main(int, char *[]) { for (auto epsilon : {1e-6, 1e-10}) { for (auto side : {'L', 'R'}) { for (auto operation : {'N', 'T'}) { - std::cout << epsilon << " " << number_of_rhs << " " << side << " " << operation << "\n"; - is_error = is_error || test_hmatrix_triangular_solve(side, operation, n1, number_of_rhs, epsilon, margin); + for (auto diag : {'N', 'U'}) { + std::cout << epsilon << " " << number_of_rhs << " " << side << " " << operation << " " << diag << "\n"; + is_error = is_error || test_hmatrix_triangular_solve(side, operation, diag, n1, number_of_rhs, epsilon, margin); + } } } } diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_complex_double.cpp b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_complex_double.cpp new file mode 100644 index 00000000..3c8fb04a --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_complex_double.cpp @@ -0,0 +1,44 @@ +#include "../test_task_based_hmatrix_triangular_solve.hpp" +#include // for max +#include // for complex, abs, operator- +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + const int n1 = 400; + const int number_of_rhs = 100; + + for (auto epsilon : {1e-6}) { + for (auto side : {'L', 'R'}) { + for (auto operation : {'N', 'T', 'C'}) { + for (auto diag : {'N', 'U'}) { + + std::cout << "task based hmatrix triangular solve test case: " << "number_of_rhs = " << number_of_rhs << ", epsilon = " << epsilon << ", n1 = " << n1 << ", side = " << side << ", operation = " << operation << ", diag = " << diag << "\n"; + + TestCaseSolve, GeneratorTestComplexHermitian> test_case(side, operation, n1, number_of_rhs, 1, -1); + + is_error = is_error || test_task_based_hmatrix_triangular_solve, GeneratorTestComplexHermitian, TestCaseSolve, GeneratorTestComplexHermitian>>(test_case, side, operation, diag, epsilon); + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_triangular_solve case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_triangular_solve cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_double.cpp b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_double.cpp new file mode 100644 index 00000000..4422ddd0 --- /dev/null +++ b/tests/functional_tests/hmatrix/hmatrix_triangular_solve/test_task_based_hmatrix_triangular_solve_double.cpp @@ -0,0 +1,43 @@ +#include "../test_task_based_hmatrix_triangular_solve.hpp" +#include // for max +#include // for GeneratorTestComplexSymm... +#include // for initializer_list +#include // for basic_ostream, char_traits + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + bool is_error = false; + + const int n1 = 400; + const int number_of_rhs = 100; + + for (auto epsilon : {1e-6}) { + for (auto side : {'L', 'R'}) { + for (auto operation : {'N', 'T'}) { + for (auto diag : {'N', 'U'}) { + + std::cout << "task based hmatrix triangular solve test case: " << "number_of_rhs = " << number_of_rhs << ", epsilon = " << epsilon << ", n1 = " << n1 << ", side = " << side << ", operation = " << operation << ", diag = " << diag << "\n"; + + TestCaseSolve test_case(side, operation, n1, number_of_rhs, 1, -1); + + is_error = is_error || test_task_based_hmatrix_triangular_solve>(test_case, side, operation, diag, epsilon); + } + } + } + } + + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + if (is_error) { + std::cerr << "ERROR: At least one test_task_based_hmatrix_triangular_solve case failed." << std::endl; + return 1; + + } else { + std::cout << "SUCCESS: All test_task_based_hmatrix_triangular_solve cases passed." << std::endl; + } + std::cout << "+++++++++++++++++++++++++++++++++++++++++++" << std::endl; + + return 0; +} diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/CMakeLists.txt b/tests/functional_tests/hmatrix/lrmat/lrmat_build/CMakeLists.txt index d99559e5..f46e286b 100755 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_build/CMakeLists.txt +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/CMakeLists.txt @@ -5,6 +5,7 @@ #=== lrmat_SVD set(compressions "fullACA") list(APPEND compressions "partialACA") +list(APPEND compressions "recompressed_partialACA") list(APPEND compressions "sympartialACA") list(APPEND compressions "SVD") foreach(compression ${compressions}) diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_SVD.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_SVD.cpp index 1c3ec6f3..c83716b9 100644 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_SVD.cpp +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_SVD.cpp @@ -1,20 +1,20 @@ -#include "../test_lrmat_build.hpp" // for test_lrmat -#include // for min -#include // for pow, sqrt -#include // for real -#include // for operator<< -#include // for Cluster -#include // for Cluster... -#include // for Generat... -#include // for SVD -#include // for Frobeni... -#include // for Matrix -#include // for Generat... -#include // for create_... -#include // for Lapack -#include // for basic_o... -#include // for pair -#include // for vector +#include "../test_lrmat_build.hpp" // for test_lrmat +#include // for min +#include // for pow, sqrt +#include // for real +#include // for operator<< +#include // for Cluster +#include // for Cluster... +#include // for Generat... +#include // for SVD +#include // for Frobeni... +#include // for Matrix +#include // for Generat... +#include // for create_... +#include // for Lapack +#include // for basic_o... +#include // for pair +#include // for vector using namespace std; using namespace htool; @@ -51,10 +51,10 @@ int main(int, char *[]) { // SVD fixed rank int reqrank_max = 10; - SVD compressor_SVD; - test = test || !(compressor_SVD.is_htool_owning_data()); + SVD compressor_SVD(A_in_user_numbering, t.get_permutation().data(), s.get_permutation().data()); - LowRankMatrix A_SVD_fixed(A, compressor_SVD, t, s, reqrank_max, epsilon); + LowRankMatrix A_SVD_fixed(t.get_size(), s.get_size(), reqrank_max, epsilon); + compressor_SVD.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), reqrank_max, A_SVD_fixed); std::vector SVD_fixed_errors; std::vector SVD_errors_check(reqrank_max, 0); @@ -74,7 +74,7 @@ int main(int, char *[]) { std::vector rwork(5 * std::min(nr, nc)); Lapack::gesvd("A", "A", &nr, &nc, matrix.data(), &lda, singular_values.data(), u.data(), &ldu, vt.data(), &ldvt, work.data(), &lwork, rwork.data(), &info); - lwork = (int)std::real(work[0]); + lwork = static_cast(std::real(work[0])); work.resize(lwork); Lapack::gesvd("A", "A", &nr, &nc, matrix.data(), &lda, singular_values.data(), u.data(), &ldu, vt.data(), &ldvt, work.data(), &lwork, rwork.data(), &info); @@ -93,7 +93,8 @@ int main(int, char *[]) { cout << "> Errors computed with the remaining eigenvalues : " << SVD_errors_check << endl; // ACA automatic building - LowRankMatrix A_SVD(A, compressor_SVD, t, s, -1, epsilon); + LowRankMatrix A_SVD(t.get_size(), s.get_size(), epsilon); + compressor_SVD.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), A_SVD); std::pair fixed_compression_interval(0.87, 0.89); std::pair auto_compression_interval(0.95, 0.97); diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_fullACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_fullACA.cpp index bc41ae09..a99f5b3d 100644 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_fullACA.cpp +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_fullACA.cpp @@ -1,16 +1,16 @@ -#include "../test_lrmat_build.hpp" // for test_lrmat -#include // for copy, max -#include // for pow, sqrt -#include // for Cluster -#include // for Cluster... -#include // for Generat... -#include // for fullACA -#include // for LowRank... -#include // for Generat... -#include // for create_... -#include // for basic_o... -#include // for pair -#include // for vector +#include "../test_lrmat_build.hpp" // for test_lrmat +#include // for copy, max +#include // for pow, sqrt +#include // for Cluster +#include // for Cluster... +#include // for Generat... +#include // for fullACA +#include // for LowRank... +#include // for Generat... +#include // for create_... +#include // for basic_o... +#include // for pair +#include // for vector using namespace std; using namespace htool; @@ -46,13 +46,14 @@ int main(int, char *[]) { // fullACA fixed rank int reqrank_max = 10; - fullACA compressor; - test = test || !(compressor.is_htool_owning_data()); + fullACA compressor(A_in_user_numbering, target_cluster.get_permutation().data(), source_cluster.get_permutation().data()); - LowRankMatrix A_fullACA_fixed(A, compressor, target_cluster, source_cluster, reqrank_max, epsilon); + LowRankMatrix A_fullACA_fixed(target_cluster.get_size(), source_cluster.get_size(), reqrank_max, epsilon); + compressor.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), reqrank_max, A_fullACA_fixed); // ACA automatic building - LowRankMatrix A_fullACA(A, compressor, target_cluster, source_cluster, -1, epsilon); + LowRankMatrix A_fullACA(target_cluster.get_size(), source_cluster.get_size(), epsilon); + compressor.copy_low_rank_approximation(target_cluster.get_size(), source_cluster.get_size(), target_cluster.get_offset(), source_cluster.get_offset(), A_fullACA); std::pair fixed_compression_interval(0.87, 0.89); std::pair auto_compression_interval(0.95, 0.97); test = test || (test_lrmat(target_cluster, source_cluster, A, A_fullACA_fixed, A_fullACA, fixed_compression_interval, auto_compression_interval)); diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_partialACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_partialACA.cpp index 7e6fa7a0..17897c4c 100644 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_partialACA.cpp +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_partialACA.cpp @@ -1,16 +1,16 @@ -#include "../test_lrmat_build.hpp" // for test_lrmat -#include // for max, copy -#include // for pow, sqrt -#include // for Cluster -#include // for Cluster... -#include // for Generat... -#include // for LowRank... -#include // for partialACA -#include // for Generat... -#include // for create_... -#include // for basic_o... -#include // for pair -#include // for vector +#include "../test_lrmat_build.hpp" // for test_lrmat +#include // for max, copy +#include // for pow, sqrt +#include // for Cluster +#include // for Cluster... +#include // for Generat... +#include // for LowRank... +#include // for partialACA +#include // for Generat... +#include // for create_... +#include // for basic_o... +#include // for pair +#include // for vector using namespace std; using namespace htool; @@ -48,17 +48,18 @@ int main(int, char *[]) { // partialACA fixed rank int reqrank_max = 10; - partialACA compressor; - test = test || !(compressor.is_htool_owning_data()); + partialACA compressor(A_in_user_numbering, t.get_permutation().data(), s.get_permutation().data()); - LowRankMatrix A_fullACA_fixed(A, compressor, t, s, reqrank_max, epsilon); + LowRankMatrix A_partialACA_fixed(t.get_size(), s.get_size(), reqrank_max, epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), reqrank_max, A_partialACA_fixed); // ACA automatic building - LowRankMatrix A_fullACA(A, compressor, t, s, -1, epsilon); + LowRankMatrix A_partialACA(t.get_size(), s.get_size(), epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), A_partialACA); std::pair fixed_compression_interval(0.87, 0.89); std::pair auto_compression_interval(0.93, 0.96); - test = test || (test_lrmat(t, s, A, A_fullACA_fixed, A_fullACA, fixed_compression_interval, auto_compression_interval)); + test = test || (test_lrmat(t, s, A, A_partialACA_fixed, A_partialACA, fixed_compression_interval, auto_compression_interval)); } cout << "test : " << test << endl; diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_recompressed_partialACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_recompressed_partialACA.cpp new file mode 100644 index 00000000..a525661b --- /dev/null +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_recompressed_partialACA.cpp @@ -0,0 +1,70 @@ +#include "../test_lrmat_build.hpp" // for test_lrmat +#include // for max, copy +#include // for pow, sqrt +#include // for Cluster +#include // for Cluster... +#include // for Generat... +#include // for LowRank... +#include // for partialACA +#include // for partialACA +#include // for Generat... +#include // for create_... +#include // for basic_o... +#include // for pair +#include // for vector + +using namespace std; +using namespace htool; + +int main(int, char *[]) { + + const int ndistance = 4; + double distance[ndistance]; + distance[0] = 15; + distance[1] = 20; + distance[2] = 30; + distance[3] = 40; + + double epsilon = 0.0001; + + int nr = 500; + int nc = 100; + std::vector xt(3 * nr); + std::vector xs(3 * nc); + std::vector tabt(500); + std::vector tabs(100); + bool test = 0; + for (int idist = 0; idist < ndistance; idist++) { + + create_disk(3, 0., nr, xt.data()); + create_disk(3, distance[idist], nc, xs.data()); + + ClusterTreeBuilder recursive_build_strategy; + + Cluster t = recursive_build_strategy.create_cluster_tree(nr, 3, xt.data(), 2, 2); + Cluster s = recursive_build_strategy.create_cluster_tree(nc, 3, xt.data(), 2, 2); + + GeneratorTestDouble A_in_user_numbering(3, xt, xs); + InternalGeneratorWithPermutation A(A_in_user_numbering, t.get_permutation().data(), s.get_permutation().data()); + + // partialACA fixed rank + int reqrank_max = 10; + partialACA compressor_partialACA(A); + RecompressedLowRankGenerator compressor(compressor_partialACA, std::function &)>(SVD_recompression)); + + LowRankMatrix A_partialACA_fixed(t.get_size(), s.get_size(), reqrank_max, epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), reqrank_max, A_partialACA_fixed); + + // ACA automatic building + LowRankMatrix A_partialACA(t.get_size(), s.get_size(), epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), A_partialACA); + + std::pair fixed_compression_interval(0.87, 0.89); + std::pair auto_compression_interval(0.94, 0.97); + test = test || (test_lrmat(t, s, A, A_partialACA_fixed, A_partialACA, fixed_compression_interval, auto_compression_interval)); + } + + cout << "test : " << test << endl; + + return test; +} diff --git a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_sympartialACA.cpp b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_sympartialACA.cpp index 8400ecfc..74d2e2aa 100644 --- a/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_sympartialACA.cpp +++ b/tests/functional_tests/hmatrix/lrmat/lrmat_build/test_lrmat_build_sympartialACA.cpp @@ -1,16 +1,16 @@ -#include "../test_lrmat_build.hpp" // for test_lrmat -#include // for max, copy -#include // for pow, sqrt -#include // for Cluster -#include // for Cluster... -#include // for Generat... -#include // for LowRank... -#include // for sympart... -#include // for Generat... -#include // for create_... -#include // for basic_o... -#include // for pair -#include // for vector +#include "../test_lrmat_build.hpp" // for test_lrmat +#include // for max, copy +#include // for pow, sqrt +#include // for Cluster +#include // for Cluster... +#include // for Generat... +#include // for LowRank... +#include // for sympart... +#include // for Generat... +#include // for create_... +#include // for basic_o... +#include // for pair +#include // for vector using namespace std; using namespace htool; @@ -48,13 +48,14 @@ int main(int, char *[]) { // sympartialACA fixed rank int reqrank_max = 10; - sympartialACA compressor; - test = test || !(compressor.is_htool_owning_data()); + sympartialACA compressor(A_in_user_numbering, t.get_permutation().data(), s.get_permutation().data()); - LowRankMatrix A_sympartialACA_fixed(A, compressor, t, s, reqrank_max, epsilon); + LowRankMatrix A_sympartialACA_fixed(t.get_size(), s.get_size(), reqrank_max, epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), reqrank_max, A_sympartialACA_fixed); // ACA automatic building - LowRankMatrix A_sympartialACA(A, compressor, t, s, -1, epsilon); + LowRankMatrix A_sympartialACA(t.get_size(), s.get_size(), epsilon); + compressor.copy_low_rank_approximation(t.get_size(), s.get_size(), t.get_offset(), s.get_offset(), A_sympartialACA); std::pair fixed_compression_interval(0.87, 0.89); std::pair auto_compression_interval(0.93, 0.96); diff --git a/tests/functional_tests/hmatrix/lrmat/test_lrmat_build.hpp b/tests/functional_tests/hmatrix/lrmat/test_lrmat_build.hpp index 4d3b2ff4..d7335f37 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_lrmat_build.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_lrmat_build.hpp @@ -1,9 +1,9 @@ -#include // for operator<< -#include // for Frobenius_abs... -#include // for recompression -#include // for basic_ostream -#include // for pair -#include // for vector +#include // for operator<< +#include // for Frobenius_abs... +#include // for recompression +#include // for basic_ostream +#include // for pair +#include // for vector namespace htool { template class VirtualGenerator; @@ -42,26 +42,26 @@ bool test_lrmat(const Cluster &target_cluster, const Cluster &so cout << "> Compression rate : " << Fixed_approximation.space_saving() << endl; // Recompression with fixed rank - auto recompressed_fixed_approximation = recompression(Fixed_approximation); - if (recompressed_fixed_approximation) { - std::vector fixed_errors_after_recompression; - for (int k = 0; k < recompressed_fixed_approximation->rank_of() + 1; k++) { - fixed_errors_after_recompression.push_back(Frobenius_absolute_error(target_cluster, source_cluster, *recompressed_fixed_approximation, A, k)); - } - - // Test rank - cout << "Recompression with fixed rank" << endl; - cout << "> rank : " << recompressed_fixed_approximation->rank_of() << endl; - - // Test Frobenius errors - test = test || !(fixed_errors_after_recompression.back() < recompressed_fixed_approximation->get_epsilon()); - cout << "> Errors with Frobenius norm : " << fixed_errors_after_recompression << endl; - - // Test compression - test = test || !(Fixed_approximation.space_saving() <= recompressed_fixed_approximation->space_saving()); - cout << "> Compression rate : " << recompressed_fixed_approximation->space_saving() << endl; + LowRankMatrix recompressed_fixed_approximation = Fixed_approximation; + SVD_recompression(recompressed_fixed_approximation); + std::vector fixed_errors_after_recompression; + for (int k = 0; k < recompressed_fixed_approximation.rank_of() + 1; k++) { + fixed_errors_after_recompression.push_back(Frobenius_absolute_error(target_cluster, source_cluster, recompressed_fixed_approximation, A, k)); } + // Test rank + cout << "Recompression with fixed rank" << endl; + cout << "> rank : " << recompressed_fixed_approximation.rank_of() << endl; + + // Test Frobenius errors + test = test || !(fixed_errors_after_recompression.back() <= recompressed_fixed_approximation.get_epsilon()); + cout << "> Errors with Frobenius norm : " << fixed_errors_after_recompression << endl; + + // Test compression + std::cout << Fixed_approximation.space_saving() << " " << recompressed_fixed_approximation.space_saving() << "\n"; + test = test || !(Fixed_approximation.space_saving() <= recompressed_fixed_approximation.space_saving()); + cout << "> Compression rate : " << recompressed_fixed_approximation.space_saving() << endl; + // ACA automatic building std::vector auto_errors; for (int k = 0; k < Auto_approximation.rank_of() + 1; k++) { @@ -78,25 +78,24 @@ bool test_lrmat(const Cluster &target_cluster, const Cluster &so cout << "> Compression rate : " << Auto_approximation.space_saving() << endl; // Recompression with automatic rank - auto recompressed_auto_approximation = recompression(Auto_approximation); - if (recompressed_auto_approximation) { - std::vector fixed_errors_after_recompression; - for (int k = 0; k < recompressed_auto_approximation->rank_of() + 1; k++) { - fixed_errors_after_recompression.push_back(Frobenius_absolute_error(target_cluster, source_cluster, *recompressed_auto_approximation, A, k)); - } - - // Test rank - cout << "Recompression with auto rank" << endl; - cout << "> rank : " << recompressed_auto_approximation->rank_of() << endl; - - // Test Frobenius errors - test = test || !(fixed_errors_after_recompression.back() < recompressed_auto_approximation->get_epsilon()); - cout << "> Errors with Frobenius norm : " << fixed_errors_after_recompression << endl; - - // Test compression - test = test || !(Auto_approximation.space_saving() <= recompressed_auto_approximation->space_saving()); - cout << "> Compression rate : " << recompressed_auto_approximation->space_saving() << endl; + LowRankMatrix recompressed_auto_approximation = Auto_approximation; + SVD_recompression(recompressed_auto_approximation); + std::vector auto_errors_after_recompression; + for (int k = 0; k < recompressed_auto_approximation.rank_of() + 1; k++) { + auto_errors_after_recompression.push_back(Frobenius_absolute_error(target_cluster, source_cluster, recompressed_auto_approximation, A, k)); } + // Test rank + cout << "Recompression with auto rank" << endl; + cout << "> rank : " << recompressed_auto_approximation.rank_of() << endl; + + // Test Frobenius errors + test = test || !(auto_errors_after_recompression.back() <= recompressed_auto_approximation.get_epsilon()); + cout << "> Errors with Frobenius norm : " << auto_errors_after_recompression << endl; + + // Test compression + test = test || !(Auto_approximation.space_saving() <= recompressed_auto_approximation.space_saving()); + cout << "> Compression rate : " << recompressed_auto_approximation.space_saving() << endl; + return test; } diff --git a/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_addition.hpp b/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_addition.hpp index 79a3dc04..da3fe1f5 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_addition.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_addition.hpp @@ -15,16 +15,24 @@ bool test_lrmat_lrmat_addition(int n1, int n2, htool::underlying_type epsilon TestCaseAddition test_case(n1, n2, 2); // lrmat - Compressor compressor; - LowRankMatrix A_approximation(*test_case.operator_A, compressor, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input, -1, epsilon); - LowRankMatrix zero_A_approximation(*test_case.operator_A, compressor, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input, 0, epsilon); - LowRankMatrix sub_zero_A_approximation(*test_case.operator_A, compressor, *test_case.root_cluster_B_output, *test_case.root_cluster_B_input, 0, epsilon); + Compressor compressor(*test_case.operator_A); + LowRankMatrix A_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), A_approximation); - std::unique_ptr> sub_A_approximation_ptr = std::make_unique>(*test_case.operator_A, compressor, *test_case.root_cluster_B_output, *test_case.root_cluster_B_input, -1, epsilon); - if (sub_A_approximation_ptr->rank_of() == 0) { - sub_A_approximation_ptr = std::make_unique>(*test_case.operator_A, compressor, *test_case.root_cluster_B_output, *test_case.root_cluster_B_input, (test_case.root_cluster_B_output->get_size() * test_case.root_cluster_B_input->get_size()) / (test_case.root_cluster_B_output->get_size() + test_case.root_cluster_B_input->get_size()), epsilon); + int zero_A_rank = 0; + LowRankMatrix zero_A_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), zero_A_rank, epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), zero_A_rank, zero_A_approximation); + + int sub_zero_A_rank = 0; + LowRankMatrix sub_zero_A_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), sub_zero_A_rank, epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), test_case.root_cluster_B_output->get_offset(), test_case.root_cluster_B_input->get_offset(), sub_zero_A_rank, sub_zero_A_approximation); + + LowRankMatrix sub_A_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), test_case.root_cluster_B_output->get_offset(), test_case.root_cluster_B_input->get_offset(), sub_A_approximation); + if (sub_A_approximation.rank_of() == 0) { + int sub_A_reqrank = (test_case.root_cluster_B_output->get_size() * test_case.root_cluster_B_input->get_size()) / (test_case.root_cluster_B_output->get_size() + test_case.root_cluster_B_input->get_size()); + compressor.copy_low_rank_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), test_case.root_cluster_B_output->get_offset(), test_case.root_cluster_B_input->get_offset(), sub_A_reqrank, sub_A_approximation); } - LowRankMatrix &sub_A_approximation = *sub_A_approximation_ptr; // Reference Matrix A_dense(A_approximation.nb_rows(), A_approximation.nb_cols(), 0); @@ -44,8 +52,7 @@ bool test_lrmat_lrmat_addition(int n1, int n2, htool::underlying_type epsilon // Addition htool::underlying_type error; - LowRankMatrix lrmat_test(epsilon); - lrmat_test = A_approximation; + LowRankMatrix lrmat_test = A_approximation; add_lrmat_lrmat(sub_A_approximation, *test_case.root_cluster_B_output, *test_case.root_cluster_B_input, lrmat_test, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); lrmat_test.copy_to_dense(matrix_test.data()); error = normFrob(matrix_result_w_sum - matrix_test) / normFrob(matrix_result_w_sum); diff --git a/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_product.hpp b/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_product.hpp index 5b91e89d..904f585d 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_product.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_lrmat_lrmat_product.hpp @@ -12,7 +12,7 @@ bool test_lrmat_lrmat_product(char transa, char transb, T alpha, T beta, const L bool is_error = false; Matrix matrix_test, dense_lrmat_test; - LowRankMatrix lrmat_test(epsilon); + LowRankMatrix lrmat_test(A_auto_approximation.nb_rows(), B_auto_approximation.nb_cols(), epsilon); htool::underlying_type error; // Product with automatic rank diff --git a/tests/functional_tests/hmatrix/lrmat/test_lrmat_matrix_product.hpp b/tests/functional_tests/hmatrix/lrmat/test_lrmat_matrix_product.hpp index 9a74d094..c652e67a 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_lrmat_matrix_product.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_lrmat_matrix_product.hpp @@ -17,7 +17,7 @@ using namespace htool; template bool test_lrmat_matrix_product(char transa, char transb, T alpha, T beta, T scaling_coefficient, const LowRankMatrix &A_auto_approximation, LowRankMatrix &C_auto_approximation, const Matrix &B_dense, const Matrix &C_dense, const Matrix &matrix_result_w_matrix_sum, const Matrix &matrix_result_wo_sum, const Matrix &matrix_result_w_lrmat_sum, htool::underlying_type epsilon, htool::underlying_type additional_compression_tolerance, htool::underlying_type additional_lrmat_sum_tolerance) { bool is_error = false; - LowRankMatrix lrmat_test(epsilon); + LowRankMatrix lrmat_test(A_auto_approximation.nb_rows(), B_dense.nb_cols(), epsilon); // Reference matrix Matrix matrix_test, dense_lrmat_test, transposed_B_dense(B_dense.nb_cols(), B_dense.nb_rows()), transposed_C_dense(C_dense.nb_cols(), C_dense.nb_rows()); diff --git a/tests/functional_tests/hmatrix/lrmat/test_lrmat_product.hpp b/tests/functional_tests/hmatrix/lrmat/test_lrmat_product.hpp index 9d62f381..f1947129 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_lrmat_product.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_lrmat_product.hpp @@ -28,10 +28,15 @@ bool test_lrmat_product(char transa, char transb, int n1, int n2, int n3, htool: generate_random_scalar(scaling_coefficient); // lrmat - Compressor compressor; - LowRankMatrix A_auto_approximation(*test_case.operator_A, compressor, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input, -1, epsilon); - LowRankMatrix B_auto_approximation(*test_case.operator_B, compressor, *test_case.root_cluster_B_output, *test_case.root_cluster_B_input, -1, epsilon); - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *test_case.root_cluster_C_output, *test_case.root_cluster_C_input, -1, epsilon); + Compressor compressor_A(*test_case.operator_A); + LowRankMatrix A_auto_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), epsilon); + compressor_A.copy_low_rank_approximation(test_case.root_cluster_A_output->get_size(), test_case.root_cluster_A_input->get_size(), test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), A_auto_approximation); + Compressor compressor_B(*test_case.operator_B); + LowRankMatrix B_auto_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), epsilon); + compressor_B.copy_low_rank_approximation(test_case.root_cluster_B_output->get_size(), test_case.root_cluster_B_input->get_size(), test_case.root_cluster_B_output->get_offset(), test_case.root_cluster_B_input->get_offset(), B_auto_approximation); + Compressor compressor_C(*test_case.operator_C); + LowRankMatrix C_auto_approximation(test_case.root_cluster_C_output->get_size(), test_case.root_cluster_C_input->get_size(), epsilon); + compressor_C.copy_low_rank_approximation(test_case.root_cluster_C_output->get_size(), test_case.root_cluster_C_input->get_size(), test_case.root_cluster_C_output->get_offset(), test_case.root_cluster_C_input->get_offset(), C_auto_approximation); // dense Matrix A_dense(test_case.no_A, test_case.ni_A), B_dense(test_case.no_B, test_case.ni_B), C_dense(test_case.no_C, test_case.ni_C); diff --git a/tests/functional_tests/hmatrix/lrmat/test_matrix_lrmat_product.hpp b/tests/functional_tests/hmatrix/lrmat/test_matrix_lrmat_product.hpp index 460329f5..708c4e6c 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_matrix_lrmat_product.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_matrix_lrmat_product.hpp @@ -11,7 +11,7 @@ template bool test_matrix_lrmat_product(char transa, char transb, T alpha, T beta, const LowRankMatrix &B_auto_approximation, const LowRankMatrix &C_auto_approximation, const Matrix &A_dense, const Matrix &C_dense, const Matrix &matrix_result_w_matrix_sum, const Matrix &matrix_result_wo_sum, const Matrix &matrix_result_w_lrmat_sum, htool::underlying_type epsilon, htool::underlying_type additional_compression_tolerance, htool::underlying_type additional_lrmat_sum_tolerance) { bool is_error = false; Matrix matrix_test, dense_lrmat_test; - LowRankMatrix lrmat_test(epsilon); + LowRankMatrix lrmat_test(A_dense.nb_rows(), B_auto_approximation.nb_cols(), epsilon); htool::underlying_type error; // Products with auto approximation diff --git a/tests/functional_tests/hmatrix/lrmat/test_matrix_matrix_product.hpp b/tests/functional_tests/hmatrix/lrmat/test_matrix_matrix_product.hpp index 5a5223b4..c4142b78 100644 --- a/tests/functional_tests/hmatrix/lrmat/test_matrix_matrix_product.hpp +++ b/tests/functional_tests/hmatrix/lrmat/test_matrix_matrix_product.hpp @@ -10,7 +10,7 @@ using namespace htool; template bool test_matrix_matrix_product(char transa, char transb, T alpha, T beta, const LowRankMatrix &C_auto_approximation, const Matrix &A_dense, const Matrix &B_dense, const Matrix &matrix_result_wo_sum, const Matrix &matrix_result_w_lrmat_sum, htool::underlying_type epsilon, htool::underlying_type additional_compression_tolerance, htool::underlying_type additional_lrmat_sum_tolerance) { bool is_error = false; - LowRankMatrix lrmat_test(epsilon); + LowRankMatrix lrmat_test(A_dense.nb_rows(), B_dense.nb_cols(), epsilon); Matrix matrix_test, dense_lrmat_test; htool::underlying_type error; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_build.hpp b/tests/functional_tests/hmatrix/test_hmatrix_build.hpp index 0f097e60..f0ab5c3e 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_build.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_build.hpp @@ -1,16 +1,18 @@ -#include // for pow, sqrt -#include // for size_t -#include // for norm2 -#include // for Cluster... -#include // for Cluster... -#include // for copy_di... -#include // for print_d... -#include // for print_h... -#include // for Generat... -#include // for HMatrix... -#include // for Matrix -#include // for underly... -#include // for NbrToStr +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_bl... +#include // for Generat... +#include // for HMatrix... +#include // for recomp... +#include // for Matrix +#include // for underly... +#include // for NbrToStr #include #include // for create_... #include // for test_pa... @@ -80,26 +82,29 @@ bool test_hmatrix_build(int nr, int nc, bool use_local_cluster, char Symmetry, c // HMatrix double eta = 10; - - std::unique_ptr>> hmatrix_tree_builder; - if (use_local_cluster) { - hmatrix_tree_builder = std::make_unique>>(target_root_cluster->get_cluster_on_partition(rankWorld), source_root_cluster->get_cluster_on_partition(rankWorld), epsilon, eta, Symmetry, UPLO, -1, -1, -1); - } else { - hmatrix_tree_builder = std::make_unique>>(*target_root_cluster, *source_root_cluster, epsilon, eta, Symmetry, UPLO, -1, rankWorld, rankWorld); - } - hmatrix_tree_builder->set_block_tree_consistency(block_tree_consistency); - + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, Symmetry, UPLO); std::shared_ptr> dense_blocks_generator; if (use_dense_blocks_generator) { dense_blocks_generator = std::make_shared>(generator_with_permutation); } - hmatrix_tree_builder->set_dense_blocks_generator(dense_blocks_generator); + hmatrix_tree_builder.set_dense_blocks_generator(dense_blocks_generator); + hmatrix_tree_builder.set_block_tree_consistency(block_tree_consistency); // build - auto root_hmatrix = hmatrix_tree_builder->build(generator); + std::unique_ptr>> root_hmatrix_ptr; + if (use_local_cluster) { + root_hmatrix_ptr = std::make_unique>>(hmatrix_tree_builder.build(generator, target_root_cluster->get_cluster_on_partition(rankWorld), source_root_cluster->get_cluster_on_partition(rankWorld), -1, -1)); + } else { + root_hmatrix_ptr = std::make_unique>>(hmatrix_tree_builder.build(generator, *target_root_cluster, *source_root_cluster, rankWorld, rankWorld)); + } + auto &root_hmatrix = *root_hmatrix_ptr; + recompression(root_hmatrix); + openmp_recompression(root_hmatrix); save_leaves_with_rank(root_hmatrix, "leaves_" + htool::NbrToStr(rankWorld)); save_levels(root_hmatrix, "level_" + htool::NbrToStr(rankWorld) + "_", {0, 1, 2}); + std::ofstream dotfile("dotfile.dot"); + view_block_tree(root_hmatrix, {}, dotfile); // Check integrity of root's children std::size_t check_children_size{0}; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_builder.hpp b/tests/functional_tests/hmatrix/test_hmatrix_builder.hpp new file mode 100644 index 00000000..998f90bf --- /dev/null +++ b/tests/functional_tests/hmatrix/test_hmatrix_builder.hpp @@ -0,0 +1,169 @@ +#include +#include +#include // for HMatrixBuilder +#include +#include // for create_... +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_hmatrix_builder(int nr, int nc, char Symmetry, char UPLO, double epsilon) { + bool is_error = false; + + // Geometry + double z1 = 1; + vector p1(3 * nr), p1_permuted, off_diagonal_p1; + vector p2(Symmetry == 'N' ? 3 * nc : 1), p2_permuted, off_diagonal_p2; + create_disk(3, z1, nr, p1.data()); + + // HMatrix builder + std::unique_ptr> hmatrix_builder; + if (Symmetry == 'N') { + hmatrix_builder = std::make_unique>(nr, 3, p1.data(), nc, 3, p2.data()); + + } else { + hmatrix_builder = std::make_unique>(nr, 3, p1.data()); + } + + // HMatrixTreeBuilder + double eta = 10; + HMatrixTreeBuilder hmatrix_tree_builder(epsilon, eta, Symmetry, UPLO); + + // Generator + GeneratorTestTypeInUserNumbering generator(3, p1, (Symmetry == 'N' ? p2 : p1)); + + // HMatrix + HMatrix hmatrix = hmatrix_builder->build(generator, hmatrix_tree_builder); + + // Reference solution + Matrix dense_hmatrix(nr, nc); + std::vector vec_in(nc, 1), vec_out(nr, 1), vec_result(nr, 0), vec_reference(nr, 0); + std::vector vec_buffer(nr + nc); + Matrix mat_in(nc, 10), mat_out(nr, 10), mat_result(nr, 10), mat_reference(nr, 10); + vector mat_buffer((nr + nc) * 10); + double error; + T alpha, beta; + generate_random_scalar(alpha); + generate_random_scalar(beta); + generate_random_vector(vec_in); + generate_random_vector(vec_out); + generate_random_matrix(mat_in); + generate_random_matrix(mat_out); + vec_reference = vec_out; + mat_reference = mat_out; + copy_to_dense_in_user_numbering(hmatrix, dense_hmatrix.data()); + + if (Symmetry == 'N') { + add_matrix_vector_product('N', alpha, dense_hmatrix, vec_in.data(), beta, vec_reference.data()); + add_matrix_matrix_product('N', 'N', alpha, dense_hmatrix, mat_in, beta, mat_reference); + } else if (Symmetry == 'S') { + add_symmetric_matrix_vector_product(UPLO, alpha, dense_hmatrix, vec_in.data(), beta, vec_reference.data()); + add_symmetric_matrix_matrix_product('L', UPLO, alpha, dense_hmatrix, mat_in, beta, mat_reference); + } else { + if constexpr (is_complex()) { + add_hermitian_matrix_vector_product(UPLO, alpha, dense_hmatrix, vec_in.data(), beta, vec_reference.data()); + add_hermitian_matrix_matrix_product('L', UPLO, alpha, dense_hmatrix, mat_in, beta, mat_reference); + } + } + +#if defined(__cpp_lib_execution) && __cplusplus >= 201703L + // Test sequential vector product without buffer + vec_result = vec_out; + add_hmatrix_vector_product(std::execution::seq, 'N', alpha, hmatrix, vec_in.data(), beta, vec_result.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a sequential hmatrix vector product wo buffer: " << error << endl; + + // Test parallel vector product without buffer + vec_result = vec_out; + add_hmatrix_vector_product(std::execution::par, 'N', alpha, hmatrix, vec_in.data(), beta, vec_result.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a parallel hmatrix vector product wo buffer: " << error << endl; + + // Test sequential vector product with buffer + vec_result = vec_out; + add_hmatrix_vector_product(std::execution::seq, 'N', alpha, hmatrix, vec_in.data(), beta, vec_result.data(), vec_buffer.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a sequential hmatrix vector product with buffer: " << error << endl; + + // Test parallel vector product with buffer + vec_result = vec_out; + add_hmatrix_vector_product(std::execution::par, 'N', alpha, hmatrix, vec_in.data(), beta, vec_result.data(), vec_buffer.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a paralel hmatrix vector product with buffer: " << error << endl; + + // Test sequential matrix product without buffer + mat_result = mat_out; + add_hmatrix_matrix_product(std::execution::seq, 'N', 'N', alpha, hmatrix, mat_in, beta, mat_result); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a sequential hmatrix matrix product wo buffer: " << error << endl; + + // Test parallel matrix product without buffer + mat_result = mat_out; + add_hmatrix_matrix_product(std::execution::par, 'N', 'N', alpha, hmatrix, mat_in, beta, mat_result); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a parallel hmatrix matrix product wo buffer: " << error << endl; + + // Test sequential matrix product with buffer + mat_result = mat_out; + add_hmatrix_matrix_product(std::execution::seq, 'N', 'N', alpha, hmatrix, mat_in, beta, mat_result, mat_buffer.data()); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a sequential hmatrix vector product with buffer: " << error << endl; + + // Test parallel matrix product with buffer + mat_result = mat_out; + add_hmatrix_matrix_product(std::execution::par, 'N', 'N', alpha, hmatrix, mat_in, beta, mat_result, mat_buffer.data()); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a paralel hmatrix matrix product with buffer: " << error << endl; +#else + // Test vector product without buffer + vec_result = vec_out; + add_hmatrix_vector_product('N', alpha, hmatrix, vec_in.data(), beta, vec_result.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a hmatrix vector product wo buffer: " << error << endl; + + // Test vector product with buffer + vec_result = vec_out; + add_hmatrix_vector_product('N', alpha, hmatrix, vec_in.data(), beta, vec_result.data(), vec_buffer.data()); + + error = norm2(vec_result - vec_reference) / norm2(vec_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a hmatrix vector product with buffer: " << error << endl; + + // Test sequential matrix product without buffer + mat_result = mat_out; + add_hmatrix_matrix_product('N', 'N', alpha, hmatrix, mat_in, beta, mat_result); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a hmatrix vector product wo buffer: " << error << endl; + + // Test parallel matrix product with buffer + mat_result = mat_out; + add_hmatrix_matrix_product('N', 'N', alpha, hmatrix, mat_in, beta, mat_result, mat_buffer.data()); + + error = normFrob(mat_result - mat_reference) / normFrob(mat_reference); + is_error = is_error || !(error < epsilon); + cout << "> Errors on a hmatrix matrix product with buffer: " << error << endl; +#endif + return is_error; +} diff --git a/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp b/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp index 90bba286..b557bf7f 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_factorization.hpp @@ -22,8 +22,8 @@ bool test_hmatrix_lu(char trans, int n1, int n2, htool::underlying_type epsil htool::TestCaseSolve test_case('L', trans, n1, n2, 1, -1); // HMatrix - HMatrixTreeBuilder> hmatrix_tree_builder_A(*test_case.root_cluster_A_output, *test_case.root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, -1); - HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); // Matrix int ni_A = test_case.root_cluster_A_input->get_size(); @@ -59,8 +59,8 @@ bool test_hmatrix_cholesky(char UPLO, int n1, int n2, htool::underlying_type htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); // HMatrix - HMatrixTreeBuilder> hmatrix_tree_builder_A(*test_case.root_cluster_A_output, *test_case.root_cluster_A_input, epsilon, eta, is_complex() ? 'H' : 'S', UPLO, -1, -1, -1); - HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_A); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); // Matrix int ni_A = test_case.root_cluster_A_input->get_size(); @@ -96,8 +96,8 @@ bool test_hmatrix_cholesky(char UPLO, int n1, int n2, htool::underlying_type htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); // HMatrix - HMatrixTreeBuilder> hmatrix_tree_builder_A(*test_case.root_cluster_A_output, *test_case.root_cluster_A_input, epsilon, eta, is_complex() ? 'H' : 'S', UPLO, -1, -1, -1); - HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_A); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); // Matrix int ni_A = test_case.root_cluster_A_input->get_size(); diff --git a/tests/functional_tests/hmatrix/test_hmatrix_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_hmatrix_product.hpp index 8375551c..391b954d 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_hmatrix_product.hpp @@ -57,15 +57,15 @@ bool test_hmatrix_hmatrix_product(const TestCaseProduct &t } } - HMatrixTreeBuilder> hmatrix_tree_builder_A(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - HMatrixTreeBuilder> hmatrix_tree_builder_B(*root_cluster_B_output, *root_cluster_B_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - HMatrixTreeBuilder> hmatrix_tree_builder_C(*root_cluster_C_output, *root_cluster_C_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_tree_builder_B(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_tree_builder_C(epsilon, eta, 'N', 'N'); hmatrix_tree_builder_C.set_minimal_source_depth(2); // build - HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A); - HMatrix> B = hmatrix_tree_builder_B.build(*test_case.operator_B); - HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input); + HMatrix> B = hmatrix_tree_builder_B.build(*test_case.operator_B, *root_cluster_B_output, *root_cluster_B_input); + HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input); HMatrix> hmatrix_test(C); // save_leaves_with_rank(A, "A_leaves_" + std::to_string(rankWorld)); // save_leaves_with_rank(B, "B_leaves_" + std::to_string(rankWorld)); @@ -92,9 +92,10 @@ bool test_hmatrix_hmatrix_product(const TestCaseProduct &t copy_to_dense(C, HC_dense.data()); // lrmat - SVD compressor; htool::underlying_type lrmat_tol = 1e-6; - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tol), lrmat_test(lrmat_tol); + SVD compressor_C(*test_case.operator_C); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tol), lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tol); + compressor_C.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input T alpha(1), beta(0), scaling_coefficient; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp index eaee3740..938b0b25 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_lrmat_product.hpp @@ -59,15 +59,13 @@ bool test_hmatrix_lrmat_product(const TestCaseProduct &tes } } - HMatrixTreeBuilder> hmatrix_tree_builder_A(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder_A.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N', -1, std::make_shared>(*test_case.operator_A)); - HMatrixTreeBuilder> hmatrix_tree_builder_C(*root_cluster_C_output, *root_cluster_C_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder_C.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder_C(epsilon, eta, 'N', 'N', -1, std::make_shared>(*test_case.operator_C)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder_A.build(*test_case.operator_A); - HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C); + HMatrix> root_hmatrix = hmatrix_tree_builder_A.build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input); + HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input); HMatrix> hmatrix_test(C); // Dense matrix @@ -85,19 +83,21 @@ bool test_hmatrix_lrmat_product(const TestCaseProduct &tes Matrix matrix_test, dense_lrmat_test; // lrmat - SVD compressor; - htool::underlying_type lrmat_tolerance = 1e-6; - std::unique_ptr> B_auto_approximation_ptr = std::make_unique>(*test_case.operator_B, compressor, *root_cluster_B_output, *root_cluster_B_input, -1, lrmat_tolerance); - - // LowRankMatrix B_auto_approximation(*test_case.operator_B, compressor, *root_cluster_B_output, *root_cluster_B_input, -1, lrmat_tolerance); - LowRankMatrix lrmat_test(epsilon); - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance); - - if (B_auto_approximation_ptr->rank_of() == 0) { - B_auto_approximation_ptr = std::make_unique>(*test_case.operator_B, compressor, *root_cluster_B_output, *root_cluster_B_input, (root_cluster_B_output->get_size() * root_cluster_B_input->get_size()) / (root_cluster_B_output->get_size() + root_cluster_B_input->get_size()), epsilon); + htool::underlying_type lrmat_tolerance = 1e-6; + SVD compressor_B(*test_case.operator_B); + LowRankMatrix B_auto_approximation(root_cluster_B_output->get_size(), root_cluster_B_input->get_size(), lrmat_tolerance); + compressor_B.copy_low_rank_approximation(root_cluster_B_output->get_size(), root_cluster_B_input->get_size(), root_cluster_B_output->get_offset(), root_cluster_B_input->get_offset(), B_auto_approximation); + + if (B_auto_approximation.rank_of() == 0) { + int reqrank_B = (root_cluster_B_output->get_size() * root_cluster_B_input->get_size()) / (root_cluster_B_output->get_size() + root_cluster_B_input->get_size()); + compressor_B.copy_low_rank_approximation(root_cluster_B_output->get_size(), root_cluster_B_input->get_size(), root_cluster_B_output->get_offset(), root_cluster_B_input->get_offset(), reqrank_B, B_auto_approximation); } - LowRankMatrix &B_auto_approximation = *B_auto_approximation_ptr; + // LowRankMatrix B_auto_approximation(*test_case.operator_B, compressor, *root_cluster_B_output, *root_cluster_B_input, -1, lrmat_tolerance); + LowRankMatrix lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), epsilon); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance); + SVD compressor_C(*test_case.operator_C); + compressor_C.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input matrix T alpha(1), beta(1), scaling_coefficient; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp b/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp index 8df9304a..2c3ac844 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_matrix_product.hpp @@ -68,11 +68,11 @@ bool test_hmatrix_matrix_product(const TestCaseProduct &te } } - HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, 'N', 'N'); + hmatrix_tree_builder.set_low_rank_generator(std::make_shared>(*test_case.operator_A)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A); + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input); // Dense matrix int ni_A = root_cluster_A_input->get_size(); @@ -94,9 +94,10 @@ bool test_hmatrix_matrix_product(const TestCaseProduct &te copy_to_dense(root_hmatrix, HA_dense.data()); // lrmat - SVD compressor; + SVD compressor(*test_case.operator_C); htool::underlying_type lrmat_tolerance = 0.0001; - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance), lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance); + compressor.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input matrix std::vector B_vec, C_vec, test_vec; @@ -200,11 +201,10 @@ bool test_symmetric_hmatrix_matrix_product(const TestCaseSymmetricProductget_cluster_on_partition(rankWorld); root_cluster_C_input = &test_case.root_cluster_C_input->get_cluster_on_partition(rankWorld); - HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'S', UPLO, -1, -1, rankWorld); - hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, 'S', UPLO, -1, std::make_shared>(*test_case.operator_A)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A); + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input); // Dense matrix int ni_A = root_cluster_A_input->get_size(); @@ -226,9 +226,10 @@ bool test_symmetric_hmatrix_matrix_product(const TestCaseSymmetricProduct compressor; + SVD compressor(*test_case.operator_C); htool::underlying_type lrmat_tolerance = 0.0001; - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance), lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance); + compressor.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input matrix std::vector B_vec, C_vec, test_vec; @@ -313,11 +314,10 @@ bool test_hermitian_hmatrix_matrix_product(const TestCaseSymmetricProductget_cluster_on_partition(rankWorld); root_cluster_C_input = &test_case.root_cluster_C_input->get_cluster_on_partition(rankWorld); - HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_A_output, *root_cluster_A_input, epsilon, eta, 'H', UPLO, -1, -1, rankWorld); - hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, 'H', UPLO, -1, std::make_shared>(*test_case.operator_A)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A); + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input); // Dense matrix int ni_A = root_cluster_A_input->get_size(); @@ -339,9 +339,9 @@ bool test_hermitian_hmatrix_matrix_product(const TestCaseSymmetricProduct compressor; + // SVD compressor(*test_case.operator_C); // htool::underlying_type lrmat_tolerance = 0.0001; - // LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); + // LowRankMatrix C_auto_approximation( compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); // Random Input matrix std::vector B_vec, C_vec, test_vec; diff --git a/tests/functional_tests/hmatrix/test_hmatrix_task_based.hpp b/tests/functional_tests/hmatrix/test_hmatrix_task_based.hpp new file mode 100644 index 00000000..6642883b --- /dev/null +++ b/tests/functional_tests/hmatrix/test_hmatrix_task_based.hpp @@ -0,0 +1,1046 @@ +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_block_tree... +#include // for Generat... +#include // for add_... +#include +#include +#include // for task_bas... +#include // for task_bas... +#include // for task_based_lu_factorization +#include // for task_based_triangular_hmatrix_hmatrix_solve +#include // for triangular_hmatrix_hmatrix_solve + +#include +#include // for enumerate_dependence, find_l0... +#include // for HMatrix... +#include // for Matrix +#include // for underly... +#include // for NbrToStr +#include +#include // for TestCaseSymmetricPro... +#include +#include // for create_... +#include // for test_pa... +#include // for operator<< +#include // for make_sh... +#include // for operator+ +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_task_based_hmatrix_vector_product(const TestCaseType &test_case, char sym, char transa, htool::underlying_type epsilon, bool block_tree_consistency, char UPLO = 'L') { + // custom tests to run + bool all_tests = true; // run all tests. Has priority over specific tests flags + bool basic_tests = false; // tests left_hmatrix_ancestor_of_right_hmatrix, enumerate_dependences and find_l0 + bool compute_blocks_tests = false; + bool hmatrix_vector_product_tests = false; + bool assembly_and_vector_product_tests = false; // compares the assembly of a hmatrix + the product with a vector between conjoint and disjoint cases + + double eta = 10; + std::cout << "eta = " << eta << std::endl; + double error_tol = 1e-14; + bool is_error = false; + + // Get test case parameters + const Cluster> *root_cluster_A_output, *root_cluster_A_input; + root_cluster_A_output = test_case.root_cluster_A_output; + root_cluster_A_input = test_case.root_cluster_A_input; + int ni_A = test_case.ni_A; + int no_A = test_case.no_A; + int no_B = test_case.no_B; + int no_C = test_case.no_C; + + // Tree builder + std::unique_ptr>> hmatrix_tree_builder; + if (sym == 'N') { + hmatrix_tree_builder = std::make_unique>>(epsilon, eta, 'N', 'N'); + } else if (sym == 'S') { + hmatrix_tree_builder = std::make_unique>>(epsilon, eta, 'S', UPLO); + } + hmatrix_tree_builder->set_low_rank_generator(std::make_shared>(*test_case.operator_A)); + hmatrix_tree_builder->set_block_tree_consistency(block_tree_consistency); + + // swap input and output clusters if transa is not 'N' + auto *input_cluster = root_cluster_A_input; + auto *output_cluster = root_cluster_A_output; + if (transa != 'N') { + std::swap(input_cluster, output_cluster); + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Basic tests + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (basic_tests || all_tests) { + // build + HMatrix> root_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); + + // Hmatrix Visualization + save_leaves_with_rank(root_hmatrix, "root_hmatrix_facto"); + // print_hmatrix_information(root_hmatrix, std::cout); + + // Basic sub tree + HMatrix &child1 = *root_hmatrix.get_children()[0].get(); + HMatrix &child2 = *root_hmatrix.get_children()[1].get(); + HMatrix &child1_child1 = *root_hmatrix.get_children()[0].get()->get_children()[0].get(); + HMatrix &child1_child2 = *root_hmatrix.get_children()[0].get()->get_children()[1].get(); + + // Tests for left_hmatrix_ancestor_of_right_hmatrix + std::cout << "left_hmatrix_ancestor_of_right_hmatrix tests..."; + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child2)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1_child1)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1_child2)); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // Tests for left_hmatrix_descendant_of_right_hmatrix + std::cout << "left_hmatrix_descendant_of_right_hmatrix tests..."; + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child2, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1_child1, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1_child2, root_hmatrix)); + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // Tests for enumerate_dependences + { + std::cout << "enumerate_dependences tests..."; + std::vector *> L0; + std::vector *> dependences; + + // Test case 1: hmatrix is in L0 + { + L0 = {&child1_child1, &child1_child2, &child2}; + dependences = enumerate_dependences(child2, L0); + + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &child2); + } + + // Test case 2: hmatrix is above L0 + { + L0 = {&child1_child1, &child1_child2, &child2}; + dependences = enumerate_dependences(root_hmatrix, L0); + is_error = is_error || !(dependences.size() == 3); + is_error = is_error || !(dependences[0] == &child1_child1); + is_error = is_error || !(dependences[1] == &child1_child2); + is_error = is_error || !(dependences[2] == &child2); + } + + // Test case 3: hmatrix is below L0 + { + L0 = {&root_hmatrix}; + + dependences = enumerate_dependences(child1, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + + dependences = enumerate_dependences(child2, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + + dependences = enumerate_dependences(child1_child2, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + } + + // Test case 4 : L0 is empty + // L0.clear(); + // dependences = enumerate_dependences(root_hmatrix, L0); + // is_error = is_error || !(dependences.empty()); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + } // end of tests for enumerate_dependences + + // Tests for find_l0. Visual verification of the block tree in dotfile.dot + { + std::vector *> L0 = find_l0(root_hmatrix, 256); + std::ofstream dotfile("test_find_l0.dot"); + view_block_tree(root_hmatrix, L0, dotfile); + } + } // end of basic tests + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for task_based_compute_blocks + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (compute_blocks_tests || all_tests) { + std::cout << "task_based_compute_blocks tests..."; + std::chrono::steady_clock::time_point start, end; + HMatrix task_based_hmatrix(*root_cluster_A_output, *root_cluster_A_input); + + // task based build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); // build + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + + task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] = NbrToStr(hmatrix_tree_builder->get_false_positive()); // when task_based_compute_blocks is used in build, the number of false positives is not set in the hmatrix tree data because the tasks need to finish before we can know the number of false positive + + // classic build + start = std::chrono::steady_clock::now(); + auto hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); // when compute_blocks is used in build, the number of false positives is set in the hmatrix tree data + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + // visu + // std::cout << "hmatrix:" << std::endl; + // print_hmatrix_information(hmatrix, std::cout); + // std::cout << "task_based_hmatrix:" << std::endl; + // print_hmatrix_information(task_based_hmatrix, std::cout); + save_leaves_with_rank(task_based_hmatrix, "TB_hmatrix_build"); + save_leaves_with_rank(hmatrix, "hmatrix_build"); + + // densification + Matrix densified_hmatrix(no_A, ni_A), densified_hmatrix_task_based(no_A, ni_A); + copy_to_dense(hmatrix, densified_hmatrix.data()); + copy_to_dense(task_based_hmatrix, densified_hmatrix_task_based.data()); + + // compare + + is_error = is_error || (std::isnan(normFrob(densified_hmatrix - densified_hmatrix_task_based) / normFrob(densified_hmatrix)) > error_tol); + is_error = is_error || (hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] < task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"]); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + std::cout << " normFrob(densified_hmatrix - densified_hmatrix_task_based)/normFrob(densified_hmatrix) = " << normFrob(densified_hmatrix - densified_hmatrix_task_based) / normFrob(densified_hmatrix) << std::endl; + + std::cout << " hmatrix m_false_positive = " << hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] << std::endl; + std::cout << " task_based_hmatrix m_false_positive = " << task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] << std::endl + << std::endl; + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl + << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << " Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + std::cout << "----------------------------------" << std::endl; + } // end of tests for task_based_compute_blocks + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for task_based_internal_add_hmatrix_vector_product + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (hmatrix_vector_product_tests || all_tests) { + std::cout << "task_based_internal_add_hmatrix_vector_product tests..."; + std::chrono::steady_clock::time_point start, end; + + // build + auto hmatrix_classic = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); + auto hmatrix_task_based = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); // not in parallel area because it is not the focus of the test, but should nevertheless be called with the flag true in order to build m_L0 needed later when : hmatrix_tree_builder->get_L0(); + + // visu + // std::cout << "hmatrix_classic:" << std::endl; + // print_hmatrix_information(hmatrix_classic, std::cout); + // std::cout << "hmatrix_task_based:" << std::endl; + // print_hmatrix_information(hmatrix_task_based, std::cout); + save_leaves_with_rank(hmatrix_task_based, "TB_hmatrix_prod"); + save_leaves_with_rank(hmatrix_classic, "hmatrix_prod"); + + // L0 definitions + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + std::vector> *> in_L0 = find_l0(*input_cluster, 64); + std::vector> *> out_L0 = find_l0(*output_cluster, 64); + // std::ofstream dotfile("test_find_l0.dot"); + // view_block_tree(hmatrix_task_based, L0, dotfile); + // for (auto &L0_nodes : L0) { + // // if (L0_nodes->get_target_cluster().get_offset() < L0_nodes->get_source_cluster().get_offset()) { + + // std::cout << L0_nodes->get_target_cluster().get_offset() << " " << L0_nodes->get_target_cluster().get_size() << " "; + // std::cout << L0_nodes->get_source_cluster().get_offset() << " " << L0_nodes->get_source_cluster().get_size() << " " << L0_nodes->get_symmetry_for_leaves() << std ::endl; + // // } + // } + + // Create a vector for the input and output + std::vector in(no_B), out(no_C, 11), out_task(no_C, 11); + generate_random_vector(in); + generate_random_vector(out); + out_task = out; + T alpha = T(3); + T beta = T(2); + int nb_products = 50; + + // Perform the classic H-matrix vector product + start = std::chrono::steady_clock::now(); + for (int i = 0; i < nb_products; i++) { + openmp_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_classic, in.data(), beta, out.data()); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + // Perform the task-based internal add H-matrix vector product + task_based_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_task_based, in.data(), beta, out_task.data(), L0, in_L0, out_L0); + // sequential_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_task_based, in.data(), beta, out_task.data()); + } + } + end = std::chrono::steady_clock::now(); + + std::chrono::duration task_based_duration = end - start; + + // Compare the results + is_error = is_error || (std::isnan(norm2(out - out_task) / norm2(out)) > nb_products * error_tol); + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + // std::cout << " \n out_final = \n"; + // for (int i = 0; i < 0 + 15; i++) { + // std::cout << out[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << " \n out_task_final = \n"; + // for (int i = 0; i < 0 + 15; i++) { + // std::cout << out_task[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << " argmax(out - out_task) = " << argmax(out - out_task) << std::endl; + // std::cout << " (out - out_task) = " << (out - out_task) << std::endl; + // std::cout << " max(out - out_task)/out(argmax(out - out_task)) = " << max(out - out_task) / std::abs(out[argmax(out - out_task)]) << std::endl; + std::cout << " norm2(out) = " << norm2(out) << std::endl; + + std::cout + << " norm2(out - out_task)/norm2(out) = " << norm2(out - out_task) / norm2(out) << std::endl + << std::endl; + + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + std::cout << "----------------------------------" << std::endl; + } // end of tests for task_based_internal_add_hmatrix_vector_product + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for hmatrix vector product following assembly with task_based_compute_blocks and task_based_internal_add_hmatrix_vector_product + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (assembly_and_vector_product_tests || all_tests) { + std::cout << "assembly + hmatrix vector products tests..."; + + int nb_products = 100; + T alpha = T(3); + T beta = T(2); + std::chrono::duration disjoint_case_duration, conjoint_case_duration; + std::chrono::steady_clock::time_point start, end; + HMatrix task_based_hmatrix(*root_cluster_A_output, *root_cluster_A_input); + + std::vector> *> in_L0 = find_l0(*input_cluster, 64); + std::vector> *> out_L0 = find_l0(*output_cluster, 64); + std::vector in(no_B), out_task_dis(no_C, 11), out_task_con(no_C, 11); + generate_random_vector(in); + generate_random_vector(out_task_dis); + out_task_con = out_task_dis; + + // Case 1 : task graphs are disjoint + { + // build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + } + + // N hmatrix vector products + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_vector_product(transa, alpha, task_based_hmatrix, in.data(), beta, out_task_dis.data(), L0, in_L0, out_L0); + } + } + end = std::chrono::steady_clock::now(); + disjoint_case_duration = end - start; + std::cout << "\n disjoint_case_TB_duration = " << disjoint_case_duration.count() << std::endl; + } // end of case 1 + + // Case 2 : task graphs are not conjoint + { + // build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + + // N hmatrix vector products + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_vector_product(transa, alpha, task_based_hmatrix, in.data(), beta, out_task_con.data(), L0, in_L0, out_L0); + } + } + end = std::chrono::steady_clock::now(); + conjoint_case_duration = end - start; + std::cout << " conjoint_case_TB_duration = " << conjoint_case_duration.count() << std::endl; + + is_error = is_error || (std::isnan(norm2(out_task_con - out_task_dis) / norm2(out_task_con)) > error_tol); + std::cout << " norm2(out_task_con - out_task_dis)/norm2(out_task_con) = " << norm2(out_task_con - out_task_dis) / norm2(out_task_con) << std::endl; + } // end of case 2 + + if (disjoint_case_duration.count() < conjoint_case_duration.count()) { + std::cerr << "Careful: conjoint_case_TB_duration > disjoint_case_TB_duration. Ratio conjoint/disjoint = " << conjoint_case_duration.count() / disjoint_case_duration.count() << std::endl; + } + + std::cout << "----------------------------------" << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_vector_product current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_vector_product current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} + +template +bool test_task_based_hmatrix_hmatrix_product(const TestCaseType &test_case, char sym, char transa, char transb, htool::underlying_type epsilon, bool block_tree_consistency, char UPLO = 'L') { + bool is_error = false; + double eta = 10; + double error_tol = 1e-15; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_internal_add_hmatrix_hmatrix_product tests..."; + + // Get test case parameters + const Cluster> *root_cluster_A_output, *root_cluster_A_input, + *root_cluster_B_output, *root_cluster_B_input, *root_cluster_C_output, *root_cluster_C_input; + root_cluster_A_output = test_case.root_cluster_A_output; + root_cluster_A_input = test_case.root_cluster_A_input; + root_cluster_B_output = test_case.root_cluster_B_output; + root_cluster_B_input = test_case.root_cluster_B_input; + root_cluster_C_output = test_case.root_cluster_C_output; + root_cluster_C_input = test_case.root_cluster_C_input; + int ni_C = test_case.ni_C; + int no_C = test_case.no_C; + + // Tree builder + std::unique_ptr>> hmatrix_tree_builder_A, + hmatrix_tree_builder_B, hmatrix_tree_builder_C; + if (sym == 'N') { + hmatrix_tree_builder_A = std::make_unique>>(epsilon, eta, 'N', 'N'); + hmatrix_tree_builder_B = std::make_unique>>(epsilon, eta, 'N', 'N'); + hmatrix_tree_builder_C = std::make_unique>>(epsilon, eta, 'N', 'N'); + } else if (sym == 'S') { + hmatrix_tree_builder_A = std::make_unique>>(epsilon, eta, 'S', UPLO); + hmatrix_tree_builder_B = std::make_unique>>(epsilon, eta, 'S', UPLO); + hmatrix_tree_builder_C = std::make_unique>>(epsilon, eta, 'S', UPLO); + } + + hmatrix_tree_builder_A->set_low_rank_generator(std::make_shared>(*test_case.operator_A)); + hmatrix_tree_builder_A->set_block_tree_consistency(block_tree_consistency); + + // swap input and output clusters if transX is not 'N' + // auto *input_cluster_A = &hmatrix_tree_builder_A->get_source_cluster(); + // auto *output_cluster_A = &hmatrix_tree_builder_A->get_target_cluster(); + // if (transa != 'N') { + // std::swap(input_cluster, output_cluster); + // } + // auto *input_cluster_B = &hmatrix_tree_builder_B->get_source_cluster(); + // auto *output_cluster_B = &hmatrix_tree_builder_B->get_target_cluster(); + // if (transb != 'N') { + // std::swap(input_cluster_B, output_cluster_B); + // } + + // build + auto hmatrix_task_based_A = hmatrix_tree_builder_A->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + auto hmatrix_task_based_B = hmatrix_tree_builder_B->build(*test_case.operator_B, *root_cluster_B_output, *root_cluster_B_input, -1, -1, true, 64); + auto hmatrix_classic_C = hmatrix_tree_builder_C->build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input, -1, -1, false, 64); + auto hmatrix_task_based_C = hmatrix_tree_builder_C->build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input, -1, -1, true, 64); + std::vector *> L0 = hmatrix_tree_builder_C->get_L0(); + std::vector *> L0_A = hmatrix_tree_builder_A->get_L0(); + std::vector *> L0_B = hmatrix_tree_builder_B->get_L0(); + + // visu + // std::cout << "hmatrix_task_based_A:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_A, std::cout); + // std::cout << "hmatrix_task_based_B:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_B, std::cout); + // std::cout << "hmatrix_classic_C:" << std::endl; + // print_hmatrix_information(hmatrix_classic_C, std::cout); + // std::cout << "hmatrix_task_based_C:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_C, std::cout); + save_leaves_with_rank(hmatrix_task_based_A, "hmatrix_task_based_AAAAAAAAAAAAAAAAA"); + save_leaves_with_rank(hmatrix_task_based_B, "hmatrix_task_based_B"); + save_leaves_with_rank(hmatrix_classic_C, "hmatrix_classic_C"); + save_leaves_with_rank(hmatrix_task_based_C, "hmatrix_task_based_C"); + + // parameters + T alpha = T(3); + T beta = T(2); + int nb_products = 1; + + std::chrono::steady_clock::time_point start, end; + + // Perform the classic hmatrix hmatrix product + start = std::chrono::steady_clock::now(); + for (int i = 0; i < nb_products; i++) { + internal_add_hmatrix_hmatrix_product(transa, transb, alpha, hmatrix_task_based_A, hmatrix_task_based_B, beta, hmatrix_classic_C); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + // Perform the task-based internal add H-matrix hamtrix product + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_hmatrix_product(transa, transb, alpha, hmatrix_task_based_A, hmatrix_task_based_B, beta, hmatrix_task_based_C, L0_A, L0_B, L0); + } + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + + // desify results + Matrix densified_hmatrix_classic_C(no_C, ni_C), densified_hmatrix_task_based_C(no_C, ni_C); + copy_to_dense(hmatrix_classic_C, densified_hmatrix_classic_C.data()); + copy_to_dense(hmatrix_task_based_C, densified_hmatrix_task_based_C.data()); + // Matrix test = densified_hmatrix_classic_C - densified_hmatrix_task_based_C; + // std::ofstream dense_classic_file("dense_classic.csv"); + // std::ofstream dense_task_file("dense_task.csv"); + // std::ofstream test_file("test.csv"); + // densified_hmatrix_classic_C.print(dense_classic_file, ","); + // densified_hmatrix_task_based_C.print(dense_task_file, ","); + // test.print(test_file, ","); + + // Compare the results + is_error = is_error || (std::isnan(normFrob(densified_hmatrix_classic_C - densified_hmatrix_task_based_C) / normFrob(densified_hmatrix_classic_C)) > nb_products * error_tol); + + // Print the results + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + std::cout << " normFrob(densified_hmatrix_classic_C) = " << normFrob(densified_hmatrix_classic_C) << std::endl; + std::cout << " normFrob(densified_hmatrix_task_based_C) = " << normFrob(densified_hmatrix_task_based_C) << std::endl; + + std::cout << " normFrob(classic_C - task_based_C) / normFrob(classic_C) = " << normFrob(densified_hmatrix_classic_C - densified_hmatrix_task_based_C) / normFrob(densified_hmatrix_classic_C) << std::endl + << std::endl; + + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_hmatrix_product current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_hmatrix_product current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} + +template +bool test_task_based_hmatrix_triangular_solve(const TestCaseType &test_case, char side, char transa, char diag, double epsilon) { + + bool is_error = false; + double eta = 10; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_internal_triangular_hmatrix_hmatrix_solve tests...\n"; + + // Random input + T alpha(1); + generate_random_scalar(alpha); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_tree_builder_X(epsilon, eta, 'N', 'N'); + + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input, -1, -1, true, 64); + HMatrix> X = hmatrix_tree_builder_X.build(*test_case.operator_X, *test_case.root_cluster_X_output, *test_case.root_cluster_X_input, -1, -1, true, 64); + HMatrix> B(X), UB(X), LB(X); + HMatrix> hmatrix_test(B); + + if (diag == 'U') { // to avoid conditioning issues with the unit diagonal + std::vector diagonal(A.nb_cols()); + copy_diagonal(A, diagonal.data()); + auto max_abs = *std::max_element(diagonal.begin(), diagonal.end(), [](const T &a, const T &b) { + return std::abs(a) < std::abs(b); + }); + scale(T(1) / (T(10) * std::abs(max_abs)), A); + } + + // Triangular hmatrices + HMatrix> LA(A); + HMatrix> UA(A); + preorder_tree_traversal(LA, [&diag](HMatrix> &hmatrix) { + if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { + Matrix &dense_data = *hmatrix.get_dense_data(); + for (int i = 0; i < dense_data.nb_rows(); i++) { + if (diag == 'U') { + dense_data(i, i) = 1; + } + for (int j = i + 1; j < dense_data.nb_cols(); j++) { + dense_data(i, j) = 0; + } + } + } else { + std::vector>>> filtered_children; + for (auto &child : hmatrix.get_children_with_ownership()) { + if (child->get_target_cluster().get_offset() >= child->get_source_cluster().get_offset()) { + filtered_children.push_back(std::move(child)); + } + } + if (filtered_children.size() > 0) { + hmatrix.delete_children(); + hmatrix.assign_children(filtered_children); + } + } + }); + + preorder_tree_traversal(UA, [&diag](HMatrix> &hmatrix) { + if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { + Matrix &dense_data = *hmatrix.get_dense_data(); + for (int j = 0; j < dense_data.nb_cols(); j++) { + if (diag == 'U') { + dense_data(j, j) = 1; + } + for (int i = j + 1; i < dense_data.nb_rows(); i++) { + dense_data(i, j) = 0; + } + } + } else { + std::vector>>> filtered_children; + for (auto &child : hmatrix.get_children_with_ownership()) { + if (child->get_target_cluster().get_offset() <= child->get_source_cluster().get_offset()) { + filtered_children.push_back(std::move(child)); + } + } + if (filtered_children.size() > 0) { + hmatrix.delete_children(); + hmatrix.assign_children(filtered_children); + } + } + }); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + test_case.operator_A->copy_submatrix(no_A, ni_A, test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), A_dense.data()); + test_case.operator_X->copy_submatrix(no_X, ni_X, test_case.root_cluster_X_output->get_offset(), test_case.root_cluster_X_input->get_offset(), X_dense.data()); + + // Triangular matrices + if (side == 'L') { + internal_add_hmatrix_hmatrix_product(transa, 'N', T(1) / alpha, UA, X, T(0), UB); + internal_add_hmatrix_hmatrix_product(transa, 'N', T(1) / alpha, LA, X, T(0), LB); + } else { + internal_add_hmatrix_hmatrix_product('N', transa, T(1) / alpha, X, UA, T(0), UB); + internal_add_hmatrix_hmatrix_product('N', transa, T(1) / alpha, X, LA, T(0), LB); + } + + // save_leaves_with_rank(LA, "LA"); + // save_leaves_with_rank(LB, "LB"); + + // Tests + std::chrono::steady_clock::time_point start, end; + int max_nb_nodes = 32; + std::vector *> L0_LA = find_l0(LA, max_nb_nodes); + std::vector *> L0_UA = find_l0(UA, max_nb_nodes); + std::vector *> L0_test; + + //// internal_triangular_hmatrix_hmatrix_solve Lower + ////// Classic + hmatrix_test = LB; + start = std::chrono::steady_clock::now(); + internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, diag, alpha, LA, hmatrix_test); + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << ">> Lower case: " << endl; + cout << "> classic errors = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + ////// Task-based + hmatrix_test = LB; + L0_test = find_l0(hmatrix_test, max_nb_nodes); + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, diag, alpha, LA, hmatrix_test, L0_LA, L0_test); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based errors = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // internal_triangular_hmatrix_hmatrix_solve Upper + //// Classic + hmatrix_test = UB; + start = std::chrono::steady_clock::now(); + internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, diag, alpha, UA, hmatrix_test); + end = std::chrono::steady_clock::now(); + classic_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << ">> Upper case: " << endl; + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based + hmatrix_test = UB; + L0_test = find_l0(hmatrix_test, max_nb_nodes); + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, diag, alpha, UA, hmatrix_test, L0_UA, L0_test); + } + end = std::chrono::steady_clock::now(); + task_based_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based errors = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_triangular_solve current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_triangular_solve current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} // end of test_task_based_hmatrix_triangular_solve + +template +bool test_task_based_lu_factorization(char trans, int n1, int n2, htool::underlying_type epsilon) { + + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_lu_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', trans, n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_matrix_matrix_product(trans, 'N', T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + + //// Classic LU factorization + auto A_classic = A; + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + lu_factorization(A_classic); + end = std::chrono::steady_clock::now(); + // save_leaves_with_rank(A_classic, "classic_hmatrix_facto"); + lu_solve(trans, A_classic, matrix_test); + std::chrono::duration classic_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based LU factorization + auto A_task_based = A; + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(A_task_based, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_lu_factorization(A_task_based, L0_A); + } + end = std::chrono::steady_clock::now(); + // save_leaves_with_rank(A_task_based, "TB_hmatrix_facto"); + lu_solve(trans, A_task_based, matrix_test); + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_lu_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_lu_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} + +template ::value, bool> = true> +bool test_task_based_cholesky_factorization(char UPLO, int n1, int n2, htool::underlying_type epsilon) { + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_cholesky_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_symmetric_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + auto task_based_HA = HA; + + //// Classic Cholesky factorization + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + cholesky_factorization(UPLO, HA); + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, HA, matrix_test); + + std::chrono::duration classic_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based Cholesky factorization + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(task_based_HA, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_cholesky_factorization(UPLO, task_based_HA, L0_A); + } + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, task_based_HA, matrix_test); + + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_cholesky_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_cholesky_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} // end of test_task_based_cholesky_factorization + +template ::value, bool> = true> +bool test_task_based_cholesky_factorization(char UPLO, int n1, int n2, htool::underlying_type epsilon) { + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_cholesky_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_hermitian_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + auto task_based_HA = HA; + + //// Classic Cholesky factorization + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + cholesky_factorization(UPLO, HA); + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, HA, matrix_test); + std::chrono::duration classic_duration = end - start; + + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based Cholesky factorization + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(task_based_HA, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_cholesky_factorization(UPLO, task_based_HA, L0_A); + } + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, task_based_HA, matrix_test); + + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_cholesky_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_cholesky_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} diff --git a/tests/functional_tests/hmatrix/test_hmatrix_triangular_solve.hpp b/tests/functional_tests/hmatrix/test_hmatrix_triangular_solve.hpp index 5c5671ec..c1c8d676 100644 --- a/tests/functional_tests/hmatrix/test_hmatrix_triangular_solve.hpp +++ b/tests/functional_tests/hmatrix/test_hmatrix_triangular_solve.hpp @@ -21,7 +21,7 @@ using namespace std; using namespace htool; template -bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool::underlying_type epsilon, htool::underlying_type margin) { +bool test_hmatrix_triangular_solve(char side, char transa, char diag, int n1, int n2, htool::underlying_type epsilon, htool::underlying_type margin) { bool is_error = false; double eta = 10; htool::underlying_type error; @@ -34,27 +34,37 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool htool::TestCaseSolve test_case(side, transa, n1, n2, 1, -1); // HMatrix - HMatrixTreeBuilder> hmatrix_tree_builder_A(*test_case.root_cluster_A_output, *test_case.root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, -1); - HMatrixTreeBuilder> hmatrix_tree_builder_X(*test_case.root_cluster_X_output, *test_case.root_cluster_X_input, epsilon, eta, 'N', 'N', -1, -1, -1); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_tree_builder_X(epsilon, eta, 'N', 'N'); - HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A); - HMatrix> X = hmatrix_tree_builder_X.build(*test_case.operator_X); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + HMatrix> X = hmatrix_tree_builder_X.build(*test_case.operator_X, *test_case.root_cluster_X_output, *test_case.root_cluster_X_input); HMatrix> B(X), UB(X), LB(X); HMatrix> hmatrix_test(B); // Lrmat rhs - SVD compressor; htool::underlying_type lrmat_tolerance = 1e-3; - LowRankMatrix X_lrmat(*test_case.operator_X, compressor, *test_case.root_cluster_X_output, *test_case.root_cluster_X_input, 15, lrmat_tolerance); - LowRankMatrix lrmat_test(epsilon); + SVD compressor(*test_case.operator_X); + int reqrank = 15; + LowRankMatrix X_lrmat(test_case.root_cluster_X_output->get_size(), test_case.root_cluster_X_input->get_size(), lrmat_tolerance, reqrank); + LowRankMatrix lrmat_test(test_case.root_cluster_X_output->get_size(), test_case.root_cluster_X_input->get_size(), epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_X_output->get_size(), test_case.root_cluster_X_input->get_size(), test_case.root_cluster_X_output->get_offset(), test_case.root_cluster_X_input->get_offset(), X_lrmat); // Triangular hmatrices + if (diag == 'U') { + std::vector diagonal(A.nb_cols()); + copy_diagonal(A, diagonal.data()); + scale(1. / (*std::max_element(diagonal.begin(), diagonal.end(), [](T a, T b) { return (std::abs(a) < std::abs(b)); })), A); + } HMatrix> LA(A); HMatrix> UA(A); - preorder_tree_traversal(LA, [](HMatrix> &hmatrix) { + preorder_tree_traversal(LA, [&diag](HMatrix> &hmatrix) { if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { Matrix &dense_data = *hmatrix.get_dense_data(); for (int i = 0; i < dense_data.nb_rows(); i++) { + if (diag == 'U') { + dense_data(i, i) = 1; + } for (int j = i + 1; j < dense_data.nb_cols(); j++) { dense_data(i, j) = 0; } @@ -73,10 +83,13 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool } }); - preorder_tree_traversal(UA, [](HMatrix> &hmatrix) { + preorder_tree_traversal(UA, [&diag](HMatrix> &hmatrix) { if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { Matrix &dense_data = *hmatrix.get_dense_data(); for (int j = 0; j < dense_data.nb_cols(); j++) { + if (diag == 'U') { + dense_data(j, j) = 1; + } for (int i = j + 1; i < dense_data.nb_rows(); i++) { dense_data(i, j) = 0; } @@ -105,10 +118,21 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool test_case.operator_X->copy_submatrix(no_X, ni_X, test_case.root_cluster_X_output->get_offset(), test_case.root_cluster_X_input->get_offset(), X_dense.data()); dense_X_lrmat.resize(X_lrmat.nb_rows(), X_lrmat.nb_cols()); X_lrmat.copy_to_dense(dense_X_lrmat.data()); + if (diag == 'U') { + T max = 0; + for (int i = 0; i < A_dense.nb_cols(); i++) { + max = std::max(std::abs(max), std::abs(A_dense(i, i))); + } + scale(1. / max, A_dense); + } // Triangular matrices Matrix LA_dense(A_dense), UA_dense(A_dense), LB_dense(B.nb_rows(), B.nb_cols()), UB_dense(B.nb_rows(), B.nb_cols()); for (int i = 0; i < A.nb_rows(); i++) { + if (diag == 'U') { + UA_dense(i, i) = 1; + LA_dense(i, i) = 1; + } for (int j = 0; j < A.nb_cols(); j++) { if (i > j) { UA_dense(i, j) = 0; @@ -118,7 +142,7 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool } } } - LowRankMatrix UB_lrmat(epsilon), LB_lrmat(epsilon); + LowRankMatrix UB_lrmat(UA_dense.nb_rows(), X_lrmat.nb_cols(), epsilon), LB_lrmat(LA_dense.nb_rows(), X_lrmat.nb_cols(), epsilon); if (side == 'L') { add_matrix_matrix_product(transa, 'N', T(1) / alpha, UA_dense, X_dense, T(0), UB_dense); add_matrix_matrix_product(transa, 'N', T(1) / alpha, LA_dense, X_dense, T(0), LB_dense); @@ -152,19 +176,19 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool // triangular_matrix_matrix_solve matrix_test = LB_dense; - internal_triangular_hmatrix_matrix_solve(side, 'L', transa, 'N', alpha, LA, matrix_test); + internal_triangular_hmatrix_matrix_solve(side, 'L', transa, diag, alpha, LA, matrix_test); error = normFrob(X_dense - matrix_test) / normFrob(X_dense); is_error = is_error || !(error < epsilon * margin); cout << "> Errors on lower triangular hmatrix matrix solve: " << error << endl; matrix_test = UB_dense; - internal_triangular_hmatrix_matrix_solve(side, 'U', transa, 'N', alpha, UA, matrix_test); + internal_triangular_hmatrix_matrix_solve(side, 'U', transa, diag, alpha, UA, matrix_test); error = normFrob(X_dense - matrix_test) / normFrob(X_dense); is_error = is_error || !(error < epsilon * margin); cout << "> Errors on upper triangular hmatrix matrix solve: " << error << endl; lrmat_test = LB_lrmat; - internal_triangular_hmatrix_lrmat_solve(side, 'L', transa, 'N', alpha, LA, lrmat_test); + internal_triangular_hmatrix_lrmat_solve(side, 'L', transa, diag, alpha, LA, lrmat_test); dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(dense_X_lrmat - dense_lrmat_test) / normFrob(dense_X_lrmat); @@ -172,7 +196,7 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool cout << "> Errors on lower triangular hmatrix lrmat solve: " << error << endl; lrmat_test = UB_lrmat; - internal_triangular_hmatrix_lrmat_solve(side, 'U', transa, 'N', alpha, UA, lrmat_test); + internal_triangular_hmatrix_lrmat_solve(side, 'U', transa, diag, alpha, UA, lrmat_test); dense_lrmat_test.resize(lrmat_test.get_U().nb_rows(), lrmat_test.get_V().nb_cols()); lrmat_test.copy_to_dense(dense_lrmat_test.data()); error = normFrob(dense_X_lrmat - dense_lrmat_test) / normFrob(dense_X_lrmat); @@ -180,14 +204,14 @@ bool test_hmatrix_triangular_solve(char side, char transa, int n1, int n2, htool cout << "> Errors on upper triangular hmatrix lrmat solve: " << error << endl; hmatrix_test = LB; - internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, 'N', alpha, LA, hmatrix_test); + internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, diag, alpha, LA, hmatrix_test); copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); is_error = is_error || !(error < epsilon * margin); cout << "> Errors on lower triangular hmatrix hmatrix solve: " << error << endl; hmatrix_test = UB; - internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, 'N', alpha, UA, hmatrix_test); + internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, diag, alpha, UA, hmatrix_test); copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); is_error = is_error || !(error < epsilon * margin); diff --git a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_addition.hpp b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_addition.hpp index cdcfc14f..e2b62af9 100644 --- a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_addition.hpp +++ b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_addition.hpp @@ -21,13 +21,14 @@ bool test_lrmat_hmatrix_addition(int n1, int n2, htool::underlying_type epsil TestCaseProduct test_case('N', 'N', n1, n2, n2, 1, 4); // HMatrix - HMatrixTreeBuilder> hmatrix_tree_builder_A(*test_case.root_cluster_A_output, *test_case.root_cluster_A_input, epsilon, eta, 'N', 'N', -1, -1, -1); - HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A); + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); HMatrix> hmatrix_test(A); // lrmat - Compressor compressor; - LowRankMatrix C_approximation(*test_case.operator_C, compressor, *test_case.root_cluster_C_output, *test_case.root_cluster_C_input, -1, epsilon); + Compressor compressor(*test_case.operator_C); + LowRankMatrix C_approximation(test_case.root_cluster_C_output->get_size(), test_case.root_cluster_C_input->get_size(), epsilon); + compressor.copy_low_rank_approximation(test_case.root_cluster_C_output->get_size(), test_case.root_cluster_C_input->get_size(), test_case.root_cluster_C_output->get_offset(), test_case.root_cluster_C_input->get_offset(), C_approximation); // Reference Matrix A_dense(A.nb_rows(), A.nb_cols(), 0); diff --git a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp index 92a098c4..da4ea1be 100644 --- a/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_lrmat_hmatrix_product.hpp @@ -57,15 +57,13 @@ bool test_lrmat_hmatrix_product(const TestCaseProduct &tes } } - HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_B_output, *root_cluster_B_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, 'N', 'N', -1, std::make_shared>(*test_case.operator_B)); - HMatrixTreeBuilder> hmatrix_tree_builder_C(*root_cluster_C_output, *root_cluster_C_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder_C.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder_C(epsilon, eta, 'N', 'N', -1, std::make_shared>(*test_case.operator_C)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_B); - HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C); + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_B, *root_cluster_B_output, *root_cluster_B_input); + HMatrix> C = hmatrix_tree_builder_C.build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input); HMatrix> hmatrix_test(C); // Dense matrix @@ -83,12 +81,16 @@ bool test_lrmat_hmatrix_product(const TestCaseProduct &tes Matrix matrix_test, dense_lrmat_test; // lrmat - SVD compressor; htool::underlying_type lrmat_tolerance = 1e-6; // std::unique_ptr> A_auto_approximation, C_auto_approximation; - LowRankMatrix lrmat_test(epsilon); - LowRankMatrix A_auto_approximation(*test_case.operator_A, compressor, *root_cluster_A_output, *root_cluster_A_input, -1, lrmat_tolerance); - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance); + LowRankMatrix lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), epsilon); + SVD compressor_A(*test_case.operator_A); + LowRankMatrix A_auto_approximation(root_cluster_A_output->get_size(), root_cluster_A_input->get_size(), lrmat_tolerance); + compressor_A.copy_low_rank_approximation(root_cluster_A_output->get_size(), root_cluster_A_input->get_size(), root_cluster_A_output->get_offset(), root_cluster_A_input->get_offset(), A_auto_approximation); + + SVD compressor_C(*test_case.operator_C); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance); + compressor_C.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input matrix T alpha(1), beta(1), scaling_coefficient; diff --git a/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp index 1d7d1a78..9276be7e 100644 --- a/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp +++ b/tests/functional_tests/hmatrix/test_matrix_hmatrix_product.hpp @@ -58,11 +58,10 @@ bool test_matrix_hmatrix_product(const TestCaseProduct &te } } - HMatrixTreeBuilder> hmatrix_tree_builder(*root_cluster_B_output, *root_cluster_B_input, epsilon, eta, 'N', 'N', -1, -1, rankWorld); - hmatrix_tree_builder.set_low_rank_generator(std::make_shared>()); + HMatrixTreeBuilder> hmatrix_tree_builder(epsilon, eta, 'N', 'N', -1, std ::make_shared>(*test_case.operator_B)); // build - HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_B); + HMatrix> root_hmatrix = hmatrix_tree_builder.build(*test_case.operator_B, *root_cluster_B_output, *root_cluster_B_input); // Dense matrix int ni_A = root_cluster_A_input->get_size(); @@ -82,9 +81,10 @@ bool test_matrix_hmatrix_product(const TestCaseProduct &te copy_to_dense(root_hmatrix, HB_dense.data()); // lrmat - SVD compressor; + SVD compressor(*test_case.operator_C); htool::underlying_type lrmat_tolerance = 0.0001; - LowRankMatrix C_auto_approximation(*test_case.operator_C, compressor, *root_cluster_C_output, *root_cluster_C_input, -1, lrmat_tolerance), lrmat_test(lrmat_tolerance); + LowRankMatrix C_auto_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance), lrmat_test(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), lrmat_tolerance); + compressor.copy_low_rank_approximation(root_cluster_C_output->get_size(), root_cluster_C_input->get_size(), root_cluster_C_output->get_offset(), root_cluster_C_input->get_offset(), C_auto_approximation); // Random Input matrix std::vector B_vec, C_vec, test_vec; diff --git a/tests/functional_tests/hmatrix/test_task_based_hmatrix_factorization.hpp b/tests/functional_tests/hmatrix/test_task_based_hmatrix_factorization.hpp new file mode 100644 index 00000000..29d7b959 --- /dev/null +++ b/tests/functional_tests/hmatrix/test_task_based_hmatrix_factorization.hpp @@ -0,0 +1,283 @@ +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_block_tree... +#include // for Generat... +#include // for add_... +#include +#include +#include // for task_bas... +#include // for task_bas... +#include // for task_based_lu_factorization +#include // for task_based_triangular_hmatrix_hmatrix_solve +#include // for triangular_hmatrix_hmatrix_solve + +#include +#include // for enumerate_dependence, find_l0... +#include // for HMatrix... +#include // for Matrix +#include // for underly... +#include // for NbrToStr +#include +#include // for TestCaseSymmetricPro... +#include +#include // for create_... +#include // for test_pa... +#include // for operator<< +#include // for make_sh... +#include // for operator+ +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_task_based_lu_factorization(char trans, int n1, int n2, htool::underlying_type epsilon) { + + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_lu_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', trans, n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_matrix_matrix_product(trans, 'N', T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + + //// Classic LU factorization + auto A_classic = A; + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + lu_factorization(A_classic); + end = std::chrono::steady_clock::now(); + // save_leaves_with_rank(A_classic, "classic_hmatrix_facto"); + lu_solve(trans, A_classic, matrix_test); + std::chrono::duration classic_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based LU factorization + auto A_task_based = A; + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(A_task_based, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_lu_factorization(A_task_based, L0_A); + } + end = std::chrono::steady_clock::now(); + // save_leaves_with_rank(A_task_based, "TB_hmatrix_facto"); + lu_solve(trans, A_task_based, matrix_test); + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_lu_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_lu_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} + +template ::value, bool> = true> +bool test_task_based_cholesky_factorization(char UPLO, int n1, int n2, htool::underlying_type epsilon) { + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_cholesky_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_symmetric_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + auto task_based_HA = HA; + + //// Classic Cholesky factorization + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + cholesky_factorization(UPLO, HA); + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, HA, matrix_test); + + std::chrono::duration classic_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based Cholesky factorization + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(task_based_HA, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_cholesky_factorization(UPLO, task_based_HA, L0_A); + } + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, task_based_HA, matrix_test); + + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_cholesky_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_cholesky_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} // end of test_task_based_cholesky_factorization + +template ::value, bool> = true> +bool test_task_based_cholesky_factorization(char UPLO, int n1, int n2, htool::underlying_type epsilon) { + bool is_error = false; + double eta = 100; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_cholesky_factorization tests...\n"; + + // Setup test case + htool::TestCaseSolve test_case('L', 'N', n1, n2, 1, -1); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, is_complex() ? 'H' : 'S', UPLO); + HMatrix> HA = hmatrix_tree_builder_A.build(*test_case.operator_in_user_numbering_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + std::vector identity(ni_A); + std::iota(identity.begin(), identity.end(), test_case.root_cluster_A_output->get_offset()); + test_case.operator_in_user_numbering_A->copy_submatrix(no_A, ni_A, identity.data(), identity.data(), A_dense.data()); + generate_random_matrix(X_dense); + add_hermitian_matrix_matrix_product('L', UPLO, T(1.), A_dense, X_dense, T(0.), B_dense); + + // Tests + std::chrono::steady_clock::time_point start, end; + auto task_based_HA = HA; + + //// Classic Cholesky factorization + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); + cholesky_factorization(UPLO, HA); + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, HA, matrix_test); + std::chrono::duration classic_duration = end - start; + + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based Cholesky factorization + int max_nb_nodes = 64; + std::vector *> L0_A = find_l0(task_based_HA, max_nb_nodes); + matrix_test = B_dense; + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_cholesky_factorization(UPLO, task_based_HA, L0_A); + } + end = std::chrono::steady_clock::now(); + cholesky_solve(UPLO, task_based_HA, matrix_test); + + std::chrono::duration task_based_duration = end - start; + error = normFrob(X_dense - matrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based error = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << std::endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_cholesky_factorization current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_cholesky_factorization current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} diff --git a/tests/functional_tests/hmatrix/test_task_based_hmatrix_hmatrix_product.hpp b/tests/functional_tests/hmatrix/test_task_based_hmatrix_hmatrix_product.hpp new file mode 100644 index 00000000..3d707864 --- /dev/null +++ b/tests/functional_tests/hmatrix/test_task_based_hmatrix_hmatrix_product.hpp @@ -0,0 +1,188 @@ +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_block_tree... +#include // for Generat... +#include // for add_... +#include +// #include +#include // for task_bas... +#include // for task_bas... +// #include // for task_based_lu_factorization +// #include // for task_based_triangular_hmatrix_hmatrix_solve +// #include // for triangular_hmatrix_hmatrix_solve + +#include +#include // for enumerate_dependence, find_l0... +#include // for HMatrix... +#include // for Matrix +#include // for underly... +#include // for NbrToStr +#include +#include // for TestCaseSymmetricPro... +#include +#include // for create_... +#include // for test_pa... +#include // for operator<< +#include // for make_sh... +#include // for operator+ +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_task_based_hmatrix_hmatrix_product(const TestCaseType &test_case, char sym, char transa, char transb, htool::underlying_type epsilon, bool block_tree_consistency, char UPLO = 'L') { + bool is_error = false; + double eta = 10; + double error_tol = 1e-15; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_internal_add_hmatrix_hmatrix_product tests..."; + + // Get test case parameters + const Cluster> *root_cluster_A_output, *root_cluster_A_input, + *root_cluster_B_output, *root_cluster_B_input, *root_cluster_C_output, *root_cluster_C_input; + root_cluster_A_output = test_case.root_cluster_A_output; + root_cluster_A_input = test_case.root_cluster_A_input; + root_cluster_B_output = test_case.root_cluster_B_output; + root_cluster_B_input = test_case.root_cluster_B_input; + root_cluster_C_output = test_case.root_cluster_C_output; + root_cluster_C_input = test_case.root_cluster_C_input; + int ni_C = test_case.ni_C; + int no_C = test_case.no_C; + + // Tree builder + std::unique_ptr>> hmatrix_tree_builder_A, + hmatrix_tree_builder_B, hmatrix_tree_builder_C; + if (sym == 'N') { + hmatrix_tree_builder_A = std::make_unique>>(epsilon, eta, 'N', 'N'); + hmatrix_tree_builder_B = std::make_unique>>(epsilon, eta, 'N', 'N'); + hmatrix_tree_builder_C = std::make_unique>>(epsilon, eta, 'N', 'N'); + } else if (sym == 'S') { + hmatrix_tree_builder_A = std::make_unique>>(epsilon, eta, 'S', UPLO); + hmatrix_tree_builder_B = std::make_unique>>(epsilon, eta, 'S', UPLO); + hmatrix_tree_builder_C = std::make_unique>>(epsilon, eta, 'S', UPLO); + } + + hmatrix_tree_builder_A->set_low_rank_generator(std::make_shared>(*test_case.operator_A)); + hmatrix_tree_builder_A->set_block_tree_consistency(block_tree_consistency); + + // swap input and output clusters if transX is not 'N' + // auto *input_cluster_A = &hmatrix_tree_builder_A->get_source_cluster(); + // auto *output_cluster_A = &hmatrix_tree_builder_A->get_target_cluster(); + // if (transa != 'N') { + // std::swap(input_cluster, output_cluster); + // } + // auto *input_cluster_B = &hmatrix_tree_builder_B->get_source_cluster(); + // auto *output_cluster_B = &hmatrix_tree_builder_B->get_target_cluster(); + // if (transb != 'N') { + // std::swap(input_cluster_B, output_cluster_B); + // } + + // build + auto hmatrix_task_based_A = hmatrix_tree_builder_A->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + auto hmatrix_task_based_B = hmatrix_tree_builder_B->build(*test_case.operator_B, *root_cluster_B_output, *root_cluster_B_input, -1, -1, true, 64); + auto hmatrix_classic_C = hmatrix_tree_builder_C->build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input, -1, -1, false, 64); + auto hmatrix_task_based_C = hmatrix_tree_builder_C->build(*test_case.operator_C, *root_cluster_C_output, *root_cluster_C_input, -1, -1, true, 64); + std::vector *> L0 = hmatrix_tree_builder_C->get_L0(); + std::vector *> L0_A = hmatrix_tree_builder_A->get_L0(); + std::vector *> L0_B = hmatrix_tree_builder_B->get_L0(); + + // visu + // std::cout << "hmatrix_task_based_A:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_A, std::cout); + // std::cout << "hmatrix_task_based_B:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_B, std::cout); + // std::cout << "hmatrix_classic_C:" << std::endl; + // print_hmatrix_information(hmatrix_classic_C, std::cout); + // std::cout << "hmatrix_task_based_C:" << std::endl; + // print_hmatrix_information(hmatrix_task_based_C, std::cout); + save_leaves_with_rank(hmatrix_task_based_A, "hmatrix_task_based_AAAAAAAAAAAAAAAAA"); + save_leaves_with_rank(hmatrix_task_based_B, "hmatrix_task_based_B"); + save_leaves_with_rank(hmatrix_classic_C, "hmatrix_classic_C"); + save_leaves_with_rank(hmatrix_task_based_C, "hmatrix_task_based_C"); + + // parameters + T alpha = T(3); + T beta = T(2); + int nb_products = 1; + + std::chrono::steady_clock::time_point start, end; + + // Perform the classic hmatrix hmatrix product + start = std::chrono::steady_clock::now(); + for (int i = 0; i < nb_products; i++) { + internal_add_hmatrix_hmatrix_product(transa, transb, alpha, hmatrix_task_based_A, hmatrix_task_based_B, beta, hmatrix_classic_C); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + // Perform the task-based internal add H-matrix hamtrix product + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_hmatrix_product(transa, transb, alpha, hmatrix_task_based_A, hmatrix_task_based_B, beta, hmatrix_task_based_C, L0_A, L0_B, L0); + } + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + + // desify results + Matrix densified_hmatrix_classic_C(no_C, ni_C), densified_hmatrix_task_based_C(no_C, ni_C); + copy_to_dense(hmatrix_classic_C, densified_hmatrix_classic_C.data()); + copy_to_dense(hmatrix_task_based_C, densified_hmatrix_task_based_C.data()); + // Matrix test = densified_hmatrix_classic_C - densified_hmatrix_task_based_C; + // std::ofstream dense_classic_file("dense_classic.csv"); + // std::ofstream dense_task_file("dense_task.csv"); + // std::ofstream test_file("test.csv"); + // densified_hmatrix_classic_C.print(dense_classic_file, ","); + // densified_hmatrix_task_based_C.print(dense_task_file, ","); + // test.print(test_file, ","); + + // Compare the results + is_error = is_error || (std::isnan(normFrob(densified_hmatrix_classic_C - densified_hmatrix_task_based_C) / normFrob(densified_hmatrix_classic_C)) > nb_products * error_tol); + + // Print the results + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + std::cout << " normFrob(densified_hmatrix_classic_C) = " << normFrob(densified_hmatrix_classic_C) << std::endl; + std::cout << " normFrob(densified_hmatrix_task_based_C) = " << normFrob(densified_hmatrix_task_based_C) << std::endl; + + std::cout << " normFrob(classic_C - task_based_C) / normFrob(classic_C) = " << normFrob(densified_hmatrix_classic_C - densified_hmatrix_task_based_C) / normFrob(densified_hmatrix_classic_C) << std::endl + << std::endl; + + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_hmatrix_product current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_hmatrix_product current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} diff --git a/tests/functional_tests/hmatrix/test_task_based_hmatrix_triangular_solve.hpp b/tests/functional_tests/hmatrix/test_task_based_hmatrix_triangular_solve.hpp new file mode 100644 index 00000000..0c4e74d0 --- /dev/null +++ b/tests/functional_tests/hmatrix/test_task_based_hmatrix_triangular_solve.hpp @@ -0,0 +1,235 @@ +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_block_tree... +#include // for Generat... +#include // for add_... +#include +// #include +#include // for task_bas... +#include // for task_bas... +// #include // for task_based_lu_factorization +#include // for task_based_triangular_hmatrix_hmatrix_solve +#include // for triangular_hmatrix_hmatrix_solve + +#include +#include // for enumerate_dependence, find_l0... +#include // for HMatrix... +#include // for Matrix +#include // for underly... +#include // for NbrToStr +#include +#include // for TestCaseSymmetricPro... +#include +#include // for create_... +#include // for test_pa... +#include // for operator<< +#include // for make_sh... +#include // for operator+ +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_task_based_hmatrix_triangular_solve(const TestCaseType &test_case, char side, char transa, char diag, double epsilon) { + + bool is_error = false; + double eta = 10; + htool::underlying_type error; + std::cout << "eta = " << eta << std::endl; + std::cout << "task_based_internal_triangular_hmatrix_hmatrix_solve tests...\n"; + + // Random input + T alpha(1); + generate_random_scalar(alpha); + + // HMatrix + HMatrixTreeBuilder> hmatrix_tree_builder_A(epsilon, eta, 'N', 'N'); + HMatrixTreeBuilder> hmatrix_tree_builder_X(epsilon, eta, 'N', 'N'); + + HMatrix> A = hmatrix_tree_builder_A.build(*test_case.operator_A, *test_case.root_cluster_A_output, *test_case.root_cluster_A_input, -1, -1, true, 64); + HMatrix> X = hmatrix_tree_builder_X.build(*test_case.operator_X, *test_case.root_cluster_X_output, *test_case.root_cluster_X_input, -1, -1, true, 64); + HMatrix> B(X), UB(X), LB(X); + HMatrix> hmatrix_test(B); + + if (diag == 'U') { // to avoid conditioning issues with the unit diagonal + std::vector diagonal(A.nb_cols()); + copy_diagonal(A, diagonal.data()); + auto max_abs = *std::max_element(diagonal.begin(), diagonal.end(), [](const T &a, const T &b) { + return std::abs(a) < std::abs(b); + }); + scale(T(1) / (T(10) * std::abs(max_abs)), A); + } + + // Triangular hmatrices + HMatrix> LA(A); + HMatrix> UA(A); + preorder_tree_traversal(LA, [&diag](HMatrix> &hmatrix) { + if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { + Matrix &dense_data = *hmatrix.get_dense_data(); + for (int i = 0; i < dense_data.nb_rows(); i++) { + if (diag == 'U') { + dense_data(i, i) = 1; + } + for (int j = i + 1; j < dense_data.nb_cols(); j++) { + dense_data(i, j) = 0; + } + } + } else { + std::vector>>> filtered_children; + for (auto &child : hmatrix.get_children_with_ownership()) { + if (child->get_target_cluster().get_offset() >= child->get_source_cluster().get_offset()) { + filtered_children.push_back(std::move(child)); + } + } + if (filtered_children.size() > 0) { + hmatrix.delete_children(); + hmatrix.assign_children(filtered_children); + } + } + }); + + preorder_tree_traversal(UA, [&diag](HMatrix> &hmatrix) { + if (hmatrix.is_leaf() and hmatrix.get_target_cluster() == hmatrix.get_source_cluster()) { + Matrix &dense_data = *hmatrix.get_dense_data(); + for (int j = 0; j < dense_data.nb_cols(); j++) { + if (diag == 'U') { + dense_data(j, j) = 1; + } + for (int i = j + 1; i < dense_data.nb_rows(); i++) { + dense_data(i, j) = 0; + } + } + } else { + std::vector>>> filtered_children; + for (auto &child : hmatrix.get_children_with_ownership()) { + if (child->get_target_cluster().get_offset() <= child->get_source_cluster().get_offset()) { + filtered_children.push_back(std::move(child)); + } + } + if (filtered_children.size() > 0) { + hmatrix.delete_children(); + hmatrix.assign_children(filtered_children); + } + } + }); + + // Matrix + int ni_A = test_case.root_cluster_A_input->get_size(); + int no_A = test_case.root_cluster_A_output->get_size(); + int ni_X = test_case.root_cluster_X_input->get_size(); + int no_X = test_case.root_cluster_X_output->get_size(); + Matrix A_dense(no_A, ni_A), X_dense(no_X, ni_X), B_dense(X_dense), densified_hmatrix_test(B_dense), matrix_test; + test_case.operator_A->copy_submatrix(no_A, ni_A, test_case.root_cluster_A_output->get_offset(), test_case.root_cluster_A_input->get_offset(), A_dense.data()); + test_case.operator_X->copy_submatrix(no_X, ni_X, test_case.root_cluster_X_output->get_offset(), test_case.root_cluster_X_input->get_offset(), X_dense.data()); + + // Triangular matrices + if (side == 'L') { + internal_add_hmatrix_hmatrix_product(transa, 'N', T(1) / alpha, UA, X, T(0), UB); + internal_add_hmatrix_hmatrix_product(transa, 'N', T(1) / alpha, LA, X, T(0), LB); + } else { + internal_add_hmatrix_hmatrix_product('N', transa, T(1) / alpha, X, UA, T(0), UB); + internal_add_hmatrix_hmatrix_product('N', transa, T(1) / alpha, X, LA, T(0), LB); + } + + // save_leaves_with_rank(LA, "LA"); + // save_leaves_with_rank(LB, "LB"); + + // Tests + std::chrono::steady_clock::time_point start, end; + int max_nb_nodes = 32; + std::vector *> L0_LA = find_l0(LA, max_nb_nodes); + std::vector *> L0_UA = find_l0(UA, max_nb_nodes); + std::vector *> L0_test; + + //// internal_triangular_hmatrix_hmatrix_solve Lower + ////// Classic + hmatrix_test = LB; + start = std::chrono::steady_clock::now(); + internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, diag, alpha, LA, hmatrix_test); + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << ">> Lower case: " << endl; + cout << "> classic errors = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + ////// Task-based + hmatrix_test = LB; + L0_test = find_l0(hmatrix_test, max_nb_nodes); + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, 'L', transa, diag, alpha, LA, hmatrix_test, L0_LA, L0_test); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based errors = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // internal_triangular_hmatrix_hmatrix_solve Upper + //// Classic + hmatrix_test = UB; + start = std::chrono::steady_clock::now(); + internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, diag, alpha, UA, hmatrix_test); + end = std::chrono::steady_clock::now(); + classic_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << ">> Upper case: " << endl; + cout << "> classic error = " << error << endl; + cout << " classic_duration = " << classic_duration.count() << std::endl; + + //// Task-based + hmatrix_test = UB; + L0_test = find_l0(hmatrix_test, max_nb_nodes); + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_internal_triangular_hmatrix_hmatrix_solve(side, 'U', transa, diag, alpha, UA, hmatrix_test, L0_UA, L0_test); + } + end = std::chrono::steady_clock::now(); + task_based_duration = end - start; + copy_to_dense(hmatrix_test, densified_hmatrix_test.data()); + error = normFrob(X_dense - densified_hmatrix_test) / normFrob(X_dense); + is_error = is_error || !(error < epsilon); + cout << "> task_based errors = " << error << endl; + cout << " task_based_duration = " << task_based_duration.count() << endl; + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_triangular_solve current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_triangular_solve current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} // end of test_task_based_hmatrix_triangular_solve diff --git a/tests/functional_tests/hmatrix/test_task_based_hmatrix_vector_product.hpp b/tests/functional_tests/hmatrix/test_task_based_hmatrix_vector_product.hpp new file mode 100644 index 00000000..f06256af --- /dev/null +++ b/tests/functional_tests/hmatrix/test_task_based_hmatrix_vector_product.hpp @@ -0,0 +1,454 @@ +#include // for pow, sqrt +#include // for size_t +#include // for norm2 +#include // for Cluster... +#include +#include // for Cluster... +#include // for copy_di... +#include // for print_d... +#include // for print_h... +#include // for view_block_tree... +#include // for Generat... +// #include // for add_... +#include +// #include +// #include // for task_bas... +#include // for task_bas... +// #include // for task_based_lu_factorization +// #include // for task_based_triangular_hmatrix_hmatrix_solve +// #include // for triangular_hmatrix_hmatrix_solve + +#include +#include // for enumerate_dependence, find_l0... +#include // for HMatrix... +#include // for Matrix +#include // for underly... +#include // for NbrToStr +#include +#include // for TestCaseSymmetricPro... +#include +#include // for create_... +#include // for test_pa... +#include // for operator<< +#include // for make_sh... +#include // for operator+ +#include // for vector + +using namespace std; +using namespace htool; + +template +bool test_task_based_hmatrix_vector_product(const TestCaseType &test_case, char sym, char transa, htool::underlying_type epsilon, bool block_tree_consistency, char UPLO = 'L') { + // custom tests to run + bool all_tests = true; // run all tests. Has priority over specific tests flags + bool basic_tests = false; // tests left_hmatrix_ancestor_of_right_hmatrix, enumerate_dependences and find_l0 + bool compute_blocks_tests = false; + bool hmatrix_vector_product_tests = false; + bool assembly_and_vector_product_tests = false; // compares the assembly of a hmatrix + the product with a vector between conjoint and disjoint cases + + double eta = 10; + std::cout << "eta = " << eta << std::endl; + double error_tol = 1e-14; + bool is_error = false; + + // Get test case parameters + const Cluster> *root_cluster_A_output, *root_cluster_A_input; + root_cluster_A_output = test_case.root_cluster_A_output; + root_cluster_A_input = test_case.root_cluster_A_input; + int ni_A = test_case.ni_A; + int no_A = test_case.no_A; + int no_B = test_case.no_B; + int no_C = test_case.no_C; + + // Tree builder + std::unique_ptr>> hmatrix_tree_builder; + if (sym == 'N') { + hmatrix_tree_builder = std::make_unique>>(epsilon, eta, 'N', 'N'); + } else if (sym == 'S') { + hmatrix_tree_builder = std::make_unique>>(epsilon, eta, 'S', UPLO); + } + hmatrix_tree_builder->set_low_rank_generator(std::make_shared>(*test_case.operator_A)); + hmatrix_tree_builder->set_block_tree_consistency(block_tree_consistency); + + // swap input and output clusters if transa is not 'N' + auto *input_cluster = root_cluster_A_input; + auto *output_cluster = root_cluster_A_output; + if (transa != 'N') { + std::swap(input_cluster, output_cluster); + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Basic tests + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (basic_tests || all_tests) { + // build + HMatrix> root_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); + + // Hmatrix Visualization + save_leaves_with_rank(root_hmatrix, "root_hmatrix_facto"); + // print_hmatrix_information(root_hmatrix, std::cout); + + // Basic sub tree + HMatrix &child1 = *root_hmatrix.get_children()[0].get(); + HMatrix &child2 = *root_hmatrix.get_children()[1].get(); + HMatrix &child1_child1 = *root_hmatrix.get_children()[0].get()->get_children()[0].get(); + HMatrix &child1_child2 = *root_hmatrix.get_children()[0].get()->get_children()[1].get(); + + // Tests for left_hmatrix_ancestor_of_right_hmatrix + std::cout << "left_hmatrix_ancestor_of_right_hmatrix tests..."; + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child2)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1_child1)); + is_error = is_error || !(left_hmatrix_ancestor_of_right_hmatrix(root_hmatrix, child1_child2)); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // Tests for left_hmatrix_descendant_of_right_hmatrix + std::cout << "left_hmatrix_descendant_of_right_hmatrix tests..."; + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child2, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1_child1, root_hmatrix)); + is_error = is_error || !(left_hmatrix_descendant_of_right_hmatrix(child1_child2, root_hmatrix)); + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + + // Tests for enumerate_dependences + { + std::cout << "enumerate_dependences tests..."; + std::vector *> L0; + std::vector *> dependences; + + // Test case 1: hmatrix is in L0 + { + L0 = {&child1_child1, &child1_child2, &child2}; + dependences = enumerate_dependences(child2, L0); + + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &child2); + } + + // Test case 2: hmatrix is above L0 + { + L0 = {&child1_child1, &child1_child2, &child2}; + dependences = enumerate_dependences(root_hmatrix, L0); + is_error = is_error || !(dependences.size() == 3); + is_error = is_error || !(dependences[0] == &child1_child1); + is_error = is_error || !(dependences[1] == &child1_child2); + is_error = is_error || !(dependences[2] == &child2); + } + + // Test case 3: hmatrix is below L0 + { + L0 = {&root_hmatrix}; + + dependences = enumerate_dependences(child1, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + + dependences = enumerate_dependences(child2, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + + dependences = enumerate_dependences(child1_child2, L0); + is_error = is_error || !(dependences.size() == 1); + is_error = is_error || !(dependences[0] == &root_hmatrix); + } + + // Test case 4 : L0 is empty + // L0.clear(); + // dependences = enumerate_dependences(root_hmatrix, L0); + // is_error = is_error || !(dependences.empty()); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + std::cout << "----------------------------------" << std::endl; + } // end of tests for enumerate_dependences + + // Tests for find_l0. Visual verification of the block tree in dotfile.dot + { + std::vector *> L0 = find_l0(root_hmatrix, 256); + std::ofstream dotfile("test_find_l0.dot"); + view_block_tree(root_hmatrix, L0, dotfile); + } + } // end of basic tests + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for task_based_compute_blocks + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (compute_blocks_tests || all_tests) { + std::cout << "task_based_compute_blocks tests..."; + std::chrono::steady_clock::time_point start, end; + HMatrix task_based_hmatrix(*root_cluster_A_output, *root_cluster_A_input); + + // task based build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); // build + } + end = std::chrono::steady_clock::now(); + std::chrono::duration task_based_duration = end - start; + + task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] = NbrToStr(hmatrix_tree_builder->get_false_positive()); // when task_based_compute_blocks is used in build, the number of false positives is not set in the hmatrix tree data because the tasks need to finish before we can know the number of false positive + + // classic build + start = std::chrono::steady_clock::now(); + auto hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); // when compute_blocks is used in build, the number of false positives is set in the hmatrix tree data + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + // visu + // std::cout << "hmatrix:" << std::endl; + // print_hmatrix_information(hmatrix, std::cout); + // std::cout << "task_based_hmatrix:" << std::endl; + // print_hmatrix_information(task_based_hmatrix, std::cout); + save_leaves_with_rank(task_based_hmatrix, "TB_hmatrix_build"); + save_leaves_with_rank(hmatrix, "hmatrix_build"); + + // densification + Matrix densified_hmatrix(no_A, ni_A), densified_hmatrix_task_based(no_A, ni_A); + copy_to_dense(hmatrix, densified_hmatrix.data()); + copy_to_dense(task_based_hmatrix, densified_hmatrix_task_based.data()); + + // compare + + is_error = is_error || (std::isnan(normFrob(densified_hmatrix - densified_hmatrix_task_based) / normFrob(densified_hmatrix)) > error_tol); + is_error = is_error || (hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] < task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"]); + + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + std::cout << " normFrob(densified_hmatrix - densified_hmatrix_task_based)/normFrob(densified_hmatrix) = " << normFrob(densified_hmatrix - densified_hmatrix_task_based) / normFrob(densified_hmatrix) << std::endl; + + std::cout << " hmatrix m_false_positive = " << hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] << std::endl; + std::cout << " task_based_hmatrix m_false_positive = " << task_based_hmatrix.get_hmatrix_tree_data()->m_information["Number_of_false_positive"] << std::endl + << std::endl; + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl + << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << " Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + + std::cout << "----------------------------------" << std::endl; + } // end of tests for task_based_compute_blocks + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for task_based_internal_add_hmatrix_vector_product + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (hmatrix_vector_product_tests || all_tests) { + std::cout << "task_based_internal_add_hmatrix_vector_product tests..."; + std::chrono::steady_clock::time_point start, end; + + // build + auto hmatrix_classic = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, false, 64); + auto hmatrix_task_based = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); // not in parallel area because it is not the focus of the test, but should nevertheless be called with the flag true in order to build m_L0 needed later when : hmatrix_tree_builder->get_L0(); + + // visu + // std::cout << "hmatrix_classic:" << std::endl; + // print_hmatrix_information(hmatrix_classic, std::cout); + // std::cout << "hmatrix_task_based:" << std::endl; + // print_hmatrix_information(hmatrix_task_based, std::cout); + save_leaves_with_rank(hmatrix_task_based, "TB_hmatrix_prod"); + save_leaves_with_rank(hmatrix_classic, "hmatrix_prod"); + + // L0 definitions + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + std::vector> *> in_L0 = find_l0(*input_cluster, 64); + std::vector> *> out_L0 = find_l0(*output_cluster, 64); + // std::ofstream dotfile("test_find_l0.dot"); + // view_block_tree(hmatrix_task_based, L0, dotfile); + // for (auto &L0_nodes : L0) { + // // if (L0_nodes->get_target_cluster().get_offset() < L0_nodes->get_source_cluster().get_offset()) { + + // std::cout << L0_nodes->get_target_cluster().get_offset() << " " << L0_nodes->get_target_cluster().get_size() << " "; + // std::cout << L0_nodes->get_source_cluster().get_offset() << " " << L0_nodes->get_source_cluster().get_size() << " " << L0_nodes->get_symmetry_for_leaves() << std ::endl; + // // } + // } + + // Create a vector for the input and output + std::vector in(no_B), out(no_C, 11), out_task(no_C, 11); + generate_random_vector(in); + generate_random_vector(out); + out_task = out; + T alpha = T(3); + T beta = T(2); + int nb_products = 50; + + // Perform the classic H-matrix vector product + start = std::chrono::steady_clock::now(); + for (int i = 0; i < nb_products; i++) { + openmp_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_classic, in.data(), beta, out.data()); + } + end = std::chrono::steady_clock::now(); + std::chrono::duration classic_duration = end - start; + + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + // Perform the task-based internal add H-matrix vector product + task_based_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_task_based, in.data(), beta, out_task.data(), L0, in_L0, out_L0); + // sequential_internal_add_hmatrix_vector_product(transa, alpha, hmatrix_task_based, in.data(), beta, out_task.data()); + } + } + end = std::chrono::steady_clock::now(); + + std::chrono::duration task_based_duration = end - start; + + // Compare the results + is_error = is_error || (std::isnan(norm2(out - out_task) / norm2(out)) > nb_products * error_tol); + if (is_error) { + std::cout << "ERROR" << std::endl; + } else { + std::cout << "SUCCESS" << std::endl; + } + + // std::cout << " \n out_final = \n"; + // for (int i = 0; i < 0 + 15; i++) { + // std::cout << out[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << " \n out_task_final = \n"; + // for (int i = 0; i < 0 + 15; i++) { + // std::cout << out_task[i] << " "; + // } + // std::cout << std::endl; + + // std::cout << " argmax(out - out_task) = " << argmax(out - out_task) << std::endl; + // std::cout << " (out - out_task) = " << (out - out_task) << std::endl; + // std::cout << " max(out - out_task)/out(argmax(out - out_task)) = " << max(out - out_task) / std::abs(out[argmax(out - out_task)]) << std::endl; + std::cout << " norm2(out) = " << norm2(out) << std::endl; + + std::cout + << " norm2(out - out_task)/norm2(out) = " << norm2(out - out_task) / norm2(out) << std::endl + << std::endl; + + std::cout << " classic_duration = " << classic_duration.count() << std::endl; + std::cout << " task_based_duration = " << task_based_duration.count() << std::endl; + + // check durations + if (task_based_duration.count() > classic_duration.count()) { + std::cerr << "Careful: task_based_duration > classic_duration. Ratio TB/Classic = " << task_based_duration.count() / classic_duration.count() << std::endl; + } + std::cout << "----------------------------------" << std::endl; + } // end of tests for task_based_internal_add_hmatrix_vector_product + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Tests for hmatrix vector product following assembly with task_based_compute_blocks and task_based_internal_add_hmatrix_vector_product + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (assembly_and_vector_product_tests || all_tests) { + std::cout << "assembly + hmatrix vector products tests..."; + + int nb_products = 100; + T alpha = T(3); + T beta = T(2); + std::chrono::duration disjoint_case_duration, conjoint_case_duration; + std::chrono::steady_clock::time_point start, end; + HMatrix task_based_hmatrix(*root_cluster_A_output, *root_cluster_A_input); + + std::vector> *> in_L0 = find_l0(*input_cluster, 64); + std::vector> *> out_L0 = find_l0(*output_cluster, 64); + std::vector in(no_B), out_task_dis(no_C, 11), out_task_con(no_C, 11); + generate_random_vector(in); + generate_random_vector(out_task_dis); + out_task_con = out_task_dis; + + // Case 1 : task graphs are disjoint + { + // build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + } + + // N hmatrix vector products + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_vector_product(transa, alpha, task_based_hmatrix, in.data(), beta, out_task_dis.data(), L0, in_L0, out_L0); + } + } + end = std::chrono::steady_clock::now(); + disjoint_case_duration = end - start; + std::cout << "\n disjoint_case_TB_duration = " << disjoint_case_duration.count() << std::endl; + } // end of case 1 + + // Case 2 : task graphs are not conjoint + { + // build + start = std::chrono::steady_clock::now(); +#if defined(_OPENMP) && !defined(HTOOL_WITH_PYTHON_INTERFACE) +# pragma omp parallel +# pragma omp single +#endif + { + task_based_hmatrix = hmatrix_tree_builder->build(*test_case.operator_A, *root_cluster_A_output, *root_cluster_A_input, -1, -1, true, 64); + + // N hmatrix vector products + std::vector *> L0 = hmatrix_tree_builder->get_L0(); + + for (int i = 0; i < nb_products; i++) { + task_based_internal_add_hmatrix_vector_product(transa, alpha, task_based_hmatrix, in.data(), beta, out_task_con.data(), L0, in_L0, out_L0); + } + } + end = std::chrono::steady_clock::now(); + conjoint_case_duration = end - start; + std::cout << " conjoint_case_TB_duration = " << conjoint_case_duration.count() << std::endl; + + is_error = is_error || (std::isnan(norm2(out_task_con - out_task_dis) / norm2(out_task_con)) > error_tol); + std::cout << " norm2(out_task_con - out_task_dis)/norm2(out_task_con) = " << norm2(out_task_con - out_task_dis) / norm2(out_task_con) << std::endl; + } // end of case 2 + + if (disjoint_case_duration.count() < conjoint_case_duration.count()) { + std::cerr << "Careful: conjoint_case_TB_duration > disjoint_case_TB_duration. Ratio conjoint/disjoint = " << conjoint_case_duration.count() / disjoint_case_duration.count() << std::endl; + } + + std::cout << "----------------------------------" << std::endl; + } + + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + // Print the results + // ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + if (is_error) { + std::cerr << "ERROR: test_task_based_hmatrix_vector_product current case failed." << std::endl; + } else { + std::cout << "SUCCESS: test_task_based_hmatrix_vector_product current case passed." << std::endl; + std::cout << "===============================================================\n" + << std::endl; + } + return is_error; +} diff --git a/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_complex_double.cpp b/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_complex_double.cpp index f8b49835..65d18881 100644 --- a/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_complex_double.cpp +++ b/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_complex_double.cpp @@ -14,9 +14,11 @@ int main(int, char *[]) { for (auto number_of_rhs : {1, 100}) { for (auto side : {'L', 'R'}) { for (auto operation : {'N', 'T', 'C'}) { - std::cout << number_of_rhs << " " << side << " " << operation << "\n"; - // Square matrix - is_error = is_error || test_matrix_triangular_solve>(number_of_rows, number_of_rhs, side, operation); + for (auto diag : {'N', 'U'}) { + std::cout << number_of_rhs << " " << side << " " << operation << " " << diag << "\n"; + // Square matrix + is_error = is_error || test_matrix_triangular_solve>(number_of_rows, number_of_rhs, side, operation, diag); + } } } } diff --git a/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_double.cpp b/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_double.cpp index ab1746ed..a791b599 100644 --- a/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_double.cpp +++ b/tests/functional_tests/matrix/matrix_triangular_solve/test_matrix_triangular_solve_double.cpp @@ -11,9 +11,11 @@ int main(int, char *[]) { for (auto number_of_rhs : {1, 100}) { for (auto side : {'L', 'R'}) { for (auto operation : {'N', 'T'}) { - std::cout << number_of_rhs << " " << side << " " << operation << "\n"; - // Square matrix - is_error = is_error || test_matrix_triangular_solve(number_of_rows, number_of_rhs, side, operation); + for (auto diag : {'N', 'U'}) { + std::cout << number_of_rhs << " " << side << " " << operation << " " << diag << "\n"; + // Square matrix + is_error = is_error || test_matrix_triangular_solve(number_of_rows, number_of_rhs, side, operation, diag); + } } } } diff --git a/tests/functional_tests/matrix/test_matrix_product.hpp b/tests/functional_tests/matrix/test_matrix_product.hpp index 284b2935..7c2eaa06 100644 --- a/tests/functional_tests/matrix/test_matrix_product.hpp +++ b/tests/functional_tests/matrix/test_matrix_product.hpp @@ -159,6 +159,14 @@ bool test_matrix_symmetric_product(int n1, int n2, char side, char UPLO) { error = normFrob(matrix_result_w_matrix_sum - C) / normFrob(matrix_result_w_matrix_sum); is_error = is_error || !(error < 1e-14); cout << "> Errors on a symmetry matrix vector product: " << error << endl; + + if constexpr (!is_complex()) { + C = Y; + add_hermitian_matrix_vector_product(UPLO, alpha, A, B.data(), beta, C.data()); + error = normFrob(matrix_result_w_matrix_sum - C) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < 1e-14); + cout << "> Errors on a hermitian matrix vector product: " << error << endl; + } } C = Y; add_symmetric_matrix_matrix_product(side, UPLO, alpha, A, B, beta, C); @@ -172,6 +180,20 @@ bool test_matrix_symmetric_product(int n1, int n2, char side, char UPLO) { is_error = is_error || !(error < 1e-14); cout << "> Errors on a symmetric matrix matrix product with row major input: " << error << endl; + if constexpr (!is_complex()) { + C = Y; + add_hermitian_matrix_matrix_product(side, UPLO, alpha, A, B, beta, C); + error = normFrob(matrix_result_w_matrix_sum - C) / normFrob(matrix_result_w_matrix_sum); + is_error = is_error || !(error < 1e-14); + cout << "> Errors on a hermitian matrix matrix product: " << error << endl; + + C = Yt; + add_hermitian_matrix_matrix_product_row_major(side, UPLO, alpha, A, Bt.data(), beta, C.data(), n2); + error = normFrob(transposed_matrix_result_w_matrix_sum - C) / normFrob(transposed_matrix_result_w_matrix_sum); + is_error = is_error || !(error < 1e-14); + cout << "> Errors on a hermitian matrix matrix product with row major input: " << error << endl; + } + return is_error; } diff --git a/tests/functional_tests/matrix/test_matrix_triangular_solve.hpp b/tests/functional_tests/matrix/test_matrix_triangular_solve.hpp index 1bf9cffc..a732b93f 100644 --- a/tests/functional_tests/matrix/test_matrix_triangular_solve.hpp +++ b/tests/functional_tests/matrix/test_matrix_triangular_solve.hpp @@ -9,7 +9,7 @@ using namespace std; using namespace htool; template -bool test_matrix_triangular_solve(int n, int nrhs, char side, char transa) { +bool test_matrix_triangular_solve(int n, int nrhs, char side, char transa, char diag) { bool is_error = false; @@ -33,10 +33,20 @@ bool test_matrix_triangular_solve(int n, int nrhs, char side, char transa) { } A(i, i) = sum; } - + if (diag == 'U') { + T max = 0; + for (int i = 0; i < n; i++) { + max = std::max(std::abs(max), std::abs(A(i, i))); + } + scale(1. / max, A); + } // Triangular setup Matrix LA(A), UA(A), LB(B.nb_rows(), B.nb_cols()), permuted_LB(B.nb_rows(), B.nb_cols()), UB(B.nb_rows(), B.nb_cols()); for (int i = 0; i < A.nb_rows(); i++) { + if (diag == 'U') { + UA(i, i) = 1; + LA(i, i) = 1; + } for (int j = 0; j < A.nb_cols(); j++) { if (i > j) { UA(i, j) = 0; @@ -109,22 +119,28 @@ bool test_matrix_triangular_solve(int n, int nrhs, char side, char transa) { // triangular_matrix_matrix_solve test_factorization = LA; test_solve = LB; - triangular_matrix_matrix_solve(side, 'L', transa, 'N', alpha, test_factorization, test_solve); + triangular_matrix_matrix_solve(side, 'L', transa, diag, alpha, test_factorization, test_solve); + // test_solve.print(std::cout, ","); error = normFrob(result - test_solve) / normFrob(result); is_error = is_error || !(error < 1e-9); cout << "> Errors on lower triangular matrix matrix solve: " << error << endl; test_factorization = LA; test_factorization.get_pivots() = ipiv; - test_solve = permuted_LB; - triangular_matrix_matrix_solve(side, 'L', transa, 'N', alpha, test_factorization, test_solve); + // std::cout << ipiv.size() << "\n"; + // for (auto elt : ipiv) { + // std::cout << elt << " "; + // } + // std::cout << "\n"; + test_solve = permuted_LB; + triangular_matrix_matrix_solve(side, 'L', transa, diag, alpha, test_factorization, test_solve); error = normFrob(result - test_solve) / normFrob(result); is_error = is_error || !(error < 1e-9); cout << "> Errors on lower triangular matrix matrix solve with permutation: " << error << endl; test_factorization = UA; test_solve = UB; - triangular_matrix_matrix_solve(side, 'U', transa, 'N', alpha, test_factorization, test_solve); + triangular_matrix_matrix_solve(side, 'U', transa, diag, alpha, test_factorization, test_solve); error = normFrob(result - test_solve) / normFrob(result); is_error = is_error || !(error < 1e-9); cout << "> Errors on upper triangular matrix matrix solve: " << error << endl; diff --git a/tests/functional_tests/solvers/CMakeLists.txt b/tests/functional_tests/solvers/CMakeLists.txt index ac17472d..2e15efd3 100644 --- a/tests/functional_tests/solvers/CMakeLists.txt +++ b/tests/functional_tests/solvers/CMakeLists.txt @@ -9,24 +9,35 @@ FetchContent_Declare( GIT_TAG origin/main GIT_SHALLOW TRUE GIT_SUBMODULES_RECURSE FALSE - GIT_SUBMODULES "") + GIT_SUBMODULES "" SOURCE_SUBDIR AVOID_calling_ADD_SUBDIR_in_subseqeuent_MAKEAVILABLE) +# weird hack, see https://gitlab.kitware.com/cmake/cmake/-/issues/26220 and -FetchContent_GetProperties(data_test_repository) -if(NOT data_test_repository_POPULATED) - FetchContent_Populate(data_test_repository) -endif() +FetchContent_MakeAvailable(data_test_repository) set(Test_solver_ARGS ${data_test_repository_SOURCE_DIR}/data/) -add_executable(Test_solver test_solver.cpp) -target_link_libraries(Test_solver htool) -add_dependencies(build-tests-solvers Test_solver) +add_executable(Test_solver_double test_solver_double.cpp) +target_link_libraries(Test_solver_double htool) +add_dependencies(build-tests-solvers Test_solver_double) -add_test(NAME Test_solver_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver ${Test_solver_ARGS}) -set_tests_properties(Test_solver_1 PROPERTIES LABELS "mpi") -add_test(NAME Test_solver_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver ${Test_solver_ARGS}) -set_tests_properties(Test_solver_2 PROPERTIES LABELS "mpi") -add_test(NAME Test_solver_3 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 3 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver ${Test_solver_ARGS}) -set_tests_properties(Test_solver_3 PROPERTIES LABELS "mpi") -add_test(NAME Test_solver_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver ${Test_solver_ARGS}) -set_tests_properties(Test_solver_4 PROPERTIES LABELS "mpi") +add_executable(Test_solver_complex_double test_solver_complex_double.cpp) +target_link_libraries(Test_solver_complex_double htool) +add_dependencies(build-tests-solvers Test_solver_complex_double) + +add_test(NAME Test_solver_double_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_double_1 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_double_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_double_2 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_double_3 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 3 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_double_3 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_double_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_double_4 PROPERTIES LABELS "mpi") + +add_test(NAME Test_solver_complex_double_1 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 1 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_complex_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_complex_double_1 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_complex_double_2 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 2 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_complex_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_complex_double_2 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_complex_double_3 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 3 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_complex_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_complex_double_3 PROPERTIES LABELS "mpi") +add_test(NAME Test_solver_complex_double_4 COMMAND ${MPIEXEC} ${MPIEXEC_NUMPROC_FLAG} 4 ${MPIEXEC_PREFLAGS} ${CMAKE_CURRENT_BINARY_DIR}/Test_solver_complex_double ${Test_solver_ARGS}) +set_tests_properties(Test_solver_complex_double_4 PROPERTIES LABELS "mpi") diff --git a/tests/functional_tests/solvers/test_solver_complex_double.cpp b/tests/functional_tests/solvers/test_solver_complex_double.cpp new file mode 100644 index 00000000..7fabd63d --- /dev/null +++ b/tests/functional_tests/solvers/test_solver_complex_double.cpp @@ -0,0 +1,51 @@ +#include "test_solver_ddm.hpp" // for test_so... +#include "test_solver_ddm_adding_overlap.hpp" // for test_so... +#include "test_solver_wo_overlap.hpp" // for test_so... +#include // for max, copy +#include // for complex +#include // for DDMSolv... +#include // for initial... +#include // for basic_o... +#include // for MPI_Fin... +#include // for operator<< +#include // for vector + +int main(int argc, char *argv[]) { + // Initialize the MPI environment + MPI_Init(&argc, &argv); + + // Input file + if (argc < 2) { // argc should be 5 or more for correct execution + // We print argv[0] assuming it is the program name + cout << "usage: " << argv[0] << " datapath\n"; // LCOV_EXCL_LINE + return 1; // LCOV_EXCL_LINE + } + string datapath = argv[1]; + + bool is_error = false; + std::string datapath_final = datapath + "/output_non_sym/"; + + for (auto nb_rhs : {1, 5}) { + for (auto data_symmetry : {'N'}) { + std::vector symmetries = {'N'}; + for (auto symmetry : symmetries) { + std::cout << nb_rhs << " " << data_symmetry << " " << symmetry << "\n"; + + is_error = is_error || test_solver_wo_overlap, double, DDMSolverWithDenseLocalSolver, double>>(argc, argv, nb_rhs, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + is_error = is_error || test_solver_ddm_adding_overlap, double, DDMSolverWithDenseLocalSolver>>(argc, argv, nb_rhs, data_symmetry, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + is_error = is_error || test_solver_ddm, double, DDMSolverWithDenseLocalSolver>>(argc, argv, nb_rhs, data_symmetry, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + + is_error = is_error || test_solver_wo_overlap, double, DDMSolverBuilder, double>>(argc, argv, nb_rhs, symmetry, 'N', datapath_final); + test_solver_ddm_adding_overlap, double, DDMSolverBuilder, double>>(argc, argv, nb_rhs, data_symmetry, symmetry, 'N', datapath_final); + is_error = is_error || test_solver_ddm, double, DDMSolverBuilder>>(argc, argv, nb_rhs, data_symmetry, symmetry, 'N', datapath_final); + } + } + } + + // Finalize the MPI environment. + MPI_Finalize(); + if (is_error) { + return 1; + } + return 0; +} diff --git a/tests/functional_tests/solvers/test_solver_ddm.hpp b/tests/functional_tests/solvers/test_solver_ddm.hpp index 2af540b5..63fb3325 100644 --- a/tests/functional_tests/solvers/test_solver_ddm.hpp +++ b/tests/functional_tests/solvers/test_solver_ddm.hpp @@ -23,227 +23,62 @@ using namespace std; using namespace htool; -template -int test_solver_ddm(int argc, char *argv[], int mu, char data_symmetry, char symmetric, char UPLO, std::string datapath) { - - // Get the number of processes - int size; - MPI_Comm_size(MPI_COMM_WORLD, &size); - - // Get the rank of the process - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - // - bool test = 0; - - // HPDDM verbosity - HPDDM::Option &opt = *HPDDM::Option::get(); - opt.parse(argc, argv, rank == 0); - double tol = opt.val("tol", 1e-6); - if (rank != 0) - opt.remove("verbosity"); - opt.parse("-hpddm_max_it 200"); - - // HTOOL - double epsilon = tol; - // epsilon = 1e-10; - double eta = 10; - - // Clustering - if (rank == 0) - std::cout << "Creating cluster tree" << std::endl; - Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); - - // Matrix - if (rank == 0) - std::cout << "Creating generators" << std::endl; - Matrix> A; - A.bytes_to_matrix(datapath + "/matrix.bin"); - GeneratorInUserNumberingFromMatrix> Generator(A); - int n = A.nb_rows(); - - // Right-hand side - if (rank == 0) - std::cout << "Building rhs" << std::endl; - Matrix> f_global(n, mu); - std::vector> temp(n); - bytes_to_vector(temp, datapath + "/rhs.bin"); - for (int i = 0; i < mu; i++) { - f_global.set_col(i, temp); - } - - // Hmatrix - if (rank == 0) - std::cout << "Creating HMatrix" << std::endl; - - DefaultApproximationBuilder, htool::underlying_type>> default_build(Generator, target_cluster, target_cluster, epsilon, eta, symmetric, UPLO, MPI_COMM_WORLD); - - DistributedOperator> &Operator = default_build.distributed_operator; - - // Global vectors - Matrix> - x_global(n, mu), x_ref(n, mu), test_global(n, mu); - bytes_to_vector(temp, datapath + "sol.bin"); - for (int i = 0; i < mu; i++) { - x_ref.set_col(i, temp); - } - - // Partition - std::vector cluster_to_ovr_subdomain; - std::vector ovr_subdomain_to_global; - std::vector neighbors; - std::vector> intersections; - bytes_to_vector(cluster_to_ovr_subdomain, datapath + "/cluster_to_ovr_subdomain_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - bytes_to_vector(ovr_subdomain_to_global, datapath + "/ovr_subdomain_to_global_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - bytes_to_vector(neighbors, datapath + "/neighbors_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - - intersections.resize(neighbors.size()); - for (int p = 0; p < neighbors.size(); p++) { - bytes_to_vector(intersections[p], datapath + "/intersections_" + NbrToStr(size) + "_" + NbrToStr(rank) + "_" + NbrToStr(p) + ".bin"); - } - - // Error - double error2; - - // Solve - if (rank == 0) - std::cout << "Creating HMatrix" << std::endl; - std::vector geometry(n); - bytes_to_vector(geometry, datapath + "/geometry.bin"); - solver_builder default_ddm_solver(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, Generator, 3, geometry.data(), epsilon, eta); - auto &ddm_with_overlap = default_ddm_solver.solver; - - // No precond with overlap - if (rank == 0) - std::cout << "No precond with overlap:" << std::endl; - - opt.parse("-hpddm_schwarz_method none"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM one level ASM with overlap - if (rank == 0) - std::cout << "ASM one level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm "); - Matrix> Ki; - if (data_symmetry == 'S' && size > 1) { - opt.remove("geneo_threshold"); - opt.parse("-hpddm_geneo_nu 2"); - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_ddm_solver.local_hmatrix, Ki, symmetric, UPLO, 2); - // geneo_coarse_space_dense_builder.set_geneo_nu(2); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - } - - ddm_with_overlap.facto_one_level(); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM one level RAS with overlap - if (rank == 0) - std::cout << "RAS one level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras "); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } +template +class CustomSVD final : public VirtualLowRankGenerator { + const VirtualGenerator &m_A; - test = test || !(error2 < tol); - - x_global = 0; + private: + bool copy_low_rank_approximation_impl(int M, int N, const int *rows, const int *cols, LowRankMatrix &lrmat, int rank) const { + Matrix mat(M, N); + m_A.copy_submatrix(M, N, rows, cols, mat.data()); - // DDM two level ASM with overlap - if (data_symmetry == 'S' && size > 1) { - if (rank == 0) - std::cout << "ASM two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } + //// SVD + std::vector> singular_values(std::min(M, N)); + Matrix u(M, M); + Matrix vt(N, N); - test = test || !(error2 < tol); + int truncated_rank = SVD_truncation(mat, lrmat.get_epsilon(), u, vt, singular_values); + truncated_rank = rank > 0 ? std::min(rank, std::min(M, N)) : truncated_rank; - x_global = 0; - - if (rank == 0) - std::cout << "RAS two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - ddm_with_overlap.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; + if (truncated_rank * (M + N) > (M * N)) { + return false; } - test = test || !(error2 < tol); - - x_global = 0; - - // DDM solver with threshold - if (rank == 0) - std::cout << "RAS two level with overlap and threshold:" << std::endl; - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - // DDM> ddm_with_overlap_threshold(Generator, &Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - solver_builder default_ddm_solver_with_threshold(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, Generator, 3, geometry.data(), epsilon, eta); - auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, *default_ddm_solver.local_hmatrix, Ki, symmetric, UPLO, 100); - // geneo_coarse_space_dense_builder.set_geneo_threshold(100); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap_threshold.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - ddm_with_overlap_threshold.facto_one_level(); - ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap_threshold.print_infos(); - ddm_with_overlap_threshold.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; + if (truncated_rank > 0) { + auto &U = lrmat.get_U(); + auto &V = lrmat.get_V(); + U.resize(M, truncated_rank); + V.resize(truncated_rank, N); + for (int i = 0; i < M; i++) { + for (int j = 0; j < truncated_rank; j++) { + U(i, j) = u(i, j) * singular_values[j]; + } + } + for (int i = 0; i < truncated_rank; i++) { + for (int j = 0; j < N; j++) { + V(i, j) = vt(i, j); + } + } + return true; } + return false; + } - test = test || !(error2 < tol); + public: + CustomSVD(const VirtualGenerator &A) : m_A(A) {} - x_global = 0; + // C style + virtual bool copy_low_rank_approximation(int M, int N, const int *rows, const int *cols, LowRankMatrix &lrmat) const override { + return copy_low_rank_approximation_impl(M, N, rows, cols, lrmat, -1); } - return test; -} + virtual bool copy_low_rank_approximation(int M, int N, const int *rows, const int *cols, int rank, LowRankMatrix &lrmat) const override { + return copy_low_rank_approximation_impl(M, N, rows, cols, lrmat, rank); + } +}; -template <> -int test_solver_ddm>>(int argc, char *argv[], int mu, char data_symmetry, char symmetric, char UPLO, std::string datapath) { +template +int test_solver_ddm(int argc, char *argv[], int mu, char data_symmetry, char symmetric, char UPLO, std::string datapath) { // Get the number of processes int size; @@ -259,53 +94,78 @@ int test_solver_ddm>>(int argc, char *argv[], i // HPDDM verbosity HPDDM::Option &opt = *HPDDM::Option::get(); opt.parse(argc, argv, rank == 0); - double tol = opt.val("tol", 1e-6); + CoordinatePrecision tol = opt.val("tol", 1e-6); if (rank != 0) opt.remove("verbosity"); opt.parse("-hpddm_max_it 200"); // HTOOL - double epsilon = tol; + CoordinatePrecision epsilon = tol; // epsilon = 1e-10; - double eta = 10; + CoordinatePrecision eta = 10; // Clustering if (rank == 0) std::cout << "Creating cluster tree" << std::endl; - Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); + Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); // Matrix if (rank == 0) std::cout << "Creating generators" << std::endl; - Matrix> A; - A.bytes_to_matrix(datapath + "/matrix.bin"); - GeneratorInUserNumberingFromMatrix> Generator(A); + std::unique_ptr> generator; + Matrix A; + Matrix> A_original; + A_original.bytes_to_matrix(datapath + "/matrix.bin"); + if constexpr (htool::is_complex()) { + generator = std::make_unique>>(A_original); + A = A_original; + } else { + generator = std::make_unique>(A_original); + A.resize(A_original.nb_rows(), A_original.nb_cols()); + for (int i = 0; i < A_original.nb_rows(); i++) { + for (int j = 0; j < A_original.nb_cols(); j++) { + A(i, j) = A_original(i, j).real(); + } + } + } int n = A.nb_rows(); // Right-hand side if (rank == 0) std::cout << "Building rhs" << std::endl; - Matrix> f_global(n, mu); + Matrix f_global(n, mu); std::vector> temp(n); bytes_to_vector(temp, datapath + "/rhs.bin"); for (int i = 0; i < mu; i++) { - f_global.set_col(i, temp); + if constexpr (htool::is_complex()) { + f_global.set_col(i, temp); + } else { + for (int j = 0; j < f_global.nb_rows(); j++) { + f_global(j, i) = temp[j].real(); + } + } } // Hmatrix if (rank == 0) std::cout << "Creating HMatrix" << std::endl; + HMatrixTreeBuilder hmatrix_builder(epsilon, eta, symmetric, UPLO); + hmatrix_builder.set_low_rank_generator(std::make_shared>(*generator)); + DefaultApproximationBuilder> default_build(*generator, target_cluster, target_cluster, hmatrix_builder, MPI_COMM_WORLD); - DefaultApproximationBuilder, htool::underlying_type>> default_build(Generator, target_cluster, target_cluster, epsilon, eta, symmetric, UPLO, MPI_COMM_WORLD); - - DistributedOperator> &Operator = default_build.distributed_operator; + DistributedOperator &Operator = default_build.distributed_operator; // Global vectors - Matrix> - x_global(n, mu), x_ref(n, mu), test_global(n, mu); + Matrix x_global(n, mu), x_ref(n, mu), test_global(n, mu); bytes_to_vector(temp, datapath + "sol.bin"); for (int i = 0; i < mu; i++) { - x_ref.set_col(i, temp); + if constexpr (htool::is_complex()) { + x_ref.set_col(i, temp); + } else { + for (int j = 0; j < x_ref.nb_rows(); j++) { + x_ref(j, i) = temp[j].real(); + } + } } // Partition @@ -328,10 +188,18 @@ int test_solver_ddm>>(int argc, char *argv[], i // Solve if (rank == 0) std::cout << "Creating HMatrix" << std::endl; - std::vector geometry(n); + std::vector geometry(n); bytes_to_vector(geometry, datapath + "/geometry.bin"); - DDMSolverBuilder> default_ddm_solver(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, Generator, 3, geometry.data(), epsilon, eta); - auto &ddm_with_overlap = default_ddm_solver.solver; + std::unique_ptr default_ddm_solver_ptr; + if constexpr (std::is_same_v>) { + default_ddm_solver_ptr = std::make_unique(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, *generator, 3, geometry.data(), epsilon, eta); + } else { + default_ddm_solver_ptr = std::make_unique(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, *generator, 3, geometry.data(), ClusterTreeBuilder(), hmatrix_builder); + } + auto &default_ddm_solver = *default_ddm_solver_ptr; + auto &ddm_with_overlap = default_ddm_solver.solver; + + // DDMSolverBuilder> default_ddm_solver(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, Generator, 3, geometry.data(), ClusterTreeBuilder(), HMatrixTreeBuilder>(epsilon, eta, symmetric, UPLO)); // No precond with overlap if (rank == 0) @@ -355,17 +223,34 @@ int test_solver_ddm>>(int argc, char *argv[], i std::cout << "ASM one level with overlap:" << std::endl; MPI_Barrier(MPI_COMM_WORLD); opt.parse("-hpddm_schwarz_method asm "); - Matrix> Ki; - if (data_symmetry == 'S' && size > 1) { - opt.remove("geneo_threshold"); - opt.parse("-hpddm_geneo_nu 2"); - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_ddm_solver.local_hmatrix, Ki, symmetric, UPLO, 2); - // geneo_coarse_space_dense_builder.set_geneo_nu(2); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + Matrix Ki; + if constexpr (!htool::is_complex()) { + if (data_symmetry == 'S' && size > 1) { + opt.remove("geneo_threshold"); + opt.parse("-hpddm_geneo_nu 2"); + Matrix> tmp; + tmp.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); + if constexpr (htool::is_complex()) { + Ki = tmp; + } else { + Ki.resize(tmp.nb_rows(), tmp.nb_cols()); + for (int i = 0; i < tmp.nb_rows(); i++) { + for (int j = 0; j < tmp.nb_cols(); j++) { + Ki(i, j) = tmp(i, j).real(); + } + } + } + int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); + int local_size_with_overlap = Ki.nb_cols(); + std::unique_ptr> geneo_coarse_space_dense_builder; + if constexpr (std::is_same_v>) { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 2)); + } else { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 2)); + } + GeneoCoarseOperatorBuilder geneo_coarse_operator_builder(Operator); + ddm_with_overlap.build_coarse_space(*geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + } } ddm_with_overlap.facto_one_level(); @@ -398,64 +283,89 @@ int test_solver_ddm>>(int argc, char *argv[], i x_global = 0; // DDM two level ASM with overlap - if (data_symmetry == 'S' && size > 1) { - if (rank == 0) - std::cout << "ASM two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - if (rank == 0) - std::cout << "RAS two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - ddm_with_overlap.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; + if constexpr (!htool::is_complex()) { + if (data_symmetry == 'S' && size > 1) { + if (rank == 0) + std::cout << "ASM two level with overlap:" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); + ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap.print_infos(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; + + if (rank == 0) + std::cout << "RAS two level with overlap:" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); + ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap.print_infos(); + ddm_with_overlap.clean(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; + + // DDM solver with threshold + if (rank == 0) + std::cout << "RAS two level with overlap and threshold:" << std::endl; + opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); + + std::unique_ptr default_ddm_solver_with_threshold_ptr; + if constexpr (std::is_same_v>) { + default_ddm_solver_with_threshold_ptr = std::make_unique(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, *generator, 3, geometry.data(), epsilon, eta); + } else { + default_ddm_solver_with_threshold_ptr = std::make_unique(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, *generator, 3, geometry.data(), ClusterTreeBuilder(), HMatrixTreeBuilder(epsilon, eta, symmetric, UPLO)); + } + auto &default_ddm_solver_with_threshold = *default_ddm_solver_with_threshold_ptr; + + auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; + Matrix> tmp; + tmp.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); + if constexpr (htool::is_complex()) { + Ki = tmp; + } else { + Ki.resize(tmp.nb_rows(), tmp.nb_cols()); + for (int i = 0; i < tmp.nb_rows(); i++) { + for (int j = 0; j < tmp.nb_cols(); j++) { + Ki(i, j) = tmp(i, j).real(); + } + } + } + int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); + int local_size_with_overlap = Ki.nb_cols(); + std::unique_ptr> geneo_coarse_space_dense_builder; + if constexpr (std::is_same_v>) { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 100)); + } else { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 100)); + } + // geneo_coarse_space_dense_builder.set_geneo_threshold(100); + GeneoCoarseOperatorBuilder geneo_coarse_operator_builder(Operator); + ddm_with_overlap_threshold.build_coarse_space(*geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + ddm_with_overlap_threshold.facto_one_level(); + ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap_threshold.print_infos(); + ddm_with_overlap_threshold.clean(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM solver with threshold - if (rank == 0) - std::cout << "RAS two level with overlap and threshold:" << std::endl; - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - // DDM> ddm_with_overlap_threshold(Generator, &Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - DDMSolverBuilder> default_ddm_solver_with_threshold(Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections, Generator, 3, geometry.data(), epsilon, eta); - auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, *default_ddm_solver.local_hmatrix, Ki, symmetric, UPLO, 100); - // geneo_coarse_space_dense_builder.set_geneo_threshold(100); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap_threshold.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - ddm_with_overlap_threshold.facto_one_level(); - ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap_threshold.print_infos(); - ddm_with_overlap_threshold.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; } return test; diff --git a/tests/functional_tests/solvers/test_solver_ddm_adding_overlap.hpp b/tests/functional_tests/solvers/test_solver_ddm_adding_overlap.hpp index 2dba05c5..db372e4e 100644 --- a/tests/functional_tests/solvers/test_solver_ddm_adding_overlap.hpp +++ b/tests/functional_tests/solvers/test_solver_ddm_adding_overlap.hpp @@ -25,8 +25,8 @@ using namespace std; using namespace htool; -template -int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_symmetry, char symmetric, std::string datapath) { +template +int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_symmetry, char symmetric, char UPLO, std::string datapath) { // Get the number of processes int size; @@ -42,275 +42,80 @@ int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_sym // HPDDM verbosity HPDDM::Option &opt = *HPDDM::Option::get(); opt.parse(argc, argv, rank == 0); - double tol = opt.val("tol", 1e-6); + CoordinatePrecision tol = opt.val("tol", 1e-6); if (rank != 0) opt.remove("verbosity"); opt.parse("-hpddm_max_it 200"); // HTOOL - double epsilon = tol; - double eta = 10; - - char UPLO = 'N'; - if (symmetric != 'N') { - UPLO = 'L'; - } + CoordinatePrecision epsilon = tol; + CoordinatePrecision eta = 10; // Clustering if (rank == 0) std::cout << "Creating cluster tree" << std::endl; - Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); + Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); // Matrix if (rank == 0) std::cout << "Creating generators" << std::endl; - Matrix> A; - A.bytes_to_matrix(datapath + "/matrix.bin"); - GeneratorInUserNumberingFromMatrix> Generator(A); - int n = A.nb_rows(); - - // Right-hand side - if (rank == 0) - std::cout << "Building rhs" << std::endl; - Matrix> f_global(n, mu); - std::vector> temp(n); - bytes_to_vector(temp, datapath + "/rhs.bin"); - for (int i = 0; i < mu; i++) { - f_global.set_col(i, temp); - } - - // Hmatrix - if (rank == 0) - std::cout << "Creating HMatrix" << std::endl; - - DefaultApproximationBuilder> default_build(Generator, target_cluster, target_cluster, epsilon, eta, symmetric, UPLO, MPI_COMM_WORLD); - - DistributedOperator &Operator = default_build.distributed_operator; - HMatrix local_block_diagonal_hmatrix = *default_build.block_diagonal_hmatrix; - - // Global vectors - Matrix> - x_global(n, mu), x_ref(n, mu), test_global(n, mu); - bytes_to_vector(temp, datapath + "sol.bin"); - for (int i = 0; i < mu; i++) { - x_ref.set_col(i, temp); - } - - // Partition - std::vector cluster_to_ovr_subdomain; - std::vector ovr_subdomain_to_global; - std::vector neighbors; - std::vector> intersections; - bytes_to_vector(cluster_to_ovr_subdomain, datapath + "/cluster_to_ovr_subdomain_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - bytes_to_vector(ovr_subdomain_to_global, datapath + "/ovr_subdomain_to_global_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - bytes_to_vector(neighbors, datapath + "/neighbors_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - - intersections.resize(neighbors.size()); - for (int p = 0; p < neighbors.size(); p++) { - bytes_to_vector(intersections[p], datapath + "/intersections_" + NbrToStr(size) + "_" + NbrToStr(rank) + "_" + NbrToStr(p) + ".bin"); - } - - // Errors - double error2; - - // Solve - if (rank == 0) - std::cout << "Creating HMatrix" << std::endl; - solver_builder default_ddm_solver(Operator, local_block_diagonal_hmatrix, Generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - auto &ddm_with_overlap = default_ddm_solver.solver; - - // No precond with overlap - if (rank == 0) - std::cout << "No precond with overlap:" << std::endl; - - opt.parse("-hpddm_schwarz_method none"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM one level ASM with overlap - if (rank == 0) - std::cout << "ASM one level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm "); - Matrix> Ki; - if (data_symmetry == 'S' && size > 1) { - opt.remove("geneo_threshold"); - opt.parse("-hpddm_geneo_nu 2"); - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 2); - // geneo_coarse_space_dense_builder.set_geneo_nu(4); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - } - - ddm_with_overlap.facto_one_level(); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM one level RAS with overlap - if (rank == 0) - std::cout << "RAS one level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras "); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM two level ASM with overlap - if (data_symmetry == 'S' && size > 1) { - if (rank == 0) - std::cout << "ASM two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; + std::unique_ptr> generator; + Matrix A; + Matrix> A_original; + A_original.bytes_to_matrix(datapath + "/matrix.bin"); + if constexpr (htool::is_complex()) { + generator = std::make_unique>>(A_original); + A = A_original; + } else { + generator = std::make_unique>(A_original); + A.resize(A_original.nb_rows(), A_original.nb_cols()); + for (int i = 0; i < A_original.nb_rows(); i++) { + for (int j = 0; j < A_original.nb_cols(); j++) { + A(i, j) = A_original(i, j).real(); + } } - - test = test || !(error2 < tol); - - x_global = 0; - - if (rank == 0) - std::cout << "RAS two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - ddm_with_overlap.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM solver with threshold - if (rank == 0) - std::cout << "RAS two level with overlap and threshold:" << std::endl; - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive -hpddm_geneo_threshold 100"); - // DDM> ddm_with_overlap_threshold(Generator, &Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - solver_builder default_ddm_solver_with_threshold(Operator, local_block_diagonal_hmatrix, Generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; - - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 100); - // geneo_coarse_space_dense_builder.set_geneo_threshold(100); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap_threshold.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - ddm_with_overlap_threshold.facto_one_level(); - ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap_threshold.print_infos(); - ddm_with_overlap_threshold.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; } - - return test; -} - -int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_symmetry, char symmetric, char UPLO, std::string datapath) { - - // Get the number of processes - int size; - MPI_Comm_size(MPI_COMM_WORLD, &size); - - // Get the rank of the process - int rank; - MPI_Comm_rank(MPI_COMM_WORLD, &rank); - - // - bool test = 0; - - // HPDDM verbosity - HPDDM::Option &opt = *HPDDM::Option::get(); - opt.parse(argc, argv, rank == 0); - double tol = opt.val("tol", 1e-6); - if (rank != 0) - opt.remove("verbosity"); - opt.parse("-hpddm_max_it 200"); - - // HTOOL - double epsilon = tol; - double eta = 10; - - // Clustering - if (rank == 0) - std::cout << "Creating cluster tree" << std::endl; - Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); - - // Matrix - if (rank == 0) - std::cout << "Creating generators" << std::endl; - Matrix> A; - A.bytes_to_matrix(datapath + "/matrix.bin"); - GeneratorInUserNumberingFromMatrix> Generator(A); int n = A.nb_rows(); // Right-hand side if (rank == 0) std::cout << "Building rhs" << std::endl; - Matrix> f_global(n, mu); + Matrix f_global(n, mu); std::vector> temp(n); bytes_to_vector(temp, datapath + "/rhs.bin"); for (int i = 0; i < mu; i++) { - f_global.set_col(i, temp); + if constexpr (htool::is_complex()) { + f_global.set_col(i, temp); + } else { + for (int j = 0; j < f_global.nb_rows(); j++) { + f_global(j, i) = temp[j].real(); + } + } } // Hmatrix if (rank == 0) std::cout << "Creating HMatrix" << std::endl; - DefaultApproximationBuilder> default_build(Generator, target_cluster, target_cluster, epsilon, eta, symmetric, UPLO, MPI_COMM_WORLD); + DefaultApproximationBuilder> default_build(*generator, target_cluster, target_cluster, HMatrixTreeBuilder(epsilon, eta, symmetric, UPLO), MPI_COMM_WORLD); - DistributedOperator &Operator = default_build.distributed_operator; - HMatrix local_block_diagonal_hmatrix = *default_build.block_diagonal_hmatrix; + DistributedOperator &Operator = default_build.distributed_operator; + HMatrix local_block_diagonal_hmatrix = *default_build.block_diagonal_hmatrix; + HMatrix local_block_diagonal_hmatrix_bis = *default_build.block_diagonal_hmatrix; + HMatrix local_block_diagonal_hmatrix_ter = *default_build.block_diagonal_hmatrix; // Global vectors - Matrix> + Matrix x_global(n, mu), x_ref(n, mu), test_global(n, mu); bytes_to_vector(temp, datapath + "sol.bin"); for (int i = 0; i < mu; i++) { - x_ref.set_col(i, temp); + if constexpr (htool::is_complex()) { + x_ref.set_col(i, temp); + } else { + for (int j = 0; j < x_ref.nb_rows(); j++) { + x_ref(j, i) = temp[j].real(); + } + } } // Partition @@ -333,7 +138,7 @@ int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_sym // Solve if (rank == 0) std::cout << "Creating HMatrix" << std::endl; - DDMSolverBuilder> default_ddm_solver(Operator, local_block_diagonal_hmatrix, Generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); + solver_builder default_ddm_solver(Operator, local_block_diagonal_hmatrix, *generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); auto &ddm_with_overlap = default_ddm_solver.solver; // No precond with overlap @@ -358,17 +163,35 @@ int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_sym std::cout << "ASM one level with overlap:" << std::endl; MPI_Barrier(MPI_COMM_WORLD); opt.parse("-hpddm_schwarz_method asm "); - Matrix> Ki; - if (data_symmetry == 'S' && size > 1) { - opt.remove("geneo_threshold"); - opt.parse("-hpddm_geneo_nu 2"); - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 2); - // geneo_coarse_space_dense_builder.set_geneo_nu(4); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + Matrix Ki; + if constexpr (!htool::is_complex()) { + if (data_symmetry == 'S' && size > 1) { + opt.remove("geneo_threshold"); + opt.parse("-hpddm_geneo_nu 2"); + Matrix> tmp; + tmp.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); + if constexpr (htool::is_complex()) { + Ki = tmp; + } else { + Ki.resize(tmp.nb_rows(), tmp.nb_cols()); + for (int i = 0; i < tmp.nb_rows(); i++) { + for (int j = 0; j < tmp.nb_cols(); j++) { + Ki(i, j) = tmp(i, j).real(); + } + } + } + int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); + int local_size_with_overlap = Ki.nb_cols(); + std::unique_ptr> geneo_coarse_space_dense_builder; + if constexpr (std::is_same_v>) { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 2)); + } else { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 2)); + } + // geneo_coarse_space_dense_builder.set_geneo_nu(4); + GeneoCoarseOperatorBuilder geneo_coarse_operator_builder(Operator); + ddm_with_overlap.build_coarse_space(*geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + } } ddm_with_overlap.facto_one_level(); @@ -401,66 +224,123 @@ int test_solver_ddm_adding_overlap(int argc, char *argv[], int mu, char data_sym x_global = 0; // DDM two level ASM with overlap - if (data_symmetry == 'S' && size > 1) { - if (rank == 0) - std::cout << "ASM two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; - - if (rank == 0) - std::cout << "RAS two level with overlap:" << std::endl; - MPI_Barrier(MPI_COMM_WORLD); - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); - ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap.print_infos(); - ddm_with_overlap.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; + if constexpr (!htool::is_complex()) { + if (data_symmetry == 'S' && size > 1) { + if (rank == 0) + std::cout << "ASM two level with overlap:" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + opt.parse("-hpddm_schwarz_method asm -hpddm_schwarz_coarse_correction additive"); + ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap.print_infos(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; + + if (rank == 0) + std::cout << "RAS two level with overlap:" << std::endl; + MPI_Barrier(MPI_COMM_WORLD); + opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); + ddm_with_overlap.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap.print_infos(); + ddm_with_overlap.clean(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; + + // DDM solver with threshold + if (rank == 0) + std::cout << "RAS two level with overlap and threshold:" << std::endl; + opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive -hpddm_geneo_threshold 100"); + // DDM> ddm_with_overlap_threshold(*generator, &Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); + solver_builder default_ddm_solver_with_threshold(Operator, local_block_diagonal_hmatrix_bis, *generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); + auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; + + Matrix> tmp; + tmp.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); + if constexpr (htool::is_complex()) { + Ki = tmp; + } else { + Ki.resize(tmp.nb_rows(), tmp.nb_cols()); + for (int i = 0; i < tmp.nb_rows(); i++) { + for (int j = 0; j < tmp.nb_cols(); j++) { + Ki(i, j) = tmp(i, j).real(); + } + } + } + int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); + int local_size_with_overlap = Ki.nb_cols(); + std::unique_ptr> geneo_coarse_space_dense_builder; + if constexpr (std::is_same_v>) { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, 100)); + } else { + geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 100)); + } + + GeneoCoarseOperatorBuilder geneo_coarse_operator_builder(Operator); + ddm_with_overlap_threshold.build_coarse_space(*geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + ddm_with_overlap_threshold.facto_one_level(); + ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); + ddm_with_overlap_threshold.print_infos(); + ddm_with_overlap_threshold.clean(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; + + // DDM solver with non uniform coarse space + if (rank == 0) + std::cout << "RAS two level with overlap and threshold:" << std::endl; + opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive"); + // DDM> ddm_with_overlap_threshold(*generator, &Operator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); + solver_builder default_ddm_solver_non_uniform_coarse_space(Operator, local_block_diagonal_hmatrix_ter, *generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); + auto &ddm_with_non_uniform_coarse_space = default_ddm_solver_non_uniform_coarse_space.solver; + + tmp.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); + if constexpr (htool::is_complex()) { + Ki = tmp; + } else { + Ki.resize(tmp.nb_rows(), tmp.nb_cols()); + for (int i = 0; i < tmp.nb_rows(); i++) { + for (int j = 0; j < tmp.nb_cols(); j++) { + Ki(i, j) = tmp(i, j).real(); + } + } + } + std::unique_ptr> non_uniform_geneo_coarse_space_dense_builder; + if constexpr (std::is_same_v>) { + non_uniform_geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, default_ddm_solver.block_diagonal_dense_matrix, Ki, symmetric, UPLO, rank == 0 ? 0 : 2)); + } else { + non_uniform_geneo_coarse_space_dense_builder = std::make_unique>(GeneoCoarseSpaceDenseBuilder::GeneoWithNu(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, rank == 0 ? 0 : 2)); + } + + ddm_with_non_uniform_coarse_space.build_coarse_space(*non_uniform_geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); + ddm_with_non_uniform_coarse_space.facto_one_level(); + ddm_with_non_uniform_coarse_space.solve(f_global.data(), x_global.data(), mu); + ddm_with_non_uniform_coarse_space.print_infos(); + ddm_with_non_uniform_coarse_space.clean(); + error2 = normFrob(f_global - A * x_global) / normFrob(f_global); + if (rank == 0) { + cout << "error: " << error2 << endl; + } + + test = test || !(error2 < tol); + + x_global = 0; } - - test = test || !(error2 < tol); - - x_global = 0; - - // DDM solver with threshold - if (rank == 0) - std::cout << "RAS two level with overlap and threshold:" << std::endl; - opt.parse("-hpddm_schwarz_method ras -hpddm_schwarz_coarse_correction additive -hpddm_geneo_threshold 100"); - - auto local_block_diagonal_hmatrix_bis = *default_build.block_diagonal_hmatrix; - DDMSolverBuilder> default_ddm_solver_with_threshold(Operator, local_block_diagonal_hmatrix_bis, Generator, ovr_subdomain_to_global, cluster_to_ovr_subdomain, neighbors, intersections); - auto &ddm_with_overlap_threshold = default_ddm_solver_with_threshold.solver; - - Ki.bytes_to_matrix(datapath + "/Ki_" + NbrToStr(size) + "_" + NbrToStr(rank) + ".bin"); - int local_size_wo_overlap = Operator.get_target_partition().get_size_of_partition(rank); - int local_size_with_overlap = Ki.nb_cols(); - auto geneo_coarse_space_dense_builder = GeneoCoarseSpaceDenseBuilder>::GeneoWithThreshold(local_size_wo_overlap, local_size_with_overlap, *default_build.block_diagonal_hmatrix, Ki, symmetric, UPLO, 100); - // geneo_coarse_space_dense_builder.set_geneo_threshold(100); - GeneoCoarseOperatorBuilder> geneo_coarse_operator_builder(Operator); - ddm_with_overlap_threshold.build_coarse_space(geneo_coarse_space_dense_builder, geneo_coarse_operator_builder); - ddm_with_overlap_threshold.facto_one_level(); - ddm_with_overlap_threshold.solve(f_global.data(), x_global.data(), mu); - ddm_with_overlap_threshold.print_infos(); - ddm_with_overlap_threshold.clean(); - error2 = normFrob(f_global - A * x_global) / normFrob(f_global); - if (rank == 0) { - cout << "error: " << error2 << endl; - } - - test = test || !(error2 < tol); - - x_global = 0; } return test; diff --git a/tests/functional_tests/solvers/test_solver.cpp b/tests/functional_tests/solvers/test_solver_double.cpp similarity index 59% rename from tests/functional_tests/solvers/test_solver.cpp rename to tests/functional_tests/solvers/test_solver_double.cpp index 6508ee04..87fc4a45 100644 --- a/tests/functional_tests/solvers/test_solver.cpp +++ b/tests/functional_tests/solvers/test_solver_double.cpp @@ -22,20 +22,18 @@ int main(int argc, char *argv[]) { } string datapath = argv[1]; - bool is_error = false; + bool is_error = false; + std::string datapath_final = datapath + "/output_sym/"; for (auto nb_rhs : {1, 5}) { for (auto data_symmetry : {'N', 'S'}) { - std::string datapath_final = data_symmetry == 'S' ? datapath + "/output_sym/" : datapath + "/output_non_sym/"; - std::vector symmetries = {'N'}; - if (data_symmetry == 'S') { - symmetries.push_back('S'); - } + std::vector symmetries = {'N', 'S'}; for (auto symmetry : symmetries) { + std::cout << nb_rhs << " " << data_symmetry << " " << symmetry << "\n"; - is_error = is_error || test_solver_wo_overlap, double>>(argc, argv, nb_rhs, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); - is_error = is_error || test_solver_ddm_adding_overlap>>(argc, argv, nb_rhs, data_symmetry, symmetry, datapath_final); - is_error = is_error || test_solver_ddm>>(argc, argv, nb_rhs, data_symmetry, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + is_error = is_error || test_solver_wo_overlap>(argc, argv, nb_rhs, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + is_error = is_error || test_solver_ddm_adding_overlap>(argc, argv, nb_rhs, data_symmetry, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); + is_error = is_error || test_solver_ddm>(argc, argv, nb_rhs, data_symmetry, symmetry, symmetry == 'N' ? 'N' : 'L', datapath_final); std::vector storage; if (symmetry == 'N') { @@ -45,9 +43,9 @@ int main(int argc, char *argv[]) { storage.push_back('L'); } for (auto UPLO : storage) { - is_error = is_error || test_solver_wo_overlap, double>>(argc, argv, nb_rhs, symmetry, UPLO, datapath_final); - test_solver_ddm_adding_overlap(argc, argv, nb_rhs, data_symmetry, symmetry, UPLO, datapath_final); - is_error = is_error || test_solver_ddm>>(argc, argv, nb_rhs, data_symmetry, symmetry, UPLO, datapath_final); + is_error = is_error || test_solver_wo_overlap>(argc, argv, nb_rhs, symmetry, UPLO, datapath_final); + is_error = is_error || test_solver_ddm_adding_overlap>(argc, argv, nb_rhs, data_symmetry, symmetry, UPLO, datapath_final); + is_error = is_error || test_solver_ddm>(argc, argv, nb_rhs, data_symmetry, symmetry, UPLO, datapath_final); } } } diff --git a/tests/functional_tests/solvers/test_solver_wo_overlap.hpp b/tests/functional_tests/solvers/test_solver_wo_overlap.hpp index cb8fbd90..abd12bfb 100644 --- a/tests/functional_tests/solvers/test_solver_wo_overlap.hpp +++ b/tests/functional_tests/solvers/test_solver_wo_overlap.hpp @@ -25,7 +25,7 @@ class DistributedOperator; using namespace std; using namespace htool; -template +template int test_solver_wo_overlap(int argc, char *argv[], int mu, char symmetric, char UPLO, std::string datapath) { // Get the number of processes @@ -42,53 +42,77 @@ int test_solver_wo_overlap(int argc, char *argv[], int mu, char symmetric, char // HPDDM verbosity HPDDM::Option &opt = *HPDDM::Option::get(); opt.parse(argc, argv, rank == 0); - double tol = opt.val("tol", 1e-6); + CoordinatePrecision tol = opt.val("tol", 1e-6); if (rank != 0) opt.remove("verbosity"); opt.parse("-hpddm_max_it 200"); // HTOOL - double epsilon = tol; - double eta = 10; + CoordinatePrecision epsilon = tol; + CoordinatePrecision eta = 10; // Clustering if (rank == 0) std::cout << "Creating cluster tree" << std::endl; - Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); + Cluster target_cluster = read_cluster_tree(datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree_properties.csv", datapath + "/cluster_" + NbrToStr(size) + "_cluster_tree.csv"); // Matrix if (rank == 0) std::cout << "Creating generators" << std::endl; - Matrix> A; - A.bytes_to_matrix(datapath + "/matrix.bin"); - GeneratorInUserNumberingFromMatrix> Generator(A); + std::unique_ptr> generator; + Matrix A; + Matrix> A_original; + A_original.bytes_to_matrix(datapath + "/matrix.bin"); + if constexpr (htool::is_complex()) { + generator = std::make_unique>>(A_original); + A = A_original; + } else { + generator = std::make_unique>(A_original); + A.resize(A_original.nb_rows(), A_original.nb_cols()); + for (int i = 0; i < A_original.nb_rows(); i++) { + for (int j = 0; j < A_original.nb_cols(); j++) { + A(i, j) = A_original(i, j).real(); + } + } + } int n = A.nb_rows(); // Right-hand side if (rank == 0) std::cout << "Building rhs" << std::endl; - Matrix> f_global(n, mu); + Matrix f_global(n, mu); std::vector> temp(n); bytes_to_vector(temp, datapath + "/rhs.bin"); for (int i = 0; i < mu; i++) { - f_global.set_col(i, temp); + if constexpr (htool::is_complex()) { + f_global.set_col(i, temp); + } else { + for (int j = 0; j < f_global.nb_rows(); j++) { + f_global(j, i) = temp[j].real(); + } + } } // Hmatrix if (rank == 0) std::cout << "Creating HMatrix" << std::endl; - DefaultApproximationBuilder> default_build(Generator, target_cluster, target_cluster, epsilon, eta, symmetric, UPLO, MPI_COMM_WORLD); + DefaultApproximationBuilder> default_build(*generator, target_cluster, target_cluster, HMatrixTreeBuilder(epsilon, eta, symmetric, UPLO), MPI_COMM_WORLD); - DistributedOperator &Operator = default_build.distributed_operator; - HMatrix local_block_diagonal_hmatrix = *default_build.block_diagonal_hmatrix; + DistributedOperator &Operator = default_build.distributed_operator; + HMatrix local_block_diagonal_hmatrix = *default_build.block_diagonal_hmatrix; // Global vectors - Matrix> - x_global(n, mu), x_ref(n, mu), test_global(n, mu); + Matrix x_global(n, mu), x_ref(n, mu), test_global(n, mu); bytes_to_vector(temp, datapath + "sol.bin"); for (int i = 0; i < mu; i++) { - x_ref.set_col(i, temp); + if constexpr (htool::is_complex()) { + x_ref.set_col(i, temp); + } else { + for (int j = 0; j < x_ref.nb_rows(); j++) { + x_ref(j, i) = temp[j].real(); + } + } } // Partition