From 7a8b38249dbb582218f41c077c53c4d310e2341f Mon Sep 17 00:00:00 2001 From: rebcabin Date: Sun, 4 Feb 2024 20:21:02 -0800 Subject: [PATCH 01/16] create vector-backend branch --- .idea/.gitignore | 8 ++++++++ .idea/customTargets.xml | 12 ++++++++++++ .idea/lpython.iml | 2 ++ .idea/misc.xml | 4 ++++ .idea/modules.xml | 8 ++++++++ .idea/tools/External Tools.xml | 9 +++++++++ .idea/vcs.xml | 6 ++++++ 7 files changed, 49 insertions(+) create mode 100644 .idea/.gitignore create mode 100644 .idea/customTargets.xml create mode 100644 .idea/lpython.iml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/tools/External Tools.xml create mode 100644 .idea/vcs.xml diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000000..13566b81b0 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/customTargets.xml b/.idea/customTargets.xml new file mode 100644 index 0000000000..98fb12e6b5 --- /dev/null +++ b/.idea/customTargets.xml @@ -0,0 +1,12 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/lpython.iml b/.idea/lpython.iml new file mode 100644 index 0000000000..f08604bb65 --- /dev/null +++ b/.idea/lpython.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000000..79b3c94830 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000000..83d184b0d9 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/tools/External Tools.xml b/.idea/tools/External Tools.xml new file mode 100644 index 0000000000..8cdd0760c9 --- /dev/null +++ b/.idea/tools/External Tools.xml @@ -0,0 +1,9 @@ + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000000..35eb1ddfbb --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file From 1ebba8bf7062e420b7cb1b087efa94decf927557 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Sun, 4 Feb 2024 21:28:22 -0800 Subject: [PATCH 02/16] beginning matmul --- tests/matmul.py | 95 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 tests/matmul.py diff --git a/tests/matmul.py b/tests/matmul.py new file mode 100644 index 0000000000..e2bfd56dd6 --- /dev/null +++ b/tests/matmul.py @@ -0,0 +1,95 @@ +import numpy +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64) + +# https://numpy.org/devdocs/reference/typing.html +# from numpy.typing import NDArray + + +# plan for 30 Jan 2024 -- +# step 0: comment out this code and ./build_baryon.sh to run on APU +# emulator; or ./run_full_emulation.sh to run in CPython. +# step 1: side-by-side numpy implementation in full-emulation +# - get there line-by-line +# = focus on gvml_add_u16 first + + +def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, + A: CPtr, B: CPtr, C: CPtr) -> \ + None: # NDArray[numpy.int16]: + VR_SIZE: i32 = 32_768 + + # In the primary example, n = 15, m = 3, l = 32_768, + # M1 = 1, M2 = 5 + + # source GSI L4 arrays + pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) + pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) + + ######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO + ######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 + ######## FEBRUARY 2024. + + # source numpy arrays + ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) + ######## for row in range(n): + ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] + + ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) + ######## for row in range(m): + ######## B_ml[row,:] = pB_ml[(row * l):((row + 1) * l)] + + # # destination numpy array + ######## C_nl: NDArray[numpy.int16] = numpy.zeros((n, l), dtype=numpy.int16) + + # destination GSI L4 array + pC_nl: Pointer[i16[:]] = c_p_pointer(C, i16[:], array([n * l])) + + # First, accumulate outer product without blocking. This is + # the code we would -ultimately- like to compile. Notice that + # all GSI-specific L1, L4, MMB are hidden. + + k: i32 + ######## for k in range(0, m): + ######## C_nl += numpy.outer(A_nm[:,k], B_ml[k,:]) + ######## pass + + # expect + # [[ 5 8 11 ... 20 23 26], + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], ... + # + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], + # [14 26 38 ... 74 86 98]] + set_breakpoint_here_and_inspect_C_nl : i32 = 0 + + # Second, with explicit blocking. This is a stepping-stone + # for our back-end. Notice that L1 and MMB are hidden. + + # T_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + # B_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + A_ik: i16 + jj: i32 + ii: i32 + i: i32 + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + ######## C_nl[i + ii, :] = 0 + pass + for k in range(0, m): # rows of B + # B_1l[0, :] = B_ml[k, :] + for i in range(0, M2): + ######## A_ik = A_nm[i + ii, k] + # broadcast a single element of A + # T_1l[0, :] = A_ik + # pointwise (Hadamard) product: + # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) + # C_nl[i + ii, :] += T_1l[0, :] + # optimization without the temporaries + ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik + pass + + set_breakpoint_here_and_inspect_C_nl = 0 + + ######## return C_nl From 6d39319a1192cb18a47feaef2517392dbe126d1b Mon Sep 17 00:00:00 2001 From: rebcabin Date: Sun, 4 Feb 2024 21:30:41 -0800 Subject: [PATCH 03/16] better comments --- tests/matmul.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/matmul.py b/tests/matmul.py index e2bfd56dd6..6f14983158 100644 --- a/tests/matmul.py +++ b/tests/matmul.py @@ -1,8 +1,12 @@ import numpy from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64) +######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO +######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 +######## FEBRUARY 2024. + # https://numpy.org/devdocs/reference/typing.html -# from numpy.typing import NDArray +######## from numpy.typing import NDArray # plan for 30 Jan 2024 -- @@ -15,7 +19,7 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, A: CPtr, B: CPtr, C: CPtr) -> \ - None: # NDArray[numpy.int16]: + None: ######## NDArray[numpy.int16]: VR_SIZE: i32 = 32_768 # In the primary example, n = 15, m = 3, l = 32_768, @@ -25,10 +29,6 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) - ######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO - ######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 - ######## FEBRUARY 2024. - # source numpy arrays ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) ######## for row in range(n): From 394a8aa79e6f0db8ce0286a50fcd640fdf39c2d3 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Mon, 5 Feb 2024 10:58:04 -0800 Subject: [PATCH 04/16] issues 2479 and 2480 --- ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py | 117 ++++++++++++++++++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.txt | 55 ++++++++ .../UNHANDLED-EXCEPTIONS/Issue2480.py | 21 ++++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.txt | 33 +++++ integration_tests/matmul_apu_backend.py | 44 +++++++ integration_tests/matmul_integration.py | 117 ++++++++++++++++++ 6 files changed, 387 insertions(+) create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.txt rename tests/matmul.py => ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.py (89%) create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.txt create mode 100644 integration_tests/matmul_apu_backend.py create mode 100644 integration_tests/matmul_integration.py diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py new file mode 100644 index 0000000000..917166f0bd --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py @@ -0,0 +1,117 @@ +import numpy +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64) + +######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO +######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 +######## FEBRUARY 2024. + +# https://numpy.org/devdocs/reference/typing.html +######## from numpy.typing import NDArray + + +# plan for 30 Jan 2024 -- +# step 0: comment out this code and ./build_baryon.sh to run on APU +# emulator; or ./run_full_emulation.sh to run in CPython. +# step 1: side-by-side numpy implementation in full-emulation +# - get there line-by-line +# = focus on gvml_add_u16 first + + +def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, + A: CPtr, B: CPtr, C: CPtr) -> \ + None: ######## NDArray[numpy.int16]: + VR_SIZE: i32 = 32_768 + + # In the primary example, n = 15, m = 3, l = 32_768, + # M1 = 1, M2 = 5 + + # source GSI L4 arrays + pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) + pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) + + # source numpy arrays + ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) + ######## for row in range(n): + ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] + A_nm: Array[i16, n, m] + row : i32 + for row in range(n): + col : i32 + for col in range(m): + A_nm[row, col] = pA_nm[(row * m):((row * m) + col)] + + ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) + ######## for row in range(m): + ######## B_ml[row,:] = pB_ml[(row * l):((row + 1) * l)] + + # # destination numpy array + ######## C_nl: NDArray[numpy.int16] = numpy.zeros((n, l), dtype=numpy.int16) + + # destination GSI L4 array + pC_nl: Pointer[i16[:]] = c_p_pointer(C, i16[:], array([n * l])) + + # First, accumulate outer product without blocking. This is + # the code we would -ultimately- like to compile. Notice that + # all GSI-specific L1, L4, MMB are hidden. + + k: i32 + ######## for k in range(0, m): + ######## C_nl += numpy.outer(A_nm[:,k], B_ml[k,:]) + ######## pass + + # expect + # [[ 5 8 11 ... 20 23 26], + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], ... + # + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], + # [14 26 38 ... 74 86 98]] + set_breakpoint_here_and_inspect_C_nl : i32 = 0 + + # Second, with explicit blocking. This is a stepping-stone + # for our back-end. Notice that L1 and MMB are hidden. + + # T_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + # B_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + A_ik: i16 + jj: i32 + ii: i32 + i: i32 + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + ######## C_nl[i + ii, :] = 0 + pass + for k in range(0, m): # rows of B + # B_1l[0, :] = B_ml[k, :] + for i in range(0, M2): + ######## A_ik = A_nm[i + ii, k] + # broadcast a single element of A + # T_1l[0, :] = A_ik + # pointwise (Hadamard) product: + # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) + # C_nl[i + ii, :] += T_1l[0, :] + # optimization without the temporaries + ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik + pass + + set_breakpoint_here_and_inspect_C_nl = 0 + + ######## return C_nl + +def main(): + n : i32 = 15 + m : i32 = 3 + l : i32 = 32_768 + M1 : i32 = 1 + M2 : i32 = 5 + A_l4 : CPtr + B_l4 : CPtr + C_l4 : CPtr + numpy_side_by_side(n, m, l, M1, M2, A_l4, B_l4, C_l4) + print ("hello, world!") + + +if __name__ == "__main__": + main() diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.txt b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.txt new file mode 100644 index 0000000000..85662064e2 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.txt @@ -0,0 +1,55 @@ +└─(10:46:54 on vector-backend ✖ ✭)──> lpython ../ISSUES/UNHANDLED-EXCEPTIONS/Issue2479.py 1 ↵ ──(Mon,Feb05)─┘ +Internal Compiler Error: Unhandled exception +Traceback (most recent call last): + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/bin/lpython.cpp", line 1872 + err = compile_python_to_object_file(arg_file, tmp_o, runtime_library_dir, + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/bin/lpython.cpp", line 824 + res = fe.get_llvm3(*asr, pass_manager, diagnostics, infile); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/lpython/python_evaluator.cpp", line 71 + run_fn, infile); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 9282 + v.visit_asr((ASR::asr_t&)asr); + File "../libasr/asr.h", line 5057 + File "../libasr/asr.h", line 5033 + File "../libasr/asr.h", line 5058 + File "../libasr/asr.h", line 4766 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 932 + ASR::symbol_t *mod = x.m_symtab->get_symbol(item); + File "../libasr/asr.h", line 5060 + File "../libasr/asr.h", line 4774 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 2976 + finish_module_init_function_prototype(x); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 3927 + ASR::Function_t *s = ASR::down_cast(item.second); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 3683 + visit_procedures(x); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 3880 + this->visit_stmt(*x.m_body[i]); + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 4827 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 5603 + create_loop(x.m_name, [=]() { + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 321 + start_new_block(loopbody); { + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 5609 + this->visit_stmt(*x.m_body[i]); + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 4834 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 5063 + this->visit_stmt(*(block->m_body[i])); + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 4827 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 5603 + create_loop(x.m_name, [=]() { + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 321 + start_new_block(loopbody); { + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 5609 + this->visit_stmt(*x.m_body[i]); + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 4800 + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 4283 + handle_array_section_association_to_pointer(x); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 4247 + LCOMPILERS_ASSERT(target_rank > 0); +AssertFailed: target_rank > 0 +(lp) ┌─(~/Documents/GitHub/lpython/integration_tests)───────────────────────────────────────(brian@MacBook-Pro:s001)─┐ diff --git a/tests/matmul.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.py similarity index 89% rename from tests/matmul.py rename to ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.py index 6f14983158..a26e5c230e 100644 --- a/tests/matmul.py +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.py @@ -33,6 +33,11 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) ######## for row in range(n): ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] + A_nm: Array[i16, n, m] + row : i32 + for row in range(n): + A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] + ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) ######## for row in range(m): @@ -93,3 +98,19 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, set_breakpoint_here_and_inspect_C_nl = 0 ######## return C_nl + +def main(): + n : i32 = 15 + m : i32 = 3 + l : i32 = 32_768 + M1 : i32 = 1 + M2 : i32 = 5 + A_l4 : CPtr + B_l4 : CPtr + C_l4 : CPtr + numpy_side_by_side(n, m, l, M1, M2, A_l4, B_l4, C_l4) + print ("hello, world!") + + +if __name__ == "__main__": + main() diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.txt b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.txt new file mode 100644 index 0000000000..d9604ba789 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.txt @@ -0,0 +1,33 @@ +└─(10:50:28 on vector-backend ✖ ✭)──> lpython ../ISSUES/UNHANDLED-EXCEPTIONS/Issue2480.py 1 ↵ ──(Mon,Feb05)─┘ +Traceback (most recent call last): + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/bin/lpython.cpp", line 1872 + err = compile_python_to_object_file(arg_file, tmp_o, runtime_library_dir, + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/bin/lpython.cpp", line 824 + res = fe.get_llvm3(*asr, pass_manager, diagnostics, infile); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/lpython/python_evaluator.cpp", line 71 + run_fn, infile); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/codegen/asr_to_llvm.cpp", line 9276 + pass_manager.apply_passes(al, &asr, co.po, diagnostics); + File "/Users/brian/Documents/GitHub/lpython/src/libasr/pass/pass_manager.h", line 299 + apply_passes(al, asr, _passes, pass_options, diagnostics); + File "/Users/brian/Documents/GitHub/lpython/src/libasr/pass/pass_manager.h", line 160 + _passes_db[passes[i]](al, *asr, pass_options); + File "/Users/brian/Dropbox/Mac/Documents/GitHub/lpython/src/libasr/pass/array_op.cpp", line 1910 + u.visit_TranslationUnit(unit); + File "../libasr/asr.h", line 5277 + File "../libasr/asr.h", line 5060 + File "../libasr/asr.h", line 4774 + File "../libasr/pass/pass_utils.h", line 317 + File "../libasr/asr.h", line 5290 + File "../libasr/asr.h", line 5060 + File "../libasr/asr.h", line 4775 + File "../libasr/pass/pass_utils.h", line 298 + File "../libasr/asr.h", line 5303 + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 4805 + File "../libasr/asr.h", line 5441 + File "../libasr/asr.h", line 5077 + File "../libasr/asr.h", line 5077 + Binary file "/usr/lib/system/libsystem_platform.dylib", local address: 0x18046da23 +Segfault: Signal SIGSEGV (segmentation fault) received +(lp) ┌─(~/Documents/GitHub/lpython/integration_tests)───────────────────────────────────────(brian@MacBook-Pro:s001)─┐ diff --git a/integration_tests/matmul_apu_backend.py b/integration_tests/matmul_apu_backend.py new file mode 100644 index 0000000000..10c4b7bec8 --- /dev/null +++ b/integration_tests/matmul_apu_backend.py @@ -0,0 +1,44 @@ +from numpy import empty, uint16 +from lpython import Annotation, u16, SIMD, L1, i32, ccall, i16, i64, CPtr, TypeVar + + +n = TypeVar("n") +m = TypeVar("m") +l = TypeVar("l") + +def matmul_test(l: i32, n: i32, m: i32, M1: i32, M2: i32, V: i32, + A: u16[n, m], B: u16[m, l], C: u16[n, l]): + L1cache: Annotation[u16[M2 + 1, V], L1] = empty((M2 + 1, V), dtype=uint16) + B_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) + C_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) + T_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) + i: i32; j: i32; jj: i32; ii: i32; kk: i32; k: i32; A_ik: u16 + + L1_B_index: i32 = 0 + L1_C_base: i32 = 1 + + # I think in the APU code, M1=1, so the loops over kk and k get fused + # Closer to the APU code + + # Wrong value on purpose to check that all entries will be overriden + C[:,:] = u16(1) + for jj in range(0, l, V): + for ii in range(0, n, M2): + for i in range(M2): + C_vr[:] = u16(0) + L1cache[L1_C_base+i,:] = C_vr[:] + for kk in range(0, m, M1): + for k in range(M1): + L1cache[L1_B_index,:] = B[kk+k, jj:jj+V] + B_vr[:] = L1cache[L1_B_index,:] + for i in range(M2): + A_ik = A[ii+i,kk+k] + C_vr[:] = L1cache[L1_C_base+i,:] + T_vr[:] = A_ik + T_vr[:] = B_vr[:] * T_vr[:] + C_vr[:] = C_vr[:] + T_vr[:] + L1cache[L1_C_base+i,:] = C_vr[:] + for i in range(M2): + C[ii+i,jj:jj+V] = L1cache[L1_C_base+i,:] + + print(C) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py new file mode 100644 index 0000000000..917166f0bd --- /dev/null +++ b/integration_tests/matmul_integration.py @@ -0,0 +1,117 @@ +import numpy +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64) + +######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO +######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 +######## FEBRUARY 2024. + +# https://numpy.org/devdocs/reference/typing.html +######## from numpy.typing import NDArray + + +# plan for 30 Jan 2024 -- +# step 0: comment out this code and ./build_baryon.sh to run on APU +# emulator; or ./run_full_emulation.sh to run in CPython. +# step 1: side-by-side numpy implementation in full-emulation +# - get there line-by-line +# = focus on gvml_add_u16 first + + +def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, + A: CPtr, B: CPtr, C: CPtr) -> \ + None: ######## NDArray[numpy.int16]: + VR_SIZE: i32 = 32_768 + + # In the primary example, n = 15, m = 3, l = 32_768, + # M1 = 1, M2 = 5 + + # source GSI L4 arrays + pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) + pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) + + # source numpy arrays + ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) + ######## for row in range(n): + ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] + A_nm: Array[i16, n, m] + row : i32 + for row in range(n): + col : i32 + for col in range(m): + A_nm[row, col] = pA_nm[(row * m):((row * m) + col)] + + ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) + ######## for row in range(m): + ######## B_ml[row,:] = pB_ml[(row * l):((row + 1) * l)] + + # # destination numpy array + ######## C_nl: NDArray[numpy.int16] = numpy.zeros((n, l), dtype=numpy.int16) + + # destination GSI L4 array + pC_nl: Pointer[i16[:]] = c_p_pointer(C, i16[:], array([n * l])) + + # First, accumulate outer product without blocking. This is + # the code we would -ultimately- like to compile. Notice that + # all GSI-specific L1, L4, MMB are hidden. + + k: i32 + ######## for k in range(0, m): + ######## C_nl += numpy.outer(A_nm[:,k], B_ml[k,:]) + ######## pass + + # expect + # [[ 5 8 11 ... 20 23 26], + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], ... + # + # [ 8 14 20 ... 38 44 50], + # [11 20 29 ... 56 65 74], + # [14 26 38 ... 74 86 98]] + set_breakpoint_here_and_inspect_C_nl : i32 = 0 + + # Second, with explicit blocking. This is a stepping-stone + # for our back-end. Notice that L1 and MMB are hidden. + + # T_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + # B_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) + A_ik: i16 + jj: i32 + ii: i32 + i: i32 + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + ######## C_nl[i + ii, :] = 0 + pass + for k in range(0, m): # rows of B + # B_1l[0, :] = B_ml[k, :] + for i in range(0, M2): + ######## A_ik = A_nm[i + ii, k] + # broadcast a single element of A + # T_1l[0, :] = A_ik + # pointwise (Hadamard) product: + # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) + # C_nl[i + ii, :] += T_1l[0, :] + # optimization without the temporaries + ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik + pass + + set_breakpoint_here_and_inspect_C_nl = 0 + + ######## return C_nl + +def main(): + n : i32 = 15 + m : i32 = 3 + l : i32 = 32_768 + M1 : i32 = 1 + M2 : i32 = 5 + A_l4 : CPtr + B_l4 : CPtr + C_l4 : CPtr + numpy_side_by_side(n, m, l, M1, M2, A_l4, B_l4, C_l4) + print ("hello, world!") + + +if __name__ == "__main__": + main() From 3c44d8ebd3c1cabfb6247934d852928a4b8d0092 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Mon, 5 Feb 2024 20:17:14 -0800 Subject: [PATCH 05/16] w.i.p. in working state --- ISSUES/UNHANDLED-EXCEPTIONS/Issue2487.py | 8 +++++ integration_tests/matmul_integration.py | 42 +++++++++++++++++------- 2 files changed, 39 insertions(+), 11 deletions(-) create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2487.py diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2487.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2487.py new file mode 100644 index 0000000000..2bf167c2ff --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2487.py @@ -0,0 +1,8 @@ +from lpython import CPtr + +A : CPtr + +def foo(a : CPtr) -> None: + pass + +foo(A) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index 917166f0bd..78e23e67b4 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -1,5 +1,7 @@ import numpy -from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64) +from numpy import array +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, + ccall, sizeof) ######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO ######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 @@ -29,16 +31,27 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) + print(pA_nm[0]) + assert pA_nm[0] == i16(0) + + pA_nm[0] = i16(32_767) + assert pA_nm[0] == i16(0x7FFF) + print(pA_nm[0]) + + pA_nm[0] += i16(1) + assert pA_nm[0] == i16(-32_768) + print(pA_nm[0]) + # source numpy arrays ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) ######## for row in range(n): ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] - A_nm: Array[i16, n, m] - row : i32 - for row in range(n): - col : i32 - for col in range(m): - A_nm[row, col] = pA_nm[(row * m):((row * m) + col)] + # A_nm: Array[i16, n, m] + # row : i32 + # for row in range(n): + # col : i32 + # for col in range(m): + # A_nm[row, col] = pA_nm[(row * m):((row * m) + col)] ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) ######## for row in range(m): @@ -100,16 +113,23 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, ######## return C_nl + +@ccall +def _lfortran_malloc(size : i32) -> CPtr: + """borrowed from bindc_07.py in integration_tests""" + pass + + def main(): n : i32 = 15 m : i32 = 3 l : i32 = 32_768 M1 : i32 = 1 M2 : i32 = 5 - A_l4 : CPtr - B_l4 : CPtr - C_l4 : CPtr - numpy_side_by_side(n, m, l, M1, M2, A_l4, B_l4, C_l4) + Anm_l4 : CPtr = _lfortran_malloc( (n * m) * i32(sizeof(i16)) ) + Bml_l4 : CPtr = _lfortran_malloc( (m * l) * i32(sizeof(i16)) ) + Cnl_l4 : CPtr = _lfortran_malloc( (n * l) * i32(sizeof(i16)) ) + numpy_side_by_side(n, m, l, M1, M2, Anm_l4, Bml_l4, Cnl_l4) print ("hello, world!") From 67527c446ba43fc25b968f9faf61d66d2c4e545b Mon Sep 17 00:00:00 2001 From: rebcabin Date: Tue, 6 Feb 2024 20:18:43 -0800 Subject: [PATCH 06/16] interim w.i.p. --- ISSUES/Issue2496.py | 22 +++++ ISSUES/Issue2499.py | 11 +++ ISSUES/SIGSEGV/Issue2498.py | 20 ++++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2491.py | 13 +++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2492.py | 22 +++++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2494.py | 15 +++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2495.py | 7 ++ ISSUES/UNHANDLED-EXCEPTIONS/Issue2497.py | 48 ++++++++++ integration_tests/matmul_integration.py | 116 +++++++++++++++++++---- 9 files changed, 258 insertions(+), 16 deletions(-) create mode 100644 ISSUES/Issue2496.py create mode 100644 ISSUES/Issue2499.py create mode 100644 ISSUES/SIGSEGV/Issue2498.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2491.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2492.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2494.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2495.py create mode 100644 ISSUES/UNHANDLED-EXCEPTIONS/Issue2497.py diff --git a/ISSUES/Issue2496.py b/ISSUES/Issue2496.py new file mode 100644 index 0000000000..8dab66fc9a --- /dev/null +++ b/ISSUES/Issue2496.py @@ -0,0 +1,22 @@ +from numpy import array, empty, int16 +from lpython import (i16, i32, c_p_pointer, Pointer, CPtr, TypeVar) + + +Tn = TypeVar("Tn") +Tm = TypeVar("Tm") +Tl = TypeVar("Tl") + + +def THIS_WORKS(Anm_l4: CPtr, Tn: i32, Tm: i32, l: i32) -> i16[Tn, Tm]: + A_nm: i16[Tn, Tm] = empty((Tn, Tm), dtype=int16) + return A_nm + + +def THIS_DOESNT_WORK(d: i16[Tm, Tn], b: CPtr, Tm: i32, Tn: i32) -> None: + B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([Tm * Tn])) + i: i32 + j: i32 + for i in range(Tm): + for j in range(Tn): + d[i, j] = B[(i * Tn) + j] + diff --git a/ISSUES/Issue2499.py b/ISSUES/Issue2499.py new file mode 100644 index 0000000000..73f5ae8d42 --- /dev/null +++ b/ISSUES/Issue2499.py @@ -0,0 +1,11 @@ +from lpython import i32 +i: i32 +for i in range(10): + print(i) +for i in range(0, 10): + print(i) +for i in range(0, 10, 2): + print(i) +M2: i32 = 1 +for i in range(0, 10): + print(i) \ No newline at end of file diff --git a/ISSUES/SIGSEGV/Issue2498.py b/ISSUES/SIGSEGV/Issue2498.py new file mode 100644 index 0000000000..cbfc0860b8 --- /dev/null +++ b/ISSUES/SIGSEGV/Issue2498.py @@ -0,0 +1,20 @@ +from numpy import array, empty, int16 +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, + ccall, sizeof, Array, Allocatable, TypeVar, Const) + + +rows = TypeVar("rows") +cols = TypeVar("cols") + + +def spot_print_lpython_array(a: i16[:], rows: i32, cols: i32) -> i16[rows, cols]: + pass + + +def main() -> i32: + print ("hello, world!") + return 0 + + +if __name__ == "__main__": + main() diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2491.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2491.py new file mode 100644 index 0000000000..baca03ced2 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2491.py @@ -0,0 +1,13 @@ +import numpy +from numpy import empty + +from lpython import (i16) + + +def main(): + A_nm: i16[15, 3] = empty((15, 3), dtype=numpy.int16) + print ("hello, world!") + + +if __name__ == "__main__": + main() diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2492.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2492.py new file mode 100644 index 0000000000..635a5e8134 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2492.py @@ -0,0 +1,22 @@ +from numpy import empty, int16 + +from lpython import (i16, i32, CPtr, ccall, sizeof) + + +@ccall +def _lfortran_malloc(size : i32) -> CPtr: + """borrowed from bindc_07.py in integration_tests""" + pass + + +def main(): + n : i32 = 15 + m : i32 = 3 + + # Emulate getting stuff from the C side. + Anm_l4 : CPtr = _lfortran_malloc( (n * m) * i32(sizeof(i16)) ) + A_nm: i16[n, m] = empty((n, m), dtype=int16) + + +if __name__ == "__main__": + main() diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2494.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2494.py new file mode 100644 index 0000000000..745b743cce --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2494.py @@ -0,0 +1,15 @@ +from numpy import empty, int16 + +from lpython import (i16, i32, Allocatable) + + +# doesn't work: +# def to_lpython_array(n: i32, m: i32) -> Array[i16, n, m]: #ndarray(Any, dtype=int16): +# works: +# def to_lpython_array(n: i32, m: i32) -> Array[i16, 15, 3]: #ndarray(Any, dtype=int16): +# doesn't work: +def to_lpython_array(n: i32, m: i32) -> Allocatable[i16[:]]: + A_nm: i16[n, m] = empty((n, m), dtype=int16) + return A_nm + + diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2495.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2495.py new file mode 100644 index 0000000000..07ad62e468 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2495.py @@ -0,0 +1,7 @@ +from lpython import i16, Allocatable +from numpy import empty, int16 + + +def foo() -> Allocatable[i16[:]]: + result: i16[1] = empty((1,), dtype=int16) + return result \ No newline at end of file diff --git a/ISSUES/UNHANDLED-EXCEPTIONS/Issue2497.py b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2497.py new file mode 100644 index 0000000000..ca17294ab7 --- /dev/null +++ b/ISSUES/UNHANDLED-EXCEPTIONS/Issue2497.py @@ -0,0 +1,48 @@ +import numpy +from numpy import array, empty, int16 +from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, + ccall, sizeof, Array, Allocatable, TypeVar, Const) + + +@ccall +def _lfortran_malloc(size : i32) -> CPtr: + """Borrow from bindc_07.py in integration_tests.""" + pass + + +rows = TypeVar("rows") +cols = TypeVar("cols") + + +def load_lpython_array_from_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> i16[rows, cols]: + """Load an LPython array from a C / Fortran array.""" + B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) + D: i16[rows, cols] = empty((rows, cols), dtype=int16) + i: i32 + j: i32 + for i in range(rows): + for j in range(cols): + D[i, j] = B[(i * cols) + j] + return D + + +def spot_print_lpython_array(a: i16[:], n: i32, m: i32) -> None: + pass + + +def main() -> i32: + + # "Const" lets these appear in type declarations such as i16[n, m] + n : Const[i32] = 15 + m : Const[i32] = 3 + + Anm_l4 : CPtr = _lfortran_malloc((n * m) * i32(sizeof(i16))) + + Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) + spot_print_lpython_array(Anm, n, m) + + return 0 + + +if __name__ == "__main__": + main() diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index 78e23e67b4..d01baabe3f 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -1,7 +1,7 @@ import numpy -from numpy import array +from numpy import array, empty, int16 from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, - ccall, sizeof) + ccall, sizeof, Array, Allocatable, TypeVar, Const) ######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO ######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 @@ -46,12 +46,12 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) ######## for row in range(n): ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] - # A_nm: Array[i16, n, m] - # row : i32 + A_nm: i16[n, m] = empty((n, m), dtype=int16) + # row : i32 = 0 # for row in range(n): # col : i32 # for col in range(m): - # A_nm[row, col] = pA_nm[(row * m):((row * m) + col)] + # A_nm[row, col] = pA_nm[(row * m) + col] ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) ######## for row in range(m): @@ -116,21 +116,105 @@ def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, @ccall def _lfortran_malloc(size : i32) -> CPtr: - """borrowed from bindc_07.py in integration_tests""" + """Borrow from bindc_07.py in integration_tests.""" pass -def main(): - n : i32 = 15 - m : i32 = 3 - l : i32 = 32_768 - M1 : i32 = 1 - M2 : i32 = 5 - Anm_l4 : CPtr = _lfortran_malloc( (n * m) * i32(sizeof(i16)) ) - Bml_l4 : CPtr = _lfortran_malloc( (m * l) * i32(sizeof(i16)) ) - Cnl_l4 : CPtr = _lfortran_malloc( (n * l) * i32(sizeof(i16)) ) - numpy_side_by_side(n, m, l, M1, M2, Anm_l4, Bml_l4, Cnl_l4) +def init_c_fortran_array(b: CPtr, rows: i32, cols: i32, mod: i32) -> None: + """Initialize a C / Fortran array with test data.""" + B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) + i: i32 + j: i32 + for i in range(rows): + for j in range(cols): + B[(i * cols) + j] = i16((i + j) % mod) + + +def zero_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> None: + """Zero out a C / Fortran array.""" + B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) + i: i32 + j: i32 + for i in range(rows): + for j in range(cols): + B[(i * cols) + j] = i16(0) + + +rows = TypeVar("rows") +cols = TypeVar("cols") + + +def load_lpython_array_from_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> i16[rows, cols]: + """Load an LPython array from a C / Fortran array.""" + B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) + D: i16[rows, cols] = empty((rows, cols), dtype=int16) + i: i32 + j: i32 + for i in range(rows): + for j in range(cols): + D[i, j] = B[(i * cols) + j] + return D + + +def spot_print(a: i16[:]) -> None: + """Issue2497""" + return + + +def main() -> i32: + + # "Const" lets these appear in type declarations such as i16[n, m] + n : Const[i32] = 15 + m : Const[i32] = 3 + l : Const[i32] = 32_768 + M1 : Const[i32] = 1 + M2 : Const[i32] = 5 + + Anm_l4 : CPtr = _lfortran_malloc((n * m) * i32(sizeof(i16))) + Bml_l4 : CPtr = _lfortran_malloc((m * l) * i32(sizeof(i16))) + Cnl_l4 : CPtr = _lfortran_malloc((n * l) * i32(sizeof(i16))) + + init_c_fortran_array(Anm_l4, n, m, 11) + init_c_fortran_array(Bml_l4, m, l, 13) + zero_c_fortran_array(Cnl_l4, n, l) + + print (Anm_l4) + + Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) + # Issue2497 spot_print(Anm) + print(Anm) + Bml: i16[m, l] = load_lpython_array_from_c_fortran_array(Bml_l4, m, l) + # print(Bml) + Cnl: i16[n, l] = load_lpython_array_from_c_fortran_array(Cnl_l4, n, l) + # print(Cnl) + + VR_SIZE: i32 = 32_768 + k: i32 + A_ik: i16 + jj: i32 + ii: i32 + i: i32 + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n):#, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + ######## C_nl[i + ii, :] = 0 + pass + for k in range(0, m): # rows of B + # B_1l[0, :] = B_ml[k, :] + for i in range(0, M2): + ######## A_ik = A_nm[i + ii, k] + # broadcast a single element of A + # T_1l[0, :] = A_ik + # pointwise (Hadamard) product: + # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) + # C_nl[i + ii, :] += T_1l[0, :] + # optimization without the temporaries + ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik + pass + + print ("hello, world!") + return 0 if __name__ == "__main__": From 3a3782e13ae34e764ad8bc57c18111795f5c8845 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Wed, 7 Feb 2024 10:41:31 -0800 Subject: [PATCH 07/16] matmul is done --- ISSUES/Issue2499.py | 27 ++-- ISSUES/Issue2503.py | 25 ++++ integration_tests/matmul_integration.py | 165 ++++++++---------------- 3 files changed, 99 insertions(+), 118 deletions(-) create mode 100644 ISSUES/Issue2503.py diff --git a/ISSUES/Issue2499.py b/ISSUES/Issue2499.py index 73f5ae8d42..797650693c 100644 --- a/ISSUES/Issue2499.py +++ b/ISSUES/Issue2499.py @@ -1,11 +1,18 @@ -from lpython import i32 +from lpython import i32, i16, Const +VR_SIZE: i32 = 32_768 +l: Const[i32] = VR_SIZE +n: Const[i32] = 15 +m: Const[i32] = 3 +k: i32 +M2: Const[i32] = 5 +A_ik: i16 +jj: i32 +ii: i32 i: i32 -for i in range(10): - print(i) -for i in range(0, 10): - print(i) -for i in range(0, 10, 2): - print(i) -M2: i32 = 1 -for i in range(0, 10): - print(i) \ No newline at end of file +for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + pass + for k in range(0, m): # rows of B + for i in range(0, M2): + pass diff --git a/ISSUES/Issue2503.py b/ISSUES/Issue2503.py new file mode 100644 index 0000000000..0b0068fc06 --- /dev/null +++ b/ISSUES/Issue2503.py @@ -0,0 +1,25 @@ +from lpython import (i16, i32, Const) +from numpy import empty, int16 +dim: Const[i32] = 10 + + +def foo(): + """Negative indices produce random results each run.""" + A: i16[dim] = empty((dim,), dtype=int16) + ww: i32 + for ww in range(dim): + A[ww] = i16(ww + 1) + print(A[0], A[1], A[2], "...", A[-3], A[-2], A[-1]) + + +def bar(dim_: i32): + """Negative indices always produce zero when 'dim' is a parameter.""" + A: i16[dim_] = empty((dim_,), dtype=int16) + ww: i32 + for ww in range(dim_): + A[ww] = i16(ww + 1) + print(A[0], A[1], A[2], "...", A[-3], A[-2], A[-1]) + + +foo() +bar(10) \ No newline at end of file diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index d01baabe3f..aa9abbfde4 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -19,101 +19,6 @@ # = focus on gvml_add_u16 first -def numpy_side_by_side(n: i32, m: i32, l: i32, M1: i32, M2: i32, - A: CPtr, B: CPtr, C: CPtr) -> \ - None: ######## NDArray[numpy.int16]: - VR_SIZE: i32 = 32_768 - - # In the primary example, n = 15, m = 3, l = 32_768, - # M1 = 1, M2 = 5 - - # source GSI L4 arrays - pA_nm: Pointer[i16[:]] = c_p_pointer(A, i16[:], array([n * m])) - pB_ml: Pointer[i16[:]] = c_p_pointer(B, i16[:], array([m * l])) - - print(pA_nm[0]) - assert pA_nm[0] == i16(0) - - pA_nm[0] = i16(32_767) - assert pA_nm[0] == i16(0x7FFF) - print(pA_nm[0]) - - pA_nm[0] += i16(1) - assert pA_nm[0] == i16(-32_768) - print(pA_nm[0]) - - # source numpy arrays - ######## A_nm: NDArray[numpy.int16] = numpy.zeros((n, m), dtype=numpy.int16) - ######## for row in range(n): - ######## A_nm[row,:] = pA_nm[(row * m):((row + 1) * m)] - A_nm: i16[n, m] = empty((n, m), dtype=int16) - # row : i32 = 0 - # for row in range(n): - # col : i32 - # for col in range(m): - # A_nm[row, col] = pA_nm[(row * m) + col] - - ######## B_ml: NDArray[numpy.int16] = numpy.zeros((m, l), dtype=numpy.int16) - ######## for row in range(m): - ######## B_ml[row,:] = pB_ml[(row * l):((row + 1) * l)] - - # # destination numpy array - ######## C_nl: NDArray[numpy.int16] = numpy.zeros((n, l), dtype=numpy.int16) - - # destination GSI L4 array - pC_nl: Pointer[i16[:]] = c_p_pointer(C, i16[:], array([n * l])) - - # First, accumulate outer product without blocking. This is - # the code we would -ultimately- like to compile. Notice that - # all GSI-specific L1, L4, MMB are hidden. - - k: i32 - ######## for k in range(0, m): - ######## C_nl += numpy.outer(A_nm[:,k], B_ml[k,:]) - ######## pass - - # expect - # [[ 5 8 11 ... 20 23 26], - # [ 8 14 20 ... 38 44 50], - # [11 20 29 ... 56 65 74], ... - # - # [ 8 14 20 ... 38 44 50], - # [11 20 29 ... 56 65 74], - # [14 26 38 ... 74 86 98]] - set_breakpoint_here_and_inspect_C_nl : i32 = 0 - - # Second, with explicit blocking. This is a stepping-stone - # for our back-end. Notice that L1 and MMB are hidden. - - # T_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) - # B_1l: NDArray[numpy.int16] = numpy.zeros((1, l), dtype=numpy.int16) - A_ik: i16 - jj: i32 - ii: i32 - i: i32 - for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C - for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # zero-out rows of C - ######## C_nl[i + ii, :] = 0 - pass - for k in range(0, m): # rows of B - # B_1l[0, :] = B_ml[k, :] - for i in range(0, M2): - ######## A_ik = A_nm[i + ii, k] - # broadcast a single element of A - # T_1l[0, :] = A_ik - # pointwise (Hadamard) product: - # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) - # C_nl[i + ii, :] += T_1l[0, :] - # optimization without the temporaries - ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik - pass - - set_breakpoint_here_and_inspect_C_nl = 0 - - ######## return C_nl - - @ccall def _lfortran_malloc(size : i32) -> CPtr: """Borrow from bindc_07.py in integration_tests.""" @@ -161,14 +66,21 @@ def spot_print(a: i16[:]) -> None: return +def clear_row(a: i16[:], row: i32, cols: i32) -> None: + j: i32 + for j in range(cols): + a[row, j] = i16(0) + + def main() -> i32: # "Const" lets these appear in type declarations such as i16[n, m] n : Const[i32] = 15 m : Const[i32] = 3 l : Const[i32] = 32_768 - M1 : Const[i32] = 1 - M2 : Const[i32] = 5 + + M1 : i32 = 1 + M2 : i32 = 5 # Issue 2499 -- can't be Const Anm_l4 : CPtr = _lfortran_malloc((n * m) * i32(sizeof(i16))) Bml_l4 : CPtr = _lfortran_malloc((m * l) * i32(sizeof(i16))) @@ -181,39 +93,76 @@ def main() -> i32: print (Anm_l4) Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) - # Issue2497 spot_print(Anm) + # Issue 2497 spot_print(Anm) print(Anm) Bml: i16[m, l] = load_lpython_array_from_c_fortran_array(Bml_l4, m, l) # print(Bml) Cnl: i16[n, l] = load_lpython_array_from_c_fortran_array(Cnl_l4, n, l) # print(Cnl) + # Temporaries (TODO: get rid of them, as indicated by proposed syntax below) + B1l: i16[1, l] = empty((1, l), dtype=int16) + T1l: i16[1, l] = empty((1, l), dtype=int16) + VR_SIZE: i32 = 32_768 k: i32 A_ik: i16 jj: i32 ii: i32 i: i32 + ww: i32 # "ww" is short for "workaround_index." for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C - for ii in range(0, n):#, M2): # each M2 block in A cols and B rows + for ii in range(0, n, M2): # each M2 block in A cols and B rows for i in range(0, M2): # zero-out rows of C - ######## C_nl[i + ii, :] = 0 + # Due to Issue 2496, I cannot pass an array to a function + # clear_row(Cnl, i + ii, l) + # Due to Issue 2500, I cannot broadcast a constant + # Cnl[i + ii, :] = 0 + for ww in range(0, l): + Cnl[i + ii, ww] = i16(0) pass for k in range(0, m): # rows of B + # Issues 2496 and 2500 prevent the desirable form and workaround # B_1l[0, :] = B_ml[k, :] + for ww in range(0, l): + B1l[0, ww] = Bml[k, ww] for i in range(0, M2): - ######## A_ik = A_nm[i + ii, k] - # broadcast a single element of A - # T_1l[0, :] = A_ik + A_ik = Anm[i + ii, k] + # broadcast a single element of A (why? might have a SIMD vector register for T1l) + # T1l[0, :] = A_ik + for ww in range(0, l): + T1l[0, ww] = A_ik # pointwise (Hadamard) product: - # T_1l[0, :] = np.multiply(B_1l[0, :], T_1l[0, :]) - # C_nl[i + ii, :] += T_1l[0, :] + # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) + for ww in range(0, l): + T1l[0, ww] *= B1l[0, ww] + # Accumulated outer product: + # Cnl[i + ii, :] += T1l[0, :] + for ww in range(0, l): + Cnl[i + ii, ww] += T1l[0, ww] # optimization without the temporaries - ######## C_nl[i + ii, :] += B_ml[k, :] * A_ik + ######## Cnl[i + ii, :] += Bml[k, :] * A_ik pass - - print ("hello, world!") + print("Expect:") + print("[[ 5 8 11 ... 20 23 26],") + print(" [ 8 14 20 ... 38 44 50],") + print(" [11 20 29 ... 56 65 74], ...") + print("") + print(" [ 8 14 20 ... 38 44 50],") + print(" [11 20 29 ... 56 65 74],") + print(" [14 26 38 ... 74 86 98]]") + print("") + print("Actual:") + for ww in range(0, 3): + print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + print("...") + for ww in range(n-3, n): + print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + # print(Cnl) + # for ww in range(0, l): + # T1l[0, ww] = Cnl[0, ww] + # print(T1l) return 0 From 97fa7624c833d1d1ec67a7e3e586592938e756ef Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 08:10:23 -0800 Subject: [PATCH 08/16] hand-optimized branch --- integration_tests/matmul_integration.asr | 2999 ++++++++++++++++++++++ integration_tests/matmul_integration.py | 80 +- 2 files changed, 3048 insertions(+), 31 deletions(-) create mode 100644 integration_tests/matmul_integration.asr diff --git a/integration_tests/matmul_integration.asr b/integration_tests/matmul_integration.asr new file mode 100644 index 0000000000..eb0e0fbafd --- /dev/null +++ b/integration_tests/matmul_integration.asr @@ -0,0 +1,2999 @@ +(TranslationUnit + (SymbolTable + 1 + { + __main__: + (Module + (SymbolTable + 2 + { + __lcompilers_dummy: + (Variable + 2 + __lcompilers_dummy + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + __main__global_stmts: + (Function + (SymbolTable + 244 + { + + }) + __main__global_stmts + (FunctionType + [] + () + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [main] + [] + [(If + (StringCompare + (Var 2 __name__) + Eq + (StringConstant + "__main__" + (Character 1 8 ()) + ) + (Logical 4) + () + ) + [(= + (Var 2 __lcompilers_dummy) + (FunctionCall + 2 main + () + [((LogicalConstant + .true. + (Logical 4) + ))] + (Integer 4) + () + () + ) + () + )] + [] + )] + () + Public + .false. + .false. + () + ), + __name__: + (Variable + 2 + __name__ + [] + Local + (StringConstant + "__main__" + (Character 1 8 ()) + ) + (StringConstant + "__main__" + (Character 1 8 ()) + ) + Default + (Character 1 8 ()) + () + Source + Public + Required + .false. + ), + _lfortran_malloc: + (Function + (SymbolTable + 211 + { + _lpython_return_variable: + (Variable + 211 + _lpython_return_variable + [] + ReturnVar + () + () + Default + (CPtr) + () + BindC + Public + Required + .false. + ), + size: + (Variable + 211 + size + [] + In + () + () + Default + (Integer 4) + () + BindC + Public + Required + .true. + ) + }) + _lfortran_malloc + (FunctionType + [(Integer 4)] + (CPtr) + BindC + Interface + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 211 size)] + [] + (Var 211 _lpython_return_variable) + Public + .false. + .false. + () + ), + clear_row: + (Function + (SymbolTable + 216 + { + a: + (Variable + 216 + a + [] + InOut + () + () + Default + (Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + () + Source + Public + Required + .false. + ), + cols: + (Variable + 216 + cols + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + j: + (Variable + 216 + j + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + row: + (Variable + 216 + row + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ) + }) + clear_row + (FunctionType + [(Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + (Integer 4) + (Integer 4)] + () + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 216 a) + (Var 216 row) + (Var 216 cols)] + [(DoLoop + () + ((Var 216 j) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 216 cols) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 216 a) + [(() + (Var 216 row) + ()) + (() + (Var 216 j) + ())] + (Integer 2) + RowMajor + () + ) + (Cast + (IntegerConstant 0 (Integer 4)) + IntegerToInteger + (Integer 2) + (IntegerConstant 0 (Integer 2)) + ) + () + )] + )] + () + Public + .false. + .false. + () + ), + cols: + (Variable + 2 + cols + [] + Local + () + () + Default + (TypeParameter + cols + ) + () + Source + Public + Required + .false. + ), + init_c_fortran_array: + (Function + (SymbolTable + 212 + { + B: + (Variable + 212 + B + [] + Local + () + () + Default + (Pointer + (Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + ) + () + Source + Public + Required + .false. + ), + b: + (Variable + 212 + b + [] + In + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + block: + (Block + (SymbolTable + 218 + { + block: + (Block + (SymbolTable + 219 + { + _mod: + (ExternalSymbol + 219 + _mod + 86 _mod + lpython_builtin + [] + _mod + Private + ), + _mod@__lpython_overloaded_2___mod: + (ExternalSymbol + 219 + _mod@__lpython_overloaded_2___mod + 86 __lpython_overloaded_2___mod + lpython_builtin + [] + __lpython_overloaded_2___mod + Public + ) + }) + block + [(= + (ArrayItem + (Var 212 B) + [(() + (IntegerBinOp + (IntegerBinOp + (Var 212 i) + Mul + (Var 212 cols) + (Integer 4) + () + ) + Add + (Var 212 j) + (Integer 4) + () + ) + ())] + (Integer 2) + RowMajor + () + ) + (Cast + (FunctionCall + 219 _mod@__lpython_overloaded_2___mod + 219 _mod + [((IntegerBinOp + (Var 212 i) + Add + (Var 212 j) + (Integer 4) + () + )) + ((Var 212 mod))] + (Integer 4) + () + () + ) + IntegerToInteger + (Integer 2) + () + ) + () + )] + ) + }) + block + [(DoLoop + () + ((Var 212 j) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 212 cols) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(BlockCall + -1 + 218 block + )] + )] + ), + cols: + (Variable + 212 + cols + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + i: + (Variable + 212 + i + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + j: + (Variable + 212 + j + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + mod: + (Variable + 212 + mod + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + rows: + (Variable + 212 + rows + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ) + }) + init_c_fortran_array + (FunctionType + [(CPtr) + (Integer 4) + (Integer 4) + (Integer 4)] + () + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 212 b) + (Var 212 rows) + (Var 212 cols) + (Var 212 mod)] + [(CPtrToPointer + (Var 212 b) + (Var 212 B) + (ArrayConstant + [(IntegerBinOp + (Var 212 rows) + Mul + (Var 212 cols) + (Integer 4) + () + )] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + (ArrayConstant + [(IntegerConstant 0 (Integer 4))] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + ) + (DoLoop + () + ((Var 212 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 212 rows) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(BlockCall + -1 + 212 block + )] + )] + () + Public + .false. + .false. + () + ), + load_lpython_array_from_c_fortran_array: + (Function + (SymbolTable + 214 + { + B: + (Variable + 214 + B + [] + Local + () + () + Default + (Pointer + (Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + ) + () + Source + Public + Required + .false. + ), + D: + (Variable + 214 + D + [rows + cols] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 214 rows)) + ((IntegerConstant 0 (Integer 4)) + (Var 214 cols))] + PointerToDataArray + ) + () + Source + Public + Required + .false. + ), + _lpython_return_variable: + (Variable + 214 + _lpython_return_variable + [rows + cols] + ReturnVar + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 214 rows)) + ((IntegerConstant 0 (Integer 4)) + (Var 214 cols))] + PointerToDataArray + ) + () + Source + Public + Required + .false. + ), + b: + (Variable + 214 + b + [] + In + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + cols: + (Variable + 214 + cols + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + i: + (Variable + 214 + i + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + j: + (Variable + 214 + j + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + rows: + (Variable + 214 + rows + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ) + }) + load_lpython_array_from_c_fortran_array + (FunctionType + [(CPtr) + (Integer 4) + (Integer 4)] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (FunctionParam + 1 + (Integer 4) + () + )) + ((IntegerConstant 0 (Integer 4)) + (FunctionParam + 2 + (Integer 4) + () + ))] + PointerToDataArray + ) + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 214 b) + (Var 214 rows) + (Var 214 cols)] + [(CPtrToPointer + (Var 214 b) + (Var 214 B) + (ArrayConstant + [(IntegerBinOp + (Var 214 rows) + Mul + (Var 214 cols) + (Integer 4) + () + )] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + (ArrayConstant + [(IntegerConstant 0 (Integer 4))] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + ) + (= + (Var 214 D) + (ArrayConstant + [] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 214 rows)) + ((IntegerConstant 0 (Integer 4)) + (Var 214 cols))] + PointerToDataArray + ) + RowMajor + ) + () + ) + (DoLoop + () + ((Var 214 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 214 rows) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 214 j) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 214 cols) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 214 D) + [(() + (Var 214 i) + ()) + (() + (Var 214 j) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 214 B) + [(() + (IntegerBinOp + (IntegerBinOp + (Var 214 i) + Mul + (Var 214 cols) + (Integer 4) + () + ) + Add + (Var 214 j) + (Integer 4) + () + ) + ())] + (Integer 2) + RowMajor + () + ) + () + )] + )] + ) + (= + (Var 214 _lpython_return_variable) + (Var 214 D) + () + ) + (Return)] + (Var 214 _lpython_return_variable) + Public + .false. + .false. + () + ), + main: + (Function + (SymbolTable + 217 + { + A_ik: + (Variable + 217 + A_ik + [] + Local + () + () + Default + (Integer 2) + () + Source + Public + Required + .false. + ), + Anm: + (Variable + 217 + Anm + [] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 15 (Const + (Integer 4) + ))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 3 (Const + (Integer 4) + )))] + FixedSizeArray + ) + () + Source + Public + Required + .false. + ), + Anm_l4: + (Variable + 217 + Anm_l4 + [] + Local + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + B1l: + (Variable + 217 + B1l + [] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + () + Source + Public + Required + .false. + ), + Bml: + (Variable + 217 + Bml + [] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 3 (Const + (Integer 4) + ))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + () + Source + Public + Required + .false. + ), + Bml_l4: + (Variable + 217 + Bml_l4 + [] + Local + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + Cnl: + (Variable + 217 + Cnl + [] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 15 (Const + (Integer 4) + ))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + () + Source + Public + Required + .false. + ), + Cnl_l4: + (Variable + 217 + Cnl_l4 + [] + Local + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + M1: + (Variable + 217 + M1 + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + M2: + (Variable + 217 + M2 + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + T1l: + (Variable + 217 + T1l + [] + Local + () + () + Default + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + () + Source + Public + Required + .false. + ), + VR_SIZE: + (Variable + 217 + VR_SIZE + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + _lpython_return_variable: + (Variable + 217 + _lpython_return_variable + [] + ReturnVar + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + i: + (Variable + 217 + i + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + ii: + (Variable + 217 + ii + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + jj: + (Variable + 217 + jj + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + k: + (Variable + 217 + k + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + l: + (Variable + 217 + l + [] + Local + (IntegerConstant 32768 (Integer 4)) + (IntegerConstant 32768 (Integer 4)) + Parameter + (Const + (Integer 4) + ) + () + Source + Public + Required + .false. + ), + m: + (Variable + 217 + m + [] + Local + (IntegerConstant 3 (Integer 4)) + (IntegerConstant 3 (Integer 4)) + Parameter + (Const + (Integer 4) + ) + () + Source + Public + Required + .false. + ), + n: + (Variable + 217 + n + [] + Local + (IntegerConstant 15 (Integer 4)) + (IntegerConstant 15 (Integer 4)) + Parameter + (Const + (Integer 4) + ) + () + Source + Public + Required + .false. + ), + optimize: + (Variable + 217 + optimize + [] + In + () + () + Default + (Logical 4) + () + Source + Public + Required + .false. + ), + ww: + (Variable + 217 + ww + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ) + }) + main + (FunctionType + [(Logical 4)] + (Integer 4) + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [_lfortran_malloc + init_c_fortran_array + zero_c_fortran_array + load_lpython_array_from_c_fortran_array] + [(Var 217 optimize)] + [(= + (Var 217 M1) + (IntegerConstant 1 (Integer 4)) + () + ) + (= + (Var 217 M2) + (IntegerConstant 5 (Integer 4)) + () + ) + (= + (Var 217 Anm_l4) + (FunctionCall + 2 _lfortran_malloc + () + [((IntegerBinOp + (IntegerBinOp + (Var 217 n) + Mul + (Var 217 m) + (Integer 4) + (IntegerConstant 45 (Integer 4)) + ) + Mul + (Cast + (SizeOfType + (Integer 2) + (Integer 8) + () + ) + IntegerToInteger + (Integer 4) + () + ) + (Integer 4) + () + ))] + (CPtr) + () + () + ) + () + ) + (= + (Var 217 Bml_l4) + (FunctionCall + 2 _lfortran_malloc + () + [((IntegerBinOp + (IntegerBinOp + (Var 217 m) + Mul + (Var 217 l) + (Integer 4) + (IntegerConstant 98304 (Integer 4)) + ) + Mul + (Cast + (SizeOfType + (Integer 2) + (Integer 8) + () + ) + IntegerToInteger + (Integer 4) + () + ) + (Integer 4) + () + ))] + (CPtr) + () + () + ) + () + ) + (= + (Var 217 Cnl_l4) + (FunctionCall + 2 _lfortran_malloc + () + [((IntegerBinOp + (IntegerBinOp + (Var 217 n) + Mul + (Var 217 l) + (Integer 4) + (IntegerConstant 491520 (Integer 4)) + ) + Mul + (Cast + (SizeOfType + (Integer 2) + (Integer 8) + () + ) + IntegerToInteger + (Integer 4) + () + ) + (Integer 4) + () + ))] + (CPtr) + () + () + ) + () + ) + (SubroutineCall + 2 init_c_fortran_array + () + [((Var 217 Anm_l4)) + ((Var 217 n)) + ((Var 217 m)) + ((IntegerConstant 11 (Integer 4)))] + () + ) + (SubroutineCall + 2 init_c_fortran_array + () + [((Var 217 Bml_l4)) + ((Var 217 m)) + ((Var 217 l)) + ((IntegerConstant 13 (Integer 4)))] + () + ) + (SubroutineCall + 2 zero_c_fortran_array + () + [((Var 217 Cnl_l4)) + ((Var 217 n)) + ((Var 217 l))] + () + ) + (Print + [(Var 217 Anm_l4)] + () + () + ) + (= + (Var 217 Anm) + (FunctionCall + 2 load_lpython_array_from_c_fortran_array + () + [((Var 217 Anm_l4)) + ((Var 217 n)) + ((Var 217 m))] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 217 n)) + ((IntegerConstant 0 (Integer 4)) + (Var 217 m))] + FixedSizeArray + ) + () + () + ) + () + ) + (Print + [(Var 217 Anm)] + () + () + ) + (= + (Var 217 Bml) + (FunctionCall + 2 load_lpython_array_from_c_fortran_array + () + [((Var 217 Bml_l4)) + ((Var 217 m)) + ((Var 217 l))] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 217 m)) + ((IntegerConstant 0 (Integer 4)) + (Var 217 l))] + FixedSizeArray + ) + () + () + ) + () + ) + (= + (Var 217 Cnl) + (FunctionCall + 2 load_lpython_array_from_c_fortran_array + () + [((Var 217 Cnl_l4)) + ((Var 217 n)) + ((Var 217 l))] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (Var 217 n)) + ((IntegerConstant 0 (Integer 4)) + (Var 217 l))] + FixedSizeArray + ) + () + () + ) + () + ) + (= + (Var 217 B1l) + (ArrayConstant + [] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + RowMajor + ) + () + ) + (= + (Var 217 T1l) + (ArrayConstant + [] + (Array + (Integer 2) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4))) + ((IntegerConstant 0 (Const + (Integer 4) + )) + (IntegerConstant 32768 (Const + (Integer 4) + )))] + FixedSizeArray + ) + RowMajor + ) + () + ) + (= + (Var 217 VR_SIZE) + (IntegerConstant 32768 (Integer 4)) + () + ) + (Print + [(StringConstant + "optimized by hand ? " + (Character 1 20 ()) + ) + (Var 217 optimize) + (StringConstant + "\n" + (Character 1 1 ()) + )] + () + () + ) + (If + (Var 217 optimize) + [(DoLoop + () + ((Var 217 jj) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Add + (IntegerBinOp + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 VR_SIZE) + GtE + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant -1 (Integer 4)) + (Integer 4) + () + ) + Add + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 VR_SIZE) + Lt + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Var 217 VR_SIZE)) + [(DoLoop + () + ((Var 217 ii) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 n) + Add + (IntegerBinOp + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 M2) + GtE + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant -1 (Integer 4)) + (Integer 4) + () + ) + Add + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 M2) + Lt + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Var 217 M2)) + [(DoLoop + () + ((Var 217 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 M2) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (Cast + (IntegerConstant 0 (Integer 4)) + IntegerToInteger + (Integer 2) + (IntegerConstant 0 (Integer 2)) + ) + () + )] + )] + ) + (DoLoop + () + ((Var 217 k) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 m) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 2 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 217 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 M2) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (IntegerBinOp + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + Add + (IntegerBinOp + (ArrayItem + (Var 217 Bml) + [(() + (Var 217 k) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + Mul + (ArrayItem + (Var 217 Anm) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 k) + ())] + (Integer 2) + RowMajor + () + ) + (Integer 2) + () + ) + (Integer 2) + () + ) + () + )] + )] + )] + )] + )] + )] + [(DoLoop + () + ((Var 217 jj) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Add + (IntegerBinOp + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 VR_SIZE) + GtE + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant -1 (Integer 4)) + (Integer 4) + () + ) + Add + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 VR_SIZE) + Lt + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Var 217 VR_SIZE)) + [(DoLoop + () + ((Var 217 ii) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 n) + Add + (IntegerBinOp + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 M2) + GtE + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant -1 (Integer 4)) + (Integer 4) + () + ) + Add + (IntegerBinOp + (Cast + (IntegerCompare + (Var 217 M2) + Lt + (IntegerConstant 0 (Integer 4)) + (Logical 4) + () + ) + LogicalToInteger + (Integer 4) + () + ) + Mul + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Integer 4) + () + ) + (Var 217 M2)) + [(DoLoop + () + ((Var 217 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 M2) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (Cast + (IntegerConstant 0 (Integer 4)) + IntegerToInteger + (Integer 2) + (IntegerConstant 0 (Integer 2)) + ) + () + )] + )] + ) + (DoLoop + () + ((Var 217 k) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 m) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 2 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 B1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Bml) + [(() + (Var 217 k) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + () + )] + ) + (DoLoop + () + ((Var 217 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 M2) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(= + (Var 217 A_ik) + (ArrayItem + (Var 217 Anm) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 k) + ())] + (Integer 2) + RowMajor + () + ) + () + ) + (DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 T1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (Var 217 A_ik) + () + )] + ) + (DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 T1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (IntegerBinOp + (ArrayItem + (Var 217 T1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + Mul + (ArrayItem + (Var 217 B1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (Integer 2) + () + ) + () + )] + ) + (DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (IntegerBinOp + (ArrayItem + (Var 217 Cnl) + [(() + (IntegerBinOp + (Var 217 i) + Add + (Var 217 ii) + (Integer 4) + () + ) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + Add + (ArrayItem + (Var 217 T1l) + [(() + (IntegerConstant 0 (Integer 4)) + ()) + (() + (Var 217 ww) + ())] + (Integer 2) + RowMajor + () + ) + (Integer 2) + () + ) + () + )] + )] + )] + )] + )] + )] + ) + (Print + [(StringConstant + "Expect:" + (Character 1 7 ()) + )] + () + () + ) + (Print + [(StringConstant + "[[ 5 8 11 ... 20 23 26]," + (Character 1 25 ()) + )] + () + () + ) + (Print + [(StringConstant + " [ 8 14 20 ... 38 44 50]," + (Character 1 25 ()) + )] + () + () + ) + (Print + [(StringConstant + " [11 20 29 ... 56 65 74], ..." + (Character 1 29 ()) + )] + () + () + ) + (Print + [(StringConstant + "" + (Character 1 0 ()) + )] + () + () + ) + (Print + [(StringConstant + " [ 8 14 20 ... 38 44 50]," + (Character 1 25 ()) + )] + () + () + ) + (Print + [(StringConstant + " [11 20 29 ... 56 65 74]," + (Character 1 25 ()) + )] + () + () + ) + (Print + [(StringConstant + " [14 26 38 ... 74 86 98]]" + (Character 1 25 ()) + )] + () + () + ) + (Print + [(StringConstant + "" + (Character 1 0 ()) + )] + () + () + ) + (Print + [(StringConstant + "Actual:" + (Character 1 7 ()) + )] + () + () + ) + (DoLoop + () + ((Var 217 ww) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (IntegerConstant 3 (Integer 4)) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 2 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(Print + [(ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 0 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 1 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 2 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (StringConstant + "..." + (Character 1 3 ()) + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 3 (Integer 4)) + (Integer 4) + (IntegerConstant 32765 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 2 (Integer 4)) + (Integer 4) + (IntegerConstant 32766 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + )] + () + () + )] + ) + (Print + [(StringConstant + "..." + (Character 1 3 ()) + )] + () + () + ) + (DoLoop + () + ((Var 217 ww) + (IntegerBinOp + (Var 217 n) + Sub + (IntegerConstant 3 (Integer 4)) + (Integer 4) + (IntegerConstant 12 (Integer 4)) + ) + (IntegerBinOp + (Var 217 n) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 14 (Integer 4)) + ) + (IntegerConstant 1 (Integer 4))) + [(Print + [(ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 0 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 1 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerConstant 2 (Integer 4)) + ())] + (Integer 2) + RowMajor + () + ) + (StringConstant + "..." + (Character 1 3 ()) + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 3 (Integer 4)) + (Integer 4) + (IntegerConstant 32765 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 2 (Integer 4)) + (Integer 4) + (IntegerConstant 32766 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + ) + (ArrayItem + (Var 217 Cnl) + [(() + (Var 217 ww) + ()) + (() + (IntegerBinOp + (Var 217 l) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + (IntegerConstant 32767 (Integer 4)) + ) + ())] + (Integer 2) + RowMajor + () + )] + () + () + )] + ) + (= + (Var 217 _lpython_return_variable) + (IntegerConstant 0 (Integer 4)) + () + ) + (Return)] + (Var 217 _lpython_return_variable) + Public + .false. + .false. + () + ), + rows: + (Variable + 2 + rows + [] + Local + () + () + Default + (TypeParameter + rows + ) + () + Source + Public + Required + .false. + ), + spot_print: + (Function + (SymbolTable + 215 + { + a: + (Variable + 215 + a + [] + InOut + () + () + Default + (Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + () + Source + Public + Required + .false. + ) + }) + spot_print + (FunctionType + [(Array + (Integer 2) + [(() + ())] + DescriptorArray + )] + () + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 215 a)] + [(Return)] + () + Public + .false. + .false. + () + ), + zero_c_fortran_array: + (Function + (SymbolTable + 213 + { + B: + (Variable + 213 + B + [] + Local + () + () + Default + (Pointer + (Array + (Integer 2) + [(() + ())] + DescriptorArray + ) + ) + () + Source + Public + Required + .false. + ), + b: + (Variable + 213 + b + [] + In + () + () + Default + (CPtr) + () + Source + Public + Required + .false. + ), + cols: + (Variable + 213 + cols + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + i: + (Variable + 213 + i + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + j: + (Variable + 213 + j + [] + Local + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ), + rows: + (Variable + 213 + rows + [] + In + () + () + Default + (Integer 4) + () + Source + Public + Required + .false. + ) + }) + zero_c_fortran_array + (FunctionType + [(CPtr) + (Integer 4) + (Integer 4)] + () + Source + Implementation + () + .false. + .false. + .false. + .false. + .false. + [] + .false. + ) + [] + [(Var 213 b) + (Var 213 rows) + (Var 213 cols)] + [(CPtrToPointer + (Var 213 b) + (Var 213 B) + (ArrayConstant + [(IntegerBinOp + (Var 213 rows) + Mul + (Var 213 cols) + (Integer 4) + () + )] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + (ArrayConstant + [(IntegerConstant 0 (Integer 4))] + (Array + (Integer 4) + [((IntegerConstant 0 (Integer 4)) + (IntegerConstant 1 (Integer 4)))] + PointerToDataArray + ) + RowMajor + ) + ) + (DoLoop + () + ((Var 213 i) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 213 rows) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(DoLoop + () + ((Var 213 j) + (IntegerConstant 0 (Integer 4)) + (IntegerBinOp + (Var 213 cols) + Sub + (IntegerConstant 1 (Integer 4)) + (Integer 4) + () + ) + (IntegerConstant 1 (Integer 4))) + [(= + (ArrayItem + (Var 213 B) + [(() + (IntegerBinOp + (IntegerBinOp + (Var 213 i) + Mul + (Var 213 cols) + (Integer 4) + () + ) + Add + (Var 213 j) + (Integer 4) + () + ) + ())] + (Integer 2) + RowMajor + () + ) + (Cast + (IntegerConstant 0 (Integer 4)) + IntegerToInteger + (Integer 2) + (IntegerConstant 0 (Integer 2)) + ) + () + )] + )] + )] + () + Public + .false. + .false. + () + ) + }) + __main__ + [lpython_builtin + numpy] + .false. + .false. + ), + lpython_builtin: + (IntrinsicModule lpython_builtin), + main_program: + (Program + (SymbolTable + 245 + { + __main__global_stmts: + (ExternalSymbol + 245 + __main__global_stmts + 2 __main__global_stmts + __main__ + [] + __main__global_stmts + Public + ) + }) + main_program + [__main__] + [(SubroutineCall + 245 __main__global_stmts + 2 __main__global_stmts + [] + () + )] + ), + numpy: + (Module numpy) + }) + [] +) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index aa9abbfde4..c88759bc97 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -72,7 +72,7 @@ def clear_row(a: i16[:], row: i32, cols: i32) -> None: a[row, j] = i16(0) -def main() -> i32: +def main(optimize: bool = False) -> i32: # "Const" lets these appear in type declarations such as i16[n, m] n : Const[i32] = 15 @@ -111,38 +111,56 @@ def main() -> i32: ii: i32 i: i32 ww: i32 # "ww" is short for "workaround_index." - for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C - for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # zero-out rows of C - # Due to Issue 2496, I cannot pass an array to a function - # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant - # Cnl[i + ii, :] = 0 - for ww in range(0, l): - Cnl[i + ii, ww] = i16(0) - pass - for k in range(0, m): # rows of B - # Issues 2496 and 2500 prevent the desirable form and workaround - # B_1l[0, :] = B_ml[k, :] - for ww in range(0, l): - B1l[0, ww] = Bml[k, ww] - for i in range(0, M2): - A_ik = Anm[i + ii, k] - # broadcast a single element of A (why? might have a SIMD vector register for T1l) - # T1l[0, :] = A_ik + print("optimized by hand ? ", optimize, "\n") + if optimize: + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + # Due to Issue 2496, I cannot pass an array to a function + # clear_row(Cnl, i + ii, l) + # Due to Issue 2500, I cannot broadcast a constant + # Cnl[i + ii, :] = 0 for ww in range(0, l): - T1l[0, ww] = A_ik - # pointwise (Hadamard) product: - # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) + Cnl[i + ii, ww] = i16(0) + for k in range(0, m): # rows of B + for i in range(0, M2): + for ww in range(0, l): + # optimization without the temporaries + ######## Cnl[i + ii, :] += Bml[k, :] * A_ik + Cnl[i + ii, ww] += Bml[k, ww] * Anm[i + ii, k] + else: + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # zero-out rows of C + # Due to Issue 2496, I cannot pass an array to a function + # clear_row(Cnl, i + ii, l) + # Due to Issue 2500, I cannot broadcast a constant + # Cnl[i + ii, :] = 0 for ww in range(0, l): - T1l[0, ww] *= B1l[0, ww] - # Accumulated outer product: - # Cnl[i + ii, :] += T1l[0, :] - for ww in range(0, l): - Cnl[i + ii, ww] += T1l[0, ww] - # optimization without the temporaries - ######## Cnl[i + ii, :] += Bml[k, :] * A_ik + Cnl[i + ii, ww] = i16(0) pass + for k in range(0, m): # rows of B + # Issues 2496 and 2500 prevent the desirable form and workaround + # B_1l[0, :] = B_ml[k, :] + for ww in range(0, l): + B1l[0, ww] = Bml[k, ww] + for i in range(0, M2): + A_ik = Anm[i + ii, k] + # broadcast a single element of A (why? might have a SIMD vector register for T1l) + # T1l[0, :] = A_ik + for ww in range(0, l): + T1l[0, ww] = A_ik + # pointwise (Hadamard) product: + # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) + for ww in range(0, l): + T1l[0, ww] *= B1l[0, ww] + # Accumulated outer product: + # Cnl[i + ii, :] += T1l[0, :] + for ww in range(0, l): + Cnl[i + ii, ww] += T1l[0, ww] + # optimization without the temporaries + ######## Cnl[i + ii, :] += Bml[k, :] * A_ik + pass print("Expect:") print("[[ 5 8 11 ... 20 23 26],") @@ -167,4 +185,4 @@ def main() -> i32: if __name__ == "__main__": - main() + main(optimize=True) From 995ca7c84274a643a328c021b78d5ccc7f3b552f Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 12:22:23 -0800 Subject: [PATCH 09/16] add naive --- integration_tests/matmul_integration.py | 77 ++++++++++++++++--------- 1 file changed, 51 insertions(+), 26 deletions(-) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index c88759bc97..ca9360722e 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -72,6 +72,18 @@ def clear_row(a: i16[:], row: i32, cols: i32) -> None: a[row, j] = i16(0) +def print_expected(): + print("Expected result:") + print("[[ 5 8 11 ... 20 23 26],") + print(" [ 8 14 20 ... 38 44 50],") + print(" [11 20 29 ... 56 65 74], ...") + print("") + print(" [ 8 14 20 ... 38 44 50],") + print(" [11 20 29 ... 56 65 74],") + print(" [14 26 38 ... 74 86 98]]") + print("") + + def main(optimize: bool = False) -> i32: # "Const" lets these appear in type declarations such as i16[n, m] @@ -111,36 +123,37 @@ def main(optimize: bool = False) -> i32: ii: i32 i: i32 ww: i32 # "ww" is short for "workaround_index." - print("optimized by hand ? ", optimize, "\n") + + print("hand-blocked accumulated outer product; block size = M2 =", M2) if optimize: + print("optimized by hand to remove temporaries") for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # zero-out rows of C - # Due to Issue 2496, I cannot pass an array to a function + for i in range(0, M2): # Zero-out rows of C. + # Due to Issue 2496, I cannot pass an array to a function. # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant + # Due to Issue 2500, I cannot broadcast a constant. # Cnl[i + ii, :] = 0 for ww in range(0, l): Cnl[i + ii, ww] = i16(0) for k in range(0, m): # rows of B for i in range(0, M2): for ww in range(0, l): - # optimization without the temporaries - ######## Cnl[i + ii, :] += Bml[k, :] * A_ik Cnl[i + ii, ww] += Bml[k, ww] * Anm[i + ii, k] else: + print("liberal usage of temporaries") for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # zero-out rows of C - # Due to Issue 2496, I cannot pass an array to a function + for i in range(0, M2): # Zero-out rows of C. + # Due to Issue 2496, I cannot pass an array to a function. # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant + # Due to Issue 2500, I cannot broadcast a constant. # Cnl[i + ii, :] = 0 for ww in range(0, l): Cnl[i + ii, ww] = i16(0) pass for k in range(0, m): # rows of B - # Issues 2496 and 2500 prevent the desirable form and workaround + # Issues 2496 and 2500 prevent the desirable form and workaround. # B_1l[0, :] = B_ml[k, :] for ww in range(0, l): B1l[0, ww] = Bml[k, ww] @@ -154,33 +167,45 @@ def main(optimize: bool = False) -> i32: # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) for ww in range(0, l): T1l[0, ww] *= B1l[0, ww] - # Accumulated outer product: + # Accumulated Outer Product: # Cnl[i + ii, :] += T1l[0, :] for ww in range(0, l): Cnl[i + ii, ww] += T1l[0, ww] - # optimization without the temporaries - ######## Cnl[i + ii, :] += Bml[k, :] * A_ik pass - print("Expect:") - print("[[ 5 8 11 ... 20 23 26],") - print(" [ 8 14 20 ... 38 44 50],") - print(" [11 20 29 ... 56 65 74], ...") - print("") - print(" [ 8 14 20 ... 38 44 50],") - print(" [11 20 29 ... 56 65 74],") - print(" [14 26 38 ... 74 86 98]]") + print_expected() + + print("Actual result:") + # Due to Issue 2496, I cannot pass an array to a function; just inline + # the code for 'spot-print'. + for ww in range(0, 3): + print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + print("...") + for ww in range(n-3, n): + print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + print("") - print("Actual:") + print("unblocked, naive Accumulated Outer Product for reference") + for i in range(0, n): + # Due to Issue 2496, I cannot pass an array to a function. + # clear_row(Cnl, i + ii, l) + # Due to Issue 2500, I cannot broadcast a constant. + # Cnl[i + ii, :] = 0 + for ww in range(0, l): + Cnl[i, ww] = i16(0) + for k in range(0, m): # rows of B + for ww in range(0, l): + Cnl[i, ww] += Bml[k, ww] * Anm[i, k] + + print("Actual result:") + # Due to Issue 2496, I cannot pass an array to a function; just inline + # the code for 'spot-print'. for ww in range(0, 3): print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) print("...") for ww in range(n-3, n): print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) - # print(Cnl) - # for ww in range(0, l): - # T1l[0, ww] = Cnl[0, ww] - # print(T1l) + return 0 From e18ecc8d204477381fefc55cfd1c226874c18b20 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 19:03:30 -0800 Subject: [PATCH 10/16] ISSUES/Issue2521.py --- ISSUES/Issue2521.py | 31 +++++++++++++++++++++++++ integration_tests/matmul_integration.py | 20 ++++++++++------ 2 files changed, 44 insertions(+), 7 deletions(-) create mode 100644 ISSUES/Issue2521.py diff --git a/ISSUES/Issue2521.py b/ISSUES/Issue2521.py new file mode 100644 index 0000000000..ee9ce4429d --- /dev/null +++ b/ISSUES/Issue2521.py @@ -0,0 +1,31 @@ +import numpy +from numpy import empty, int16 +from lpython import (i16, i32, Const) + + +# ~~~~~~~~~~~~~~~~ ATTENTION ~~~~~~~~~~~~~~~~~~~~ +# | +# v +def spot_print(a: i16[:], n: i32, l: i32) -> None: + """Issue2497, Issue2521""" + ww: i32 + for ww in range(0, 3): + print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], a[ww, l - 1]) + print("...") + for ww in range(n-3, n): + print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], [ww, l - 1]) + + +def main() -> i32: + + # "Const" lets these appear in type declarations such as i16[n, m] + n : Const[i32] = 15 + l : Const[i32] = 32_768 + + Cnl: i16[n, l] = empty((n, l), dtype=int16) + + spot_print(Cnl, i32(n), i32(l)) + + +if __name__ == "__main__": + main() diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index ca9360722e..b3a3ab0356 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -61,9 +61,14 @@ def load_lpython_array_from_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> i1 return D -def spot_print(a: i16[:]) -> None: +def spot_print(a: i16[:, :], n: i32, l: i32) -> None: """Issue2497""" - return + ww: i32 + for ww in range(0, 3): + print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], a[ww, l - 1]) + print("...") + for ww in range(n-3, n): + print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], [ww, l - 1]) def clear_row(a: i16[:], row: i32, cols: i32) -> None: @@ -176,13 +181,14 @@ def main(optimize: bool = False) -> i32: print_expected() print("Actual result:") + spot_print(Cnl, i32(n), i32(l)) # Due to Issue 2496, I cannot pass an array to a function; just inline # the code for 'spot-print'. - for ww in range(0, 3): - print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) - print("...") - for ww in range(n-3, n): - print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + # for ww in range(0, 3): + # print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + # print("...") + # for ww in range(n-3, n): + # print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) print("") print("unblocked, naive Accumulated Outer Product for reference") From 060dbf190b53e5645f3f70ef749d3995383ecb2d Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 19:59:27 -0800 Subject: [PATCH 11/16] factor row operations --- ISSUES/Issue2521.py | 31 ------ integration_tests/matmul_integration.py | 140 ++++++++++++------------ 2 files changed, 72 insertions(+), 99 deletions(-) delete mode 100644 ISSUES/Issue2521.py diff --git a/ISSUES/Issue2521.py b/ISSUES/Issue2521.py deleted file mode 100644 index ee9ce4429d..0000000000 --- a/ISSUES/Issue2521.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy -from numpy import empty, int16 -from lpython import (i16, i32, Const) - - -# ~~~~~~~~~~~~~~~~ ATTENTION ~~~~~~~~~~~~~~~~~~~~ -# | -# v -def spot_print(a: i16[:], n: i32, l: i32) -> None: - """Issue2497, Issue2521""" - ww: i32 - for ww in range(0, 3): - print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], a[ww, l - 1]) - print("...") - for ww in range(n-3, n): - print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], [ww, l - 1]) - - -def main() -> i32: - - # "Const" lets these appear in type declarations such as i16[n, m] - n : Const[i32] = 15 - l : Const[i32] = 32_768 - - Cnl: i16[n, l] = empty((n, l), dtype=int16) - - spot_print(Cnl, i32(n), i32(l)) - - -if __name__ == "__main__": - main() diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index b3a3ab0356..4470cdc602 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -3,12 +3,9 @@ from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, ccall, sizeof, Array, Allocatable, TypeVar, Const) -######## ALL THE LINES WITH EIGHT COMMENT MARKS ARE THE ONES WE NEED TO -######## BRING UP! AS IT STANDS, THIS CODE WORKS IN LPYTHON MAIN AS OF 4 -######## FEBRUARY 2024. # https://numpy.org/devdocs/reference/typing.html -######## from numpy.typing import NDArray +# from numpy.typing import NDArray # plan for 30 Jan 2024 -- @@ -61,22 +58,67 @@ def load_lpython_array_from_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> i1 return D -def spot_print(a: i16[:, :], n: i32, l: i32) -> None: - """Issue2497""" - ww: i32 - for ww in range(0, 3): - print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], a[ww, l - 1]) +def spot_print(Anl: i16[:, :], n: i32, l: i32) -> None: + j: i32 + for j in range(0, 3): + spot_print_row(Anl, l, j) print("...") - for ww in range(n-3, n): - print(a[ww, 0], a[ww, 1], a[ww, 2], "...", a[ww, l - 3], a[ww, l - 2], [ww, l - 1]) + for j in range(n - 3, n): + spot_print_row(Anl, l, j) + + +def spot_print_row(Anl: i16[:, :], cols: i32, row: i32): + if (cols > 3): + print(Anl[row, 0], Anl[row, 1], Anl[row, 2], "...", + Anl[row, cols - 3], Anl[row, cols - 2], Anl[row, cols - 1]) + else: + print(Anl[row, 0], Anl[row, 1], Anl[row, 2]) -def clear_row(a: i16[:], row: i32, cols: i32) -> None: +def clear_row(a: i16[:, :], row: i32, cols: i32) -> None: + # Due to Issue 2500, I cannot broadcast a constant. j: i32 for j in range(cols): a[row, j] = i16(0) +def broadcast_i16_row( + a: i16[:, :], row: i32, val: i16, + cols: i32) -> None: + # Due to Issue 2500, I cannot broadcast a constant. + j: i32 + for j in range(cols): + a[row, j] = i16(val) + + +def broadcast_copy_row( + dest: i16[:, :], dest_row: i32, + src: i16[:, :], src_row: i32, + cols: i32) -> None: + # Due to Issue 2500, I cannot broadcast. + j: i32 + for j in range(cols): + dest[dest_row, j] = src[src_row, j] + + +def hadamard_product_in_place_row( + dest: i16[:, :], dest_row: i32, + src: i16[:, :], src_row: i32, + cols: i32) -> None: + j: i32 + for j in range(cols): + dest[dest_row, j] *= src[src_row, j] + + +def accumulate_in_place_row( + dest: i16[:, :], dest_row: i32, + src: i16[:, :], src_row: i32, + cols: i32) -> None: + j: i32 + for j in range(cols): + dest[dest_row, j] += src[src_row, j] + + def print_expected(): print("Expected result:") print("[[ 5 8 11 ... 20 23 26],") @@ -110,12 +152,14 @@ def main(optimize: bool = False) -> i32: print (Anm_l4) Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) - # Issue 2497 spot_print(Anm) - print(Anm) + print("Anm[", n, ",", m, "]") + spot_print(Anm, n, m) Bml: i16[m, l] = load_lpython_array_from_c_fortran_array(Bml_l4, m, l) - # print(Bml) + print("Bml[", m, ",", l, "]") + spot_print(Bml, m, l) Cnl: i16[n, l] = load_lpython_array_from_c_fortran_array(Cnl_l4, n, l) - # print(Cnl) + print("Cnl[", n, ",", l, "]") + spot_print(Cnl, n, l) # Temporaries (TODO: get rid of them, as indicated by proposed syntax below) B1l: i16[1, l] = empty((1, l), dtype=int16) @@ -123,7 +167,6 @@ def main(optimize: bool = False) -> i32: VR_SIZE: i32 = 32_768 k: i32 - A_ik: i16 jj: i32 ii: i32 i: i32 @@ -135,12 +178,7 @@ def main(optimize: bool = False) -> i32: for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C for ii in range(0, n, M2): # each M2 block in A cols and B rows for i in range(0, M2): # Zero-out rows of C. - # Due to Issue 2496, I cannot pass an array to a function. - # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant. - # Cnl[i + ii, :] = 0 - for ww in range(0, l): - Cnl[i + ii, ww] = i16(0) + clear_row(Cnl, i + ii, l) for k in range(0, m): # rows of B for i in range(0, M2): for ww in range(0, l): @@ -150,70 +188,36 @@ def main(optimize: bool = False) -> i32: for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C for ii in range(0, n, M2): # each M2 block in A cols and B rows for i in range(0, M2): # Zero-out rows of C. - # Due to Issue 2496, I cannot pass an array to a function. - # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant. - # Cnl[i + ii, :] = 0 - for ww in range(0, l): - Cnl[i + ii, ww] = i16(0) - pass + clear_row(Cnl, i + ii, l) for k in range(0, m): # rows of B - # Issues 2496 and 2500 prevent the desirable form and workaround. # B_1l[0, :] = B_ml[k, :] - for ww in range(0, l): - B1l[0, ww] = Bml[k, ww] + broadcast_copy_row(B1l, 0, Bml, k, l) for i in range(0, M2): - A_ik = Anm[i + ii, k] - # broadcast a single element of A (why? might have a SIMD vector register for T1l) - # T1l[0, :] = A_ik - for ww in range(0, l): - T1l[0, ww] = A_ik - # pointwise (Hadamard) product: + # T1l[0, :] = Anm[i + ii, k] + broadcast_i16_row(T1l, 0, Anm[i + ii, k], l) # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) - for ww in range(0, l): - T1l[0, ww] *= B1l[0, ww] - # Accumulated Outer Product: + hadamard_product_in_place_row(T1l, 0, B1l, 0, l) # Cnl[i + ii, :] += T1l[0, :] - for ww in range(0, l): - Cnl[i + ii, ww] += T1l[0, ww] - pass - - print_expected() + accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) print("Actual result:") - spot_print(Cnl, i32(n), i32(l)) - # Due to Issue 2496, I cannot pass an array to a function; just inline - # the code for 'spot-print'. - # for ww in range(0, 3): - # print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) - # print("...") - # for ww in range(n-3, n): - # print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + spot_print(Cnl, n, l) print("") print("unblocked, naive Accumulated Outer Product for reference") for i in range(0, n): - # Due to Issue 2496, I cannot pass an array to a function. - # clear_row(Cnl, i + ii, l) - # Due to Issue 2500, I cannot broadcast a constant. - # Cnl[i + ii, :] = 0 - for ww in range(0, l): - Cnl[i, ww] = i16(0) + clear_row(Cnl, i, l) for k in range(0, m): # rows of B for ww in range(0, l): Cnl[i, ww] += Bml[k, ww] * Anm[i, k] print("Actual result:") - # Due to Issue 2496, I cannot pass an array to a function; just inline - # the code for 'spot-print'. - for ww in range(0, 3): - print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) - print("...") - for ww in range(n-3, n): - print(Cnl[ww, 0], Cnl[ww, 1], Cnl[ww, 2], "...", Cnl[ww, l - 3], Cnl[ww, l - 2], Cnl[ww, l - 1]) + spot_print(Cnl, n, l) return 0 if __name__ == "__main__": + print_expected() + main(optimize=False) main(optimize=True) From 5f5acbd2230509f377c41d331d08715092b7d8da Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 20:16:49 -0800 Subject: [PATCH 12/16] refactored out workarounds --- integration_tests/matmul_integration.py | 123 +++++++++++++++--------- 1 file changed, 75 insertions(+), 48 deletions(-) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index 4470cdc602..ebd4ec8843 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -120,7 +120,7 @@ def accumulate_in_place_row( def print_expected(): - print("Expected result:") + print("\nExpected result:") print("[[ 5 8 11 ... 20 23 26],") print(" [ 8 14 20 ... 38 44 50],") print(" [11 20 29 ... 56 65 74], ...") @@ -131,14 +131,14 @@ def print_expected(): print("") -def main(optimize: bool = False) -> i32: +def main() -> i32: # "Const" lets these appear in type declarations such as i16[n, m] n : Const[i32] = 15 m : Const[i32] = 3 l : Const[i32] = 32_768 - M1 : i32 = 1 + # M1 : i32 = 1 # Unused M2 : i32 = 5 # Issue 2499 -- can't be Const Anm_l4 : CPtr = _lfortran_malloc((n * m) * i32(sizeof(i16))) @@ -149,7 +149,7 @@ def main(optimize: bool = False) -> i32: init_c_fortran_array(Bml_l4, m, l, 13) zero_c_fortran_array(Cnl_l4, n, l) - print (Anm_l4) + print("\nInputs:") Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) print("Anm[", n, ",", m, "]") @@ -161,63 +161,90 @@ def main(optimize: bool = False) -> i32: print("Cnl[", n, ",", l, "]") spot_print(Cnl, n, l) - # Temporaries (TODO: get rid of them, as indicated by proposed syntax below) - B1l: i16[1, l] = empty((1, l), dtype=int16) - T1l: i16[1, l] = empty((1, l), dtype=int16) + print_expected() VR_SIZE: i32 = 32_768 - k: i32 - jj: i32 - ii: i32 - i: i32 ww: i32 # "ww" is short for "workaround_index." - print("hand-blocked accumulated outer product; block size = M2 =", M2) - if optimize: - print("optimized by hand to remove temporaries") - for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C - for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # Zero-out rows of C. - clear_row(Cnl, i + ii, l) - for k in range(0, m): # rows of B - for i in range(0, M2): - for ww in range(0, l): - Cnl[i + ii, ww] += Bml[k, ww] * Anm[i + ii, k] - else: - print("liberal usage of temporaries") - for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C - for ii in range(0, n, M2): # each M2 block in A cols and B rows - for i in range(0, M2): # Zero-out rows of C. - clear_row(Cnl, i + ii, l) - for k in range(0, m): # rows of B - # B_1l[0, :] = B_ml[k, :] - broadcast_copy_row(B1l, 0, Bml, k, l) - for i in range(0, M2): - # T1l[0, :] = Anm[i + ii, k] - broadcast_i16_row(T1l, 0, Anm[i + ii, k], l) - # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) - hadamard_product_in_place_row(T1l, 0, B1l, 0, l) - # Cnl[i + ii, :] += T1l[0, :] - accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) - - print("Actual result:") + print("\nhand-blocked accumulated outer product; block size = M2 =", M2) + hand_optimized_to_remove_temporaries(Anm, Bml, Cnl, n, m, l, VR_SIZE, M2) + + print("\nActual result:") spot_print(Cnl, n, l) - print("") - print("unblocked, naive Accumulated Outer Product for reference") + with_liberal_use_of_temporaries(Anm, Bml, Cnl, n, m, l, VR_SIZE, M2) + + print("\nActual result:") + spot_print(Cnl, n, l) + + unblocked_accumulated_outer_product(Anm, Bml, Cnl, n, m, l) + + print("\nActual result:") + spot_print(Cnl, n, l) + + return 0 + + +def unblocked_accumulated_outer_product( + Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], + n: i32, m: i32, l: i32): + print("\nunblocked, naive Accumulated Outer Product for reference") + i: i32 + k: i32 + ww: i32 for i in range(0, n): clear_row(Cnl, i, l) for k in range(0, m): # rows of B for ww in range(0, l): Cnl[i, ww] += Bml[k, ww] * Anm[i, k] - print("Actual result:") - spot_print(Cnl, n, l) - return 0 +def with_liberal_use_of_temporaries( + Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], + n: i32, m: i32, l: i32, VR_SIZE: i32, M2: i32): + k: i32 + jj: i32 + ii: i32 + i: i32 + ww: i32 + print("\nliberal usage of temporaries") + # Temporaries (TODO: get rid of them, as indicated by proposed syntax below) + B1l: i16[1, l] = empty((1, l), dtype=int16) + T1l: i16[1, l] = empty((1, l), dtype=int16) + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # Zero-out rows of C. + clear_row(Cnl, i + ii, l) + for k in range(0, m): # rows of B + # B_1l[0, :] = B_ml[k, :] + broadcast_copy_row(B1l, 0, Bml, k, l) + for i in range(0, M2): + # T1l[0, :] = Anm[i + ii, k] + broadcast_i16_row(T1l, 0, Anm[i + ii, k], l) + # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) + hadamard_product_in_place_row(T1l, 0, B1l, 0, l) + # Cnl[i + ii, :] += T1l[0, :] + accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) + + +def hand_optimized_to_remove_temporaries( + Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], + n: i32, m: i32, l: i32, VR_SIZE: i32, M2: i32): + k: i32 + jj: i32 + ii: i32 + i: i32 + ww: i32 + print("\noptimized by hand to remove temporaries") + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # Zero-out rows of C. + clear_row(Cnl, i + ii, l) + for k in range(0, m): # rows of B + for i in range(0, M2): + for ww in range(0, l): + Cnl[i + ii, ww] += Bml[k, ww] * Anm[i + ii, k] if __name__ == "__main__": - print_expected() - main(optimize=False) - main(optimize=True) + main() From 827415562a54349837b190e5c1323db17ca76215 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Thu, 8 Feb 2024 20:28:19 -0800 Subject: [PATCH 13/16] one more row op --- integration_tests/matmul_integration.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index ebd4ec8843..067568de03 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -119,6 +119,17 @@ def accumulate_in_place_row( dest[dest_row, j] += src[src_row, j] +def accumulate_in_place_outer_product_row( + dest: i16[:, :], dest_row: i32, + src1: i16[:, :], src1_row: i32, + src2: i16[:, :], + cols: i32) -> None: + ww: i32 + for ww in range(0, cols): + dest[dest_row, ww] += src1[src1_row, ww] * src2[dest_row, src1_row] + + + def print_expected(): print("\nExpected result:") print("[[ 5 8 11 ... 20 23 26],") @@ -195,8 +206,7 @@ def unblocked_accumulated_outer_product( for i in range(0, n): clear_row(Cnl, i, l) for k in range(0, m): # rows of B - for ww in range(0, l): - Cnl[i, ww] += Bml[k, ww] * Anm[i, k] + accumulate_in_place_outer_product_row(Cnl, i, Bml, k, Anm, l) def with_liberal_use_of_temporaries( @@ -242,8 +252,8 @@ def hand_optimized_to_remove_temporaries( clear_row(Cnl, i + ii, l) for k in range(0, m): # rows of B for i in range(0, M2): - for ww in range(0, l): - Cnl[i + ii, ww] += Bml[k, ww] * Anm[i + ii, k] + accumulate_in_place_outer_product_row( + Cnl, i + ii, Bml, k, Anm, l) if __name__ == "__main__": From d5358567132f24d33c6087d92dccad447dcc314a Mon Sep 17 00:00:00 2001 From: rebcabin Date: Mon, 12 Feb 2024 09:01:17 -0800 Subject: [PATCH 14/16] block & tile (M1 & M2) --- .idea/lpython.iml | 16 ++++- .idea/misc.xml | 8 +++ ISSUES/Issue2509.py | 5 ++ integration_tests/matmul_integration.py | 81 +++++++++++++++++++------ 4 files changed, 91 insertions(+), 19 deletions(-) create mode 100644 ISSUES/Issue2509.py diff --git a/.idea/lpython.iml b/.idea/lpython.iml index f08604bb65..0a962387a1 100644 --- a/.idea/lpython.iml +++ b/.idea/lpython.iml @@ -1,2 +1,16 @@ - \ No newline at end of file + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml index 79b3c94830..7fa6796e56 100644 --- a/.idea/misc.xml +++ b/.idea/misc.xml @@ -1,4 +1,12 @@ + + + + + + + \ No newline at end of file diff --git a/ISSUES/Issue2509.py b/ISSUES/Issue2509.py new file mode 100644 index 0000000000..e379eee4b9 --- /dev/null +++ b/ISSUES/Issue2509.py @@ -0,0 +1,5 @@ + +def main(option: bool = False): + print("option: ", option) + +main() diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index 067568de03..5a56523ea9 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -1,7 +1,9 @@ import numpy -from numpy import array, empty, int16 -from lpython import (i16, i32, ccallback, c_p_pointer, Pointer, u64, CPtr, i64, - ccall, sizeof, Array, Allocatable, TypeVar, Const) +from numpy import array, empty, int16, uint16 +from lpython import (i16, i32, c_p_pointer, Pointer, CPtr, + ccall, sizeof, TypeVar, Const, + # Annotation, SIMD # TODO + ) # https://numpy.org/devdocs/reference/typing.html @@ -23,7 +25,9 @@ def _lfortran_malloc(size : i32) -> CPtr: def init_c_fortran_array(b: CPtr, rows: i32, cols: i32, mod: i32) -> None: - """Initialize a C / Fortran array with test data.""" + """Initialize a C / Fortran array with test data. A C / Fortran + array is, mathematically, a 2D structure. Its 2D indices are + converted inline to a 1D index into the 1D physical array.""" B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) i: i32 j: i32 @@ -46,7 +50,8 @@ def zero_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> None: cols = TypeVar("cols") -def load_lpython_array_from_c_fortran_array(b: CPtr, rows: i32, cols: i32) -> i16[rows, cols]: +def load_lpython_array_from_c_fortran_array( + b: CPtr, rows: i32, cols: i32) -> i16[rows, cols]: """Load an LPython array from a C / Fortran array.""" B: Pointer[i16[:]] = c_p_pointer(b, i16[:], array([rows * cols])) D: i16[rows, cols] = empty((rows, cols), dtype=int16) @@ -67,7 +72,7 @@ def spot_print(Anl: i16[:, :], n: i32, l: i32) -> None: spot_print_row(Anl, l, j) -def spot_print_row(Anl: i16[:, :], cols: i32, row: i32): +def spot_print_row(Anl: i16[:, :], cols: i32, row: i32) -> None: if (cols > 3): print(Anl[row, 0], Anl[row, 1], Anl[row, 2], "...", Anl[row, cols - 3], Anl[row, cols - 2], Anl[row, cols - 1]) @@ -129,8 +134,7 @@ def accumulate_in_place_outer_product_row( dest[dest_row, ww] += src1[src1_row, ww] * src2[dest_row, src1_row] - -def print_expected(): +def print_expected() -> None: print("\nExpected result:") print("[[ 5 8 11 ... 20 23 26],") print(" [ 8 14 20 ... 38 44 50],") @@ -149,7 +153,7 @@ def main() -> i32: m : Const[i32] = 3 l : Const[i32] = 32_768 - # M1 : i32 = 1 # Unused + M1 : i32 = 1 M2 : i32 = 5 # Issue 2499 -- can't be Const Anm_l4 : CPtr = _lfortran_malloc((n * m) * i32(sizeof(i16))) @@ -165,9 +169,11 @@ def main() -> i32: Anm: i16[n, m] = load_lpython_array_from_c_fortran_array(Anm_l4, n, m) print("Anm[", n, ",", m, "]") spot_print(Anm, n, m) + Bml: i16[m, l] = load_lpython_array_from_c_fortran_array(Bml_l4, m, l) print("Bml[", m, ",", l, "]") spot_print(Bml, m, l) + Cnl: i16[n, l] = load_lpython_array_from_c_fortran_array(Cnl_l4, n, l) print("Cnl[", n, ",", l, "]") spot_print(Cnl, n, l) @@ -175,40 +181,82 @@ def main() -> i32: print_expected() VR_SIZE: i32 = 32_768 - ww: i32 # "ww" is short for "workaround_index." - + # ---------------------------------------------------------------- print("\nhand-blocked accumulated outer product; block size = M2 =", M2) hand_optimized_to_remove_temporaries(Anm, Bml, Cnl, n, m, l, VR_SIZE, M2) print("\nActual result:") spot_print(Cnl, n, l) - + # ---------------------------------------------------------------- with_liberal_use_of_temporaries(Anm, Bml, Cnl, n, m, l, VR_SIZE, M2) print("\nActual result:") spot_print(Cnl, n, l) + # ---------------------------------------------------------------- + blocked_and_tiled_with_temporaries(Anm, Bml, Cnl, n, m, l, VR_SIZE, M1, M2) + print("\nActual result:") + spot_print(Cnl, n, l) + # ---------------------------------------------------------------- unblocked_accumulated_outer_product(Anm, Bml, Cnl, n, m, l) print("\nActual result:") spot_print(Cnl, n, l) - + # ---------------------------------------------------------------- return 0 def unblocked_accumulated_outer_product( Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], - n: i32, m: i32, l: i32): + n: i32, m: i32, l: i32) -> None: print("\nunblocked, naive Accumulated Outer Product for reference") i: i32 k: i32 - ww: i32 for i in range(0, n): clear_row(Cnl, i, l) for k in range(0, m): # rows of B accumulate_in_place_outer_product_row(Cnl, i, Bml, k, Anm, l) +def blocked_and_tiled_with_temporaries( + Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], + n: i32, m: i32, l: i32, VR_SIZE: i32, + M1: i32, M2: i32): + # L1cache: Annotation[u16[VR_SIZE], SIMD] = empty((M2 + 1), VR_SIZE, dtype=uint16) # TODO + # B_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) # TODO + # C_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) # TODO + # T_vr: Annotation[u16[V], SIMD] = empty((V,), dtype=uint16) # TODO + L1_B_index: i32 = 0 + L1_C_base: i32 = 1 + k: i32 + kk: i32 + jj: i32 + i: i32 + ii: i32 + print("\nbloced and tiled with temporaries") + B1l: i16[1, l] = empty((1, l), dtype=int16) + T1l: i16[1, l] = empty((1, l), dtype=int16) + for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C + for ii in range(0, n, M2): # each M2 block in A cols and B rows + for i in range(0, M2): # Zero-out rows of C. + clear_row(Cnl, i + ii, l) + # L1cache[L1_C_base + i, :] = C_vr[:] # TODO + for kk in range(0, m, M1): + for k in range(0, M1): # rows of Bml + # L1cache[L1_B_index, :] = Bml[kk + k, jj : (jj + VR_SIZE)] # TODO + # B_vr[:] = l1cache[L1_B_index, :] # TODO + # -------------------------------------------- + # B_1l[0, :] = B_ml[k + kk, :] + broadcast_copy_row(B1l, 0, Bml, k + kk, l) + for i in range(0, M2): + # T1l[0, :] = Anm[i + ii, k + kk] + broadcast_i16_row(T1l, 0, Anm[i + ii, k + kk], l) + # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) + hadamard_product_in_place_row(T1l, 0, B1l, 0, l) + # Cnl[i + ii, :] += T1l[0, :] + accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) + + def with_liberal_use_of_temporaries( Anm: i16[:, :], Bml: i16[:, :], Cnl: i16[:, :], n: i32, m: i32, l: i32, VR_SIZE: i32, M2: i32): @@ -216,9 +264,7 @@ def with_liberal_use_of_temporaries( jj: i32 ii: i32 i: i32 - ww: i32 print("\nliberal usage of temporaries") - # Temporaries (TODO: get rid of them, as indicated by proposed syntax below) B1l: i16[1, l] = empty((1, l), dtype=int16) T1l: i16[1, l] = empty((1, l), dtype=int16) for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C @@ -244,7 +290,6 @@ def hand_optimized_to_remove_temporaries( jj: i32 ii: i32 i: i32 - ww: i32 print("\noptimized by hand to remove temporaries") for jj in range(0, l, VR_SIZE): # each VR-col chunk in B and C for ii in range(0, n, M2): # each M2 block in A cols and B rows From 6fdc4b32cb98f14f7705658308a0963cc88eb993 Mon Sep 17 00:00:00 2001 From: rebcabin Date: Mon, 12 Feb 2024 09:14:12 -0800 Subject: [PATCH 15/16] better comments, more M1 --- integration_tests/matmul_integration.py | 33 ++++++++++++++++++++----- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/integration_tests/matmul_integration.py b/integration_tests/matmul_integration.py index 5a56523ea9..ae88adaa0b 100644 --- a/integration_tests/matmul_integration.py +++ b/integration_tests/matmul_integration.py @@ -233,6 +233,7 @@ def blocked_and_tiled_with_temporaries( jj: i32 i: i32 ii: i32 + A_ik: i16 print("\nbloced and tiled with temporaries") B1l: i16[1, l] = empty((1, l), dtype=int16) T1l: i16[1, l] = empty((1, l), dtype=int16) @@ -240,21 +241,37 @@ def blocked_and_tiled_with_temporaries( for ii in range(0, n, M2): # each M2 block in A cols and B rows for i in range(0, M2): # Zero-out rows of C. clear_row(Cnl, i + ii, l) - # L1cache[L1_C_base + i, :] = C_vr[:] # TODO + # L1cache[(L1_C_base + i), :] = C_vr[:] # TODO for kk in range(0, m, M1): for k in range(0, M1): # rows of Bml - # L1cache[L1_B_index, :] = Bml[kk + k, jj : (jj + VR_SIZE)] # TODO + # L1cache[L1_B_index, :] = Bml[(kk + k), jj : (jj + VR_SIZE)] # TODO # B_vr[:] = l1cache[L1_B_index, :] # TODO - # -------------------------------------------- - # B_1l[0, :] = B_ml[k + kk, :] + # ------------------------------------------------------------ + # B_1l[0, :] = B_ml[(k + kk), :] broadcast_copy_row(B1l, 0, Bml, k + kk, l) for i in range(0, M2): - # T1l[0, :] = Anm[i + ii, k + kk] - broadcast_i16_row(T1l, 0, Anm[i + ii, k + kk], l) + # C_vr[:] = L1cache[(L1_C_base + i), :] # TODO + # -------------------------------------------------------- + # T_vr[:] = A_ik # TODO + # -------------------------------------------------------- + # T1l[0, :] = Anm[(i + ii), (k + kk)] + A_ik = Anm[i + ii, k + kk] + broadcast_i16_row(T1l, 0, A_ik, l) + # -------------------------------------------------------- + # T_vr[:] = B_vr[:] * T_vr[:] # Hadamard product # TODO + # T_vr[:] *= B_vr[:] # Hadamard product alternative # TODO + # -------------------------------------------------------- # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) hadamard_product_in_place_row(T1l, 0, B1l, 0, l) + # -------------------------------------------------------- + # C_vr[:] = C_vr[:] + T_vr[:] # TODO + # C_vr[:] += T_vr[:] # Alternative # TODO + # -------------------------------------------------------- # Cnl[i + ii, :] += T1l[0, :] accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) + # L1cache[(L1_C_base + i), :] = C_vr[:] # TODO + # for i in range(0, M2): # TODO + # Cnl[(ii + i), jj : (jj + VR_SIZE)] = L1cache[(L1_C_base + i), :] # TODO def with_liberal_use_of_temporaries( @@ -272,13 +289,17 @@ def with_liberal_use_of_temporaries( for i in range(0, M2): # Zero-out rows of C. clear_row(Cnl, i + ii, l) for k in range(0, m): # rows of B + # ------------------------------------------------------------ # B_1l[0, :] = B_ml[k, :] broadcast_copy_row(B1l, 0, Bml, k, l) for i in range(0, M2): + # -------------------------------------------------------- # T1l[0, :] = Anm[i + ii, k] broadcast_i16_row(T1l, 0, Anm[i + ii, k], l) + # -------------------------------------------------------- # T1l[0, :] = np.multiply(B1l[0, :], T1l[0, :]) hadamard_product_in_place_row(T1l, 0, B1l, 0, l) + # -------------------------------------------------------- # Cnl[i + ii, :] += T1l[0, :] accumulate_in_place_row(Cnl, i + ii, T1l, 0, l) From eb2af0103957fdc78cc2bd52c05b807edd2a429b Mon Sep 17 00:00:00 2001 From: rebcabin Date: Mon, 12 Feb 2024 09:23:14 -0800 Subject: [PATCH 16/16] remove .idea and add .idea to .gitignore --- .gitignore | 1 + .idea/.gitignore | 8 -------- .idea/customTargets.xml | 12 ------------ .idea/lpython.iml | 16 ---------------- .idea/misc.xml | 12 ------------ .idea/modules.xml | 8 -------- .idea/tools/External Tools.xml | 9 --------- .idea/vcs.xml | 6 ------ 8 files changed, 1 insertion(+), 71 deletions(-) delete mode 100644 .idea/.gitignore delete mode 100644 .idea/customTargets.xml delete mode 100644 .idea/lpython.iml delete mode 100644 .idea/misc.xml delete mode 100644 .idea/modules.xml delete mode 100644 .idea/tools/External Tools.xml delete mode 100644 .idea/vcs.xml diff --git a/.gitignore b/.gitignore index 267f5ee03c..5a5df47e61 100644 --- a/.gitignore +++ b/.gitignore @@ -169,6 +169,7 @@ MANIFEST ## Editor Files .vscode/ .vs/ +.idea/ ## Build Files */bin/lpython diff --git a/.idea/.gitignore b/.idea/.gitignore deleted file mode 100644 index 13566b81b0..0000000000 --- a/.idea/.gitignore +++ /dev/null @@ -1,8 +0,0 @@ -# Default ignored files -/shelf/ -/workspace.xml -# Editor-based HTTP Client requests -/httpRequests/ -# Datasource local storage ignored files -/dataSources/ -/dataSources.local.xml diff --git a/.idea/customTargets.xml b/.idea/customTargets.xml deleted file mode 100644 index 98fb12e6b5..0000000000 --- a/.idea/customTargets.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/lpython.iml b/.idea/lpython.iml deleted file mode 100644 index 0a962387a1..0000000000 --- a/.idea/lpython.iml +++ /dev/null @@ -1,16 +0,0 @@ - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml deleted file mode 100644 index 7fa6796e56..0000000000 --- a/.idea/misc.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - - - - \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml deleted file mode 100644 index 83d184b0d9..0000000000 --- a/.idea/modules.xml +++ /dev/null @@ -1,8 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/.idea/tools/External Tools.xml b/.idea/tools/External Tools.xml deleted file mode 100644 index 8cdd0760c9..0000000000 --- a/.idea/tools/External Tools.xml +++ /dev/null @@ -1,9 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 35eb1ddfbb..0000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file